# IR - HW1 Vector Space Model
* In this project, we will have  
    * 50 Queries  
    * 4191 Documents

Our goal is to implement a vector space model, and print out the ranking results for all of the queries.
https://www.kaggle.com/t/7f84706b7b074267ae314582825fb725


In [1]:
import numpy as np
import pandas as pd
import math
import os
import time
from sklearn.preprocessing import normalize

## Open files

In [2]:
# return word vector
def open_files(path_query = "data/queries", path_docs = "data/docs", extension = ".txt"):
    qlf = open("data/query_list.txt")
    dlf = open("data/doc_list.txt")
    
    querys = {}
    for fname in qlf:
        fname = fname.strip("\n")
        file = os.path.join(path_query, fname + extension)
        
        fq = open(file)
        query = []
        for q in fq:
            q = q.strip("\n").split(" ")
            for term in q:
                query.append(term)
        querys[fname] = query
        fq.close()

    docs = {}
    for fname in dlf:
        fname = fname.strip("\n")
        file = os.path.join(path_docs, fname + extension)
        
        fd = open(file)
        doc = []
        for d in fd:
            d = d.strip("\n").split(" ")
            for term in d:
                doc.append(term)
        docs[fname] = doc
        fd.close()

    dlf.close()
    qlf.close()

    return querys, docs

In [3]:
# return sentence
def open_files2(path_query = "data/queries", path_docs = "data/docs", extension = ".txt"):
    qlf = open("data/query_list.txt")
    dlf = open("data/doc_list.txt")
    
    querys = {}
    for fname in qlf:
        fname = fname.strip("\n")
        file = os.path.join(path_query, fname + extension)
        
        fq = open(file)
        query = []
        for q in fq:
            query.append(q.strip("\n"))
        querys[fname] = query
        fq.close()

    docs = {}
    for fname in dlf:
        fname = fname.strip("\n")
        file = os.path.join(path_docs, fname + extension)
        
        fd = open(file)
        doc = []
        for d in fd:
            doc.append(d.strip("\n"))
        docs[fname] = doc
        fd.close()

    dlf.close()
    qlf.close()

    return querys, docs

## TF-IDF

In [4]:
def tfidf(all_dict):
    
    tf = {}
    ni = {}
    for name, terms in all_dict.items():
        tf[name] = {}
        check_term = {}
        for term in terms:
            # ni
            try:
                if term not in check_term:
                    ni[term] += 1
                    check_term[term] = True
            except:
                ni[term] = 1
            # TF
            try:
#                 tf[name][term] += 1 / len(terms)
                tf[name][term] += 1
            except:
#                 tf[name][term] = 1 / len(terms)
                tf[name][term] = 1
    
    # IDF
    idf = {}
    # score dict initial
    score_dict = {}
    # col name
    col_name = []
    for term, times in ni.items():
        # smooth idf :prevent zero divisions
        idf[term] = math.log( ((1 + len(all_dict)) / (1 + times)) + 1) 
        score_dict[term] = 0.0
        col_name.append(term)
    
    # TFIDF
    tfidf_dict = {}
    tfidf_score = []
    # index
    index = []
    for name, _ in all_dict.items():
        index.append(name)
        tfidf_dict[name] = {}
        for term, tf_score in tf[name].items():
            # sublinear_tf: replace tf with 1 + log(tf).
            sublinear_tf = (1 + math.log(tf_score))
            tfidf_dict[name][term] = sublinear_tf * idf[term]
            score_dict[term] = sublinear_tf * idf[term]
        tfidf_score.append(list(score_dict.values()))
        # Clean up score dict
        for term in tf[name].keys():
            score_dict[term] = 0.0
    tfidf_result = normalize(tfidf_score, norm='l2')
    
    tfidf_normalize = pd.DataFrame(tfidf_result, columns=col_name, index=index)
    return tfidf_dict, tfidf_normalize

In [5]:
def cosine(vec1, vec2):
    dot = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    cos = dot / (norm1 * norm2 + 1)
    return cos

## Test section

In [6]:
querys_dict, docs_dict = open_files()
# print(docs_dict["FBIS3-23"])

all_dict = querys_dict.copy()
all_dict.update(docs_dict)

In [7]:
start = time.time()
tfidf_dict, tfidf_normalize = tfidf(all_dict)
print(f"Cost time: {time.time() - start}")

Cost time: 27.143171072006226


In [8]:
tfidf_normalize

Unnamed: 0,intern,organ,crime,poliomyel,post,polio,hubbl,telescop,achiev,endang,...,hammanskra,mavuso,mzondi,jabulani,14100,14600,holcomb,reigel,verna,caraccilo
301,0.389657,0.544465,0.742782,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
302,0.000000,0.000000,0.000000,0.753336,0.339894,0.562989,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
303,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.689357,0.671312,0.272264,0.000000,...,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
304,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.502012,...,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
305,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LA123090-0026,0.000000,0.096396,0.000000,0.000000,0.000000,0.225757,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
LA123189-0136,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.034706,...,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
LA123190-0040,0.000000,0.000000,0.018760,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.05774,0.05774,0.05774,0.05774,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
LA123190-0062,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


## Vector space model
把算出來的COSINE SIM排序

In [9]:
start = time.time()

sim_dict = {}
sorted_sim_dict = {}
for fqname in querys_dict.keys():
    sim_dict[fqname] = {}
    query_vec = []
    query_term = list(tfidf_dict[fqname].keys())
    for term in query_term:
        query_vec.append(tfidf_normalize.loc[fqname, term])
    for fdname in docs_dict.keys():
        doc_vec = []
        for term in query_term:
            doc_vec.append(tfidf_normalize.loc[fdname, term])
        sim_dict[fqname][fdname] = cosine(query_vec, doc_vec)
    sorted_sim_dict[fqname] = sorted(sim_dict[fqname], key=sim_dict[fqname].get, reverse=True)

print(f"Cost time: {time.time() - start}")

Cost time: 9.94819188117981


In [10]:
sorted_sim_dict['301'][:5]

['FBIS3-23986', 'FBIS3-19199', 'FBIS3-55219', 'FBIS3-19646', 'FBIS3-21961']

## output file

In [11]:
if os.path.exists("result.txt"):
    os.remove("result.txt")
    
with open("result.txt", "w") as ofile:
    ofile.write("Query,RetrievedDocuments\n")
    for query_name, score_list in sorted_sim_dict.items():
        ofile.write(query_name + ",")
        for score in score_list:
            ofile.write(score + " ")
        ofile.write("\n")