# IR - HW1 Vector Space Model
* In this project, we will have  
    * 50 Queries  
    * 4191 Documents

Our goal is to implement a vector space model, and print out the ranking results for all of the queries.
https://www.kaggle.com/t/7f84706b7b074267ae314582825fb725


In [1]:
import numpy as np
import pandas as pd
import math
import os
import time

# from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import normalize

## Open files

In [2]:
# return word vector
def open_files(path_query = "data/queries", path_docs = "data/docs", extension = ".txt"):
    qlf = open("data/query_list.txt")
    dlf = open("data/doc_list.txt")
    
    querys = {}
    for fname in qlf:
        fname = fname.strip("\n")
        file = os.path.join(path_query, fname + extension)
        
        fq = open(file)
        query = []
        for q in fq:
            q = q.strip("\n").split(" ")
            for term in q:
                query.append(term)
        querys[fname] = query
        fq.close()

    docs = {}
    for fname in dlf:
        fname = fname.strip("\n")
        file = os.path.join(path_docs, fname + extension)
        
        fd = open(file)
        doc = []
        for d in fd:
            d = d.strip("\n").split(" ")
            for term in d:
                doc.append(term)
        docs[fname] = doc
        fd.close()

    dlf.close()
    qlf.close()

    return querys, docs

In [3]:
# return sentence
def open_files2(path_query = "data/queries", path_docs = "data/docs", extension = ".txt"):
    qlf = open("data/query_list.txt")
    dlf = open("data/doc_list.txt")
    
    querys = {}
    for fname in qlf:
        fname = fname.strip("\n")
        file = os.path.join(path_query, fname + extension)
        
        fq = open(file)
        query = []
        for q in fq:
            query.append(q.strip("\n"))
        querys[fname] = query
        fq.close()

    docs = {}
    for fname in dlf:
        fname = fname.strip("\n")
        file = os.path.join(path_docs, fname + extension)
        
        fd = open(file)
        doc = []
        for d in fd:
            doc.append(d.strip("\n"))
        docs[fname] = doc
        fd.close()

    dlf.close()
    qlf.close()

    return querys, docs

## TF-IDF

In [4]:
def tfidf(all_dict):
    
    tf = {}
    ni = {}
    for name, terms in all_dict.items():
        tf[name] = {}
        check_term = {}
        for term in terms:
            # ni
            try:
                if term not in check_term:
                    ni[term] += 1
                    check_term[term] = True
            except:
                ni[term] = 1
            # TF
            try:
#                 tf[name][term] += 1 / len(terms)
                tf[name][term] += 1
            except:
#                 tf[name][term] = 1 / len(terms)
                tf[name][term] = 1
    
    # IDF
    idf = {}
    # score dict initial
    score_dict = {}
    # col name
    col_name = []
    for term, times in ni.items():
        # smooth idf :prevent zero divisions
        idf[term] = math.log( ((1 + len(all_dict)) / (1 + times)) ) + 1
        score_dict[term] = 0.0
        col_name.append(term)
    
    # TFIDF
    tfidf_dict = {}
    tfidf_score = []
    # index
    index = []
    for name, _ in all_dict.items():
        index.append(name)
        tfidf_dict[name] = {}
        for term, tf_score in tf[name].items():
            # sublinear_tf: replace tf with 1 + log(tf).
            sublinear_tf = (1 + math.log(tf_score))
            tfidf_dict[name][term] = sublinear_tf * idf[term]
            score_dict[term] = sublinear_tf * idf[term]
        tfidf_score.append(list(score_dict.values()))
        # Clean up score dict
        for term in tf[name].keys():
            score_dict[term] = 0.0
    tfidf_result = normalize(tfidf_score, norm='l2')
    
    return tfidf_dict, tfidf_result, col_name, index

## Vector space model
把算出來的COSINE SIM排序

In [5]:
def cosine_similarity(vec1, vec2):
    dot = sum(v1 * v2 for v1, v2 in zip(vec1, vec2))   
    norm1 = math.sqrt(sum(v1 * v1 for v1 in vec1))
    norm2 = math.sqrt(sum(v2 * v2 for v2 in vec2))
    
    cos_sim = dot / (norm1 * norm2)
    return cos_sim

In [6]:
# test cosine similarity
vec1 = [1, 2, 3]
vec2 = [3, 4, 5]
cosine_similarity(vec1, vec2)

0.9827076298239908

## Test section

In [7]:
querys_dict, docs_dict = open_files()
# print(docs_dict["FBIS3-23"])

all_dict = querys_dict.copy()
all_dict.update(docs_dict)

In [8]:
start = time.time()
tfidf_dict, tfidf_result, col_name, index = tfidf(all_dict)
print(time.time() - start)

18.85719084739685


In [9]:
df_tfidf = pd.DataFrame(tfidf_result, columns=col_name, index=index)
df_tfidf.loc['LA072189-0048', :]

intern       0.0
organ        0.0
crime        0.0
poliomyel    0.0
post         0.0
            ... 
14600        0.0
holcomb      0.0
reigel       0.0
verna        0.0
caraccilo    0.0
Name: LA072189-0048, Length: 59680, dtype: float64

In [10]:
df_tfidf

Unnamed: 0,intern,organ,crime,poliomyel,post,polio,hubbl,telescop,achiev,endang,...,hammanskra,mavuso,mzondi,jabulani,14100,14600,holcomb,reigel,verna,caraccilo
301,0.418991,0.557749,0.716493,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
302,0.000000,0.000000,0.000000,0.728294,0.379738,0.570427,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
303,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.678101,0.662881,0.31744,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
304,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.518474,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
305,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LA123090-0026,0.000000,0.110526,0.000000,0.000000,0.000000,0.219452,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
LA123189-0136,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.037941,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
LA123190-0040,0.000000,0.000000,0.020386,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.051888,0.051888,0.051888,0.051888,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
LA123190-0062,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [11]:
docs_vec = {}
col_name = []
for fdname in docs_dict.keys():
    docs_vec[fdname] = list(df_tfidf.loc[fdname,:])
    col_name.append(fdname)

In [12]:
query_vec = {}
row_name = []
for fqname in querys_dict.keys():
    query_vec[fqname] = list(df_tfidf.loc[fqname,:])
    row_name.append(fqname)

In [17]:
sim_dict = {}
sim_all = []
i = 0
for fqname in querys_dict.keys():
    sim_dict[fqname] = {}
    q_sim = []
    for fdname in docs_dict.keys():
        sim_dict[fqname][fdname] = cosine_similarity(query_vec[fqname], docs_vec[fdname])
        q_sim.append(sim_dict[fqname][fdname])
    sim_all.append(q_sim)

ZeroDivisionError: float division by zero

In [None]:
sim_all[0]

## Reference

In [None]:
# return sentence
def my_open_files(path_query = "data/queries", path_docs = "data/docs", extension = ".txt"):
    qlf = open("data/query_list.txt")
    dlf = open("data/doc_list.txt")
    
    querys = []
    query_list = []
    for fname in qlf:
        fname = fname.strip("\n")
        file = os.path.join(path_query, fname + extension)
        
        fq = open(file)
        query = []
        for q in fq:
            query.append(q.strip("\n"))
        querys.append(query)
        query_list.append(fname)
        fq.close()

    docs = []
    doc_list = []
    for fname in dlf:
        fname = fname.strip("\n")
        file = os.path.join(path_docs, fname + extension)
        if fname == 'LA072189-0048':
            continue
        
        fd = open(file)
        doc = []
        for d in fd:
            doc.append(d.strip("\n"))
        docs.append(doc)
        doc_list.append(fname)
        fd.close()

    dlf.close()
    qlf.close()

    return querys, docs, query_list, doc_list

In [None]:
def my_tfidf(all_input):
    col_name = []
    all_data_split = []
    print(all_input[0][0])
    for i in range(len(all_input)):
        all_data_split.append(all_input[i][0].split())
    
    voc_dic = {}
    score_dict_list = []
    example_dic = {}
    for i in range(len(all_data_split)):
        score_dic = {}
        updated = {}
        for j in range(len(all_data_split[i])):
            if(voc_dic.get(all_data_split[i][j]) != None):
                if all_data_split[i][j] not in updated:
                    voc_dic[all_data_split[i][j]] += 1.0
                    updated[all_data_split[i][j]] = 1.0
            else:
                voc_dic[all_data_split[i][j]] = 1.0
                updated[all_data_split[i][j]] = 1.0
            if score_dic.get(all_data_split[i][j]) == None:
                score_dic[all_data_split[i][j]] = 1.0
            else:
                score_dic[all_data_split[i][j]] += 1.0
            example_dic[all_data_split[i][j]] = 0
        score_dict_list.append(score_dic)
    col_name =list(voc_dic.keys())
    
    #idf
    idf_dic = {}
    N = len(all_input)
    for voc in voc_dic:
        idf_dic[voc] = math.log(1+((N+1)/(voc_dic[voc] +1)))
    result = []
    
    for i in range(N):
        for value in score_dict_list[i]:
            example_dic[value] = idf_dic[value] * (1+math.log(score_dict_list[i][value]))
        score = list(example_dic.values())
        result.append(score)
        for value in score_dict_list[i]:
            example_dic[value] = 0.0
    resul_score = normalize(result, norm='l2')
    return resul_score, col_name

In [None]:
# q, d, ql, dl = my_open_files()
# # print(ql[:5])
# # print(dl[:5])
# # print(q[0])
# # print(d[0])
# all_input = q + d
# tfidf_score, col_name = my_tfidf(all_input)

In [None]:
# df_tfidf = pd.DataFrame(tfidf_score, columns=col_name, index=(ql + dl))
# df_tfidf