# IR - HW1 Vector Space Model
* In this project, we will have  
    * 50 Queries  
    * 4191 Documents

Our goal is to implement a vector space model, and print out the ranking results for all of the queries.
https://www.kaggle.com/t/7f84706b7b074267ae314582825fb725


In [1]:
import numpy as np
import pandas as pd
import math
import os
import time

# from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import normalize

## Open files

In [2]:
# return word vector
def open_files(path_query = "data/queries", path_docs = "data/docs", extension = ".txt"):
    qlf = open("data/query_list.txt")
    dlf = open("data/doc_list.txt")
    
    querys = {}
    for fname in qlf:
        fname = fname.strip("\n")
        file = os.path.join(path_query, fname + extension)
        
        fq = open(file)
        query = []
        for q in fq:
            q = q.strip("\n").split(" ")
            for term in q:
                query.append(term)
        querys[fname] = query
        fq.close()

    docs = {}
    for fname in dlf:
        fname = fname.strip("\n")
        file = os.path.join(path_docs, fname + extension)
        
        fd = open(file)
        doc = []
        for d in fd:
            d = d.strip("\n").split(" ")
            for term in d:
                doc.append(term)
        docs[fname] = doc
        fd.close()

    dlf.close()
    qlf.close()

    return querys, docs

In [3]:
# return sentence
def open_files2(path_query = "data/queries", path_docs = "data/docs", extension = ".txt"):
    qlf = open("data/query_list.txt")
    dlf = open("data/doc_list.txt")
    
    querys = {}
    for fname in qlf:
        fname = fname.strip("\n")
        file = os.path.join(path_query, fname + extension)
        
        fq = open(file)
        query = []
        for q in fq:
            query.append(q.strip("\n"))
        querys[fname] = query
        fq.close()

    docs = {}
    for fname in dlf:
        fname = fname.strip("\n")
        file = os.path.join(path_docs, fname + extension)
        
        fd = open(file)
        doc = []
        for d in fd:
            doc.append(d.strip("\n"))
        docs[fname] = doc
        fd.close()

    dlf.close()
    qlf.close()

    return querys, docs

## TF-IDF

In [4]:
def tfidf(all_dict):
    
    tf = {}
    ni = {}
    for name, terms in all_dict.items():
        tf[name] = {}
        check_term = {}
        for term in terms:
            # ni
            try:
                if term not in check_term:
                    ni[term] += 1
                    check_term[term] = True
            except:
                ni[term] = 1
            # TF
            try:
#                 tf[name][term] += 1 / len(terms)
                tf[name][term] += 1
            except:
#                 tf[name][term] = 1 / len(terms)
                tf[name][term] = 1
    
    # IDF
    idf = {}
    # score dict initial
    score_dict = {}
    # col name
    col_name = []
    for term, times in ni.items():
        # smooth idf :prevent zero divisions
        idf[term] = math.log( ((1 + len(all_dict)) / (1 + times)) ) + 1
        score_dict[term] = 0.0
        col_name.append(term)
    
    # TFIDF
    tfidf_dict = {}
    tfidf_score = []
    # index
    index = []
    for name, _ in all_dict.items():
        index.append(name)
        tfidf_dict[name] = {}
        for term, tf_score in tf[name].items():
            # sublinear_tf: replace tf with 1 + log(tf).
            sublinear_tf = (1 + math.log(tf_score))
            tfidf_dict[name][term] = sublinear_tf * idf[term]
            score_dict[term] = sublinear_tf * idf[term]
        tfidf_score.append(list(score_dict.values()))
        # Clean up score dict
        for term in tf[name].keys():
            score_dict[term] = 0.0
    tfidf_result = normalize(tfidf_score, norm='l2')
    
    return tfidf_dict, tfidf_result, col_name, index

## Vector space model
把算出來的COSINE SIM排序

In [5]:
def cosine_similarity(vec1, vec2):
    dot = sum(v1 * v2 for v1, v2 in zip(vec1, vec2))
    norm1 = math.sqrt(sum(v1 * v1 for v1 in vec1))
    norm2 = math.sqrt(sum(v2 * v2 for v2 in vec2))
    
    cos_sim = dot / ((norm1 * norm2) + 1)
    return cos_sim

In [6]:
def cosine(vec1, vec2):
    dot = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    cos = dot / (norm1 * norm2 + 1)
    return cos

In [7]:
# test cosine similarity
vec1 = [1, 2, 3]
vec2 = [3, 4, 5]
cosine_similarity(vec1, vec2)

0.9469175119839679

In [8]:
cosine(vec1, vec2)

0.9469175119839679

## Test section

In [9]:
querys_dict, docs_dict = open_files()
# print(docs_dict["FBIS3-23"])

all_dict = querys_dict.copy()
all_dict.update(docs_dict)

In [10]:
start = time.time()
tfidf_dict, tfidf_result, col_name, index = tfidf(all_dict)
print(time.time() - start)

19.000770568847656


In [11]:
df_tfidf = pd.DataFrame(tfidf_result, columns=col_name, index=index)
df_tfidf.loc['LA072189-0048', :]

intern       0.0
organ        0.0
crime        0.0
poliomyel    0.0
post         0.0
            ... 
14600        0.0
holcomb      0.0
reigel       0.0
verna        0.0
caraccilo    0.0
Name: LA072189-0048, Length: 59680, dtype: float64

In [12]:
df_tfidf

Unnamed: 0,intern,organ,crime,poliomyel,post,polio,hubbl,telescop,achiev,endang,...,hammanskra,mavuso,mzondi,jabulani,14100,14600,holcomb,reigel,verna,caraccilo
301,0.418991,0.557749,0.716493,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
302,0.000000,0.000000,0.000000,0.728294,0.379738,0.570427,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
303,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.678101,0.662881,0.31744,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
304,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.518474,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
305,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LA123090-0026,0.000000,0.110526,0.000000,0.000000,0.000000,0.219452,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
LA123189-0136,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.037941,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
LA123190-0040,0.000000,0.000000,0.020386,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.051888,0.051888,0.051888,0.051888,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
LA123190-0062,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [13]:
tfidf_dict['FBIS3-10082']

{'languag': 2.722106749482256,
 'f': 3.909065807862998,
 'p': 2.1606080764113824,
 '105': 2.7570096211633164,
 'spanish': 3.5875990323397833,
 'articl': 2.452059471079455,
 'type': 2.43014624364874,
 'bfn': 2.3952927642476762,
 'text': 2.28632316498767,
 'santa': 4.020071341859259,
 'fe': 5.241916270951316,
 'de': 8.515344216964655,
 'bogota': 4.875453320646422,
 '28': 3.4021475825369007,
 'feb': 3.815455868106091,
 'dpa': 6.644739934022418,
 'today': 2.5360542545296596,
 'colombian': 9.280199333745513,
 'prosecutor': 9.19525348152518,
 'gener': 5.284136814679687,
 'gustavo': 5.591590019431066,
 'greiff': 15.530311770253034,
 'said': 1.4810974708009825,
 'u': 4.457328439245691,
 'govern': 3.095460622419392,
 'interest': 4.543208361062324,
 'suppli': 5.115485741797553,
 'evid': 8.02701368171311,
 'condemn': 4.708399235983255,
 'chief': 2.854507985648194,
 'drug': 5.696100369405269,
 'traffick': 7.0177111196256154,
 'mafia': 4.271385770140165,
 'trust': 4.127043461411427,
 'justic': 7.81

In [14]:
df_tfidf.loc['FBIS3-10082',]

intern       0.000000
organ        0.000000
crime        0.056907
poliomyel    0.000000
post         0.000000
               ...   
14600        0.000000
holcomb      0.000000
reigel       0.000000
verna        0.000000
caraccilo    0.000000
Name: FBIS3-10082, Length: 59680, dtype: float64

In [15]:
list(tfidf_dict['301'].values())

[1.9895105481615893, 2.6483757801605208, 3.4021475825369007]

In [16]:
list(tfidf_dict['301'].keys())

['intern', 'organ', 'crime']

In [17]:
tfidf_dict['LA123190-0065'].values()

dict_values([8.500388182565267, 3.093208671059705, 9.926586709367426, 10.74466104549039, 3.7484977143049343, 5.421774511516122, 3.4021475825369007, 9.931603817014764, 14.340369710660864, 20.48893781603049, 7.892507807832989, 6.872677506437524, 8.548234570150017, 9.915548328151633, 2.7090004019769554, 7.910373030805676, 9.113361688224408, 9.113361688224408, 4.996081308435036, 5.696100369405269, 3.3934099885671096, 3.142190058099975, 3.989238712673065, 2.6301603402691796, 7.801781792568596, 6.9427783014280635, 6.6111630536354316, 8.374425978630418, 6.669635671036137, 1.8336401774548046, 1.459591265580019, 5.524148738635533, 2.7420940904068685, 7.927520129935346, 5.491983774455558, 2.739405916745068, 4.573666642013098, 6.5195767910684115, 2.3943417418269726, 5.4253884876599585, 8.960722612524128, 6.713732805509369, 2.7103033337573716, 2.36069370770874, 2.552620066822428, 5.0562787589604365, 5.691218942294385, 4.661442252895484, 5.275508755050778, 12.115425649481297, 3.5005876553501536, 2.

In [18]:
sim_dict = {}
sim_score = []
for fqname in querys_dict.keys():
    sim_dict[fqname] = {}
    query_vec = list(tfidf_dict[fqname].values())
    query_term = list(tfidf_dict[fqname].keys())
    for fdname in docs_dict.keys():
        doc_vec = []
        for term in query_term:
            doc_vec.append(df_tfidf.loc[fdname, term])
        sim_dict[fqname][fdname] = cosine(query_vec, doc_vec)

In [19]:
sim_dict['301']

{'FBIS3-10082': 0.15242057352990052,
 'FBIS3-10231': 0.0730021479555938,
 'FBIS3-10243': 0.07036260365358846,
 'FBIS3-10285': 0.06642525660877155,
 'FBIS3-10291': 0.16854466033252988,
 'FBIS3-10302': 0.0,
 'FBIS3-10397': 0.21813275762575976,
 'FBIS3-10433': 0.04578113107959054,
 'FBIS3-10451': 0.0,
 'FBIS3-10464': 0.04578113107959054,
 'FBIS3-10491': 0.24192308739366208,
 'FBIS3-10501': 0.0,
 'FBIS3-10506': 0.0,
 'FBIS3-10535': 0.0,
 'FBIS3-10551': 0.0,
 'FBIS3-10552': 0.13178376441566125,
 'FBIS3-10622': 0.14322084891962125,
 'FBIS3-10632': 0.0,
 'FBIS3-10633': 0.0,
 'FBIS3-10697': 0.0,
 'FBIS3-10698': 0.0,
 'FBIS3-10721': 0.08541899348226303,
 'FBIS3-10753': 0.0,
 'FBIS3-10805': 0.14322084891962125,
 'FBIS3-10910': 0.24343236986973738,
 'FBIS3-10937': 0.11043097424018716,
 'FBIS3-11003': 0.16663228869422345,
 'FBIS3-11058': 0.07749807554029316,
 'FBIS3-11069': 0.0,
 'FBIS3-11095': 0.1578321377068733,
 'FBIS3-11099': 0.12708731522277125,
 'FBIS3-11106': 0.07492398901720551,
 'FBIS3-11

## Reference

In [20]:
# return sentence
def my_open_files(path_query = "data/queries", path_docs = "data/docs", extension = ".txt"):
    qlf = open("data/query_list.txt")
    dlf = open("data/doc_list.txt")
    
    querys = []
    query_list = []
    for fname in qlf:
        fname = fname.strip("\n")
        file = os.path.join(path_query, fname + extension)
        
        fq = open(file)
        query = []
        for q in fq:
            query.append(q.strip("\n"))
        querys.append(query)
        query_list.append(fname)
        fq.close()

    docs = []
    doc_list = []
    for fname in dlf:
        fname = fname.strip("\n")
        file = os.path.join(path_docs, fname + extension)
        if fname == 'LA072189-0048':
            continue
        
        fd = open(file)
        doc = []
        for d in fd:
            doc.append(d.strip("\n"))
        docs.append(doc)
        doc_list.append(fname)
        fd.close()

    dlf.close()
    qlf.close()

    return querys, docs, query_list, doc_list

In [21]:
def my_tfidf(all_input):
    col_name = []
    all_data_split = []
    print(all_input[0][0])
    for i in range(len(all_input)):
        all_data_split.append(all_input[i][0].split())
    
    voc_dic = {}
    score_dict_list = []
    example_dic = {}
    for i in range(len(all_data_split)):
        score_dic = {}
        updated = {}
        for j in range(len(all_data_split[i])):
            if(voc_dic.get(all_data_split[i][j]) != None):
                if all_data_split[i][j] not in updated:
                    voc_dic[all_data_split[i][j]] += 1.0
                    updated[all_data_split[i][j]] = 1.0
            else:
                voc_dic[all_data_split[i][j]] = 1.0
                updated[all_data_split[i][j]] = 1.0
            if score_dic.get(all_data_split[i][j]) == None:
                score_dic[all_data_split[i][j]] = 1.0
            else:
                score_dic[all_data_split[i][j]] += 1.0
            example_dic[all_data_split[i][j]] = 0
        score_dict_list.append(score_dic)
    col_name =list(voc_dic.keys())
    
    #idf
    idf_dic = {}
    N = len(all_input)
    for voc in voc_dic:
        idf_dic[voc] = math.log(1+((N+1)/(voc_dic[voc] +1)))
    result = []
    
    for i in range(N):
        for value in score_dict_list[i]:
            example_dic[value] = idf_dic[value] * (1+math.log(score_dict_list[i][value]))
        score = list(example_dic.values())
        result.append(score)
        for value in score_dict_list[i]:
            example_dic[value] = 0.0
    resul_score = normalize(result, norm='l2')
    return resul_score, col_name

In [22]:
# q, d, ql, dl = my_open_files()
# # print(ql[:5])
# # print(dl[:5])
# # print(q[0])
# # print(d[0])
# all_input = q + d
# tfidf_score, col_name = my_tfidf(all_input)

In [23]:
# df_tfidf = pd.DataFrame(tfidf_score, columns=col_name, index=(ql + dl))
# df_tfidf