In [10]:
import numpy as np
import math

In [11]:
def open_file():
    query = {}
    doc = {}

    with open('../ntust-ir-2020/query_list.txt', 'r') as file:
        query_list = file.read()
        for file_name in query_list.split('\n'):
            try:
                file_path = f'../ntust-ir-2020/queries/{file_name}.txt'
                with open(file_path, 'r') as f:
                    query[file_name] = f.read().lower()
            except Exception as e:
                print(e)

    with open('../ntust-ir-2020/doc_list.txt', 'r') as file:
        doc_list = file.read()
        for file_name in doc_list.split('\n'):
            try:
                file_path = f'../ntust-ir-2020/docs/{file_name}.txt'
                with open(file_path, 'r') as f:
                    doc[file_name] = f.read().lower()
            except Exception as e:
                print(e)

    print(len(query))
    print(len(doc))
    
    return query, doc

In [12]:
def cal_tf(doc_dict):
    tf_dict = {}   # tf[doc][word]
    
    for doc_name, doc in doc_dict.items(): # 讀取 key value
        tf_dict[doc_name] = {}
        for word in doc.split(' '):        # 將 document 的內容拆成 token
            if tf_dict[doc_name].get(word, 0):  # 計算 tf
                tf_dict[doc_name][word] += 1
            else:
                tf_dict[doc_name][word] = 1
        
    return tf_dict

In [13]:
def cal_idf(doc_dict, N):
    word_dict = {}      # word_dict[doc][word] = 1
    all_word_list = []
    df_dict = {}        # df[word]
    idf_dict = {}       # idf[word]
    
    for doc_name, doc in doc_dict.items():
        word_dict[doc_name] = {}
        for word in set(doc.split(' ')):    # 利用 set 過濾重複字，增加效能
            word_dict[doc_name][word] = 1   # 將找到的 word 根據 document 放在 key
            all_word_list.append(word)
    
    for word in list(set(all_word_list)):   # 計算 df (idf 的分母)
        freq = 0 
        for doc_name in word_dict.keys():
            if word_dict[doc_name].get(word, 0): # 若在這篇文章有找到此 word ，則 +1
                freq = freq + 1

        df_dict[word] = freq
    
    for word, df_freq in df_dict.items():   # 計算 idf
        idf_dict[word] = math.log(((N+1) / (df_freq+1)) , 10)  # smooth 公式
        
    return idf_dict

In [14]:
def cal_tf_idf(tf, idf, doc_dict):
    tf_idf_dict = {}  # tf_idf[doc][word]
    
    for doc_name, doc in doc_dict.items():  # 計算 tf_idf
        tf_idf_dict[doc_name] = {}
        for word in doc.split(' '):
            tf_idf_dict[doc_name][word] = (1 + math.log(tf[doc_name][word], 10)) * idf[word]
 
    return tf_idf_dict 

In [15]:
def vectorSpaceModel(query, doc_dict, tf_idf):
    score_dict = {}
    
    for doc_name, doc in doc_dict.items():
        query_vec = [tf_idf[doc_name][word] if tf_idf[doc_name].get(word, 0) else 0 for word in query.split(' ')] # 將 word 轉成 score
        doc_vec = [tf_idf[doc_name][word] if tf_idf[doc_name].get(word, 0) else 0 for word in query.split(' ')] # 將 word 轉成 score
        
        doc_vec_length = 0
        for word, value in tf_idf[doc_name].items():  # 實作 cosine similarity
            doc_vec_length += value * value
        score_dict[doc_name] = np.dot(query_vec, doc_vec) / math.sqrt(doc_vec_length)
    
    rank = sorted(score_dict.items(), key=lambda x: x[1], reverse = True) # 根據分數做排序
    return rank

In [16]:
query_dict, doc_dict = open_file()

[Errno 2] No such file or directory: '../ntust-ir-2020/queries/.txt'
[Errno 2] No such file or directory: '../ntust-ir-2020/docs/.txt'
50
4191


In [17]:
tf = cal_tf(doc_dict)
idf = cal_idf(doc_dict, len(doc_dict))
tf_idf = cal_tf_idf(tf, idf, doc_dict)

In [18]:
f = open('ans.txt', 'w')
string = 'Query,RetrievedDocuments\n'

for _id, _query in query_dict.items():
    rank = vectorSpaceModel(_query, doc_dict, tf_idf)

    string += _id + ','
    for doc in rank:
        string += doc[0] + ' '
    string += '\n'
    
f.write(string)
f.close()
print('done')

done
