In [76]:
import numpy as np
import math
from sklearn.preprocessing import normalize

In [77]:
query_dict = {}
doc_dict = {}
doc_avglen = 0
root_path = './data'

with open(f'{root_path}/query_list.txt', 'r') as file:
    query_list = file.read()
    for file_name in query_list.split('\n'): 
        try:
            file_path = f'{root_path}/queries/{file_name}.txt'
            with open(file_path, 'r') as f:
                query_dict[file_name] = f.read().split(' ')
        except Exception as e:
            print(e)

with open(f'{root_path}/doc_list.txt', 'r') as file:
    doc_list = file.read()
    for file_name in doc_list.split('\n'):
        try:
            file_path = f'{root_path}/docs/{file_name}.txt'
            with open(file_path, 'r') as f:
                doc_dict[file_name] = f.read().split(' ')
                doc_avglen += len(doc_dict[file_name])
        except Exception as e:
            print(e)

doc_avglen /= len(doc_dict)

print(len(query_dict))
print(len(doc_dict))

[Errno 2] No such file or directory: './data/queries/.txt'
[Errno 2] No such file or directory: './data/docs/.txt'
150
30000


In [78]:
query_word = []
for _, value in query_dict.items():
    query_word.append(value)
query_word = sum(query_word, [])

In [79]:
# global word
all_word_dict = {}
for doc_name, doc in doc_dict.items():
    for word in doc:
        if all_word_dict.get(word, 0):
            all_word_dict[word] += 1
        else:
            all_word_dict[word] = 1
print(len(all_word_dict))

# filter word
all_word_list = []
for word, count in all_word_dict.items():
    if count > 10 or word in query_word:
        all_word_list.append(word)
print(len(all_word_list))

154240
29735


In [88]:
# doc/word to id
doc2id, word2id, id2word = {}, {}, {}
for i, doc_name in enumerate(list(doc_dict.keys())):
    doc2id[doc_name] = i
for i, word in enumerate(all_word_list):
    word2id[word] = i
    id2word[i] = word

In [81]:
tf_dict = {}   # tf[doc][word]
    
for doc_name, doc in doc_dict.items(): # 讀取 key value
    tf_dict[doc_name] = {}
    for word in doc:
        if tf_dict[doc_name].get(word, 0):  # 計算 tf
            tf_dict[doc_name][word] += 1
        else:
            tf_dict[doc_name][word] = 1

In [82]:
word_dict = {}      # word_dict[doc][word] = 1
df_dict = {}        # df[word]
idf_dict = {}       # idf[word]

for doc_name, doc in doc_dict.items():
    word_dict[doc_name] = {}
    for word in set(doc):               # 利用 set 過濾重複字，增加效能
        word_dict[doc_name][word] = 1   # 將找到的 word 根據 document 放在 key

for word in all_word_list:   # 計算 df (idf 的分母)
    freq = 0 
    for doc_name in word_dict.keys():
        if word_dict[doc_name].get(word, 0): # 若在這篇文章有找到此 word ，則 +1
            freq += 1

    df_dict[word] = freq

for word, df_freq in df_dict.items():   # 計算 idf
    idf_dict[word] = math.log(((len(doc_dict)+1) / (df_freq+1)) , 10)  # smooth 公式

In [None]:
tf_idf = np.zeros((len(doc_dict), len(all_word_list)))
print(tf_idf.shape)

for i, doc_name in enumerate(list(doc_dict.keys())):  # 計算 tf_idf
    for j, word in enumerate(all_word_list):
        if word in doc_dict[doc_name]:
            tf_idf[i, j] = (1 + math.log(tf_dict[doc_name][word], 10)) * idf_dict[word]

In [83]:
tf_idf = np.load('./numpy/tf_idf.npy')
tf_idf = normalize(tf_idf, axis=1, norm='l2')

In [84]:
def bestMatchModel(query_list):
    K1, b = 0.8, 0.7
    score_dict = {}
    
    for doc_name, doc in doc_dict.items():
        doc_term_len = len(doc)
        score = 0
        
        for word in query_list:
            tf = tf_dict[doc_name][word] if tf_dict[doc_name].get(word, 0) else 0 # 將 word 轉成 score
            doc_weight = ((K1 + 1) * tf) / (K1 * ((1 - b) + (b * doc_term_len / doc_avglen)) + tf)
            idf_weight = pow(idf_dict[word], 2) if idf_dict.get(word, 0) else 0
            score += doc_weight * idf_weight
            
        score_dict[doc_name] = score
    
    rank = sorted(score_dict.items(), key=lambda x: x[1], reverse = True) # 根據分數做排序
    return rank

In [117]:
def bestMatchModel_again(query_vec):
    K1, b = 0.8, 0.75
    score_dict = {}
    query_dict = {}
    for index, score in enumerate(query_vec):
        if score == 0:
            continue
        query_dict[id2word[index]] = score
    
    for doc_name, doc in doc_dict.items():
        doc_term_len = len(doc)
        score = 0
        
        for word, score in query_dict.items():
            tf = tf_dict[doc_name][word] if tf_dict[doc_name].get(word, 0) else 0 # 將 word 轉成 score
            doc_weight = ((K1 + 1) * tf) / (K1 * ((1 - b) + (b * doc_term_len / doc_avglen)) + tf)
            idf_weight = pow(idf_dict[word], 2) if idf_dict.get(word, 0) else 0
            score += doc_weight * idf_weight
            
        score_dict[doc_name] = score
    
    rank = sorted(score_dict.items(), key=lambda x: x[1], reverse = True)
    return rank

In [122]:
def rocchioAlgorithm(query_list, rank):
    alpha, beta, gamma = 1, 0.8, 0.002
    top, iteration = 7, 3
    
    # first query
    query_vec = np.zeros(len(all_word_list))
    for word in query_list:
        word_id = word2id[word]
        query_vec[word_id] = idf_dict[word]
    
    # iterate
    for i in range(iteration):
        rel_doc_vec, non_rel_doc_vec = np.zeros(len(all_word_list)), np.zeros(len(all_word_list))
        score_dict = {}
        
        # select top 5 relevant documents
        for doc_name in rank[0:top]:
            doc_id = doc2id[doc_name[0]]
            rel_doc_vec += np.array(tf_idf[doc_id])
        rel_doc_vec /= top

        # select last 1 non-relevant documents
        for doc_name in rank[-1:]:
            doc_id = doc2id[doc_name[0]]
            non_rel_doc_vec = np.array(tf_idf[doc_id])

        # update query vector
        query_vec = alpha * query_vec + beta * rel_doc_vec - gamma * non_rel_doc_vec
        
        # cosine similarity
        for doc_id, doc_name in enumerate(list(doc_dict.keys())):
            doc_vec = tf_idf[doc_id]
            score_dict[doc_name] = np.dot(query_vec, doc_vec) 

        rank = sorted(score_dict.items(), key=lambda x: x[1], reverse = True)
        
    return rank

In [127]:
f = open('ans_bm25.txt', 'w')
string = 'Query,RetrievedDocuments\n'

for _id, _query in query_dict.items():
    first_rank = bestMatchModel(_query)
    rank = rocchioAlgorithm(_query, first_rank)
    
    string += _id + ','
    for doc in rank[0:5000]:
        string += doc[0] + ' '
    string += '\n'
    
f.write(string)
f.close()
print('done')

done
