In [46]:
import numpy as np
import math

In [47]:
query_dict = {}
doc_dict = {}
doc_avglen = 0
root_path = './data'

with open(f'{root_path}/query_list.txt', 'r') as file:
    query_list = file.read()
    for file_name in query_list.split('\n'): 
        try:
            file_path = f'{root_path}/queries/{file_name}.txt'
            with open(file_path, 'r') as f:
                query_dict[file_name] = f.read().split(' ')
        except Exception as e:
            print(e)

with open(f'{root_path}/doc_list.txt', 'r') as file:
    doc_list = file.read()
    for file_name in doc_list.split('\n'):
        try:
            file_path = f'{root_path}/docs/{file_name}.txt'
            with open(file_path, 'r') as f:
                doc_dict[file_name] = f.read().split(' ')
                doc_avglen += len(doc_dict[file_name])
        except Exception as e:
            print(e)

doc_avglen /= len(doc_dict)

print(len(query_dict))
print(len(doc_dict))

[Errno 2] No such file or directory: './data/queries/.txt'
[Errno 2] No such file or directory: './data/docs/.txt'
150
30000


In [48]:
query_word = []
for _, value in query_dict.items():
    query_word.append(value)
query_word = sum(query_word, [])

In [49]:
### Calculate word count & total word length
all_word_dict = {}   # word: count
all_word_len = 0 # 計算 total word length in document

for doc_name, doc in doc_dict.items():
    all_word_len += len(doc)
    for word in doc:
        if all_word_dict.get(word, 0):
            all_word_dict[word] += 1
        else:
            all_word_dict[word] = 1
print(len(all_word_dict))

# filter word
new_word_dict = {}
for word, count in all_word_dict.items():
    if count > 9 or word in query_word:
        new_word_dict[word] = count
print(len(new_word_dict))

154240
31389


In [50]:
# doc/word to id
doc2id, id2doc, word2id, id2word = {}, {}, {}, {}
for i, doc_name in enumerate(list(doc_dict.keys())):
    id2doc[i] = doc_name
    doc2id[doc_name] = i
    
for i, word in enumerate(list(new_word_dict.keys())):
    word2id[word] = i
    id2word[i] = word

In [51]:
tf_matrix = np.zeros((len(doc_dict), len(new_word_dict)))

for doc_name, doc in doc_dict.items():
    i = doc2id[doc_name]
    for word in doc:
        if new_word_dict.get(word, 0):
            j = word2id[word]
            tf_matrix[i][j] += 1

In [52]:
### Calculate BG word
BG_matrix = np.zeros(len(new_word_dict))
for word, count in new_word_dict.items():
    BG_matrix[word2id[word]] = count / all_word_len
    
print(len(BG_matrix))

31389


In [53]:
word_dict = {}      # word_dict[doc][word] = 1
df_dict = {}        # df[word]
idf_dict = {}       # idf[word]

for doc_name, doc in doc_dict.items():
    word_dict[doc_name] = {}
    for word in set(doc):               # 利用 set 過濾重複字，增加效能
        word_dict[doc_name][word] = 1   # 將找到的 word 根據 document 放在 key

for word in list(new_word_dict.keys()):   # 計算 df (idf 的分母)
    freq = 0 
    for doc_name in word_dict.keys():
        if word_dict[doc_name].get(word, 0): # 若在這篇文章有找到此 word ，則 +1
            freq += 1

    df_dict[word] = freq

for word, df_freq in df_dict.items():   # 計算 idf
    idf_dict[word] = math.log(((len(doc_dict)+1) / (df_freq+1)) , 10)  # smooth 公式

In [54]:
doc_len, word_len, query_len = len(doc_dict), len(new_word_dict), len(query_dict)
top = 5

In [55]:
def bestMatchModel(query_list):
    K1, b = 0.8, 0.7
    score_dict = {}
    
    for doc_name, doc in doc_dict.items():
        doc_id = doc2id[doc_name]
        score = 0
        
        for word in query_list:
            word_id = word2id[word]
            tf = tf_matrix[doc_id][word_id]
            doc_weight = ((K1 + 1) * tf) / (K1 * ((1 - b) + (b * len(doc) / doc_avglen)) + tf)
            idf_weight = pow(idf_dict[word], 2) if idf_dict.get(word, 0) else 0
            score += doc_weight * idf_weight
            
        score_dict[doc_name] = score
    
    rank = sorted(score_dict.items(), key=lambda x: x[1], reverse = True) # 根據分數做排序
    
    return rank[0:top]

In [56]:
# use BM25 find relevent document
relevant_doc = {}
for query_id, _query in enumerate(list(query_dict.values())):
    doc_rank = bestMatchModel(_query)
    
    rel_matrix = np.zeros((top, word_len))
    for i, doc in enumerate(doc_rank):
        doc_index = doc2id[doc[0]]
        rel_matrix[i] = tf_matrix[doc_index]
    
    relevant_doc[query_id] = rel_matrix

In [57]:
def EStep(Psmm):
    alpha = 0.3
    
    Tsmm = ((1 - alpha) * Psmm) / ((1 - alpha) * Psmm + alpha * BG_matrix)

    return Tsmm

In [58]:
def MStep(Psmm, Tsmm):
    for k in range(0, query_len):
        Psmm[k] = 0
    
    for query_id, rel_matrix in relevant_doc.items():
        Psmm[query_id] = (np.sum(rel_matrix, axis=0) * Tsmm[query_id]) / np.sum(rel_matrix)
    
    return Psmm

In [59]:
def SMM_algorithm():
    # initialize
    Psmm = np.random.random([query_len, word_len])
    for k in range(0, query_len):
        normalization = sum(Psmm[k, :])
        for i in range(0, word_len):
            Psmm[k, i] /= normalization
    
    # SMM iteration
    Iteration = 30
    for i in range(Iteration):
        Tsmm = EStep(Psmm)
        Psmm = MStep(Psmm, Tsmm)
    
    print('done')
    return Psmm

In [60]:
Psmm = SMM_algorithm()

done


In [66]:
def KL_divergence(query_id, query_list):
    alpha, beta, gamma = 0.15, 0.8, 0.3
    score_dict = {}
    
    q_ulm = np.zeros(word_len)
    for word in query_list:
        word_id = word2id[word]
        q_ulm[word_id] = (1/len(query_list))
        
    KL_1 = alpha * q_ulm + beta * Psmm[query_id] + (1-alpha-beta) * BG_matrix
    
    for doc_name, doc in doc_dict.items():
        doc_id = doc2id[doc_name]
        KL_2 = gamma * (tf_matrix[doc_id] / len(doc)) + (1-gamma) * BG_matrix
        
        score = np.sum(KL_1 * np.log(KL_2))
        score_dict[doc_name] = score
    
    rank = sorted(score_dict.items(), key=lambda x: x[1], reverse = True)
    return rank

In [67]:
f = open('ans_smm.txt', 'w')
string = 'Query,RetrievedDocuments\n'

query_id = 0
for _id, _query in query_dict.items():
    rank = KL_divergence(query_id, _query)
    query_id += 1

    string += _id + ','
    for doc in rank[0:5000]:
        string += doc[0] + ' '
    string += '\n'
    
f.write(string)
f.close()
print('done')

done
