In [112]:
import os, sys
sys.path.append("../Tools")
import numpy as np
import ProcDoc
from Evaluate import EvaluateModel
import math
from math import log


In [113]:
data = {}                # content of document (doc, content)
background_model = {}    # word count of 2265 document (word, number of words)
general_model = {}
query = {}                # query

query_lambda = 0.9
doc_lambda = 0.3
min_factor = 0.0000000001

document_path = "../Corpus/TDT2/SPLIT_DOC_WDID_NEW"
query_path = "../Corpus/TDT2/QUERY_WDID_NEW"
relevance_path = "../Corpus/TDT2/AssessmentTrainSet/AssessmentTrainSet.txt"

np.set_printoptions(threshold=sys.maxsize)

In [114]:
def read_background_dict_n(w_dicts, q_dicts, vocab_size=51253):
    BGTraingSetDict = {}
    BGTraingSetDict_prob = {}
    super_dict = {}
    #add docs to bg model 
    for top, d in w_dicts.iteritems():
        for (k, v) in d.iteritems():
            super_dict[k] = v if k not in super_dict else super_dict[k] + v
    #add query to bg model
    for top, d in q_dicts.iteritems():
        for (k, v) in d.iteritems():
            super_dict[k] = v if k not in super_dict else super_dict[k] + v
    
    for k, c in super_dict.iteritems():
        prob = float(float(c)/vocab_size)
        exp_prob = math.exp(prob)
        #print(k, c, prob)
        BGTraingSetDict[k] = exp_prob
        BGTraingSetDict_prob[k] = prob             
    return BGTraingSetDict, BGTraingSetDict_prob

def read_background_np_n(bg_dict):
    vocab_size = 51253
    obj_vec = np.zeros(vocab_size)
    for k, v in bg_dict.iteritems():
        obj_vec[int(k)] = v
    #print(obj_vec[:1000])
    return obj_vec

In [115]:
def dict2np_n(ori_dict, IDs_list = None, vocab_size=51253):
    num_tar = len(list(ori_dict.keys()))
    obj_vec = np.zeros((num_tar, vocab_size))
    if IDs_list is None:
        IDs_list = list(ori_dict.keys())
    for idx, o_id in enumerate(IDs_list):
        for o_wid, o_wc in ori_dict[o_id].items():
            #if o_wc:
                #print(o_wid, o_wc)
            obj_vec[idx][int(o_wid)] = o_wc
    return obj_vec, IDs_list


In [116]:
def TFIDF(docs_words_dict):
    doc_freq = {}
    total_docs = len(docs_words_dict.keys()) * 1.0
    # compute document frequency
    for doc, words_dict in docs_words_dict.items():
        for word in words_dict.keys():
            if word in doc_freq:
                doc_freq[word] += 1
            else:
                doc_freq[word] = 1
    # tfidf
    set_tfidf = {}
    for doc, words_list in docs_words_dict.items():
        t_doc_tfidf = {}
        for word, tf in words_list.items():
            idf = 1 / log(1 + total_docs / doc_freq[word])
            t_doc_tfidf[word] = (1 + log(tf)) * idf
        set_tfidf[doc] = t_doc_tfidf
    return set_tfidf    

In [117]:
def score_query(qnparr, d):
    dnparr = np.array(docs_words_dict[d].keys())
    intercept = np.intersect1d(dnparr, qnparr)    
    return len(intercept)

In [118]:
#prepare query top 10 doc for feedback module
def score_doc_by_query(query_wordcount_sp, q):
    score_doc_dict = {}
    qnparr = np.array(query_wordcount_sp.keys())
    
    for doc, count in doc_wordcount.items():
        dnparr = np.array(doc_wordcount[doc].keys())
        s = len(np.intersect1d(dnparr, qnparr))
        score_doc_dict.update({doc:s})
    
    sorted_dict = sorted(score_doc_dict.items(), key=lambda kv: kv[1])
    top_10 = sorted_dict[::-1][:10]
    doc_mdl_to_idx = {}
    for k, v in top_10:
        arr_idx = np.where(doc_IDs_t_np==k)
        doc_mdl_to_idx.update({k:arr_idx})
    return doc_mdl_to_idx

In [119]:
#feedback doc model teta_di
def query_feedback_mdl(doc_mdl_to_idx, l_lambda):
    doc_mdl_to_log = {}
    sum_log_di = np.zeros(len(doc_mdl[0]))
    for doc, mdl_idx in  doc_mdl_to_idx.iteritems():
        #print(doc)
        np_ml = np.array(doc_mdl[mdl_idx])
        np_ml[np_ml < min_factor] = min_factor
        dl_log = np.log(np_ml)
        sum_log_di = np.add(dl_log, sum_log_di)
        doc_mdl_to_log.update({doc : dl_log})    

    sum_log_di_norm = sum_log_di/len(doc_mdl_to_idx)    
    u_lambda = l_lambda/(1-l_lambda)
    fb_mdl = np.zeros(len(doc_mdl[0]))
    fb_mdl = np.add(u_lambda*sum_log_di_norm, u_lambda*bg_mdl_log)
    return np.exp(fb_mdl)    

In [120]:
#smooth 
def proccess_query_mdl(query_idx):
    np_query_mdl = np.array(query_mdl[query_idx])
    np_query_mdl[np_query_mdl < min_factor] = min_factor
    return np_query_mdl

In [121]:
def proccess_bg_mdl():
    background_model_np_prob[background_model_np_prob < min_factor] = min_factor
    return np.log(background_model_np_prob)

In [122]:
from itertools import count, takewhile
def frange(start, stop, step):
    return takewhile(lambda x: x< stop, count(start, step))

In [123]:
# document model, calc log(p(x))
data = ProcDoc.read_file(document_path)
doc_wordcount = ProcDoc.doc_preprocess(data)
doc_tfidf = TFIDF(doc_wordcount) 
doc_unigram = ProcDoc.unigram(dict(doc_tfidf))
doc_mdl, doc_IDs = ProcDoc.dict2np(doc_unigram)

# query model
query = ProcDoc.read_file(query_path)
query = ProcDoc.query_preprocess(query)
query_wordcount = {}

for q, q_content in query.items():
    query_wordcount[q] = ProcDoc.word_count(q_content, {})

query_tfidf = TFIDF(query_wordcount) 
query_unigram = ProcDoc.unigram(dict(query_tfidf))
#query_model = query_unigram
query_mdl, query_IDs = ProcDoc.dict2np(query_unigram)

# background_model, calc log(p(x)), p(x)=word_count/vocab_count 
background_model_exp, background_model_prob = read_background_dict_n(doc_wordcount, query_wordcount)
background_model_np_exp = read_background_np_n(background_model_exp)
bg_mdl_log = proccess_bg_mdl()

In [124]:
# document smoothing 
for doc_idx in xrange(doc_mdl.shape[0]):
    doc_vec = doc_mdl[doc_idx]
    doc_mdl[doc_idx] = doc_lambda * doc_vec + (1 - doc_lambda) * background_model_np_prob
    
# query smoothing 
for query_idx in xrange(query_mdl.shape[0]):
    query_vec = query_mdl[query_idx]
    query_mdl[query_idx] =  query_lambda * query_vec + (1 - query_lambda) * background_model_np_prob    
    
np_query_ids = np.array(query_IDs)    
doc_mdl_t, doc_IDs_t = dict2np_n(doc_unigram)
doc_IDs_t_np = np.array(doc_IDs_t)

# prepare the doc mdl
for doc_idx in range(np.shape(doc_mdl)[0]):    
    doc_mdl_vec = doc_mdl[doc_idx]
    doc_mdl_vec[doc_mdl_vec < min_factor] = min_factor    
    doc_mdl[doc_idx] = doc_mdl_vec
    

In [126]:
query_mdl_final, query_final_IDs = ProcDoc.dict2np(query_unigram)

#l_lambda = 0.8
alpha = 0.65

for l_lambda in  frange(0, 1, 0.1):
    for q, idx in query_wordcount.items():   
        query_idx = np.where(np_query_ids==q)
        doc_mdl_to_idx = score_doc_by_query(query_wordcount[q], q)
        #print(s)
        fb_mdl_exp = query_feedback_mdl(doc_mdl_to_idx, l_lambda)    
        np_query_mdl = proccess_query_mdl(query_idx)
        #print(np_query_mdl)
        query_mdl_final[query_idx] = \
            np.add((1-alpha)*np_query_mdl, alpha*fb_mdl_exp)  
    
    #multi fb query mdl and doc mdl - get sorted doc relevane 
    results = np.argsort(-np.dot(query_mdl_final, np.log(doc_mdl.T)), axis = 1)
    #arrange doc ranking by query
    qry_docs_ranking = {}
    for q_idx, q_ID in enumerate(query_IDs):
        docs_ranking = []
        for doc_idx in results[q_idx]:
            docs_ranking.append(doc_IDs[doc_idx])
        qry_docs_ranking[q_ID] = docs_ranking
        
    #evaluate score by compare to relevance     
    eval_mdl = EvaluateModel(relevance_path)
    rel_set = eval_mdl.getAset()
    mAP = eval_mdl.mAP(qry_docs_ranking)
    print(mAP)

Eval: mean Average Precision
0.0154112633394
Eval: mean Average Precision
0.016503497237
Eval: mean Average Precision
0.0183559552589
Eval: mean Average Precision
0.0307635333016
Eval: mean Average Precision
0.350608215229
Eval: mean Average Precision
0.405355498781
Eval: mean Average Precision
0.444158241578
Eval: mean Average Precision
0.480085621133
Eval: mean Average Precision
0.460551245323
Eval: mean Average Precision
0.461046295202
Eval: mean Average Precision
0.461740739646
