## TF-IDF schema1

### tf, df, dictionary

In [1]:
from collections import Counter

def dic_(allDocument): # alldocument=collection: list
    long_str = " ".join(allDocument)
    b=list(set(list(long_str.split())))
    return sorted(b)

def tf_(doc_que): # term frequqency
    counts = Counter(list(doc_que.split()))
    return dict(counts)

def df_(term, allDocuments):  # df: no. of occurance of a term in whole collection
    dic=dict.fromkeys(term, 0)
    for i in allDocuments:
        n=0
        for word in term:
            if word in i.split():         
                dic[word]=dic[word]+1
            n+=1  
    return dic  


### tf-idf for doc

In [2]:
import math

def tfidf_(query_s,c,df,allDocument): # query_s: Series (df['col']), allDocument:list
    
    all_tfidf=[]
    for query in query_s:
        q_tf=tf_(query)
        mac=q_tf.values()
        max_tf=max(q_tf.values())
        tfidf=[]
        for word in c:
            if word not in q_tf: # query item does not shown in doc collection
                    value=0
            else:
                value=(1+math.log10(q_tf.get(word)))/(1+math.log10(max_tf))*math.log10(len(allDocument)/df.get(word))
           
            tfidf.append(value)
        all_tfidf.append(tfidf)
    return np.array(all_tfidf) #vector

### tf-idf for query

In [3]:
def tfidf_q(query_s,c,df,allDocument): # query_s: Series (df['col']), allDocument:list
    
    all_tfidf=[]
    for query in query_s:
        q_tf=tf_(query)
        mac=q_tf.values()
        max_tf=max(q_tf.values())
        tfidf=[]
        for word in c:
            if word not in q_tf: # query item does not shown in doc collection
                    value=0
            else:
                value=q_tf.get(word)
           
            tfidf.append(value)
        all_tfidf.append(tfidf)
    return np.array(all_tfidf) #vector

### cosine similarity

In [4]:
from numpy import dot
from numpy.linalg import norm

def cosine_similarity(a,b):
    cos_sim = dot(a, b)/(norm(a)*norm(b))
    return cos_sim

### Import Data

In [5]:
import pandas as pd
import numpy as np

df_query=pd.read_excel('df_query_pre.xlsx')
df_doc=pd.read_excel('df_doc_pre.xlsx')

doc_col=df_doc.loc[: , "Text_tok"].tolist() #collection: list

### Import dictionary & document frequency

In [6]:
dic_df=pd.read_csv('dictionary.csv')
doc_fre_df=pd.read_csv('document_frequency.csv')

dic=dic_df['0'].tolist()
doc_fre=pd.Series(doc_fre_df.df.values,index=doc_fre_df.term).to_dict()

### Implementation: TFIDF

In [15]:
import time
start = time.time()

q_tfidf=tfidf_q(df_query["Text_tok"],dic,doc_fre,doc_col) # query tf-idf
d_tfidf=tfidf_(df_doc["Text_tok"],dic,doc_fre,doc_col) # doc tf-idf

end = time.time()
print(end - start)   # seconds

46.3126003742218


### Implementation: consine similarity

In [17]:
import time
start = time.time()

sim_all={}
for q in range(len(q_tfidf)):
    sim={}
    sim_all[q]=sim
    
    for d in range(len(d_tfidf)):
        sim[d]=cosine_similarity(d_tfidf[d],q_tfidf[q])
    #print(sim)
end = time.time()
print(end - start)

1157.573005437851


In [18]:
df_similarity= pd.DataFrame(data=sim_all)
df_similarity['d_index']=df_similarity.index
df1 = df_similarity.reset_index(drop=True)
df2 = pd.melt(df1, id_vars=["d_index"], var_name="q_index", value_name="similarity")
df2=df2.sort_values(by=['q_index','similarity'],ascending=[True,False])

In [19]:
df_query['q_index']=df_query.index
df_doc['d_index']=df_doc.index
df2=pd.merge(df2,df_query[['Query id','q_index']],on='q_index',how='left')
df2=pd.merge(df2,df_doc[['Doc id','d_index']],on='d_index',how='left')
df_similarity_final=df2[['Query id', 'Doc id', 'similarity']]

## Measurement

In [20]:
rel_label = pd.read_excel("all2-1-0.qrel.xlsx")
rel_similarity=pd.merge(df_similarity_final[['Query id','Doc id']],rel_label, on=['Query id','Doc id'], how='left')
rel_similarity=rel_similarity.fillna(value=0)
rel_similarity=rel_similarity.drop(['Doc id'],axis=1)
s = rel_similarity.groupby('Query id')['Rel_level'].apply(lambda x: x.tolist())
correct_input=s.values

### MAP

In [21]:
import numpy as np 
# presicion 
def precision(y_true):
#y_true: this list reordered y_true list
    p=[]
    a=0
    for i in range(len(y_true)):
        if y_true[i]==1:
            a+=1
            precision=a/(i+1)
            p.append(precision)
    #print(p)
    return p

# average precision
def AP(y_true):
    p=precision(y_true)
    if len(p)!=0:
        AP=sum(p)/len(p)
    else:
        AP=0
    #print(AP)
    return AP


def MAP(list_true):# query list 
    ap=[]
    for i in range(len(list_true)):
        ap.append(AP(list_true[i]))
    total=sum(ap)
    #print(ap)
    return total/(len(list_true))

print(MAP(correct_input))

0.10479293051214097


### nDCG1

In [22]:
import numpy as np 
def ndcg1(correct):
    dcg=0
    for i in range(len(correct)):
        gain=2**correct[i]-1
        discounts=np.log2(i+2)
        dcg=dcg+gain/discounts
    
    #idcg
    order = np.argsort(correct)
    sort_correct = np.take(correct, order[::-1])
    idcg=0
    for i in range(len(sort_correct)):
        gain=2**sort_correct[i]-1
        discounts=np.log2(i+2)
        idcg=idcg+gain/discounts
    #print(dcg)
    #print(idcg)
    return dcg/idcg

def mean_ndcg1(correct_list):
    b=0
    for correct in correct_list:
        a=ndcg1(correct)
        b=b+a
    return b/len(correct_list)


### nDCG2

In [23]:
def ndcg2(correct):
    dcg=0
    for i in range(len(correct)):
        gain=correct[i]
        discounts=np.log2(i+2)
        dcg=dcg+gain/discounts
    
    #idcg
    order = np.argsort(correct)
    sort_correct = np.take(correct, order[::-1])
    idcg=0
    for i in range(len(sort_correct)):
        gain=sort_correct[i]
        discounts=np.log2(i+2)
        idcg=idcg+gain/discounts
    #print(dcg)
    #print(idcg)
    return dcg/idcg

def mean_ndcg2(correct_list):
    b=0
    for correct in correct_list:
        a=ndcg2(correct)
        b=b+a
    return b/len(correct_list)


In [24]:
print('nDCG1:',mean_ndcg1(correct_input))
print('nDCG2:',mean_ndcg2(correct_input))

nDCG1: 0.462497814358
nDCG2: 0.463085521775
