## SVM - Random Projection

### tf, df, dictionary

In [47]:
from collections import Counter

def dic_(allDocument): # alldocument=collection: list
    long_str = " ".join(allDocument)
    b=list(set(list(long_str.split())))
    return sorted(b)

def tf_(doc_que):
    counts = Counter(list(doc_que.split()))
    return dict(counts)

def df_(term, allDocuments):  # df: no. of occurance of a term in whole collection
    dic=dict.fromkeys(term, 0)
    for i in allDocuments:
        n=0
        for word in term:
            if word in i.split():         
                dic[word]=dic[word]+1
            n+=1  
    return dic  


### tf-idf

In [48]:
import math

def tfidf_(query_s,c,df,allDocument): # query_s: Series (df['col']), allDocument:list
    
    all_tfidf=[]
    for query in query_s:
        q_tf=tf_(query)
        mac=q_tf.values()
        max_tf=max(q_tf.values())
        tfidf=[]
        for word in c:
            if word not in q_tf: # query item does not shown in doc collection
                    value=0
            else:
                value=(1+math.log10(q_tf.get(word)))/(1+math.log10(max_tf))*math.log10(len(allDocument)/df.get(word))
           
            tfidf.append(value)
        all_tfidf.append(tfidf)
    return np.array(all_tfidf) #vector

### cosine similarity

In [49]:
from numpy import dot
from numpy.linalg import norm

def cosine_similarity(a,b):
    if norm(a)*norm(b)==0:
        cos_sim=0
    else:
        cos_sim = dot(a, b)/(norm(a)*norm(b))
    return cos_sim

In [50]:
def consine(q_tfidf,d_tfidf):
    import time
    start = time.time()

    sim_all={}
    for q in range(len(q_tfidf)):
        sim={}
        sim_all[q]=sim
    
        for d in range(len(d_tfidf)):
            sim[d]=cosine_similarity(d_tfidf[d],q_tfidf[q])
        #print(sim)
    end = time.time()
    print(end - start)
    return sim_all

### hamming similarity

In [51]:
def Hamming_similarity(a,b):
    similarity = np.sum(a == b)
    return similarity

In [52]:
def sim(q_tfidf,d_tfidf):
    import time
    start = time.time()

    sim_all={}
    for q in range(len(q_tfidf)):
        sim={}
        sim_all[q]=sim
    
        for d in range(len(d_tfidf)):
            sim[d]=Hamming_similarity(d_tfidf[d],q_tfidf[q])
        #print(sim)
    end = time.time()
    print(end - start)
    return sim_all

### convert format

In [53]:
def df_similarity_final(sim_all):
    df_similarity= pd.DataFrame(data=sim_all)
    df_similarity['d_index']=df_similarity.index
    df1 = df_similarity.reset_index(drop=True)
    df2 = pd.melt(df1, id_vars=["d_index"], var_name="q_index", value_name="similarity")
    df2=df2.sort_values(by=['q_index','similarity'],ascending=[True,False])
    df_query['q_index']=df_query.index
    df_doc['d_index']=df_doc.index
    df2=pd.merge(df2,df_query[['Query id','q_index']],on='q_index',how='left')
    df2=pd.merge(df2,df_doc[['Doc id','d_index']],on='d_index',how='left')
    df_similarity_final=df2[['Query id', 'Doc id', 'similarity']]
    return df_similarity_final

In [54]:
def change_dataset_oder(df_similarity_final):
    rel_label = pd.read_excel("all2-1-0.qrel.xlsx")
    rel_similarity=pd.merge(df_similarity_final[['Query id','Doc id']],rel_label, on=['Query id','Doc id'], how='left')
    rel_similarity=rel_similarity.fillna(value=0)
    rel_similarity=rel_similarity.drop(['Doc id'],axis=1)
    s = rel_similarity.groupby('Query id')['Rel_level'].apply(lambda x: x.tolist())
    correct_input=s.values
    return correct_input

### map

In [55]:
import numpy as np 
# presicion 
def precision(y_true):
#y_true: this list reordered y_true list
    p=[]
    a=0
    for i in range(len(y_true)):
        if y_true[i]==1:
            a+=1
            precision=a/(i+1)
            p.append(precision)
    #print(p)
    return p

# average precision
def AP(y_true):
    p=precision(y_true)
    if len(p)!=0:
        AP=sum(p)/len(p)
    else:
        AP=0
    #print(AP)
    return AP


def MAP(list_true):# query list 
    ap=[]
    for i in range(len(list_true)):
        ap.append(AP(list_true[i]))
    total=sum(ap)
    #print(ap)
    return total/(len(list_true))

### ndcg

In [56]:
import numpy as np 
def ndcg(correct):
    if sum(correct)==0:
        return 0
    else:
        dcg=0
        for i in range(len(correct)):
            gain=2**correct[i]-1
            discounts=np.log2(i+2)
            dcg=dcg+gain/discounts
    
        #idcg
        order = np.argsort(correct)
        sort_correct = np.take(correct, order[::-1])
        idcg=0
        for i in range(len(sort_correct)):
            gain=2**sort_correct[i]-1
            discounts=np.log2(i+2)
            idcg=idcg+gain/discounts
        #print(dcg)
        #print(idcg)
        return dcg/idcg

def mean_ndcg(correct_list):
    b=0
    for correct in correct_list:
        a=ndcg(correct)
        b=b+a
    return b/len(correct_list)

## 1. basic SVM model for reference

### import data

In [11]:
import pandas as pd
import numpy as np

df_query=pd.read_excel('df_query_pre.xlsx')
df_doc=pd.read_excel('df_doc_pre.xlsx')

doc_col=df_doc.loc[: , "Text_tok"].tolist() #collection: list

### import dictionary & document frequency

In [12]:
dic_df=pd.read_csv('dictionary.csv')
doc_fre_df=pd.read_csv('document_frequency.csv')

dic=dic_df['0'].tolist()
doc_fre=pd.Series(doc_fre_df.df.values,index=doc_fre_df.term).to_dict()

### implementation: TFIDF

In [13]:
import time
start = time.time()

q_tfidf=tfidf_(df_query["Text_tok"],dic,doc_fre,doc_col) # query tf-idf
d_tfidf=tfidf_(df_doc["Text_tok"],dic,doc_fre,doc_col) # doc tf-idf

end = time.time()
print(end - start)   # seconds

37.699589014053345


## 2. random projection

### random vector

In [14]:
#produce random vector: follw the normal distribution of each term
import numpy as np
def random(doc,n):
    mu = np.mean(doc[:,0])
    std = np.std(doc[:,0])
    #np.random.seed(0)
    random = np.random.normal(mu, std, n)
    i=1
    p=1
    while i<18683:
        mu = np.mean(doc[:,i])
        std = np.std(doc[:,i])
        #np.random.seed(p)
        s = np.random.normal(mu, std, n)
        random=np.column_stack((random, s))
        i=i+1
        p=p+1
        
    return random

###  new vector

In [15]:
#compute new vector of document
def new_doc(doc,random,n):
    new_doc = np.zeros(shape=(5371,n))
    i=0
    while i<len(doc):
        j=0
        while j<len(random):
            new_doc[i,j]=sum(np.multiply(doc[i,:],random[j,:]))
            j=j+1
        i=i+1
    return new_doc

In [16]:
#compute new vector of query
def new_q(doc,random,n):
    new_doc = np.zeros(shape=(3237,n))
    i=0
    while i<len(doc):
        j=0
        while j<len(random):
            new_doc[i,j]=sum(np.multiply(doc[i,:],random[j,:]))
            j=j+1
        i=i+1
    return new_doc

### case1：different thresholds 
different thresholds mean, for every document and every query, assign the median of its own vector to the threshold. That, implies, there are 5371 thresholds for documents and 3237 thresholds for query

In [21]:
# compute threshold and get final vector of query
def final_q1(new_doc,n):
    new_doc1 = np.zeros(shape=(3237,n))
    i=0
    while i<len(new_doc):
        j=0
        a=np.median(new_doc[i,:])
        while j<n:
            if new_doc[i,j]>a:
                new_doc1[i,j]=1
            else:
                new_doc1[i,j]=0
            j=j+1
        i=i+1
    return new_doc1



In [22]:
# compute threshold and get final vector of document
def final_doc1(new_doc,n):
    new_doc1 = np.zeros(shape=(5371,n))
    i=0
    while i<len(new_doc):
        j=0
        a=np.median(new_doc[i,:])
        while j<n:
            if new_doc[i,j]>a:
                new_doc1[i,j]=1
            else:
                new_doc1[i,j]=0
            j=j+1
        i=i+1
    return new_doc1

### case 2：one global threshold 
global threshold means there is just one threshold for the whole document and another threshold for the whole query

In [23]:
# compute threshold and get final vector of documents
def final_doc(new_doc,n):
    new_doc1 = np.zeros(shape=(5371,n))
    i=0
    a=np.median(new_doc)
    while i<len(new_doc):
        j=0
        while j<n:
            if new_doc[i,j]>a:
                new_doc1[i,j]=1
            else:
                new_doc1[i,j]=0
            j=j+1
        i=i+1
    return new_doc1



In [24]:
# compute threshold and get final vector of query
def final_q(new_doc,n):
    new_doc1 = np.zeros(shape=(3237,n))
    i=0
    a=np.median(new_doc)
    while i<len(new_doc):
        j=0
        while j<n:
            if new_doc[i,j]>a:
                new_doc1[i,j]=1
            else:
                new_doc1[i,j]=0
            j=j+1
        i=i+1
    return new_doc1

### * exported random projection vector results for quick testing

In [None]:
# if use following imported vectors, you don't need to run vertor generation code

#new_doc1=pd.read_csv("new_doc1.csv")
#new_doc1=new_doc1.values
#new_q1=pd.read_csv("new_q1.csv")
#new_q1=new_q1.values

#new_doc2=pd.read_csv("new_doc2.csv")
#new_doc2=new_doc2.values
#new_q2=pd.read_csv("new_q2.csv")
#new_q2=new_q2.values

#new_doc3=pd.read_csv("new_doc3.csv")
#new_doc3=new_doc3.values
#new_q3=pd.read_csv("new_q3.csv")
#new_q3=new_q3.values

#new_doc4=pd.read_csv("new_doc4.csv")
#new_doc4=new_doc4.values
#new_q4=pd.read_csv("new_q4.csv")
#new_q4=new_q4.values

#new_doc5=pd.read_csv("new_doc5.csv")
#new_doc5=new_doc5.values
#new_q5=pd.read_csv("new_q5.csv")
#new_q5=new_q5.values

#new_doc6=pd.read_csv("new_doc6.csv")
#new_doc6=new_doc6.values
#new_q6=pd.read_csv("new_q6.csv")
#new_q6=new_q6.values

#new_doc7=pd.read_csv("new_doc7.csv")
#new_doc7=new_doc7.values
#new_q7=pd.read_csv("new_q7.csv")
#new_q7=new_q7.values

#new_doc8=pd.read_csv("new_doc8.csv")
#new_doc8=new_doc8.values
#new_q8=pd.read_csv("new_q8.csv")
#new_q8=new_q8.values


### eight final vector length :10, 100, 300, 500, 1000, 2000, 2500, 3000 

## 1.vector length: 10 

In [31]:
#vertor generation 
random1=random(d_tfidf,10)
new_doc1=new_doc(d_tfidf,random1,10)

random_1=random(q_tfidf,10)
new_q1=new_q(q_tfidf,random_1,10)

### 1.1 different thresholds 

#### cosine similarity

In [28]:
d_tfidf1=final_doc1(new_doc1,10)
q_tfidf1=final_q1(new_q1,10)

In [29]:
sim_all=consine(q_tfidf1,d_tfidf1)
df_similarity_final=df_similarity_final(sim_all)
correct_input=change_dataset_oder(df_similarity_final)

299.25027298927307


In [30]:
print(MAP(correct_input))

0.008384934726485903


In [31]:
print(mean_ndcg(correct_input))

0.249905719421


#### hamming similarity

In [42]:
sim_all=sim(q_tfidf1,d_tfidf1)
df_similarity_final=df_similarity_final(sim_all)
correct_input=change_dataset_oder(df_similarity_final)

87.69659113883972


In [43]:
print(MAP(correct_input))

0.008384986203085622


In [44]:
print(mean_ndcg(correct_input))

0.249905905643


### 1.2 one global threshold 

#### cosine similarity

In [45]:
d_tfidf1_=final_doc(new_doc1,10)
q_tfidf1_=final_q(new_q1,10)

In [63]:
sim_all_=consine(q_tfidf1_,d_tfidf1_)
df_similarity_final_=df_similarity_final(sim_all_)
correct_input_=change_dataset_oder(df_similarity_final_)

302.709450006485


In [58]:
print(MAP(correct_input_))

0.00815880780272387


In [59]:
print(mean_ndcg(correct_input_))

0.248378014527


####  hamming similarity

In [60]:
sim_all_=sim(q_tfidf1_,d_tfidf1_)
df_similarity_final_=df_similarity_final(sim_all_)
correct_input_=change_dataset_oder(df_similarity_final_)

83.55536365509033


In [61]:
print(MAP(correct_input_))

0.00806652771459403


In [62]:
print(mean_ndcg(correct_input_))

0.248159418293


## 2. vector length: 100

In [39]:
#vertor generation 
random2=random(d_tfidf,100)
new_doc2=new_doc(d_tfidf,random2,100)

random_2=random(q_tfidf,100)
new_q2=new_q(q_tfidf,random_2,100)

### 2.1 different thresholds

#### cosine similarity

In [194]:
d_tfidf2=final_doc1(new_doc2,100)
q_tfidf2=final_q1(new_q2,100)

In [195]:
sim_all2=consine(q_tfidf2,d_tfidf2)
df_similarity_final2=df_similarity_final(sim_all2)
correct_input2=change_dataset_oder(df_similarity_final2)

288.9096212387085


In [200]:
print(MAP(correct_input2))

0.014539957646979858


In [201]:
print(mean_ndcg(correct_input2))

0.266425484529


#### hamming similarity

In [202]:
sim_all2=sim(q_tfidf2,d_tfidf2)
df_similarity_final2=df_similarity_final(sim_all2)
correct_input2=change_dataset_oder(df_similarity_final2)

96.97552394866943


In [204]:
print(MAP(correct_input2))

0.014539957646979858


In [205]:
print(mean_ndcg(correct_input2))

0.266425484529


### 2.2 one global threshold

#### cosine similarity

In [65]:
d_tfidf2_=final_doc(new_doc2,100)
q_tfidf2_=final_q(new_q2,100)

In [66]:
sim_all2_=consine(q_tfidf2_,d_tfidf2_)
df_similarity_final2_=df_similarity_final(sim_all2_)
correct_input2_=change_dataset_oder(df_similarity_final2_)

307.42424297332764


In [67]:
print(MAP(correct_input2_))

0.00993482942938619


In [68]:
print(mean_ndcg(correct_input2_))

0.254331175727


#### hamming similarity

In [69]:
sim_all2_=sim(q_tfidf2_,d_tfidf2_)
df_similarity_final2_=df_similarity_final(sim_all2_)
correct_input2_=change_dataset_oder(df_similarity_final2_)

87.12870693206787


In [70]:
print(MAP(correct_input2_))

0.008991434195971933


In [71]:
print(mean_ndcg(correct_input2_))

0.252442204871


## 3. Vector length: 300

In [56]:
#vertor generation 
random3=random(d_tfidf,300)
new_doc3=new_doc(d_tfidf,random3,300)

random_3=random(q_tfidf,300)
new_q3=new_q(q_tfidf,random_3,300)

### 3.1 different thresholds 

#### cosine similarity 

In [152]:
d_tfidf3=final_doc1(new_doc3,300)
q_tfidf3=final_q1(new_q3,300)

In [153]:
sim_all3=consine(q_tfidf3,d_tfidf3)
df_similarity_final3=df_similarity_final(sim_all3)
correct_input3=change_dataset_oder(df_similarity_final3)

299.6170370578766


In [154]:
print(MAP(correct_input3))

0.027158705760747217


In [155]:
print(mean_ndcg(correct_input3))

0.292317063566


#### hamming similarity 

In [156]:
sim_all3=sim(q_tfidf3,d_tfidf3)
df_similarity_final3=df_similarity_final(sim_all3)
correct_input3=change_dataset_oder(df_similarity_final3)

94.5193841457367


In [157]:
print(MAP(correct_input3))

0.027158705760747217


In [158]:
print(mean_ndcg(correct_input3))

0.292317063566


### 3.2 one global threshold

#### cosine similarity 

In [73]:
d_tfidf3_=final_doc(new_doc3,300)
q_tfidf3_=final_q(new_q3,300)

In [74]:
sim_all3_=consine(q_tfidf3_,d_tfidf3_)
df_similarity_final3_=df_similarity_final(sim_all3_)
correct_input3_=change_dataset_oder(df_similarity_final3_)

311.10126090049744


In [75]:
print(MAP(correct_input3_))

0.009968235329121656


In [76]:
print(mean_ndcg(correct_input3_))

0.256161542556


#### hamming similarity

In [77]:
sim_all3_=sim(q_tfidf3_,d_tfidf3_)
df_similarity_final3_=df_similarity_final(sim_all3_)
correct_input3_=change_dataset_oder(df_similarity_final3_)

99.83816075325012


In [78]:
print(MAP(correct_input3_))

0.00918976746705226


In [79]:
print(mean_ndcg(correct_input3_))

0.253025328109


## 4. vector length: 500

In [78]:
#vertor generation 
random4=random(d_tfidf,500)
new_doc4=new_doc(d_tfidf,random4,500)

random_4=random(q_tfidf,500)
new_q4=new_q(q_tfidf,random_4,500)

### 4.1 different thresholds

#### cosine similarity 

In [159]:
d_tfidf4=final_doc1(new_doc4,500)
q_tfidf4=final_q1(new_q4,500)

In [160]:
sim_all4=consine(q_tfidf4,d_tfidf4)
df_similarity_final4=df_similarity_final(sim_all4)
correct_input4=change_dataset_oder(df_similarity_final4)

304.8833122253418


In [161]:
print(MAP(correct_input4))

0.035462716716465614


In [162]:
print(mean_ndcg(correct_input4))

0.306962214323


#### hamming similarity

In [163]:
sim_all4=sim(q_tfidf4,d_tfidf4)
df_similarity_final4=df_similarity_final(sim_all4)
correct_input4=change_dataset_oder(df_similarity_final4)

104.0230188369751


In [164]:
print(MAP(correct_input4))

0.035462716716465614


In [165]:
print(mean_ndcg(correct_input4))

0.306962214323


### 4.2 one global threshold

#### cosine similarity

In [82]:
d_tfidf4_=final_doc(new_doc4,500)
q_tfidf4_=final_q(new_q4,500)

In [83]:
sim_all4_=consine(q_tfidf4_,d_tfidf4_)
df_similarity_final4_=df_similarity_final(sim_all4_)
correct_input4_=change_dataset_oder(df_similarity_final4_)

303.65888690948486


In [84]:
print(MAP(correct_input4_))

0.010578638703937831


In [85]:
print(mean_ndcg(correct_input4_))

0.257881808231


#### hamming similarity 

In [86]:
sim_all4_=sim(q_tfidf4_,d_tfidf4_)
df_similarity_final4_=df_similarity_final(sim_all4_)
correct_input4_=change_dataset_oder(df_similarity_final4_)

110.7463538646698


In [87]:
print(MAP(correct_input4_))

0.00929361583203649


In [88]:
print(mean_ndcg(correct_input4_))

0.253425335687


## 5. vector length: 1000

In [86]:
#vertor generation 
random5=random(d_tfidf,1000)
new_doc5=new_doc(d_tfidf,random5,1000)

random_5=random(q_tfidf,1000)
new_q5=new_q(q_tfidf,random_5,1000)

### 5.1 different thresholds 

#### cosine similarity

In [166]:
d_tfidf5=final_doc1(new_doc5,1000)
q_tfidf5=final_q1(new_q5,1000)

In [167]:
sim_all5=consine(q_tfidf5,d_tfidf5)
df_similarity_final5=df_similarity_final(sim_all5)
correct_input5=change_dataset_oder(df_similarity_final5)

323.89371609687805


In [168]:
print(MAP(correct_input5))

0.046536753334454006


In [169]:
print(mean_ndcg(correct_input5))

0.3284382807


#### hamming similarity 

In [170]:
sim_all5=sim(q_tfidf5,d_tfidf5)
df_similarity_final5=df_similarity_final(sim_all5)
correct_input5=change_dataset_oder(df_similarity_final5)

126.13582587242126


In [171]:
print(MAP(correct_input5))

0.046536753334454006


In [172]:
print(mean_ndcg(correct_input5))

0.3284382807


### 5.2 one global threshold

####  cosine similarity

In [90]:
d_tfidf5_=final_doc(new_doc5,1000)
q_tfidf5_=final_q(new_q5,1000)

In [91]:
sim_all5_=consine(q_tfidf5_,d_tfidf5_)
df_similarity_final5_=df_similarity_final(sim_all5_)
correct_input5_=change_dataset_oder(df_similarity_final5_)

354.75254106521606


In [92]:
print(MAP(correct_input5_))

0.012005269341050636


In [93]:
print(mean_ndcg(correct_input5_))

0.26089410365


#### hamming similarity

In [94]:
sim_all5_=sim(q_tfidf5_,d_tfidf5_)
df_similarity_final5_=df_similarity_final(sim_all5_)
correct_input5_=change_dataset_oder(df_similarity_final5_)

141.83697700500488


In [95]:
print(MAP(correct_input5_))

0.009432204552807551


In [96]:
print(mean_ndcg(correct_input5_))

0.253766278767


## 6. vector length: 2000

In [108]:
#vertor generation 
random6=random(d_tfidf,2000)
new_doc6=new_doc(d_tfidf,random6,2000)

random_6=random(q_tfidf,2000)
new_q6=new_q(q_tfidf,random_6,2000)

### 6.1 different thresholds 

#### cosine similarity 

In [104]:
d_tfidf6=final_doc1(new_doc6,2000)
q_tfidf6=final_q1(new_q6,2000)

In [174]:
sim_all6=consine(q_tfidf6,d_tfidf6)
df_similarity_final6=df_similarity_final(sim_all6)
correct_input6=change_dataset_oder(df_similarity_final6)

432.96828722953796


In [106]:
print(MAP(correct_input6))

0.057554004075443345


In [107]:
print(mean_ndcg(correct_input6))

0.348192680832


#### hamming similarity 

In [108]:
sim_all6=sim(q_tfidf6,d_tfidf6)
df_similarity_final6=df_similarity_final(sim_all6)
correct_input6=change_dataset_oder(df_similarity_final6)

184.89722108840942


In [109]:
print(MAP(correct_input6))

0.057554004075443345


In [110]:
print(mean_ndcg(correct_input6))

0.348192680832


### 6.2 one global threshold

#### cosine similarity

In [111]:
d_tfidf6_=final_doc(new_doc6,2000)
q_tfidf6_=final_q(new_q6,2000)

In [113]:
sim_all6_=consine(q_tfidf6_,d_tfidf6_)
df_similarity_final6_=df_similarity_final(sim_all6_)
correct_input6_=change_dataset_oder(df_similarity_final6_)

404.04836201667786


In [114]:
print(MAP(correct_input6_))

0.015407917122855545


In [115]:
print(mean_ndcg(correct_input6_))

0.266010056692


#### hamming similarity 

In [116]:
sim_all6_=sim(q_tfidf6_,d_tfidf6_)
df_similarity_final6_=df_similarity_final(sim_all6_)
correct_input6_=change_dataset_oder(df_similarity_final6_)

175.9710772037506


In [117]:
print(MAP(correct_input6_))

0.009606074332417065


In [118]:
print(mean_ndcg(correct_input6_))

0.254171549029


## 7. vector length:3000

In [152]:
#vertor generation 
random7_=random(d_tfidf,3000)
new_doc7_=new_doc(d_tfidf,random7_,3000)

random_7_=random(q_tfidf,3000)
new_q7_=new_q(q_tfidf,random_7_,3000)

### 7.1 different thresholds

#### cosine similarity 

In [122]:
d_tfidf7=final_doc1(new_doc7_,3000)
q_tfidf7=final_q1(new_q7_,3000)

In [123]:
sim_all7=consine(q_tfidf7,d_tfidf7)
df_similarity_final7=df_similarity_final(sim_all7)
correct_input7=change_dataset_oder(df_similarity_final7)

406.38906812667847


In [124]:
print(MAP(correct_input7))

0.008547454780365291


In [125]:
print(mean_ndcg(correct_input7))

0.250171164203


#### hamming similarity 

In [126]:
sim_all7=sim(q_tfidf7,d_tfidf7)
df_similarity_final7=df_similarity_final(sim_all7)
correct_input7=change_dataset_oder(df_similarity_final7)

205.90796995162964


In [127]:
print(MAP(correct_input7))

0.008547454780365291


In [128]:
print(mean_ndcg(correct_input7))

0.250171164203


### 7.2 one global threshold

#### cosine similarity¶ 

In [129]:
d_tfidf7_=final_doc(new_doc7_,3000)
q_tfidf7_=final_q(new_q7_,3000)

In [130]:
sim_all7_=consine(q_tfidf7_,d_tfidf7_)
df_similarity_final7_=df_similarity_final(sim_all7_)
correct_input7_=change_dataset_oder(df_similarity_final7_)

394.8771526813507


In [131]:
print(MAP(correct_input7_))

0.0087914308832336


In [132]:
print(mean_ndcg(correct_input7_))

0.250995798842


#### hamming similarity 

In [134]:
sim_all7_=sim(q_tfidf7_,d_tfidf7_)
df_similarity_final7_=df_similarity_final(sim_all7_)
correct_input7_=change_dataset_oder(df_similarity_final7_)

204.52503299713135


In [135]:
print(MAP(correct_input7_))

0.009003417099736746


In [136]:
print(mean_ndcg(correct_input7_))

0.252553727498


## 8. vector length:2500

In [299]:
#vertor generation 
random8=random(d_tfidf,2500)
new_doc8=new_doc(d_tfidf,random8,2500)

random_8=random(q_tfidf,2500)
new_q8=new_q(q_tfidf,random_8,2500)

### 8.1 different thresholds 

#### cosine similarity 

In [138]:
d_tfidf8=final_doc1(new_doc8,2500)
q_tfidf8=final_q1(new_q8,2500)

In [139]:
sim_all8=consine(q_tfidf8,d_tfidf8)
df_similarity_final8=df_similarity_final(sim_all8)
correct_input8=change_dataset_oder(df_similarity_final8)

375.99183201789856


In [140]:
print(MAP(correct_input8))

0.009049736514169775


In [141]:
print(mean_ndcg(correct_input8))

0.252295462847


#### hamming similarity

In [142]:
sim_all8=sim(q_tfidf8,d_tfidf8)
df_similarity_final8=df_similarity_final(sim_all8)
correct_input8=change_dataset_oder(df_similarity_final8)

189.8171226978302


In [143]:
print(MAP(correct_input8))

0.009049736514169775


In [144]:
print(mean_ndcg(correct_input8))

0.252295462847


### 8.2 one global threshold

#### cosine similarity 

In [145]:
d_tfidf8_=final_doc(new_doc8,2500)
q_tfidf8_=final_q(new_q8,2500)

In [146]:
sim_all8_=consine(q_tfidf8_,d_tfidf8_)
df_similarity_final8_=df_similarity_final(sim_all8_)
correct_input8_=change_dataset_oder(df_similarity_final8_)

380.0483572483063


In [147]:
print(MAP(correct_input8_))

0.008889530267905756


In [148]:
print(mean_ndcg(correct_input8_))

0.251584940304


#### hamming similarity 

In [149]:
sim_all8_=sim(q_tfidf8_,d_tfidf8_)
df_similarity_final8_=df_similarity_final(sim_all8_)
correct_input8_=change_dataset_oder(df_similarity_final8_)

190.58933401107788


In [150]:
print(MAP(correct_input8_))

0.009064333859309423


In [151]:
print(mean_ndcg(correct_input8_))

0.252618256755
