## Pre-clustering with single_path

### tf, df, dictionary

In [1]:
from collections import Counter

def dic_(allDocument): # alldocument=collection: list
    long_str = " ".join(allDocument)
    b=list(set(list(long_str.split())))
    return sorted(b)

def tf_(doc_que): # term frequency
    counts = Counter(list(doc_que.split()))
    return dict(counts)

def df_(term, allDocuments):  # df: no. of occurance of a term in whole collection
    dic=dict.fromkeys(term, 0)
    for i in allDocuments:
        n=0
        for word in term:
            if word in i.split():         
                dic[word]=dic[word]+1
            n+=1  
    return dic  

### tf-idf

In [2]:
import math

def tfidf_(query_s,c,df,allDocument): # query_s: Series (df['col']), allDocument:list

    all_tfidf=[]
    for query in query_s:
        q_tf=tf_(query)
        max_tf=max(q_tf.values())
        tfidf=[]
        for word in c:
            if word not in q_tf: # query item does not shown in doc collection
                    value=0
            else:
                value=(1+math.log10(q_tf.get(word)))/(1+math.log10(max_tf))*math.log10(len(allDocument)/df.get(word))
            tfidf.append(value)
        all_tfidf.append(tfidf)
    return np.array(all_tfidf) #vector

### cosine similarity

In [3]:
from numpy import dot
from numpy.linalg import norm

def cosine_similarity(a,b):
    cos_sim = dot(a, b)/(norm(a)*norm(b))
    return cos_sim

### Import Data

In [4]:
import pandas as pd
import numpy as np

df_doc=pd.read_excel('df_doc_pre.xlsx')
df_query=pd.read_excel('df_query_pre.xlsx')

doc_col=df_doc.loc[: , "Text_tok"].tolist() #collection: list

dic_df=pd.read_csv('dictionary.csv')
doc_fre_df=pd.read_csv('document_frequency.csv')
dic=dic_df['0'].tolist()
doc_fre=pd.Series(doc_fre_df.df.values,index=doc_fre_df.term).to_dict()

### tfidf

In [5]:
d_tfidf=tfidf_(df_doc["Text_tok"],dic,doc_fre,doc_col) # doc tf-idf
q_tfidf=tfidf_(df_query["Text_tok"],dic,doc_fre,doc_col) # query tf-idf

### doc clustering

In [6]:
import math
import numpy as np
import operator
import operator
import time

def pre_clustering(d_tfidf,ran_seed,leader_size):
    
    # randomly select k doc leaders
    np.random.seed(seed=ran_seed)
    idx = np.random.randint(len(df_doc), size=leader_size)
    leader=d_tfidf[idx,:]

    # assign each doc in nearest clusters
    cluster_label=[]
    for doc in range(len(d_tfidf)):
        similarity=[]
        for l in range(len(leader)):
            sim=cosine_similarity(d_tfidf[doc],leader[l])
            similarity.append(sim)
        cluster_label.append(np.argmax(similarity))
    df_d_tfidf = pd.DataFrame(d_tfidf)
    df_d_tfidf['label']=cluster_label
    return leader, df_d_tfidf

In [7]:
import math
import operator
import time

def cosSimilarityImp(q_tfidf,leader,df_d_tfidf):
    
    start = time.time()
    sim_all={}
    for q in range(len(q_tfidf)):
        leader_sim={}
        sim={}
        sim_all[q]=sim
        
        # compare similarity between query and doc leaders
        for l in range(len(leader)):
            leader_sim[l]=cosine_similarity(leader[l],q_tfidf[q])
        
        # access the cloest leader's cluster
        closest_cluster_label=max(leader_sim.items(), key=operator.itemgetter(1))[0]
        closest_cluster=df_d_tfidf.loc[df_d_tfidf['label'] == closest_cluster_label].drop(['label'],axis=1)
        
        # calculate similarity amongth docs in the cloest cluster
        for d in closest_cluster.index:
            sim[d]=cosine_similarity(closest_cluster.loc[d],q_tfidf[q])

    end = time.time()
    print('similarity time: ',end - start)

    # ranking doc by similartiy
    # covert format to a table
    df_similarity= pd.DataFrame(data=sim_all)
    df_similarity['d_index']=df_similarity.index
    df1 = df_similarity.reset_index(drop=True)
    df2 = pd.melt(df1, id_vars=["d_index"], var_name="q_index", value_name="similarity")
    df2=df2.sort_values(by=['q_index','similarity'],ascending=[True,False])

    df_query['q_index']=df_query.index
    df_doc['d_index']=df_doc.index
    df2=pd.merge(df2,df_query[['Query id','q_index']],on='q_index',how='left')
    df2=pd.merge(df2,df_doc[['Doc id','d_index']],on='d_index',how='left')
    df_similarity_final=df2[['Query id', 'Doc id', 'similarity']]
    df_similarity_final=df_similarity_final.dropna()

    # prepare format for meaturement
    rel_label = pd.read_excel("all2-1-0.qrel.xlsx")
    rel_similarity=pd.merge(df_similarity_final[['Query id','Doc id']],rel_label, on=['Query id','Doc id'], how='left')
    rel_similarity=rel_similarity.fillna(value=0)
    rel_similarity=rel_similarity.drop(['Doc id'],axis=1)
    s = rel_similarity.groupby('Query id')['Rel_level'].apply(lambda x: x.tolist())
    correct_input=s.values

    return correct_input

#### MAP

In [8]:
import numpy as np 
# presicion 
def precision(y_true):
#y_true: this list reordered y_true list
    p=[]
    a=0
    for i in range(len(y_true)):
        if y_true[i]==1:
            a+=1
            precision=a/(i+1)
            p.append(precision)
    #print(p)
    return p

# average precision
def AP(y_true):
    p=precision(y_true)
    if len(p)!=0:
        AP=sum(p)/len(p)
    else:
        AP=0
    #print(AP)
    return AP


def MAP(list_true):# query list 
    ap=[]
    for i in range(len(list_true)):
        ap.append(AP(list_true[i]))
    total=sum(ap)
    #print(ap)
    return total/(len(list_true))

#### nDCG1

In [9]:
import numpy as np 
def ndcg1(correct):
    if sum(correct)==0:
        return 0
    else:
        dcg=0
        for i in range(len(correct)):
            gain=2**correct[i]-1
            discounts=np.log2(i+2)
            dcg=dcg+gain/discounts
    
        #idcg
        order = np.argsort(correct)
        sort_correct = np.take(correct, order[::-1])
        idcg=0
        for i in range(len(sort_correct)):
            gain=2**sort_correct[i]-1
            discounts=np.log2(i+2)
            idcg=idcg+gain/discounts
        #print(dcg)
        #print(idcg)
        return dcg/idcg

def mean_ndcg1(correct_list):
    b=0
    for correct in correct_list:
        a=ndcg1(correct)
        b=b+a
    return b/len(correct_list)

### Clustering Tuning

In [10]:
leader_size=round(math.sqrt(len(df_doc)))
print ('leader size:',leader_size)
for i in range(0,10):
    print('seed:',i)
    cluster=pre_clustering(d_tfidf,i,leader_size)
    result=cosSimilarityImp(q_tfidf,cluster[0], cluster[1])
    print('MAP:',MAP(result)) 
    print('nDCG:',mean_ndcg1(result))
    print()

leader size: 73
seed: 0
similarity time:  127.22980046272278
MAP: 0.12213382215123987
nDCG: 0.255696736928

seed: 1
similarity time:  140.3204002380371
MAP: 0.1349435507081836
nDCG: 0.281596613068

seed: 2
similarity time:  121.02650022506714
MAP: 0.13353519383144877
nDCG: 0.270695401174

seed: 3
similarity time:  160.1100001335144
MAP: 0.13324585229511968
nDCG: 0.286548144426

seed: 4
similarity time:  122.78949999809265
MAP: 0.13338996447708998
nDCG: 0.27539129887

seed: 5
similarity time:  135.7315001487732
MAP: 0.13807766274347116
nDCG: 0.28411638356

seed: 6
similarity time:  113.21399974822998
MAP: 0.12625873150058353
nDCG: 0.266347565774

seed: 7
similarity time:  129.4967999458313
MAP: 0.1262757659116349
nDCG: 0.258539190864

seed: 8
similarity time:  119.04699969291687
MAP: 0.13523005144959838
nDCG: 0.273715481787

seed: 9
similarity time:  130.76550006866455
MAP: 0.136884728870111
nDCG: 0.276464758563



In [11]:
leader_size=60
print ('leader size:',leader_size)
for i in range(0,10):
    print('seed:',i)
    cluster=pre_clustering(d_tfidf,i,leader_size)
    result=cosSimilarityImp(q_tfidf,cluster[0], cluster[1])
    print('MAP:',MAP(result)) 
    print('nDCG:',mean_ndcg1(result))
    print()

leader size: 60
seed: 0
similarity time:  146.79850029945374
MAP: 0.1289793572917648
nDCG: 0.271957723819

seed: 1
similarity time:  161.9119999408722
MAP: 0.13729110280673482
nDCG: 0.289487553414

seed: 2
similarity time:  142.19200015068054
MAP: 0.13075629364057956
nDCG: 0.269584632975

seed: 3
similarity time:  172.69200015068054
MAP: 0.13426807434952176
nDCG: 0.291898533734

seed: 4
similarity time:  141.3510000705719
MAP: 0.13127186311128997
nDCG: 0.271995830929

seed: 5
similarity time:  139.9500002861023
MAP: 0.13865997054371132
nDCG: 0.284499770374

seed: 6
similarity time:  141.6190001964569
MAP: 0.12681425648750969
nDCG: 0.275090256543

seed: 7
similarity time:  133.19999980926514
MAP: 0.12434432015813864
nDCG: 0.261795457316

seed: 8
similarity time:  137.21299982070923
MAP: 0.13026761269351653
nDCG: 0.272355668592

seed: 9
similarity time:  141.07800006866455
MAP: 0.13128755208197948
nDCG: 0.274356030634



#### According several testing by randomly selecting 10 times (seed 0-10), random seed 5 always gets best performance on both MAP and nDCG. Hence folllwing leader sizes just show resultes where random seed=5

In [12]:
leader_size=[70,80,90,100]
for i in leader_size:
    print('leader size:',i)
    cluster=pre_clustering(d_tfidf,5,i)
    result=cosSimilarityImp(q_tfidf,cluster[0], cluster[1])
    print('MAP:',MAP(result)) 
    print('nDCG:',mean_ndcg1(result))
    print()

leader size: 70
similarity time:  141.71440076828003
MAP: 0.13805103992517584
nDCG: 0.284927751445

leader size: 80
similarity time:  121.0858006477356
MAP: 0.13905014154891845
nDCG: 0.281681723324

leader size: 90
similarity time:  113.0476005077362
MAP: 0.14044409830472898
nDCG: 0.283476843559

leader size: 100
similarity time:  105.67640042304993
MAP: 0.143367708954208
nDCG: 0.285419378428



In [13]:
leader_size=[110,120,130,140,150,160,170,180,190,200]
for i in leader_size:
    print('leader size:',i)
    cluster=pre_clustering(d_tfidf,5,i)
    result=cosSimilarityImp(q_tfidf,cluster[0], cluster[1])
    print('MAP:',MAP(result)) 
    print('nDCG:',mean_ndcg1(result))
    print()

leader size: 110
similarity time:  103.31860041618347
MAP: 0.1470512678039042
nDCG: 0.287821564374

leader size: 120
similarity time:  91.81780076026917
MAP: 0.14466108633093622
nDCG: 0.27836094075

leader size: 130
similarity time:  89.41680073738098
MAP: 0.14875273281095916
nDCG: 0.285419812568

leader size: 140
similarity time:  85.92140054702759
MAP: 0.14383869258857993
nDCG: 0.278566136649

leader size: 150
similarity time:  94.16780018806458
MAP: 0.14646628526442337
nDCG: 0.280998590601

leader size: 160
similarity time:  82.15600061416626
MAP: 0.1485848039624784
nDCG: 0.281915978697

leader size: 170
similarity time:  85.99080014228821
MAP: 0.15444079854033713
nDCG: 0.287963616946

leader size: 180
similarity time:  77.13320064544678
MAP: 0.15613003085810598
nDCG: 0.288534829529

leader size: 190
similarity time:  76.49020051956177
MAP: 0.15754416557508139
nDCG: 0.290814047674

leader size: 200
similarity time:  75.59540033340454
MAP: 0.1586135731121029
nDCG: 0.291161126485



In [14]:
leader_size=[210,220,230,240,250,260,270,280,290,300]
for i in leader_size:
    print('leader size:',i)
    cluster=pre_clustering(d_tfidf,5,i)
    result=cosSimilarityImp(q_tfidf,cluster[0], cluster[1])
    print('MAP:',MAP(result)) 
    print('nDCG:',mean_ndcg1(result))
    print()

leader size: 210
similarity time:  74.89780020713806
MAP: 0.1582693973645075
nDCG: 0.289886740245

leader size: 220
similarity time:  72.72300028800964
MAP: 0.15804443930767945
nDCG: 0.287432426277

leader size: 230
similarity time:  71.86300039291382
MAP: 0.15887640931101688
nDCG: 0.28734083912

leader size: 240
similarity time:  69.05020046234131
MAP: 0.1617518084002558
nDCG: 0.290836234403

leader size: 250
similarity time:  73.5400002002716
MAP: 0.16201863658825524
nDCG: 0.292049485657

leader size: 260
similarity time:  72.32980012893677
MAP: 0.1634345442103171
nDCG: 0.292123967807

leader size: 270
similarity time:  71.19659996032715
MAP: 0.1642988781011992
nDCG: 0.294825693516

leader size: 280
similarity time:  71.55020046234131
MAP: 0.16372902737635864
nDCG: 0.295395115316

leader size: 290
similarity time:  67.16999983787537
MAP: 0.16236967730495142
nDCG: 0.286064486353

leader size: 300
similarity time:  67.04900002479553
MAP: 0.16370248149143202
nDCG: 0.286583929385



In [15]:
leader_size=[310,320,330,340,350,360,370,380,390,400]
for i in leader_size:
    print('leader size:',i)
    cluster=pre_clustering(d_tfidf,5,i)
    result=cosSimilarityImp(q_tfidf,cluster[0], cluster[1])
    print('MAP:',MAP(result)) 
    print('nDCG:',mean_ndcg1(result))
    print()

leader size: 310
similarity time:  66.7170000076294
MAP: 0.16317170324500974
nDCG: 0.285673307332

leader size: 320
similarity time:  67.15899991989136
MAP: 0.1620768555177003
nDCG: 0.284783203314

leader size: 330
similarity time:  72.75140023231506
MAP: 0.16238091326897536
nDCG: 0.284918787628

leader size: 340
similarity time:  66.04580044746399
MAP: 0.1610115262904186
nDCG: 0.283931420737

leader size: 350
similarity time:  66.87140035629272
MAP: 0.16092495578398236
nDCG: 0.283896375867

leader size: 360
similarity time:  67.76360034942627
MAP: 0.16220567353906876
nDCG: 0.287335571432

leader size: 370
similarity time:  70.21800017356873
MAP: 0.1624075345563928
nDCG: 0.286754882049

leader size: 380
similarity time:  70.42580032348633
MAP: 0.16258939217715676
nDCG: 0.286265221808

leader size: 390
similarity time:  67.93600010871887
MAP: 0.16406954929865916
nDCG: 0.286451156625

leader size: 400
similarity time:  68.15900039672852
MAP: 0.16374104206812654
nDCG: 0.285050192658



In [16]:
leader_size=[500,600,700,800,900]
for i in leader_size:
    print('leader size:',i)
    cluster=pre_clustering(d_tfidf,5,i)
    result=cosSimilarityImp(q_tfidf,cluster[0], cluster[1])
    print('MAP:',MAP(result)) 
    print('nDCG:',mean_ndcg1(result))
    print()

leader size: 500
similarity time:  73.0264003276825
MAP: 0.1730123956244614
nDCG: 0.297407110871

leader size: 600
similarity time:  77.23060059547424
MAP: 0.18996214310238826
nDCG: 0.314291608949

leader size: 700
similarity time:  90.51080083847046
MAP: 0.1899768685810404
nDCG: 0.31189783066

leader size: 800
similarity time:  92.22960042953491
MAP: 0.18928394248608627
nDCG: 0.311180117751

leader size: 900
similarity time:  101.96960067749023
MAP: 0.1881944374961413
nDCG: 0.303194067397



In [17]:
leader_size=[1000,2000,3000,4000,5000]
for i in leader_size:
    print('leader size:',i)
    cluster=pre_clustering(d_tfidf,5,i)
    result=cosSimilarityImp(q_tfidf,cluster[0], cluster[1])
    print('MAP:',MAP(result)) 
    print('nDCG:',mean_ndcg1(result))
    print()

leader size: 1000
similarity time:  105.09270071983337
MAP: 0.18664047565763092
nDCG: 0.303532309349

leader size: 2000
similarity time:  190.89420104026794
MAP: 0.21611770194783586
nDCG: 0.347978502737

leader size: 3000
similarity time:  290.0717010498047
MAP: 0.23696130530611353
nDCG: 0.374775525992

leader size: 4000
similarity time:  387.80940103530884
MAP: 0.23872536544223358
nDCG: 0.381866978758

leader size: 5000
similarity time:  464.80210280418396
MAP: 0.23737836572173973
nDCG: 0.379830011557



In [18]:
leader_size=[10,20,30,40,50]
for i in leader_size:
    print('leader size:',i)
    cluster=pre_clustering(d_tfidf,5,i)
    result=cosSimilarityImp(q_tfidf,cluster[0], cluster[1])
    print('MAP:',MAP(result)) 
    print('nDCG:',mean_ndcg1(result))
    print()

leader size: 10
similarity time:  831.9860045909882
MAP: 0.12810915416478694
nDCG: 0.342973212443

leader size: 20
similarity time:  347.59200167655945
MAP: 0.13334874095601448
nDCG: 0.321845271296

leader size: 30
similarity time:  252.3130009174347
MAP: 0.13892417795463372
nDCG: 0.311750251888

leader size: 40
similarity time:  230.63340067863464
MAP: 0.14358346922999604
nDCG: 0.309598759347

leader size: 50
similarity time:  198.74700093269348
MAP: 0.14040385718822582
nDCG: 0.291477312661

