# Time Testing for all models

### tf, df, dictionary

In [1]:
from collections import Counter

def dic_(allDocument): # alldocument=collection: list
    long_str = " ".join(allDocument)
    b=list(set(list(long_str.split())))
    return sorted(b)

def tf_(doc_que):
    counts = Counter(list(doc_que.split()))
    return dict(counts)

def df_(term, allDocuments):  # df: no. of occurance of a term in whole collection
    dic=dict.fromkeys(term, 0)
    for i in allDocuments:
        n=0
        for word in term:
            if word in i.split():         
                dic[word]=dic[word]+1
            n+=1  
    return dic  

### tf-idf

In [2]:
import math

def tfidf_(query_s,c,df,allDocument): # query_s: Series (df['col']), allDocument:list
    
    all_tfidf=[]
    all_tf=[]
    for query in query_s:
        q_tf=tf_(query)
        all_tf.append(q_tf)
        mac=q_tf.values()
        max_tf=max(q_tf.values())
        tfidf=[]
        for word in c:
            if word not in q_tf: # query item does not shown in doc collection
                    value=0
            else:
                value=(1+math.log10(q_tf.get(word)))/(1+math.log10(max_tf))*math.log10(len(allDocument)/df.get(word))
           
            tfidf.append(value)
        all_tfidf.append(tfidf)
    return np.array(all_tfidf), all_tf #vector

### cosine similarity

In [3]:
from numpy import dot
from numpy.linalg import norm

def cosine_similarity(a,b):
    cos_sim = dot(a, b)/(norm(a)*norm(b))
    return cos_sim

### Import Data

In [4]:
import pandas as pd
import numpy as np

df_doc=pd.read_excel('df_doc_pre.xlsx')
df_query=pd.read_excel('df_query_pre.xlsx')

doc_col=df_doc.loc[: , "Text_tok"].tolist() #collection: list

dic_df=pd.read_csv('dictionary.csv')
doc_fre_df=pd.read_csv('document_frequency.csv')
dic=dic_df['0'].tolist()
doc_fre=pd.Series(doc_fre_df.df.values,index=doc_fre_df.term).to_dict()

### .tfidf()

In [5]:
q_tfidf,q_tf=tfidf_(df_query["Text_tok"],dic,doc_fre,doc_col) # query tf-idf
d_tfidf,d_tf=tfidf_(df_doc["Text_tok"],dic,doc_fre,doc_col) # doc tf-idf

In [6]:
# d_tfidf=tfidf_(df_doc["Text_tok"],dic,doc_fre,doc_col) # doc tf-idf
# q_tfidf=tfidf_(df_query["Text_tok"],dic,doc_fre,doc_col) # query tf-idf

###  Sample query

In [7]:
def random_query(ran_seed,q_tfidf):
    np.random.seed(seed=ran_seed)
    idx = np.random.randint(len(q_tfidf), size=1)
    q_random=q_tfidf[idx]
    return q_random

In [8]:
def random_query_TI(ran_seed,df_query):
    np.random.seed(seed=ran_seed)
    idx = np.random.randint(len(q_tfidf), size=1)
    q_random=df_query.loc[idx]
    return q_random

## Basic Model

In [9]:
import math
import operator
import time

def cosSimilarityTime_Basic(q_tfidf,d_tfidf):
    
    start = time.time()
    sim_all={}
    for q in range(len(q_tfidf)):
        sim={}
        sim_all[q]=sim

        for d in range(len(d_tfidf)):
            sim[d]=cosine_similarity(d_tfidf[d],q_tfidf[q])
    
    df_similarity= pd.DataFrame(data=sim_all)
    df_similarity['d_index']=df_similarity.index
    df1 = df_similarity.reset_index(drop=True)
    df2 = pd.melt(df1, id_vars=["d_index"], var_name="q_index", value_name="similarity")
    df2=df2.sort_values(by=['q_index','similarity'],ascending=[True,False])

    df_query['q_index']=df_query.index
    df_doc['d_index']=df_doc.index
    df2=pd.merge(df2,df_query[['Query id','q_index']],on='q_index',how='left')
    df2=pd.merge(df2,df_doc[['Doc id','d_index']],on='d_index',how='left')
    df_similarity_final=df2[['Query id', 'Doc id', 'similarity']]
    df_similarity_final=df_similarity_final.dropna()
    
    end = time.time()

    return end - start

## Tiered Index

In [10]:
from pandas import Series, DataFrame
import time

a=pd.DataFrame(d_tf).T

def top_N(K, dataframe, tiers, n, term_key):
# K: the number of top documents
    quantile=1
    for i in range(tiers):
        quantile=quantile-1/tiers
        if quantile<0:
            quantile=0
        quantile_value_list=dataframe.quantile(quantile, axis=1)
        s=dataframe.sub(quantile_value_list,axis=0)
        s=s.where(s>0,0) 
        s=s.where(s<=0,1) 
        columns_sum=s.sum()
        top_doc_list=[i for i in range(len(columns_sum)) if columns_sum[i] >= n*len(term_key)]

        if len(top_doc_list)>=K:
            break
    return top_doc_list

def tiered_index(K, query_tf, tiers, n):
    
    top_list=[]
    for i in range(len(query_tf)):
        term_key=list(query_tf[i].keys())
        c=pd.DataFrame(a,index=term_key).fillna(value=0)
        List=top_N(K, c, tiers, n, term_key)
        top_list.append(List)
   
    return top_list

In [11]:
def cosSimilarityTime_TieredIndex(q_tfidf,d_tfidf,top_k_list):
    start = time.time()
    sim_all={}
    for q in range(len(q_tfidf)):
        sim={}
        sim_all[q]=sim
        new_d_tfidf=[]
        for index in top_k_list[q]:
            new_d_tfidf.append(d_tfidf[index])

        for d in range(len(new_d_tfidf)):
            sim[d+1]=cosine_similarity(new_d_tfidf[d],q_tfidf[q])
    
    
    df_similarity= pd.DataFrame(data=sim_all)
    df_similarity['d_index1']=df_similarity.index
    df1 = df_similarity.reset_index(drop=True)
    df2 = pd.melt(df1, id_vars=["d_index1"], var_name="q_index", value_name="similarity")
    df2=df2.dropna()
    df2['d_index']=sum(top_k_list, [])
    df2 = df2[["d_index","q_index","similarity"]]
    df2=df2.sort_values(by=['q_index','similarity'],ascending=[True,False])
    df_query['q_index']=df_query.index
    df_doc['d_index']=df_doc.index
    df2=pd.merge(df2,df_query[['Query id','q_index']],on='q_index',how='left')
    df2=pd.merge(df2,df_doc[['Doc id','d_index']],on='d_index',how='left')
    df_similarity_final=df2[['Query id', 'Doc id', 'similarity']]
    
    end = time.time()
    
    return end-start

## SVM with Pre-Clustering

In [12]:
import numpy as np

def initialize_centroids(points, k, ran_seed):
    np.random.seed(seed=ran_seed)
    centroids = points.copy()
    np.random.shuffle(centroids)
    return centroids[:k]

def closest_centroid(points, centroids):
    cluster_label=[]
    for doc in range(len(points)):
        similarity=[]
        for l in range(len(centroids)):
            sim=cosine_similarity(points[doc],centroids[l])
            similarity.append(sim)
        cluster_label.append(np.argmax(similarity))
    return np.asarray(cluster_label)

def move_centroids(points, closest, centroids):
    mean=[]
    for k in range(centroids.shape[0]):
        if len(d_tfidf[closest==k])==0:
            mean.append([0.5])
        else:
            mean.append(d_tfidf[closest==k].mean(axis=0))
        move_centroid= np.zeros([len(mean),len(max(mean,key = lambda x: len(x)))])
        for i,j in enumerate(mean):
            move_centroid[i][0:len(j)] = j
    return move_centroid
    #return np.array([points[closest==k].mean(axis=0) for k in range(centroids.shape[0])])
    
def k_means(points, k,ran_seed):
    leaders=initialize_centroids(points, k, ran_seed)
    move=True
    while move != False:
        leaders_new=move_centroids(points, closest_centroid(points, leaders), leaders)
        for i in range(len(leaders)):
            if cosine_similarity(leaders[i],leaders_new[i])>0.99999:
                move=False
            else:
                move=True
        leaders=leaders_new
    return leaders, closest_centroid(points, leaders) # cluster label

def pre_clustering(doc,k,ran_seed):
    cluster=k_means(doc, k, ran_seed)
    leader=cluster[0]
    cluster_label=cluster[1]
    df_d_tfidf = pd.DataFrame(doc)
    df_d_tfidf['label']=cluster_label
    return leader, df_d_tfidf

In [13]:
import math
import operator
import time

def cosSimilarityTime_kmeans(q_tfidf,leader,df_d_tfidf):
    
    start = time.time()
    sim_all={}
    for q in range(len(q_tfidf)):
        leader_sim={}
        sim={}
        sim_all[q]=sim

        for l in range(len(leader)):
            leader_sim[l]=cosine_similarity(leader[l],q_tfidf[q])

        closest_cluster_label=max(leader_sim.items(), key=operator.itemgetter(1))[0]
        closest_cluster=df_d_tfidf.loc[df_d_tfidf['label'] == closest_cluster_label].drop(['label'],axis=1)

        for d in closest_cluster.index:
            sim[d]=cosine_similarity(closest_cluster.loc[d],q_tfidf[q])


    df_similarity= pd.DataFrame(data=sim_all)
    df_similarity['d_index']=df_similarity.index
    df1 = df_similarity.reset_index(drop=True)
    df2 = pd.melt(df1, id_vars=["d_index"], var_name="q_index", value_name="similarity")
    df2=df2.sort_values(by=['q_index','similarity'],ascending=[True,False])

    df_query['q_index']=df_query.index
    df_doc['d_index']=df_doc.index
    df2=pd.merge(df2,df_query[['Query id','q_index']],on='q_index',how='left')
    df2=pd.merge(df2,df_doc[['Doc id','d_index']],on='d_index',how='left')
    df_similarity_final=df2[['Query id', 'Doc id', 'similarity']]
    df_similarity_final=df_similarity_final.dropna()
    
    end = time.time()

    return end - start

In [14]:
cluster=pre_clustering(d_tfidf,300,0)

## Random Projection

In [15]:
df_doc_rp=pd.read_csv('d_tfidf2000.csv')
df_query_rp=pd.read_csv('q_tfidf2000.csv')

df_doc_rp=df_doc_rp.drop(['Unnamed: 0'],axis=1)
df_query_rp=df_query_rp.drop(['Unnamed: 0'],axis=1)

d_tfidf_rp=df_doc_rp.as_matrix(columns=None)
q_tfidf_rp=df_query_rp.as_matrix(columns=None)

#df_query_origin=pd.read_excel('df_query_pre.xlsx')
df_query_rp=pd.merge(df_query_rp,df_query[['Query id']], left_index=True, right_index=True,how='left')
#df_doc_origin=pd.read_excel('df_doc_pre.xlsx')
df_doc_rp=pd.merge(df_doc_rp,df_doc[['Doc id']], left_index=True, right_index=True,how='left')

In [16]:
import time

def hammingSimilarity(a,b):
    similarity = np.sum(a == b)
    return similarity

def hammingSimilarityTime_RP(q_tfidf,d_tfidf):
    
    start = time.time()

    sim_all={}
    for q in range(len(q_tfidf)):
        sim={}
        sim_all[q]=sim
    
        for d in range(len(d_tfidf)):
            sim[d]=hammingSimilarity(d_tfidf[d],q_tfidf[q])
        
    
    df_similarity= pd.DataFrame(data=sim_all)
    df_similarity['d_index']=df_similarity.index
    df1 = df_similarity.reset_index(drop=True)
    df2 = pd.melt(df1, id_vars=["d_index"], var_name="q_index", value_name="similarity")
    df2=df2.sort_values(by=['q_index','similarity'],ascending=[True,False])
    df_query_rp['q_index']=df_query_rp.index
    df_doc_rp['d_index']=df_doc_rp.index
    df2=pd.merge(df2,df_query_rp[['Query id','q_index']],on='q_index',how='left')
    df2=pd.merge(df2,df_doc_rp[['Doc id','d_index']],on='d_index',how='left')
    df_similarity_final=df2[['Query id', 'Doc id', 'similarity']]
    
    end = time.time()

    return end - start

## Time Summary

In [18]:
import numpy as np

print ("Average ranking time per query:")

# Basic Model
seed=list(range(0,200))
time_pool=[]
for i in seed:
    qequey_for_time=random_query(i,q_tfidf)
    query_time=cosSimilarityTime_Basic(qequey_for_time,d_tfidf)
    time_pool.append(query_time)
    
print ("Basic Model:",np.mean(time_pool))

# Tiered Index
seed=list(range(0,200))
time_pool=[]
for i in seed:
    qequey_for_time=random_query_TI(i,df_query)
    q_tfidf,q_tf=tfidf_(qequey_for_time["Text_tok"],dic,doc_fre,doc_col) # query tf-idf  
    
    top_k_list=tiered_index(50, q_tf, 3, 0.06)
    query_time=cosSimilarityTime_TieredIndex(q_tfidf,d_tfidf,top_k_list)
    time_pool.append(query_time)
    
print ("Tiered Index:",np.mean(time_pool))

# Pre-Clustering
seed=list(range(0,200))
time_pool=[]
for i in seed:
    qequey_for_time=random_query(i,q_tfidf)
    query_time=cosSimilarityTime_kmeans(qequey_for_time,cluster[0],cluster[1])
    time_pool.append(query_time)
    
print ("Pre-Clustering:",np.mean(time_pool))

# Random Projection
seed=list(range(0,200))
time_pool=[]
for i in seed:
    qequey_for_time=random_query(i,q_tfidf_rp)
    query_time=hammingSimilarityTime_RP(qequey_for_time,q_tfidf_rp)
    time_pool.append(query_time)
    
print ("Random Projection:",np.mean(time_pool))

Average ranking time per query:
Basic Model: 0.261049962044
Tiered Index: 0.0296452057362
Pre-Clustering: 0.0345848608017
Random Projection: 0.0918842077255
