In [1]:
import pandas as pd
import numpy as np
import dask.dataframe as dd

import os
import gensim

import warnings
warnings.filterwarnings("ignore")

In [2]:
q_tr=pd.read_table('queries.doctrain.tsv', header=None)
q_tr.columns=['query_id','query_text']

#long queries
q_tr['len']=q_tr['query_text'].apply(lambda x: 1 if len(x) > 110 else np.nan)

#short queries
#q_tr['len']=q_tr['query_text'].apply(lambda x: 1 if len(x) > 25 and len(x) < 35 else np.nan)

q_tr=q_tr.dropna()
queries=q_tr.reset_index(drop=True).tail(400)

In [3]:
top100=pd.read_table('msmarco-doctrain-top100', delimiter=' ', header=None)
top100.columns=['query_id','f2','doc_id','rank','f5','f6']
ranked100=top100[top100['query_id'].isin(queries['query_id'].unique())].reset_index(drop=True)

In [4]:
rel=list(range(1,51))
ranked100['rel']=ranked100['rank'].apply(lambda x: 1 if x in rel else np.nan)
related_top50=ranked100.dropna()

In [5]:
all_texts=dd.read_table('msmarco-docs.tsv', blocksize=100e6, header=None)
all_texts.columns=['doc_id','f2','title','text']

In [6]:
def create_corpus(res):
    unique_doc_id=res['doc_id'].unique()
    condition=all_texts['doc_id'].isin(unique_doc_id)
    corpus=all_texts[condition].reset_index(drop=True)
    corpus=corpus.drop(columns='f2')
    return corpus

testing_corpus=create_corpus(related_top50).compute()

In [7]:
training_corpus = all_texts.head(35000).values.tolist()
clean_training_corpus = [x for x in training_corpus if str(x[3]) != 'nan']
training_texts = [x[3] for x in clean_training_corpus]

In [8]:
train_data = [gensim.utils.simple_preprocess(x) for x in training_texts]

In [9]:
model = gensim.models.Word2Vec(train_data, size=200, min_count=3, workers=4)

In [10]:
def get_embedding(tokens):
    embeddings = []
    if len(tokens) > 0:
        for token in tokens:
            if token in model.wv.vocab:
                embeddings.append(model.wv.word_vec(token))
            else:
                embeddings.append(np.random.rand(200))
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(200)

In [11]:
testing_corpus['vector']=testing_corpus['text'].apply(lambda x: get_embedding(gensim.utils.simple_preprocess(x)))

In [12]:
from sklearn.metrics.pairwise import cosine_similarity
def ranking_ir(query):

    vector=get_embedding(gensim.utils.simple_preprocess(query))

    documents=testing_corpus[['doc_id', 'text']].copy()
    documents['similarity']=testing_corpus['vector'].apply(lambda x: cosine_similarity(np.array(vector).reshape(1, -1),np.array(x).reshape(1, -1)).item())
    documents.sort_values(by='similarity', ascending=False, inplace=True)
  
    return documents.head(20).reset_index(drop=True)

In [13]:
query_example = 'which of the following were terms of the treaty of versailles?germany was occupied by allied troops.germany paid reparations.germany accepted sole responsibility for world war i.german territory was reduced in size.'
top5_example = ranking_ir(query_example).head(5)
top5_example

Unnamed: 0,doc_id,text,similarity
0,D2483074,40 terms daw2034Chapter 2: Britain and its Col...,0.919253
1,D1984850,52 terms minime1237APUSH Chapter 2: Beginnings...,0.915846
2,D1928899,The presidents during the last quarter ofthe n...,0.913805
3,D1581173,333 terms bayleekivett2Exam 1 Study Guide Lear...,0.911971
4,D3245559,1 Which of the following did NOT shape the cha...,0.905137


In [14]:
def get_average_precision(query_id):
    query = queries['query_text'].loc[queries['query_id'] == query_id].values[0]
    top20 = ranking_ir(query)['doc_id'].values

    related_docs = related_top50['doc_id'].loc[related_top50['query_id'] == query_id].values
    is_related = [int(x in related_docs) for x in top20]

    precision = []
    for i in range(0,20):
        if is_related[i]:
            precision.append(np.sum(is_related[:i+1])/(i + 1))
    return np.mean(precision)

In [15]:
MAP=queries['query_id'].apply(lambda x: get_average_precision(x)).mean()
MAP

0.7120611392442039