In [1]:
import pandas as pd
import numpy as np
import dask.dataframe as dd

import os
import gensim

import warnings
warnings.filterwarnings("ignore")

In [2]:
q_tr=pd.read_table('queries.doctrain.tsv', header=None)
q_tr.columns=['query_id','query_text']

#long queries
q_tr['len']=q_tr['query_text'].apply(lambda x: 1 if len(x) > 110 else np.nan)

#short queries
#q_tr['len']=q_tr['query_text'].apply(lambda x: 1 if len(x) > 25 and len(x) < 35 else np.nan)

q_tr=q_tr.dropna()
queries=q_tr.reset_index(drop=True).tail(400)

In [3]:
top100=pd.read_table('msmarco-doctrain-top100', delimiter=' ', header=None)
top100.columns=['query_id','f2','doc_id','rank','f5','f6']
ranked100=top100[top100['query_id'].isin(queries['query_id'].unique())].reset_index(drop=True)

In [4]:
rel=list(range(1,51))
ranked100['rel']=ranked100['rank'].apply(lambda x: 1 if x in rel else np.nan)
related_top50=ranked100.dropna()

In [5]:
all_texts=dd.read_table('msmarco-docs.tsv', blocksize=100e6, header=None)
all_texts.columns=['doc_id','f2','title','text']

In [6]:
def create_corpus(res):
    unique_doc_id=res['doc_id'].unique()
    condition=all_texts['doc_id'].isin(unique_doc_id)
    corpus=all_texts[condition].reset_index(drop=True)
    corpus=corpus.drop(columns='f2')
    return corpus

training_corpus=create_corpus(related_top50).compute()

In [7]:
data = training_corpus.values.tolist()
cleaned_data = [x for x in data if str(x[2]) != 'nan']
ids = [x[0] for x in cleaned_data]
texts = [x[2] for x in cleaned_data]
enum_data = enumerate(texts)

In [8]:
def read_corpus(enum_data):
    for i, line in enum_data:
        tokens = gensim.utils.simple_preprocess(line)
        yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

In [9]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=70, min_count=3, epochs=40)

In [10]:
corpus = list(read_corpus(enum_data))
model.build_vocab(corpus)

In [11]:
model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [12]:
def get_most_similar(query):
    inferred_vector = model.infer_vector(gensim.utils.simple_preprocess(query))
    return model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

top5Example = [(ids[x[0]], texts[x[0]]) for x in get_most_similar('which of the following were terms of the treaty of versailles?germany was occupied by allied troops.germany paid reparations.germany accepted sole responsibility for world war i.german territory was reduced in size.')[:5]]
top5Example

[('D3135781',
  'Question and answer Q: The microscopic organisms at the base of the marine food chain are known as A. plant life. B. terrestrial animals. C. primitive life. D. phytoplankton. A: The microscopic organisms at the base of the marine food chain are known as Phytoplankton. Get an answer Search for an answer or ask Weegy (Free)The microscopic organisms at the base of the marine food chain are known as A. plant life. B. terrestrial animals. C. primitive life. D. phytoplankton. '),
 ('D3135779',
  'High School Biology 5 points The microscopic organisms at the base of the marine food chain are known as a. primitive life. b. plant life. c. terrestrial animals. d. phytoplankton. Ask for details Follow Reportby Bosq Kemilosbeccarl 02/26/2016Only registered members have access to verified answers Join now Answers Brainly User They are known as phytoplankton. Comments Report4.8 12 votes Thanks 18 '),
 ('D2854281',
  'In colonial America, an aim of British mercantile policy was to fo

In [13]:
def get_average_precision(query_id):
    query = queries['query_text'].loc[queries['query_id'] == query_id].values[0]
    most_similar = get_most_similar(query)
    top20 = [ids[x[0]] for x in most_similar[:20]]

    related_docs = related_top50['doc_id'].loc[related_top50['query_id'] == query_id].values
    is_related = [int(x in related_docs) for x in top20]

    precision = []
    for i in range(0,20):
        if is_related[i]:
            precision.append(np.sum(is_related[:i+1])/(i + 1))
    return np.mean(precision)

In [14]:
MAP=queries['query_id'].apply(lambda x: get_average_precision(x)).mean()
MAP

0.5593255918600186