In [None]:
import copy
import gensim
import logging
import pyndri
import pyndri.compat
import sys
import time
import pickle
import numpy as np
from nltk.stem.porter import PorterStemmer

In [None]:
# Function that makes and saves a w2v model
def calculate_word2vec():
    dictionary = pyndri.extract_dictionary(index)
    sentences = pyndri.compat.IndriSentences(index, dictionary)

    word2vec = gensim.models.Word2Vec(sentences, min_count=1)
    word2vec.save('W2V/w2v')
    return

# Function that load a w2v model
def load_word2vec():
    word_vectors = gensim.models.Word2Vec.load('W2V/w2v')
    return word_vectors

In [None]:
# A function that returns a vector representation of a word
# INPUT
# - a word, a string, like 'joris'
# - word_vectors, a dictionary which returns the vector of a word
# CODE
# - returns the vector representation of a word. If this is not avalaible, it stems the word and finds the vector representation
# RETURNS
# - the vector representation of the (stemmed) word
def get_vector_for_word(word,word_vectors):
    try:
        embedding = word_vectors[str(word)]
        return embedding
    except KeyError:
        stemmed_word = PorterStemmer().stem(str(word))
        return word_vectors[stemmed_word]
#get_vector_for_word('panamas',word_vectors)

In [None]:
# Function that determines the average vector of a test
# INPUT
# - a list of strings, for instance ['python','wizard']
# - a word_vectors model
# RETURNS
# - a vector of size 100
def get_average_vector(text,word_vectors):
    
    if len(text) == 0: # if the document is empty
        return np.zeros(100)
    
    average = 0
    for i in range(0,len(text)):
        if i == 0:
            average = copy.copy(get_vector_for_word(text[i],word_vectors))
        else:
            average += copy.copy(get_vector_for_word(text[i],word_vectors))
    return average/len(text)    

In [None]:
# Calculate the average word vector of document in the colletion
# This is a preprocessing step
# It returns a dict where dict[1] returns the average word vector of document 1
def get_doc2vec():
    doc2vec = {}

    for i in range(1,num_documents+1):
        doc_i = get_document(int2ext_ids[i])
        
        average_doc_vec = get_average_vector(doc_i,word_vectors)

        doc2vec[i] = average_doc_vec
    return doc2vec    

In [None]:
# Function that saves the doc2vec dictionary to a pickle file
def save_doc2vec():
    doc2vec = get_doc2vec()
    with open('doc2vec/doc2vec_dict.pickle', 'wb') as handle:
            pickle.dump(doc2vec, handle, protocol=pickle.HIGHEST_PROTOCOL)
    return

# Function that loads the pickle doc2vec dictionary
def load_doc2vec():
    doc2vec = pickle.load(open('doc2vec/doc2vec_dict.pickle','rb'))
    return doc2vec

doc2vec = load_doc2vec()

In [None]:
# Function that removes punctuation, and lowers a query
# INPUT
# - a list of strings, the query, for instance ['Airbus','Subsidies']
# - token2id, a dictionary converting words to ids
# RETURNS
# - the same query lowercased without punctuation, for instance ['airbus','subsidies']
def remove_punctuation(query,token2id):
    
    punctuation_list = ['"', '(',')', '&','-',"'",'.','/','?']
        
    good_query = []
    for word in query:           
        
        good_word = ""
        if len(word) == 1 and word in punctuation_list:
            if word == '&':
                good_query.append('and')
            else:
                continue
        elif word == 'vs' or word == 'vs.':
            good_query.append('versus')
        elif word == 'us' or word == 'U.S.' or word == "U.S.'s" or word == 'U.':
            good_query.append('united')
            good_query.append('states')
        else:
            for letter in word:
                if letter not in punctuation_list:
                    good_word += letter.lower()
                elif letter == '-' or letter == "'" or letter == '/':
                    good_query.append(good_word)
                    good_word = ""
            else:
                good_query.append(good_word)
        
    if 'us' in good_query: # Replace 'us' with 'united','states'
        place = good_query.index('us')
        good_query.pop(int(place))
        good_query.insert(int(place),'united')
        good_query.insert(int(place) + 1 , 'states')
       
    
    best_query = []
    for word in good_query:
        if word in token2id and len(word) > 1:
            best_query.append(word)
    
    return best_query

In [None]:
# Function that calculates the top1000 document ranking for a query
# INPUT
# - a query id, 51,200 (integer)
# - token2id, a dictionary of tokens to IDs
# - word_vectors, a dictionary of word_vectors
# RETURNS
# - a list of the 1000 documents, a list of form [(APYYYYY-YY,scoreY),(APXXXXX-XX,scoreX),(etc,score_etc),...],
# , where the first element of the list is the best query result, and the last element of the list is the worst 
def average_vector_scores(query_id,token2id,word_vectors):
    query = remove_punctuation(queries[str(query_id)].split(),token2id)
    query_vec = get_average_vector(test_query,word_vectors)
    
    ranking = []
    
    for i in range(1,num_documents+1):
        ext_doc_ID = int2ext_ids[i]
        average_doc_vec = doc2vec[i]

        score = ssd.cosine(average_query_vec,average_doc_vec)

        ranking.append((ext_doc_ID,score))
    
    ranking.sort(key=itemgetter(1))
    
    return ranking[:1000]