In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
from collections import defaultdict
import pickle
from google_trans_new import google_trans_new

In [2]:
def queryPreprocess(query, nlp_en, nlp_de, translator):
    en_query = translator.translate(query, lang_tgt='en')
    de_query = translator.translate(query, lang_tgt='de')
    processed_query_en = nlp_en(en_query)
    query_token_list_en = []
    for tok in processed_query_en:
        val = tok.lemma_.lower()
        if val.isalnum():
            query_token_list_en.append(val)
    processed_query_de = nlp_de(de_query)
    query_token_list_de = []
    for tok in processed_query_de:
        val = tok.lemma_.lower()
        if val.isalnum():
            query_token_list_de.append(val)
    return query_token_list_de, query_token_list_en

In [3]:
#Get closest decomposed words for a given word from vocab
def closestWords(word, wordMap):
    word = word.lower()
    if word in wordMap:
        return word
    memo = {}
    def recursive(w):
        if w in wordMap:
            return [w]
        elif w == "":
            return []
        
        candidates = [[]]
        for i in range(len(w)):
            for j in range(i + 1, len(w)):
                if w[i: j] in wordMap:
                    candidates.append([w[i: j]] + memo.get(w[j:], recursive(w[j:])))   
        memo[w] = max(candidates, key=lambda x: sum([len(i) for i in x])/(len(x)+1))
        return memo[w]
    return recursive(word)

def queryIds(queryList, word2id):
    queryIdList = []
    for each in queryList:
        if each in word2id:
            queryIdList.append(word2id.get(each))
        else:
            closest = closestWords(each, word2id)
            for tok in closest:
                queryIdList.append(word2id.get(tok))
    return queryIdList

In [4]:
import math
def retrievalModel(lang, queryList, collection_frequencies, inverted_indices, i2u, mu=200):
    c_mod = sum(collection_frequencies.values())
    candidates = {}
    candidate_denominators = {}
    candidate_scores = {}
    
    for id_ in queryList:
        for doc in inverted_indices.get(id_):
            candidates[doc[0]] = [mu * (collection_frequencies.get(id_))/c_mod for _ in range(len(queryList))]
            candidate_denominators[doc[0]] = (doc[2] + mu)
    
    for i in range(len(queryList)):
        for doc in inverted_indices.get(id_):
            candidates[doc[0]][i] = (candidates[doc[0]][i] + doc[1])

    for key in candidates:
        denom = candidate_denominators.get(key)
        candidates[key] = [math.log(i/denom, 2) for i in candidates[key]]
        url = i2u.get(key)
        if lang == "de":
            url = url[:8] + "de" + url[10:]
        candidate_scores[url] = sum(candidates[key])
    
    return sorted(list(candidate_scores.items()), reverse=True, key=lambda x: x[1])
    
    
    

In [35]:
def getFinalRankedList(query, nlp_en, nlp_de, translator, indexMap):
    word2id_de = indexMap.get('word2id_de')
    word2id_eng = indexMap.get("word2id_eng")
    collection_frequencies_de = indexMap.get('collection_frequencies_de')
    collection_frequencies_eng = indexMap.get("collection_frequencies_eng")
    inverted_indices_de = indexMap.get("inverted_indices_de")
    inverted_indices_eng = indexMap.get("inverted_indices_eng")
    i2u_de = indexMap.get("i2u_de")
    i2u_eng = indexMap.get("i2u_eng")
    
    germanQuery, englishQuery = queryPreprocess(query, nlp_en, nlp_de, translator)
    ranked_list1 = retrievalModel("de", queryIds(germanQuery, word2id_de), collection_frequencies_de, inverted_indices_de, i2u_de, 500)
    ranked_list2 = retrievalModel("en", queryIds(englishQuery, word2id_eng), collection_frequencies_eng, inverted_indices_eng, i2u_eng, 500)
    final_ranked_list = ranked_list1[:20] + ranked_list2[:20]
    final_ranked_list.sort(reverse=True, key=lambda x: x[1])
    return final_ranked_list[:20]

In [6]:
nlp_en = spacy.load("en_core_web_sm")
nlp_de = spacy.load("de_core_news_sm")
translator = google_trans_new.google_translator()

In [7]:
inverted_indices_eng = pickle.load(open("inverted_indices_eng_v2.pickle", "rb"))
collection_frequencies_eng = pickle.load(open("collection_frequencies_eng_v2.pickle", "rb"))
word2id_eng = pickle.load(open("word2id_eng_v2.pickle", "rb"))
id2word_eng = pickle.load(open("id2word_eng_v2.pickle", "rb"))
u2i_eng = pickle.load(open("u2i_eng_v2.pickle", "rb"))
i2u_eng = pickle.load(open("i2u_eng_v2.pickle", "rb"))

In [8]:
with open("inverted_indices_de_v2.pickle", "rb") as f:
    inverted_indices_de = pickle.load(f)
with open("collection_frequencies_de_v2.pickle", "rb") as f:
    collection_frequencies_de = pickle.load(f)
with open("word2id_de_v2.pickle", "rb") as f:
    word2id_de = pickle.load(f)
with open("id2word_de_v2.pickle", "rb") as f:
    id2word_de = pickle.load(f)
with open("u2i_de_v2.pickle", "rb") as f:
    u2i_de = pickle.load(f)
with open("i2u_de_v2.pickle", "rb") as f:
    i2u_de = pickle.load(f)

In [9]:
indexMap = {}
indexMap["inverted_indices_de"] = inverted_indices_de
indexMap["collection_frequencies_de"] = collection_frequencies_de
indexMap["word2id_de"] = word2id_de
indexMap["i2u_de"] = i2u_de
indexMap["inverted_indices_eng"] = inverted_indices_eng
indexMap["collection_frequencies_eng"] = collection_frequencies_eng
indexMap["word2id_eng"] = word2id_eng
indexMap["i2u_eng"] = i2u_eng

In [38]:
getFinalRankedList("metrics",  nlp_en, nlp_de, translator, indexMap)

[('https://en.wikipedia.org/wiki/Metric"', -5.271837972531778),
 ('https://en.wikipedia.org/wiki/Diffeomorphometry"', -5.875483937107808),
 ('https://en.wikipedia.org/wiki/Metric tensor (general relativity)"',
  -6.08490650544641),
 ('https://en.wikipedia.org/wiki/Weyl metrics"', -6.113600533061508),
 ('https://en.wikipedia.org/wiki/Łukaszyk–Karmowski metric"',
  -6.134071408205713),
 ('https://en.wikipedia.org/wiki/Vaidya metric"', -6.224816022580784),
 ('https://en.wikipedia.org/wiki/Metric dimension (graph theory)"',
  -6.230188171253995),
 ('https://en.wikipedia.org/wiki/Probabilistic metric space"',
  -6.277158364152808),
 ('https://en.wikipedia.org/wiki/Fubini–Study metric"', -6.33042600957874),
 ('https://en.wikipedia.org/wiki/Raising and lowering indices"',
  -6.37909341899262),
 ('https://en.wikipedia.org/wiki/Riemannian manifold"', -6.408979238147976),
 ('https://en.wikipedia.org/wiki/Metric tensor"', -6.434048640652896),
 ('https://en.wikipedia.org/wiki/Line element"', -6.44