In [1]:
import numpy as np
from fasttext import FastVector
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import os, glob, re, sys, random, unicodedata, collections
from tqdm import tqdm
from functools import reduce
import nltk
from collections import Counter
from googletrans import Translator
translator = Translator()

from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
from nltk.tokenize import sent_tokenize , word_tokenize
%matplotlib inline

In [2]:
eng_dictionary = FastVector(vector_file='wiki.en.vec')

reading word vectors from wiki.en.vec


In [3]:
ger_dictionary = FastVector(vector_file='wiki.de.vec')

reading word vectors from wiki.de.vec


In [4]:
fre_dictionary = FastVector(vector_file='wiki.fr.vec')

reading word vectors from wiki.fr.vec


In [5]:
# from https://stackoverflow.com/questions/21030391/how-to-normalize-array-numpy
def normalized(a, axis=-1, order=2):
    """Utility function to normalize the rows of a numpy array."""
    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2==0] = 1
    return a / np.expand_dims(l2, axis)

def make_training_matrices(source_dictionary, target_dictionary, bilingual_dictionary):
    """
    Source and target dictionaries are the FastVector objects of
    source/target languages. bilingual_dictionary is a list of 
    translation pair tuples [(source_word, target_word), ...].
    """
    source_matrix = []
    target_matrix = []

    for (source, target) in bilingual_dictionary:
        if source in source_dictionary and target in target_dictionary:
            source_matrix.append(source_dictionary[source])
            target_matrix.append(target_dictionary[target])

    # return training matrices
    return np.array(source_matrix), np.array(target_matrix)

def learn_transformation(source_matrix, target_matrix, normalize_vectors=True):
    """
    Source and target matrices are numpy arrays, shape
    (dictionary_length, embedding_dimension). These contain paired
    word vectors from the bilingual dictionary.
    """
    # optionally normalize the training vectors
    if normalize_vectors:
        source_matrix = normalized(source_matrix)
        target_matrix = normalized(target_matrix)

    # perform the SVD
    product = np.matmul(source_matrix.transpose(), target_matrix)
    U, s, V = np.linalg.svd(product)

    # return orthogonal transformation which aligns source language to the target
    return np.matmul(U, V)

In [6]:
eng_words = set(eng_dictionary.word2id.keys())
ger_words = set(ger_dictionary.word2id.keys())
fre_words = set(fre_dictionary.word2id.keys())
overlap = list(eng_words & ger_words)
overlap_fr_en = list(eng_words & fre_words)
bilingual_dictionary = [(entry, entry) for entry in overlap]
bilingual_dictionary_fr_en = [(entry, entry) for entry in overlap_fr_en]

In [7]:
eng_vector = eng_dictionary["cow"]
fre_vector = fre_dictionary["vache"]
ger_vector = ger_dictionary["kuh"]
print(FastVector.cosine_similarity(ger_vector, eng_vector))
print(FastVector.cosine_similarity(fre_vector, eng_vector))
print(FastVector.cosine_similarity(fre_vector, ger_vector))

-0.021914206256188045
0.04893054014859706
-0.019676202184651607


In [8]:
source_matrix, target_matrix = make_training_matrices(ger_dictionary, eng_dictionary, bilingual_dictionary)
transform = learn_transformation(source_matrix, target_matrix)

In [9]:
ger_dictionary.apply_transform(transform)

In [10]:
eng_vector = eng_dictionary["cow"]
fre_vector = fre_dictionary["vache"]
ger_vector = ger_dictionary["kuh"]
print(FastVector.cosine_similarity(ger_vector, eng_vector))
print(FastVector.cosine_similarity(fre_vector, eng_vector))
print(FastVector.cosine_similarity(fre_vector, ger_vector))

0.5374997100024711
0.04893054014859706
0.01941941296655649


In [11]:
source_matrix_fr, target_matrix_fr = make_training_matrices(fre_dictionary, eng_dictionary, bilingual_dictionary_fr_en)
transform = learn_transformation(source_matrix_fr, target_matrix_fr)


In [12]:
fre_dictionary.apply_transform(transform)

In [13]:
eng_vector = eng_dictionary["cow"]
fre_vector = fre_dictionary["vache"]
ger_vector = ger_dictionary["kuh"]
print(FastVector.cosine_similarity(ger_vector, eng_vector))
print(FastVector.cosine_similarity(fre_vector, eng_vector))
print(FastVector.cosine_similarity(fre_vector, ger_vector))

0.5374997100024711
0.5894027260080459
0.509233249968698


In [14]:
eng_vector = eng_dictionary["epidemiology"]
fre_vector = fre_dictionary["epidemiologie"]
ger_vector = ger_dictionary["epidemiologie"]
print(FastVector.cosine_similarity(ger_vector, eng_vector))
print(FastVector.cosine_similarity(fre_vector, eng_vector))
print(FastVector.cosine_similarity(fre_vector, ger_vector))

0.7544904458022383
0.6913910806895412
0.7115484600503237


In [15]:
dgg = pd.DataFrame()
with pd.read_json('livivo_medline_00.jsonl', lines=True, chunksize=10,nrows = 20) as readerq:
    for chunkq in readerq:
        dgg = dgg.append(chunkq)

In [16]:
dff = pd.DataFrame()
with pd.read_json('livivo_medline_00.jsonl', lines=True, chunksize=10000,nrows = 2000000) as reader:
    for chunk in reader:
        dff = dff.append(chunk[['DBRECORDID','TITLE','ABSTRACT','LANGUAGE']])



In [17]:
for i in dff.columns:
    dff[i]=dff[i].apply(lambda x: x[0] if isinstance(x, list) else x)

In [18]:
dff=dff[dff['TITLE'].notna()]

In [19]:
df_ger = dff.query('LANGUAGE == "ger"')
df_eng = dff.query('LANGUAGE == "eng"')
df_fre = dff.query('LANGUAGE == "fre"')


In [20]:
eng_vector_t = eng_dictionary["medication"]

In [21]:
fre_vector_t = fre_dictionary["médicament"]

In [22]:
ger_vector1 = ger_dictionary["sind"]
ger_vector2 = ger_dictionary["neue"]
ger_vector3 = ger_dictionary["medikamente"]
ger_vector4 = ger_dictionary["zu"]
ger_vector5 = ger_dictionary["teuer"]


In [23]:
ger_vector_t = ger_vector1+ger_vector2+ger_vector3+ger_vector4+ger_vector5

In [24]:
print(FastVector.cosine_similarity(eng_vector_t, ger_vector_t))

0.47819758667529644


In [25]:
print(FastVector.cosine_similarity(fre_vector_t, ger_vector_t))

0.4231839768240091


In [26]:
def tokenize_text(text,lang):
    """Make all necessary preprocessing of text: strip accents and punctuation, remove the words only contains digit
    remove \n, tokenize our text, convert to lower case, remove stop words and 
    words with less than 2 chars.

    Parameters:
    text (str): Input text

    Returns:
    str: cleaned tokenized text

   """    
    WORD_MIN_LENGTH = 2
    STOP_WORDS = nltk.corpus.stopwords.words(lang)
    text = re.sub(re.compile('\n'),' ',text)
    words = word_tokenize(text)
    words = [word.lower() for word in words]
    words = [word for word in words if word not in STOP_WORDS and len(word) >= WORD_MIN_LENGTH]
    words = [word for word in words if word.isdigit()==False]
    return words

In [27]:
tokenized_ger=(df_ger["TITLE"].apply(lambda x: tokenize_text(x,'german')))
tokenized_eng=(df_eng["TITLE"].apply(lambda x: tokenize_text(x,'english')))
tokenized_fre=(df_fre["TITLE"].apply(lambda x: tokenize_text(x,'french')))

In [28]:
all_dict = {"de": ger_dictionary,
            "en": eng_dictionary,
            "fr": fre_dictionary}

In [39]:
queries = ["herzkatheter","echokardiographie","krankenhausplanung","ambient AND assisted AND living AND nursing","low AND carb","épidémiologie","alzheimer AND demenz","fatigue"]

In [30]:
def inverted_index(words):
    """Create a inverted index of words (tokens or terms) from a list of terms

    Parameters:
    words (list of str): tokenized document text

    Returns:
    Inverted index of document (dict)

   """
    inverted = {}
    for index, word in enumerate(words):
        locations = inverted.setdefault(word, [])
        locations.append(index)
    return inverted


def inverted_index_add(inverted, doc_id, doc_index):
    """Insert document id into Inverted Index

    Parameters:
    inverted (dict): Inverted Index
    doc_id (int): Id of document been added
    doc_index (dict): Inverted Index of a specific document.

    Returns:
    Inverted index of document (dict)

   """
    for word in doc_index.keys():
        locations = doc_index[word]
        indices = inverted.setdefault(word, {})
        indices[doc_id] = locations
    return inverted


In [31]:
inverted_doc_indexes_english = {}
files_with_index_english = []
files_with_tokens_english = {}
for i in tokenized_eng.index:
    #Clean and Tokenize text of each document
    words = tokenized_eng[i]
    #Store tokens
    files_with_tokens_english[i] = words

    doc_index = inverted_index(words)
    inverted_index_add(inverted_doc_indexes_english, i, doc_index)
    files_with_index_english.append(i)
########################################
DF_english = {}
for word in inverted_doc_indexes_english.keys():
    DF_english[word] = len ([doc for doc in inverted_doc_indexes_english[word]])

total_vocab_size_english = len(DF_english)
print(total_vocab_size_english)
########################################
idf_english = {} # Our data structure to store Tf-Idf weights

N = len(files_with_tokens_english)

for doc_id in tokenized_eng.index:
    tokens= tokenized_eng[doc_id]
    counter = Counter(tokens)
    words_count = len(tokens)
    
    for token in np.unique(tokens):        
        # Calculate Idf
        if token in DF_english:
            df = DF_english[token]
        else:
            df = 0
        idf = np.log((N+1)/(df+1))
        
        # Calculate Tf-idf        
        idf_english[token] = idf

388570


In [32]:
inverted_doc_indexes_german = {}
files_with_index_german = []
files_with_tokens_german = {}
for i in tokenized_ger.index:
    #Clean and Tokenize text of each document
    words = tokenized_ger[i]
    #Store tokens
    files_with_tokens_german[i] = words

    doc_index = inverted_index(words)
    inverted_index_add(inverted_doc_indexes_german, i, doc_index)
    files_with_index_german.append(i)

################################
DF_german = {}
for word in inverted_doc_indexes_german.keys():
    DF_german[word] = len ([doc for doc in inverted_doc_indexes_german[word]])

total_vocab_size_german = len(DF_german)
print(total_vocab_size_german)
######################
idf_german = {} # Our data structure to store Tf-Idf weights

N = len(files_with_tokens_german)

for doc_id in tokenized_ger.index:
    tokens= tokenized_ger[doc_id]
    counter = Counter(tokens)
    words_count = len(tokens)
    
    for token in np.unique(tokens):        
        # Calculate Idf
        if token in DF_german:
            df = DF_german[token]
        else:
            df = 0
        idf = np.log((N+1)/(df+1))
        
        # Calculate Tf-idf        
        idf_german[token] = idf

139895


In [33]:
inverted_doc_indexes_french = {}
files_with_index_french = []
files_with_tokens_french = {}
for i in tokenized_fre.index:
    #Clean and Tokenize text of each document
    words = tokenized_fre[i]
    #Store tokens
    files_with_tokens_french[i] = words

    doc_index = inverted_index(words)
    inverted_index_add(inverted_doc_indexes_french, i, doc_index)
    files_with_index_french.append(i)

################################
DF_french = {}
for word in inverted_doc_indexes_french.keys():
    DF_french[word] = len ([doc for doc in inverted_doc_indexes_french[word]])

total_vocab_size_french = len(DF_french)
print(total_vocab_size_french)
######################
idf_french = {} # Our data structure to store Tf-Idf weights

N = len(files_with_tokens_french)

for doc_id in tokenized_fre.index:
    tokens= tokenized_fre[doc_id]
    counter = Counter(tokens)
    words_count = len(tokens)
    
    for token in np.unique(tokens):        
        # Calculate Idf
        if token in DF_french:
            df = DF_french[token]
        else:
            df = 0
        idf = np.log((N+1)/(df+1))
        
        # Calculate Tf-idf        
        idf_french[token] = idf

61734


In [34]:
#BWE-AGG
for queryy in queries:

    q_lang = translator.detect(queryy).lang
    vec_list_query = [all_dict[q_lang][item]
                      for item in queryy.lower().split() if item in all_dict[q_lang]]
    vecsum_query = np.sum(vec_list_query, axis=0)

    cosine_sim_ger = []
    cosine_sim_fre = []
    cosine_sim_eng = []
    ind_ger = []
    ind_fre = []
    ind_eng = []

    for ix, token_docs in enumerate(tokenized_ger):
        ind_ger.append(tokenized_ger.index[ix])
        vec_list_doc = [ger_dictionary[item]
                        for item in token_docs if item in ger_dictionary]
        vecsum_doc = np.sum(vec_list_doc, axis=0)
        cosine_sim_ger.append(
            FastVector.cosine_similarity(vecsum_query, vecsum_doc))

    for ix, token_docs in enumerate(tokenized_eng):
        ind_eng.append(tokenized_eng.index[ix])
        vec_list_doc = [eng_dictionary[item]
                        for item in token_docs if item in eng_dictionary]
        vecsum_doc = np.sum(vec_list_doc, axis=0)
        cosine_sim_eng.append(
            FastVector.cosine_similarity(vecsum_query, vecsum_doc))

    for ix, token_docs in enumerate(tokenized_fre):
        ind_fre.append(tokenized_fre.index[ix])
        vec_list_doc = [fre_dictionary[item]
                        for item in token_docs if item in fre_dictionary]
        vecsum_doc = np.sum(vec_list_doc, axis=0)
        cosine_sim_fre.append(
            FastVector.cosine_similarity(vecsum_query, vecsum_doc))

    indexx = []

    cosine_sim1_ger = [x if isinstance(
        x, float) else 0 for x in cosine_sim_ger]
    cosine_sim1_fre = [x if isinstance(
        x, float) else 0 for x in cosine_sim_fre]
    cosine_sim1_eng = [x if isinstance(
        x, float) else 0 for x in cosine_sim_eng]
    
    indexx.append([(_, x) for _, x in sorted(
        zip(cosine_sim1_ger, ind_ger))][::-1][:20])
    indexx.append([(_, x) for _, x in sorted(
        zip(cosine_sim1_fre, ind_fre))][::-1][:20])
    indexx.append([(_, x) for _, x in sorted(
        zip(cosine_sim1_eng, ind_eng))][::-1][:20])

    scores = [oi for oi, ii in sorted(sum(indexx, []))][::-1]
    tmp_df = dff.loc[[ii for oi, ii in sorted(sum(indexx, []))][::-1]]
    tmp_df['cosine_scores'] = scores
    tmp_df.to_csv("results/{}.csv".format(queryy))
    
    


  return np.dot(vec_a, vec_b) / \


In [35]:
#BWE-AGG-IDF
for queryy in queries:

    q_lang = translator.detect(queryy).lang
    vec_list_query = [all_dict[q_lang][item]
                      for item in queryy.lower().split() if item in all_dict[q_lang]]
    vecsum_query = np.sum(vec_list_query, axis=0)

    cosine_sim_ger = []
    cosine_sim_fre = []
    cosine_sim_eng = []
    ind_ger = []
    ind_fre = []
    ind_eng = []

    for ix, token_docs in enumerate(tokenized_ger):
        ind_ger.append(tokenized_ger.index[ix])
        vec_list_doc = [idf_german[item]*ger_dictionary[item]
                        for item in token_docs if item in ger_dictionary]
        vecsum_doc = np.sum(vec_list_doc, axis=0)
        cosine_sim_ger.append(
            FastVector.cosine_similarity(vecsum_query, vecsum_doc))

    for ix, token_docs in enumerate(tokenized_eng):
        ind_eng.append(tokenized_eng.index[ix])
        vec_list_doc = [idf_english[item]*eng_dictionary[item]
                        for item in token_docs if item in eng_dictionary]
        vecsum_doc = np.sum(vec_list_doc, axis=0)
        cosine_sim_eng.append(
            FastVector.cosine_similarity(vecsum_query, vecsum_doc))

    for ix, token_docs in enumerate(tokenized_fre):
        ind_fre.append(tokenized_fre.index[ix])
        vec_list_doc = [idf_french[item]*fre_dictionary[item]
                        for item in token_docs if item in fre_dictionary]
        vecsum_doc = np.sum(vec_list_doc, axis=0)
        cosine_sim_fre.append(
            FastVector.cosine_similarity(vecsum_query, vecsum_doc))

    indexx = []

    cosine_sim1_ger = [x if isinstance(
        x, float) else 0 for x in cosine_sim_ger]
    cosine_sim1_fre = [x if isinstance(
        x, float) else 0 for x in cosine_sim_fre]
    cosine_sim1_eng = [x if isinstance(
        x, float) else 0 for x in cosine_sim_eng]

    indexx.append([(_, x) for _, x in sorted(
        zip(cosine_sim1_ger, ind_ger))][::-1][:20])
    indexx.append([(_, x) for _, x in sorted(
        zip(cosine_sim1_fre, ind_fre))][::-1][:20])
    indexx.append([(_, x) for _, x in sorted(
        zip(cosine_sim1_eng, ind_eng))][::-1][:20])
    scores = [oi for oi, ii in sorted(sum(indexx, []))][::-1]
    tmp_dff = dff.loc[[ii for oi, ii in sorted(sum(indexx, []))][::-1]]
    tmp_dff['cosine_scores'] = scores
    tmp_dff.to_csv("results/{}_idf.csv".format(queryy))


POOLING

In [None]:
for queryy in queries:
    new = pd.read_csv("results/{}.csv".format(queryy))
    new_idf = pd.read_csv("results/{}_idf.csv".format(queryy))
    final = pd.concat([new, new_idf])
    final.drop_duplicates(subset=["DBRECORDID"], inplace=True)
    final.to_csv("results/{}_pooled.csv".format(queryy))


EVALUATION

In [46]:
from sklearn.metrics import ndcg_score

bwe_scores = []
bwe_idf_scores = []
bwe_pooled_scores = []

for queryy in queries:
    eval = pd.read_csv("ranked_documents/{}_withpoints.csv".format(queryy))
    new = pd.read_csv("results/{}.csv".format(queryy))
    new_idf = pd.read_csv("results/{}_idf.csv".format(queryy))

    bwe = pd.merge(new,eval[["DBRECORDID","Relevance_Point"]],on="DBRECORDID",how='inner')
    bwe_idf = pd.merge(new_idf, eval[["DBRECORDID", "Relevance_Point"]], on="DBRECORDID", how='inner')

    ranked_relevance = np.asarray([bwe["Relevance_Point"]])
    ideal_relevance = np.asarray([(bwe_idf["Relevance_Point"])[::-1]])

    ranked_relevance_idf = np.asarray([bwe_idf["Relevance_Point"]])
    ideal_relevance_idf = np.asarray([(bwe_idf["Relevance_Point"])[::-1]])

    ranked_relevance_pooled = np.asarray([eval["Relevance_Point"]])
    ideal_relevance_pooled = np.asarray([(eval["Relevance_Point"])[::-1]])

    bwe_scores.append(ndcg_score(ideal_relevance, ranked_relevance))
    bwe_idf_scores.append(ndcg_score(ideal_relevance_idf, ranked_relevance_idf))
    bwe_pooled_scores.append(ndcg_score(ideal_relevance_pooled, ranked_relevance_pooled))


    print("BWE - nDCG : ({})".format(queryy),ndcg_score(ideal_relevance, ranked_relevance))
    print("BWE-idf - nDCG : ({})".format(queryy),ndcg_score(ideal_relevance_idf, ranked_relevance_idf))
    print("pooled - nDCG : ({})".format(queryy),ndcg_score(ideal_relevance_pooled, ranked_relevance_pooled))

print("#######################")
print("bwe nDCG score: ",np.mean(bwe_scores))
print("bwe-idf nDCG score: ", np.mean(bwe_idf_scores))
print("pooled nDCG score: ", np.mean(bwe_pooled_scores))



BWE - nDCG : (herzkatheter) 0.7506067814040828
BWE-idf - nDCG : (herzkatheter) 0.753409911253676
pooled - nDCG : (herzkatheter) 0.8206427022477627
BWE - nDCG : (echokardiographie) 0.8244076441144708
BWE-idf - nDCG : (echokardiographie) 0.8361264214973626
pooled - nDCG : (echokardiographie) 0.8854850021042235
BWE - nDCG : (krankenhausplanung) 0.7860122556120882
BWE-idf - nDCG : (krankenhausplanung) 0.7876014140033109
pooled - nDCG : (krankenhausplanung) 0.8329766684376249
BWE - nDCG : (ambient AND assisted AND living AND nursing) 0.7275962057146819
BWE-idf - nDCG : (ambient AND assisted AND living AND nursing) 0.7134937797384164
pooled - nDCG : (ambient AND assisted AND living AND nursing) 0.7707389881016127
BWE - nDCG : (low AND carb) 0.7164431873396453
BWE-idf - nDCG : (low AND carb) 0.7373164302628251
pooled - nDCG : (low AND carb) 0.806093843600604
BWE - nDCG : (épidémiologie) 0.9061277547972217
BWE-idf - nDCG : (épidémiologie) 0.9033786592127004
pooled - nDCG : (épidémiologie) 0.92

bwe nDCG score:  0.7732481328541774

bwe-idf nDCG score:  0.7729077391052603

pooled nDCG score:  0.8387090052998385