In [1]:
import numpy as np
from fasttext import FastVector
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import os, glob, re, sys, random, unicodedata, collections
from tqdm import tqdm
from functools import reduce
import nltk
from collections import Counter

from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
from nltk.tokenize import sent_tokenize , word_tokenize
%matplotlib inline

In [2]:
eng_dictionary = FastVector(vector_file='wiki.en.vec')

reading word vectors from wiki.en.vec


In [3]:
ger_dictionary = FastVector(vector_file='wiki.de.vec')

reading word vectors from wiki.de.vec


In [4]:
fre_dictionary = FastVector(vector_file='wiki.fr.vec')

reading word vectors from wiki.fr.vec


In [5]:
# from https://stackoverflow.com/questions/21030391/how-to-normalize-array-numpy
def normalized(a, axis=-1, order=2):
    """Utility function to normalize the rows of a numpy array."""
    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2==0] = 1
    return a / np.expand_dims(l2, axis)

def make_training_matrices(source_dictionary, target_dictionary, bilingual_dictionary):
    """
    Source and target dictionaries are the FastVector objects of
    source/target languages. bilingual_dictionary is a list of 
    translation pair tuples [(source_word, target_word), ...].
    """
    source_matrix = []
    target_matrix = []

    for (source, target) in bilingual_dictionary:
        if source in source_dictionary and target in target_dictionary:
            source_matrix.append(source_dictionary[source])
            target_matrix.append(target_dictionary[target])

    # return training matrices
    return np.array(source_matrix), np.array(target_matrix)

def learn_transformation(source_matrix, target_matrix, normalize_vectors=True):
    """
    Source and target matrices are numpy arrays, shape
    (dictionary_length, embedding_dimension). These contain paired
    word vectors from the bilingual dictionary.
    """
    # optionally normalize the training vectors
    if normalize_vectors:
        source_matrix = normalized(source_matrix)
        target_matrix = normalized(target_matrix)

    # perform the SVD
    product = np.matmul(source_matrix.transpose(), target_matrix)
    U, s, V = np.linalg.svd(product)

    # return orthogonal transformation which aligns source language to the target
    return np.matmul(U, V)

In [6]:
eng_words = set(eng_dictionary.word2id.keys())
ger_words = set(ger_dictionary.word2id.keys())
fre_words = set(fre_dictionary.word2id.keys())
overlap = list(eng_words & ger_words)
overlap_fr_en = list(eng_words & fre_words)
bilingual_dictionary = [(entry, entry) for entry in overlap]
bilingual_dictionary_fr_en = [(entry, entry) for entry in overlap_fr_en]

In [7]:
eng_vector = eng_dictionary["cow"]
fre_vector = fre_dictionary["vache"]
ger_vector = ger_dictionary["kuh"]
print(FastVector.cosine_similarity(ger_vector, eng_vector))
print(FastVector.cosine_similarity(fre_vector, eng_vector))
print(FastVector.cosine_similarity(fre_vector, ger_vector))

-0.021914206256188045
0.04893054014859706
-0.019676202184651607


In [8]:
source_matrix, target_matrix = make_training_matrices(ger_dictionary, eng_dictionary, bilingual_dictionary)
transform = learn_transformation(source_matrix, target_matrix)

In [9]:
ger_dictionary.apply_transform(transform)

In [78]:
eng_vector = eng_dictionary["cow"]
fre_vector = fre_dictionary["vache"]
ger_vector = ger_dictionary["kuh"]
print(FastVector.cosine_similarity(ger_vector, eng_vector))
print(FastVector.cosine_similarity(fre_vector, eng_vector))
print(FastVector.cosine_similarity(fre_vector, ger_vector))

0.5374997100024727
0.5894027260080437
0.5092332499687048


In [11]:
source_matrix_fr, target_matrix_fr = make_training_matrices(fre_dictionary, eng_dictionary, bilingual_dictionary_fr_en)
transform = learn_transformation(source_matrix_fr, target_matrix_fr)


In [12]:
fre_dictionary.apply_transform(transform)

In [13]:
eng_vector = eng_dictionary["cow"]
fre_vector = fre_dictionary["vache"]
ger_vector = ger_dictionary["kuh"]
print(FastVector.cosine_similarity(ger_vector, eng_vector))
print(FastVector.cosine_similarity(fre_vector, eng_vector))
print(FastVector.cosine_similarity(fre_vector, ger_vector))

0.5374997100024727
0.5894027260080437
0.5092332499687048


In [14]:
eng_vector = eng_dictionary["city"]
fre_vector = fre_dictionary["ville"]
ger_vector = ger_dictionary["stadt"]
print(FastVector.cosine_similarity(ger_vector, eng_vector))
print(FastVector.cosine_similarity(fre_vector, eng_vector))
print(FastVector.cosine_similarity(fre_vector, ger_vector))

0.6232559990666015
0.6797876352842969
0.6651148554627178


In [111]:
dff = pd.DataFrame()
with pd.read_json('livivo_medline_00.jsonl', lines=True, chunksize=10000,nrows = 2000000) as reader:
    for chunk in reader:
        dff = dff.append(chunk[['DBRECORDID','TITLE','ABSTRACT','LANGUAGE']])



In [112]:
for i in dff.columns:
    dff[i]=dff[i].apply(lambda x: x[0] if isinstance(x, list) else x)

In [113]:
dff=dff[dff['TITLE'].notna()]

In [114]:
df_ger = dff.query('LANGUAGE == "ger"')
df_eng = dff.query('LANGUAGE == "eng"')
df_fre = dff.query('LANGUAGE == "fre"')


In [115]:
eng_vector_t = eng_dictionary["medication"]

In [116]:
fre_vector_t = fre_dictionary["médicament"]

In [117]:
ger_vector1 = ger_dictionary["sind"]
ger_vector2 = ger_dictionary["neue"]
ger_vector3 = ger_dictionary["medikamente"]
ger_vector4 = ger_dictionary["zu"]
ger_vector5 = ger_dictionary["teuer"]


In [118]:
ger_vector_t = ger_vector1+ger_vector2+ger_vector3+ger_vector4+ger_vector5

In [119]:
print(FastVector.cosine_similarity(eng_vector_t, ger_vector_t))

0.47819758667529366


In [120]:
print(FastVector.cosine_similarity(fre_vector_t, ger_vector_t))

0.42318397682401204


In [123]:
def tokenize_text(text,lang):
    """Make all necessary preprocessing of text: strip accents and punctuation, remove the words only contains digit
    remove \n, tokenize our text, convert to lower case, remove stop words and 
    words with less than 2 chars.

    Parameters:
    text (str): Input text

    Returns:
    str: cleaned tokenized text

   """    
    WORD_MIN_LENGTH = 2
    STOP_WORDS = nltk.corpus.stopwords.words(lang)
    text = re.sub(re.compile('\n'),' ',text)
    words = word_tokenize(text)
    words = [word.lower() for word in words]
    words = [word for word in words if word not in STOP_WORDS and len(word) >= WORD_MIN_LENGTH]
    words = [word for word in words if word.isdigit()==False]
    return words

In [124]:
tokenized_ger=(df_ger["TITLE"].apply(lambda x: tokenize_text(x,'german')))
tokenized_eng=(df_eng["TITLE"].apply(lambda x: tokenize_text(x,'english')))
tokenized_fre=(df_fre["TITLE"].apply(lambda x: tokenize_text(x,'french')))

In [125]:
vec_list_query = [ger_dictionary[item] for item in ["herzkatheter"] if item in ger_dictionary]
vecsum_query = np.sum(vec_list_query,axis=0)


In [126]:
cosine_sim = []
ind = []

In [127]:

for ix, token_docs in enumerate(tokenized_ger):
    ind.append(tokenized_ger.index[ix])
    vec_list_doc = [ger_dictionary[item]
                    for item in token_docs if item in ger_dictionary]
    vecsum_doc = np.sum(vec_list_doc, axis=0)
    cosine_sim.append(
        FastVector.cosine_similarity(vecsum_query, vecsum_doc))



  return np.dot(vec_a, vec_b) / \


In [128]:

for ix, token_docs in enumerate(tokenized_eng):
    ind.append(tokenized_eng.index[ix])
    vec_list_doc = [eng_dictionary[item]
                    for item in token_docs if item in eng_dictionary]
    vecsum_doc = np.sum(vec_list_doc, axis=0)
    cosine_sim.append(
        FastVector.cosine_similarity(vecsum_query, vecsum_doc))


In [129]:

for ix, token_docs in enumerate(tokenized_fre):
    ind.append(tokenized_fre.index[ix])
    vec_list_doc = [fre_dictionary[item]
                    for item in token_docs if item in fre_dictionary]
    vecsum_doc = np.sum(vec_list_doc, axis=0)
    cosine_sim.append(
        FastVector.cosine_similarity(vecsum_query, vecsum_doc))


In [130]:
cosine_sim1 = [x if isinstance(x, float) else 0 for x in cosine_sim]


In [131]:
dff.loc[[x for _,x in sorted(zip(cosine_sim1,ind))][::-1][:20]]


Unnamed: 0,DBRECORDID,TITLE,ABSTRACT,LANGUAGE
1159864,M997705,Ein Prozessrechnersystem für Herzkatheterlabor...,In cooperation with the Department of Electron...,ger
636928,M461773,Komplikationen venöser Verweilkatheter im Thor...,The increasing use of central venous catheters...,ger
1707523,M1546486,Komplikationen zentralvenöser Katheter bei Pat...,,ger
1123132,M960927,Lebervenographie mit Ballonkatheter nach porto...,,ger
324162,M142043,Präoperative Nierenarterienblockade mit Ballon...,Renal artery occlusion with a balloon catheter...,ger
1596884,M1435805,Perkutane Silastic-Katheter bei Neu- und Frühg...,Background and methods!#!Central catheters are...,ger
162129,M28321727,Wann zum Herzkatheter überweisen?,,ger
1130001,M967798,Lagebestimmung zentralvenöser Katheter durch i...,It is reported on the electrocardiographical m...,ger
1104242,M942036,Therapie bei Duraperforation mit Katheter währ...,A rare complication following a dural tap in l...,ger
1234511,M1072590,Thrombektomie--chirurgische Therapie venöser T...,,ger


In [132]:
test_df = dff.query('DBRECORDID==["NLM100935395", "M28321727", "M22396211", "M26182251", "M27855450", "M27371084", "M17668771", "M15526640", "M12116571", "M17619416", "M15373101", "M17327990", "M17036952", "M12619228", "M17036953", "M10766551", "M15832754", "M12661441", "M11862798", "M12014270", "M9274293", "M15526639", "M15295687", "M14534866", "M14534865", "M11847881", "M17287946", "M11367987", "M16685628", "M15884497", "M6768988", "M11349619", "M4408140", "M3699648", "M10719459", "M5033568", "M4341201", "M5994901", "M5171652", "M7670011", "M572445", "M13674380", "M822301", "M13095574", "M31938804", "M5811151", "M530728", "M530727", "M530726", "M6015202", "M172696", "M13153044", "M4779158", "M5537878", "M7268345", "M5381164", "M13325698", "M530725", "M14128579", "M14271658", "M14943165", "M28382405", "NLM101607162", "M27903031", "M29623382", "NLM101237723", "M31828657", "M32865509", "M28711420", "M18726641", "M31379065", "M23212601", "M22160954", "M25612278", "M32291683", "M33055148", "M31191628", "M33518405", "M29335976", "M13137320", "NLM100888962", "M31227510", "M32983409", "M30874757", "M18447284", "M32737488", "M11565209", "M32165421", "M32350102", "M22454204", "M30928352", "M32543896", "M31133083", "M31211489", "M30253970", "M32511272", "M17290313", "M32127268", "M17334121", "M32964247"]')


In [133]:
test_df

Unnamed: 0,DBRECORDID,TITLE,ABSTRACT,LANGUAGE
21456,M28382405,Notfallmanagement im Herzkatheterlabor : Wenn ...,The establishment of primary percutaneous inte...,ger
162129,M28321727,Wann zum Herzkatheter überweisen?,,ger
354635,M172696,Herzkatheter bei Fallot' Tetralogie. Bericht e...,,ger
705860,M530725,Das Risiko der Herzkatheter-Untersuchung. Eine...,A retrospective study is undertaken in order t...,ger
705861,M530726,Das Risiko der Herzkatheter-Untersuchung. Eine...,The author reviewed the complications of 700 h...,ger
705862,M530728,Das Risiko der Herzkatheter-Untersuchung. Eine...,The review of 700 heart catheterizations in in...,ger
705863,M530727,Das Risiko der Herzkatheter-Untersuchung. Eine...,"Disturbances of heart rhythm, observed during ...",ger
747551,M572445,Heparinisierung während transvenöser Herzkathe...,,ger
984828,M822301,Einfluss von Herzkatheter-untersuchung und Ang...,"The serum activities of LDH, alpha-HBDH, CK, G...",ger


In [134]:
def inverted_index(words):
    """Create a inverted index of words (tokens or terms) from a list of terms

    Parameters:
    words (list of str): tokenized document text

    Returns:
    Inverted index of document (dict)

   """
    inverted = {}
    for index, word in enumerate(words):
        locations = inverted.setdefault(word, [])
        locations.append(index)
    return inverted


def inverted_index_add(inverted, doc_id, doc_index):
    """Insert document id into Inverted Index

    Parameters:
    inverted (dict): Inverted Index
    doc_id (int): Id of document been added
    doc_index (dict): Inverted Index of a specific document.

    Returns:
    Inverted index of document (dict)

   """
    for word in doc_index.keys():
        locations = doc_index[word]
        indices = inverted.setdefault(word, {})
        indices[doc_id] = locations
    return inverted


In [135]:
inverted_doc_indexes_english = {}
files_with_index_english = []
files_with_tokens_english = {}
for i in tokenized_eng.index:
    #Clean and Tokenize text of each document
    words = tokenized_eng[i]
    #Store tokens
    files_with_tokens_english[i] = words

    doc_index = inverted_index(words)
    inverted_index_add(inverted_doc_indexes_english, i, doc_index)
    files_with_index_english.append(i)
########################################
DF_english = {}
for word in inverted_doc_indexes_english.keys():
    DF_english[word] = len ([doc for doc in inverted_doc_indexes_english[word]])

total_vocab_size_english = len(DF_english)
print(total_vocab_size_english)
########################################
idf_english = {} # Our data structure to store Tf-Idf weights

N = len(files_with_tokens_english)

for doc_id in tokenized_eng.index:
    tokens= tokenized_eng[doc_id]
    counter = Counter(tokens)
    words_count = len(tokens)
    
    for token in np.unique(tokens):        
        # Calculate Idf
        if token in DF_english:
            df = DF_english[token]
        else:
            df = 0
        idf = np.log((N+1)/(df+1))
        
        # Calculate Tf-idf        
        idf_english[token] = idf

388570


In [136]:
inverted_doc_indexes_german = {}
files_with_index_german = []
files_with_tokens_german = {}
for i in tokenized_ger.index:
    #Clean and Tokenize text of each document
    words = tokenized_ger[i]
    #Store tokens
    files_with_tokens_german[i] = words

    doc_index = inverted_index(words)
    inverted_index_add(inverted_doc_indexes_german, i, doc_index)
    files_with_index_german.append(i)

################################
DF_german = {}
for word in inverted_doc_indexes_german.keys():
    DF_german[word] = len ([doc for doc in inverted_doc_indexes_german[word]])

total_vocab_size_german = len(DF_german)
print(total_vocab_size_german)
######################
idf_german = {} # Our data structure to store Tf-Idf weights

N = len(files_with_tokens_german)

for doc_id in tokenized_ger.index:
    tokens= tokenized_ger[doc_id]
    counter = Counter(tokens)
    words_count = len(tokens)
    
    for token in np.unique(tokens):        
        # Calculate Idf
        if token in DF_german:
            df = DF_german[token]
        else:
            df = 0
        idf = np.log((N+1)/(df+1))
        
        # Calculate Tf-idf        
        idf_german[token] = idf

139895


In [137]:
inverted_doc_indexes_french = {}
files_with_index_french = []
files_with_tokens_french = {}
for i in tokenized_fre.index:
    #Clean and Tokenize text of each document
    words = tokenized_fre[i]
    #Store tokens
    files_with_tokens_french[i] = words

    doc_index = inverted_index(words)
    inverted_index_add(inverted_doc_indexes_french, i, doc_index)
    files_with_index_french.append(i)

################################
DF_french = {}
for word in inverted_doc_indexes_french.keys():
    DF_french[word] = len ([doc for doc in inverted_doc_indexes_french[word]])

total_vocab_size_french = len(DF_french)
print(total_vocab_size_french)
######################
idf_french = {} # Our data structure to store Tf-Idf weights

N = len(files_with_tokens_french)

for doc_id in tokenized_fre.index:
    tokens= tokenized_fre[doc_id]
    counter = Counter(tokens)
    words_count = len(tokens)
    
    for token in np.unique(tokens):        
        # Calculate Idf
        if token in DF_french:
            df = DF_french[token]
        else:
            df = 0
        idf = np.log((N+1)/(df+1))
        
        # Calculate Tf-idf        
        idf_french[token] = idf

61734


In [138]:
cosine_sim2 = []
ind2 = []


In [139]:
for ix, token_docs in enumerate(tokenized_ger):
    ind2.append(tokenized_ger.index[ix])
    vec_list_doc2 = [idf_german[item]*ger_dictionary[item]
                    for item in token_docs if item in ger_dictionary]
    vecsum_doc2 = np.sum(vec_list_doc2, axis=0)
    cosine_sim2.append(
        FastVector.cosine_similarity(vecsum_query, vecsum_doc2))


  return np.dot(vec_a, vec_b) / \


In [140]:
for ix, token_docs in enumerate(tokenized_eng):
    ind2.append(tokenized_eng.index[ix])
    vec_list_doc2 = [idf_english[item]*eng_dictionary[item]
                     for item in token_docs if item in eng_dictionary]
    vecsum_doc2 = np.sum(vec_list_doc2, axis=0)
    cosine_sim2.append(
        FastVector.cosine_similarity(vecsum_query, vecsum_doc2))


In [142]:
for ix, token_docs in enumerate(tokenized_fre):
    ind2.append(tokenized_fre.index[ix])
    vec_list_doc2 = [idf_french[item]*fre_dictionary[item]
                     for item in token_docs if item in fre_dictionary]
    vecsum_doc2 = np.sum(vec_list_doc2, axis=0)
    cosine_sim2.append(
        FastVector.cosine_similarity(vecsum_query, vecsum_doc2))


In [143]:
cosine_sim3 = [x if isinstance(x, float) else 0 for x in cosine_sim2]


In [144]:
dff.loc[[x for _,x in sorted(zip(cosine_sim3,ind2))][::-1][:20]]

Unnamed: 0,DBRECORDID,TITLE,ABSTRACT,LANGUAGE
1159864,M997705,Ein Prozessrechnersystem für Herzkatheterlabor...,In cooperation with the Department of Electron...,ger
636928,M461773,Komplikationen venöser Verweilkatheter im Thor...,The increasing use of central venous catheters...,ger
1084060,M921842,Die Behandlung der Karotis-Kavernosus-Fistel m...,After a critical survey of the methods of trea...,ger
307270,M124680,Karotis-Kavernosus-Fistel - Behandlung mit ein...,A relatively simple method for the intra-vascu...,ger
162129,M28321727,Wann zum Herzkatheter überweisen?,,ger
1707523,M1546486,Komplikationen zentralvenöser Katheter bei Pat...,,ger
1123132,M960927,Lebervenographie mit Ballonkatheter nach porto...,,ger
1596884,M1435805,Perkutane Silastic-Katheter bei Neu- und Frühg...,Background and methods!#!Central catheters are...,ger
324162,M142043,Präoperative Nierenarterienblockade mit Ballon...,Renal artery occlusion with a balloon catheter...,ger
1104242,M942036,Therapie bei Duraperforation mit Katheter währ...,A rare complication following a dural tap in l...,ger


TODO - implement 2 approaches 

1 translate query to english
2 detect query language and use that

In [None]:
# %pip install googletrans==4.0.0-rc1
from googletrans import Translator
translator = Translator()

# Compile all Word Embeddings in a dictionary
all_dict = {"de": ger_dictionary,
            "en": eng_dictionary,
            "fr": fre_dictionary}

# all_dict = {"de": {"stadt":1}, # For testing
#             "en": {"city":2},
#             "fr": {"ville":3}}

q = "stadt"

# Approach 1
q_eng = translator.translate(q).text
q_eng_vector = eng_dictionary[q_eng]

# Approach 2
q_lang = translator.detect(q).lang
q_lang_vector = all_dict[q_lang][q]


de
