In [2]:
import numpy as np
from fasttext import FastVector
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import os, glob, re, sys, random, unicodedata, collections
from tqdm import tqdm
from functools import reduce
import nltk
from collections import Counter

from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
from nltk.tokenize import sent_tokenize , word_tokenize
%matplotlib inline

In [3]:
eng_dictionary = FastVector(vector_file='wiki.en.vec')

reading word vectors from wiki.en.vec


In [4]:
ger_dictionary = FastVector(vector_file='wiki.de.vec')

reading word vectors from wiki.de.vec


In [5]:
fre_dictionary = FastVector(vector_file='wiki.fr.vec')

reading word vectors from wiki.fr.vec


In [6]:
# from https://stackoverflow.com/questions/21030391/how-to-normalize-array-numpy
def normalized(a, axis=-1, order=2):
    """Utility function to normalize the rows of a numpy array."""
    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2==0] = 1
    return a / np.expand_dims(l2, axis)

def make_training_matrices(source_dictionary, target_dictionary, bilingual_dictionary):
    """
    Source and target dictionaries are the FastVector objects of
    source/target languages. bilingual_dictionary is a list of 
    translation pair tuples [(source_word, target_word), ...].
    """
    source_matrix = []
    target_matrix = []

    for (source, target) in bilingual_dictionary:
        if source in source_dictionary and target in target_dictionary:
            source_matrix.append(source_dictionary[source])
            target_matrix.append(target_dictionary[target])

    # return training matrices
    return np.array(source_matrix), np.array(target_matrix)

def learn_transformation(source_matrix, target_matrix, normalize_vectors=True):
    """
    Source and target matrices are numpy arrays, shape
    (dictionary_length, embedding_dimension). These contain paired
    word vectors from the bilingual dictionary.
    """
    # optionally normalize the training vectors
    if normalize_vectors:
        source_matrix = normalized(source_matrix)
        target_matrix = normalized(target_matrix)

    # perform the SVD
    product = np.matmul(source_matrix.transpose(), target_matrix)
    U, s, V = np.linalg.svd(product)

    # return orthogonal transformation which aligns source language to the target
    return np.matmul(U, V)

In [7]:
eng_words = set(eng_dictionary.word2id.keys())
ger_words = set(ger_dictionary.word2id.keys())
fre_words = set(fre_dictionary.word2id.keys())
overlap = list(eng_words & ger_words)
overlap_fr_en = list(eng_words & fre_words)
bilingual_dictionary = [(entry, entry) for entry in overlap]
bilingual_dictionary_fr_en = [(entry, entry) for entry in overlap_fr_en]

In [8]:
eng_vector = eng_dictionary["cow"]
fre_vector = fre_dictionary["vache"]
ger_vector = ger_dictionary["kuh"]
print(FastVector.cosine_similarity(ger_vector, eng_vector))
print(FastVector.cosine_similarity(fre_vector, eng_vector))
print(FastVector.cosine_similarity(fre_vector, ger_vector))

-0.021914206256188045
0.04893054014859706
-0.019676202184651607


In [9]:
source_matrix, target_matrix = make_training_matrices(ger_dictionary, eng_dictionary, bilingual_dictionary)
transform = learn_transformation(source_matrix, target_matrix)

MemoryError: Unable to allocate 1.73 GiB for an array with shape (772912, 300) and data type float64

In [13]:
ger_dictionary.apply_transform(transform)

In [14]:
eng_vector = eng_dictionary["cow"]
fre_vector = fre_dictionary["vache"]
ger_vector = ger_dictionary["kuh"]
print(FastVector.cosine_similarity(ger_vector, eng_vector))
print(FastVector.cosine_similarity(fre_vector, eng_vector))
print(FastVector.cosine_similarity(fre_vector, ger_vector))

0.5374997100024734
0.04893054014859706
0.019419412966559853


In [15]:
source_matrix_fr, target_matrix_fr = make_training_matrices(fre_dictionary, eng_dictionary, bilingual_dictionary_fr_en)
transform = learn_transformation(source_matrix_fr, target_matrix_fr)


In [16]:
fre_dictionary.apply_transform(transform)

In [17]:
eng_vector = eng_dictionary["cow"]
fre_vector = fre_dictionary["vache"]
ger_vector = ger_dictionary["kuh"]
print(FastVector.cosine_similarity(ger_vector, eng_vector))
print(FastVector.cosine_similarity(fre_vector, eng_vector))
print(FastVector.cosine_similarity(fre_vector, ger_vector))

0.5374997100024734
0.5894027260080404
0.5092332499687042


In [18]:
eng_vector = eng_dictionary["city"]
fre_vector = fre_dictionary["ville"]
ger_vector = ger_dictionary["stadt"]
print(FastVector.cosine_similarity(ger_vector, eng_vector))
print(FastVector.cosine_similarity(fre_vector, eng_vector))
print(FastVector.cosine_similarity(fre_vector, ger_vector))

0.6232559990665913
0.6797876352842949
0.6651148554627233


In [19]:
df = pd.DataFrame()
with pd.read_json('livivo_medline_00.jsonl', lines=True, chunksize=10000,nrows = 500000) as reader:
    for chunk in reader:
        df = df.append(chunk[['DBRECORDID','TITLE','ABSTRACT','LANGUAGE']])



In [20]:
for i in df.columns:
    df[i]=df[i].apply(lambda x: x[0] if isinstance(x, list) else x)

In [21]:
df1 = df.query('LANGUAGE == ["eng","ger","fre"]')

In [22]:
print((df1.LANGUAGE).value_counts())

eng    410066
ger     21744
fre     16052
Name: LANGUAGE, dtype: int64


In [67]:
eng_vector_t = eng_dictionary["medication"]

In [24]:
fre_vector_t = fre_dictionary["médicament"]

In [25]:
ger_vector1 = ger_dictionary["sind"]
ger_vector2 = ger_dictionary["neue"]
ger_vector3 = ger_dictionary["medikamente"]
ger_vector4 = ger_dictionary["zu"]
ger_vector5 = ger_dictionary["teuer"]


In [26]:
ger_vector_t = ger_vector1+ger_vector2+ger_vector3+ger_vector4+ger_vector5

In [27]:
print(FastVector.cosine_similarity(eng_vector_t, ger_vector_t))

0.4781975866752953


In [28]:
print(FastVector.cosine_similarity(fre_vector_t, ger_vector_t))

0.42318397682401143


In [29]:
def tokenize_text(text):
    """Make all necessary preprocessing of text: strip accents and punctuation, remove the words only contains digit
    remove \n, tokenize our text, convert to lower case, remove stop words and 
    words with less than 2 chars.

    Parameters:
    text (str): Input text

    Returns:
    str: cleaned tokenized text

   """    
    WORD_MIN_LENGTH = 2
    STOP_WORDS = nltk.corpus.stopwords.words('english')
    text = re.sub(re.compile('\n'),' ',text)
    words = word_tokenize(text)
    words = [word.lower() for word in words]
    words = [word for word in words if word not in STOP_WORDS and len(word) >= WORD_MIN_LENGTH]
    words = [word for word in words if word.isdigit()==False]
    return words

In [30]:
test_df = df.query('DBRECORDID==["NLM100935395", "M28321727", "M22396211", "M26182251", "M27855450", "M27371084", "M17668771", "M15526640", "M12116571", "M17619416", "M15373101", "M17327990", "M17036952", "M12619228", "M17036953", "M10766551", "M15832754", "M12661441", "M11862798", "M12014270", "M9274293", "M15526639", "M15295687", "M14534866", "M14534865", "M11847881", "M17287946", "M11367987", "M16685628", "M15884497", "M6768988", "M11349619", "M4408140", "M3699648", "M10719459", "M5033568", "M4341201", "M5994901", "M5171652", "M7670011", "M572445", "M13674380", "M822301", "M13095574", "M31938804", "M5811151", "M530728", "M530727", "M530726", "M6015202", "M172696", "M13153044", "M4779158", "M5537878", "M7268345", "M5381164", "M13325698", "M530725", "M14128579", "M14271658", "M14943165", "M28382405", "NLM101607162", "M27903031", "M29623382", "NLM101237723", "M31828657", "M32865509", "M28711420", "M18726641", "M31379065", "M23212601", "M22160954", "M25612278", "M32291683", "M33055148", "M31191628", "M33518405", "M29335976", "M13137320", "NLM100888962", "M31227510", "M32983409", "M30874757", "M18447284", "M32737488", "M11565209", "M32165421", "M32350102", "M22454204", "M30928352", "M32543896", "M31133083", "M31211489", "M30253970", "M32511272", "M17290313", "M32127268", "M17334121", "M32964247"]')

In [31]:
lang_list = list(test_df["LANGUAGE"])

In [32]:
df.dropna(inplace=True)

In [33]:
tokenized=(df["TITLE"].apply(lambda x: tokenize_text(x)))

In [57]:
vec_list_query = [fre_dictionary[item] for item in ["cathéter","cardiaque"] if item in fre_dictionary]
vecsum_query = np.sum(vec_list_query,axis=0)

cosine_sim = []

for token_docs in tokenized[0:1000]:
    vec_list_doc = [ger_dictionary[item] for item in token_docs if item in ger_dictionary ]
    vecsum_doc = np.sum(vec_list_doc,axis=0)
    cosine_sim.append(FastVector.cosine_similarity(vecsum_query, vecsum_doc))


In [66]:
np.sort(cosine_sim)[::-1][0:100]

array([0.65872948, 0.63541166, 0.63005044, 0.6272868 , 0.62708371,
       0.62147812, 0.61883629, 0.61870817, 0.61840581, 0.61742741,
       0.61655217, 0.61428238, 0.61355099, 0.61244279, 0.61210313,
       0.61053886, 0.60539011, 0.60487994, 0.60420434, 0.60242116,
       0.60216802, 0.60201972, 0.60105402, 0.59974198, 0.5991122 ,
       0.59792045, 0.5974014 , 0.59728187, 0.59698855, 0.59641539,
       0.59561697, 0.59438333, 0.59345241, 0.59284673, 0.592762  ,
       0.59230708, 0.59221347, 0.59207745, 0.59185406, 0.59180559,
       0.59168474, 0.59105947, 0.59103073, 0.59046134, 0.58860881,
       0.5879767 , 0.58554408, 0.58511775, 0.58501425, 0.58418249,
       0.5840152 , 0.5830802 , 0.58305672, 0.58253369, 0.58242034,
       0.58237253, 0.58221332, 0.58159155, 0.58149309, 0.58142414,
       0.58122519, 0.58115302, 0.57968046, 0.57917339, 0.57815572,
       0.57805633, 0.57755932, 0.57604149, 0.5758082 , 0.57576446,
       0.57522375, 0.57477377, 0.57393907, 0.57283635, 0.57262

TODO - implement 2 approaches 

1 translate query to english
2 detect query language and use that