In [1]:
from gensim.models import Word2Vec
import gensim.downloader

In [2]:
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [3]:
glove_vectors = gensim.downloader.load('glove-wiki-gigaword-300')



In [5]:
glove_vectors.most_similar('twitter')

[('facebook', 0.8349667191505432),
 ('myspace', 0.7156519293785095),
 ('youtube', 0.7003148198127747),
 ('tweets', 0.6613802909851074),
 ('blog', 0.6582605242729187),
 ('blogging', 0.6539378762245178),
 ('blogs', 0.6365774869918823),
 ('instagram', 0.6170562505722046),
 ('tumblr', 0.6059106588363647),
 ('tweet', 0.5980194807052612)]

In [6]:
twitter_vector = glove_vectors.get_vector('twitter')

In [7]:
twitter_vector

array([-5.7816e-01,  1.7040e-01,  4.5315e-01, -2.9551e-01,  3.9779e-01,
       -1.1818e-01, -3.5418e-02, -4.5607e-01,  5.8318e-02, -5.5856e-01,
        2.5115e-01,  5.5846e-01,  6.5725e-01, -5.4000e-01,  1.1624e-01,
       -1.6971e-01, -3.0415e-01,  1.9348e-01,  7.9338e-02,  4.7702e-01,
        5.4305e-01,  4.8144e-01, -4.9827e-01, -4.4301e-01, -8.5195e-01,
       -7.3448e-01, -1.5931e-01,  5.0877e-01,  6.5791e-01,  2.1944e-01,
       -2.2480e-01,  1.5808e-01, -1.8122e-01,  7.7452e-01, -4.3319e-01,
       -7.0192e-01, -2.4444e-01, -6.1157e-02,  4.4814e-01, -3.8710e-01,
       -5.3779e-01,  1.1006e-01, -1.7303e-01,  6.7574e-01, -1.6612e-01,
       -4.2205e-03,  6.3733e-02,  5.4731e-02,  1.7517e-02,  3.0798e-01,
       -2.4405e-01,  2.7635e-01,  2.1865e-01, -3.3787e-01,  7.6142e-02,
        1.7089e-01,  3.5796e-01, -3.8512e-01, -2.2535e-01, -9.1429e-01,
        1.0118e-01, -3.5844e-01, -2.4257e-02,  1.2927e-01, -7.4470e-02,
       -3.8815e-01,  3.9420e-01,  6.0092e-01,  2.1004e-01,  3.12

In [8]:
apple_vector = glove_vectors.get_vector('apple')

In [9]:
orange_vector = glove_vectors.get_vector('orange')

In [10]:
from scipy import spatial

In [11]:
1 - spatial.distance.cosine(orange_vector, apple_vector)

0.32060176134109497

In [12]:
animal_vector = glove_vectors.get_vector('animal')

In [13]:
1 - spatial.distance.cosine(orange_vector, animal_vector)

0.11694557219743729

In [14]:
juice_vector = glove_vectors.get_vector('juice')

In [15]:
1 - spatial.distance.cosine(orange_vector, juice_vector)

0.47728532552719116

In [16]:
1 - spatial.distance.cosine(apple_vector, juice_vector)

0.441051185131073

In [17]:
1 - spatial.distance.cosine(animal_vector, juice_vector)

0.09790198504924774

In [18]:
1 - spatial.distance.cosine(twitter_vector, juice_vector)

0.0735817700624466

In [19]:
tweet_vector = glove_vectors.get_vector('tweet')

In [20]:
1 - spatial.distance.cosine(twitter_vector, tweet_vector)

0.5980194211006165

In [22]:
dog_vector = glove_vectors.get_vector('dog')
cat_vector = glove_vectors.get_vector('cat')

1 - spatial.distance.cosine(dog_vector, cat_vector)

0.6816746592521667

In [23]:
sentences = [
    "The MacBook Pro is a line of Macintosh notebook computers by Apple Inc.",
    "Pentium is a brand used for a series of x86 architecture-compatible microprocessors produced by Intel.",
    "American football, referred to simply as football in the United States and Canada.",
    "The Serie A is a professional league competition for football clubs located at the top of the Italian football league system."
]

In [24]:
import string
import spacy
from nltk.corpus import stopwords
import re

english_stopwords = stopwords.words('english')
nlp = spacy.load('en_core_web_sm')
punctuation = set(string.punctuation)

def data_cleaner(sentence):
    sentence = sentence.lower()
    for c in string.punctuation:
        sentence = sentence.replace(c, " ")
    document = nlp(sentence)
    sentence = ' '.join(token.lemma_ for token in document)
    sentence = ' '.join(word for word in sentence.split() if word not in english_stopwords)
    sentence = re.sub('\d', '', sentence)
    
    return sentence.split()

In [25]:
import numpy as np
def avg_vector(sentence):
    to_remove = 0
    vector = np.zeros(300)
    for word in sentence:
        if word in glove_vectors.key_to_index.keys():
            vector += glove_vectors.get_vector(word)
        else:
            to_remove += 1
    if len(sentence)== to_remove:
        return np.zeros(300)
        
    return vector/(len(sentence)-to_remove)

# calculate the average vector for each sentence

In [26]:
vectors = []

for sentence in sentences:
    vectors.append(avg_vector(data_cleaner(sentence)))

In [27]:
vectors

[array([ 0.08827938,  0.2051965 ,  0.08929362, -0.371945  ,  0.104481  ,
        -0.2298181 , -0.22203688,  0.01610125,  0.42278187, -0.75414499,
        -0.19915662,  0.09173176,  0.4875765 , -0.47093149,  0.33541813,
         0.2786028 , -0.169387  ,  0.019512  , -0.068304  , -0.12165512,
         0.136586  ,  0.15210925, -0.22464688, -0.12444638,  0.0528445 ,
        -0.12840512,  0.03025537,  0.00899975,  0.38228625, -0.18794675,
         0.12620875, -0.092196  ,  0.13638962,  0.23949725, -0.51634549,
        -0.26008625, -0.22530326, -0.18710937, -0.09517774, -0.19937662,
         0.15906375,  0.35603249, -0.25223375,  0.2190975 ,  0.02374388,
         0.09801825,  0.235172  ,  0.09902125,  0.196265  , -0.16661777,
         0.36977375, -0.05864538,  0.23165338,  0.07521562,  0.00184699,
        -0.08794488,  0.07157625,  0.01529837, -0.01192125, -0.040429  ,
         0.05003775,  0.06012   , -0.1173645 ,  0.24027401, -0.08571662,
         0.30950625, -0.48429875,  0.11095763, -0.0

In [28]:
1 - spatial.distance.cosine(vectors[0], vectors[1]) # MacBook Pro vs Intel

0.6553946566063276

In [29]:
1 - spatial.distance.cosine(vectors[2], vectors[3]) # American football vs Serie A

0.6928358646640873

In [30]:
1 - spatial.distance.cosine(vectors[0], vectors[2]) # MacBook Pro vs American football

0.20898472500966325

In [31]:
1 - spatial.distance.cosine(vectors[1], vectors[2]) # Intel vs American football

0.2559275422687064

In [32]:
1 - spatial.distance.cosine(vectors[0], vectors[3]) # MacBook Pro vs Serie A

0.2382190857623825

In [33]:
1 - spatial.distance.cosine(vectors[1], vectors[3]) # Intel vs Serie A

0.23785006511457152

## ESERCIZIO

## Utilizzando il dataset visto nella lezione del topic modeling, individuare il documento del dataset, più simile ad uno dei documenti a scelta dello stesso dataset.

In [34]:
import pandas as pd

In [35]:
dataset = pd.read_csv('datasets/Lezione_7-Topic_modeling/dataset_Research_Article.csv')

In [36]:
dataset

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
20967,20968,Contemporary machine learning: a guide for pra...,Machine learning is finding increasingly bro...,1,1,0,0,0,0
20968,20969,Uniform diamond coatings on WC-Co hard alloy c...,Polycrystalline diamond coatings have been g...,0,1,0,0,0,0
20969,20970,Analysing Soccer Games with Clustering and Con...,We present a new approach for identifying si...,1,0,0,0,0,0
20970,20971,On the Efficient Simulation of the Left-Tail o...,The sum of Log-normal variates is encountere...,0,0,1,1,0,0


In [37]:
vectors = [avg_vector(data_cleaner(doc)) for doc in dataset['TITLE']]

In [43]:
def most_similar(vectors, index):
    similarity = 0
    index_similar_doc = 0
    for i in range(0,len(vectors)):
        if i!=index and vectors[i].all()!=np.zeros(300).all():
            if 1 - spatial.distance.cosine(vectors[index], vectors[i]) > similarity:
                index_similar_doc = i
                similarity = 1 - spatial.distance.cosine(vectors[index], vectors[i])
    
    return similarity,index_similar_doc

In [61]:
index = 908
similarity,index_similar_doc = most_similar(vectors,index)

In [62]:
index_similar_doc

11741

In [63]:
similarity

0.7346123395584204

In [64]:
dataset['TITLE'][index]

'Forecasting the Impact of Stellar Activity on Transiting Exoplanet Spectra'

In [65]:
dataset['TITLE'][index_similar_doc]

'Distinguishing the albedo of exoplanets from stellar activity'