##### Master Degree in Computer Science and Data Science for Economics

# Word2Vec resources
## Example of using W2V to check for semantic shifts

### Alfio Ferrara


In [1]:
from gensim.models import Word2Vec

## Main functionalities of `gensim` implementation

In [63]:
import pymongo
from nltk.tokenize import word_tokenize
from scipy.spatial import distance

In [6]:
db = pymongo.MongoClient()['cousine']
recipes = db['foodcom']

q = {}
recipe_corpus = []
size = recipes.count_documents(q)
limit = 50_000

for recipe in recipes.find(q).limit(limit):
    try:
        recipe_corpus.append(word_tokenize(recipe['description'].lower()))
    except TypeError:
        pass 
    except AttributeError:
        pass 

In [7]:
print(recipe_corpus[0])

['we', 'love', 'grits', ',', 'this', 'is', 'another', 'good', 'way', 'to', 'serve', 'them', '.', 'a', 'great', 'alternative', 'to', 'a', 'baked', 'potato', 'when', 'served', 'with', 'grilled', 'steak', 'or', 'chicken', '.', 'i', 'belive', 'this', 'recipe', 'could', 'be', 'made', 'with', 'instant', 'grits.the', '2', '1/2', 'hours', 'for', 'refrigeration', 'is', 'not', 'include', 'in', 'time', '.', 'the', 'recipe', 'comes', 'from', 'tast', 'of', 'home', "'s", 'light', 'and', 'tasty', '.']


In [8]:
recipe_model = Word2Vec(sentences=recipe_corpus, vector_size=300, window=5, 
                        min_count=1, workers=8, epochs=25)

### Similarity

In [9]:
recipe_model.wv.most_similar('pasta')

[('noodles', 0.5552150011062622),
 ('fettuccine', 0.5019420385360718),
 ('spaghetti', 0.48538538813591003),
 ('lasagna', 0.470816045999527),
 ('greens', 0.4468240737915039),
 ('broccoli', 0.4334532618522644),
 ('couscous', 0.4136292338371277),
 ('polenta', 0.4132252335548401),
 ('steamed', 0.40818169713020325),
 ('linguine', 0.4080011546611786)]

## Compositionality

In [30]:
dm = recipe_model.wv.doesnt_match(['pasta', 'spaghetti', 'noodles', 'apple'])
common = recipe_model.wv.get_mean_vector(['pasta', 'spaghetti', 'noodles', 'risotto'])
common_word = recipe_model.wv.similar_by_vector(common)
analogy = recipe_model.wv.most_similar(positive=['pizza', 'steak'], negative=['tomato'])

In [31]:
print(f"Doesn't match: {dm}")
print(f"Common terms: {common_word}")
print(f"Analogy: {analogy}")

Doesn't match: apple
Common terms: [('noodles', 0.7980520725250244), ('pasta', 0.7943751811981201), ('spaghetti', 0.7324199080467224), ('lasagna', 0.6158578395843506), ('risotto', 0.6023666858673096), ('polenta', 0.5509158372879028), ('fettuccine', 0.5406956672668457), ('linguine', 0.5256999731063843), ('penne', 0.5146601796150208), ('couscous', 0.4943198263645172)]
Analogy: [('steaks', 0.39486193656921387), ('grill', 0.368589848279953), ('bbq', 0.36669859290122986), ('burger', 0.35022082924842834), ('lasagna', 0.33507680892944336), ('ribs', 0.3340110778808594), ('grilled', 0.32294386625289917), ('roast', 0.3165149688720703), ('fork', 0.3116554319858551), ('sirloin', 0.31139659881591797)]


## Compare models vectors to measure a shift in meaning

In [37]:
db_tv = pymongo.MongoClient()['tmdb']
tvseries = db_tv['tvseries']

q = {}
tv_corpus = []
size = tvseries.count_documents(q)
limit = 50_000

for tvs in tvseries.find(q).limit(limit):
    try:
        tv_corpus.append(word_tokenize(tvs['overview'].lower()))
    except TypeError:
        pass 
    except AttributeError:
        pass 

In [40]:
tv_corpus[0][:6]

['walter', 'white', ',', 'a', 'new', 'mexico']

In [45]:
tv_model = Word2Vec(sentences=tv_corpus, vector_size=100, window=5, 
                        min_count=1, workers=8, epochs=50)

In [46]:
tv_model.wv.most_similar('brother')

[('redeem', 0.9673093557357788),
 ('loveless', 0.9648919701576233),
 ('mission', 0.9624361395835876),
 ('elder', 0.961053729057312),
 ('parents', 0.9608786702156067),
 ('problematic', 0.959640622138977),
 ('keung', 0.9570775628089905),
 ('superheroes', 0.9566478729248047),
 ('enhancing', 0.9562162160873413),
 ('peeping', 0.9530292749404907)]

In [47]:
recipe_model.wv.most_similar('brother')

[('boyfriend', 0.7582463026046753),
 ('sister', 0.7582341432571411),
 ('daughter', 0.7382907271385193),
 ('fiance', 0.7378897070884705),
 ('dad', 0.7274300456047058),
 ('niece', 0.7191648483276367),
 ('wife', 0.7175347208976746),
 ('dd', 0.7085770964622498),
 ('father', 0.7035520076751709),
 ('son', 0.6993942260742188)]

## Measuring semantic shifts

In [50]:
italian_q = {'search_terms': 'italian'}
chinese_q = {'search_terms': 'chinese'}
limit = 5_000
italian_corpus = []
chinese_corpus = []

for q, c in [(italian_q, italian_corpus), (chinese_q, chinese_corpus)]:
    for doc in recipes.find(q).limit(limit):
        try:
            tokens = word_tokenize(doc['description'].lower())
            c.append(tokens)
        except AttributeError:
            pass

print(f"Italian corpus: {len(italian_corpus)}, Chinese corpus: {len(chinese_corpus)}")

Italian corpus: 4920, Chinese corpus: 4871


In [51]:
main_corpus = italian_corpus + chinese_corpus
m0 = Word2Vec(sentences=main_corpus, vector_size=100, window=5, 
                        min_count=1, workers=8, epochs=50)

In [52]:
m0.wv.most_similar('pasta')

[('spaghetti', 0.5600408911705017),
 ('penne', 0.45347607135772705),
 ('linguine', 0.4507941007614136),
 ('fish', 0.4485734701156616),
 ('hair', 0.440589964389801),
 ('fettuccine', 0.4274340867996216),
 ('meat', 0.42419877648353577),
 ('tomato', 0.41989225149154663),
 ('risotto', 0.4197191894054413),
 ('ziti', 0.41755545139312744)]

### Fine tune the global model to specific sub-corpora

In [53]:
import copy

In [54]:
m_it = copy.deepcopy(m0)
m_ch = copy.deepcopy(m0)

In [57]:
m_it.train(italian_corpus, total_examples=m0.corpus_count, epochs=m0.epochs)
m_ch.train(chinese_corpus, total_examples=m0.corpus_count, epochs=m0.epochs)

(7021719, 9992950)

In [60]:
m_it.wv.most_similar('sauce')

[('soup', 0.5136364698410034),
 ('sauces', 0.4332062304019928),
 ('dish', 0.430277943611145),
 ('chicken-you', 0.416638046503067),
 ('chunky', 0.4089588522911072),
 ('marinade/sauce', 0.40361347794532776),
 ('broth', 0.40259942412376404),
 ('meatballs', 0.3962996304035187),
 ('paste', 0.3920050859451294),
 ('time-tested', 0.38308626413345337)]

In [61]:
m_ch.wv.most_similar('sauce')

[('paste', 0.4618358314037323),
 ('wedges', 0.42947617173194885),
 ('paste/sauce', 0.4207182824611664),
 ('soup', 0.40637680888175964),
 ('chunky', 0.3986596167087555),
 ('marinade', 0.38860464096069336),
 ('dark', 0.38331976532936096),
 ("'dark", 0.378656804561615),
 ('time-tested', 0.3783089816570282),
 ('rich/thick', 0.3740333616733551)]

In [72]:
word = 'spaghetti'
v0, vit, vch = m0.wv.get_vector(word), m_it.wv.get_vector(word), m_ch.wv.get_vector(word)

In [73]:
print(f"Moving to IT: {distance.cosine(vit, v0)}")
print(f"Moving to CH: {distance.cosine(vch, v0)}")

Moving to IT: 0.09616337660618668
Moving to CH: 0.055406030742616186
