##### Master Degree in Computer Science and Data Science for Economics

# Word2Vec resources
## Example of using W2V to check for semantic shifts

### Alfio Ferrara


In [1]:
from gensim.models import Word2Vec

## Main functionalities of `gensim` implementation

In [2]:
import pymongo
from nltk.tokenize import word_tokenize
from scipy.spatial import distance

In [3]:
db = pymongo.MongoClient()['cousine']
recipes = db['foodcom']

q = {}
recipe_corpus = []
size = recipes.count_documents(q)
limit = 50_000

for recipe in recipes.find(q).limit(limit):
    try:
        recipe_corpus.append(word_tokenize(recipe['description'].lower()))
    except TypeError:
        pass 
    except AttributeError:
        pass 

In [4]:
print(recipe_corpus[0])

['we', 'love', 'grits', ',', 'this', 'is', 'another', 'good', 'way', 'to', 'serve', 'them', '.', 'a', 'great', 'alternative', 'to', 'a', 'baked', 'potato', 'when', 'served', 'with', 'grilled', 'steak', 'or', 'chicken', '.', 'i', 'belive', 'this', 'recipe', 'could', 'be', 'made', 'with', 'instant', 'grits.the', '2', '1/2', 'hours', 'for', 'refrigeration', 'is', 'not', 'include', 'in', 'time', '.', 'the', 'recipe', 'comes', 'from', 'tast', 'of', 'home', "'s", 'light', 'and', 'tasty', '.']


In [5]:
recipe_model = Word2Vec(sentences=recipe_corpus, vector_size=300, window=5, 
                        min_count=1, workers=8, epochs=25)

### Similarity

In [6]:
recipe_model.wv.most_similar('dinner')

[('supper', 0.6715042591094971),
 ('meal', 0.5441684126853943),
 ('brunch', 0.4939667582511902),
 ('appetizer', 0.4601024091243744),
 ('lunch', 0.4385051131248474),
 ('entertaining', 0.43703317642211914),
 ('dinners', 0.43435272574424744),
 ('entree', 0.43312951922416687),
 ('picnic', 0.42357006669044495),
 ('gathering', 0.42194387316703796)]

## Compositionality

In [7]:
dm = recipe_model.wv.doesnt_match(['pasta', 'spaghetti', 'noodles', 'apple'])
common = recipe_model.wv.get_mean_vector(['pasta', 'spaghetti', 'noodles', 'risotto'])
common_word = recipe_model.wv.similar_by_vector(common)
analogy = recipe_model.wv.most_similar(positive=['pizza', 'steak'], negative=['tomato'])

In [8]:
print(f"Doesn't match: {dm}")
print(f"Common terms: {common_word}")
print(f"Analogy: {analogy}")

Doesn't match: apple
Common terms: [('noodles', 0.7999148368835449), ('pasta', 0.7884016633033752), ('spaghetti', 0.7322816252708435), ('risotto', 0.5933534502983093), ('lasagna', 0.5874766111373901), ('polenta', 0.5733187198638916), ('linguine', 0.5356707572937012), ('penne', 0.5232851505279541), ('fettuccine', 0.5186668634414673), ('couscous', 0.516014039516449)]
Analogy: [('steaks', 0.3971555233001709), ('burger', 0.35433799028396606), ('grill', 0.3518954813480377), ('lasagna', 0.34866949915885925), ('ribs', 0.33822542428970337), ('roast', 0.3349013328552246), ('bbq', 0.3304198980331421), ('flank', 0.3288681209087372), ('15-inch', 0.32589617371559143), ('grilled', 0.32188883423805237)]


## Compare models vectors to measure a shift in meaning

In [9]:
db_tv = pymongo.MongoClient()['tmdb']
tvseries = db_tv['tvseries']

q = {}
tv_corpus = []
size = tvseries.count_documents(q)
limit = 50_000

for tvs in tvseries.find(q).limit(limit):
    try:
        tv_corpus.append(word_tokenize(tvs['overview'].lower()))
    except TypeError:
        pass 
    except AttributeError:
        pass 

In [10]:
tv_corpus[0][:6]

['walter', 'white', ',', 'a', 'new', 'mexico']

In [15]:
tv_model = Word2Vec(sentences=tv_corpus, vector_size=300, window=5, 
                        min_count=1, workers=8, epochs=50)

In [16]:
tv_model.wv.most_similar('dinner')

[('sect', 0.9945293068885803),
 ('ultimate', 0.9929512739181519),
 ('west', 0.9923814535140991),
 ('shine', 0.9923009872436523),
 ('greedy', 0.9919392466545105),
 ('head', 0.9914314150810242),
 ('never-before-told', 0.991184651851654),
 ('holds', 0.9907960891723633),
 ('leaving', 0.9907811880111694),
 ('secrets', 0.9905073642730713)]

In [17]:
recipe_model.wv.most_similar('brother')

[('sister', 0.7628988027572632),
 ('daughter', 0.7540379166603088),
 ('boyfriend', 0.7494955658912659),
 ('dad', 0.7205260992050171),
 ('father', 0.7112997174263),
 ('brother-in-law', 0.7054951786994934),
 ('wife', 0.7037829160690308),
 ('fiance', 0.7002346515655518),
 ('dd', 0.6990201473236084),
 ('niece', 0.6981290578842163)]

## Measuring semantic shifts

In [18]:
italian_q = {'search_terms': 'italian'}
chinese_q = {'search_terms': 'chinese'}
limit = 5_000
italian_corpus = []
chinese_corpus = []

for q, c in [(italian_q, italian_corpus), (chinese_q, chinese_corpus)]:
    for doc in recipes.find(q).limit(limit):
        try:
            tokens = word_tokenize(doc['description'].lower())
            c.append(tokens)
        except AttributeError:
            pass

print(f"Italian corpus: {len(italian_corpus)}, Chinese corpus: {len(chinese_corpus)}")

Italian corpus: 4920, Chinese corpus: 4871


In [19]:
main_corpus = italian_corpus + chinese_corpus
m0 = Word2Vec(sentences=main_corpus, vector_size=100, window=5, 
                        min_count=1, workers=8, epochs=50)

In [20]:
m0.wv.most_similar('dinner')

[('meal', 0.6804056167602539),
 ('supper', 0.6573017835617065),
 ('lunch', 0.5930486917495728),
 ('dinners', 0.5533140897750854),
 ('night', 0.5517100691795349),
 ('snack', 0.5194377303123474),
 ('starter', 0.4874492585659027),
 ('cocktail', 0.48230254650115967),
 ('picnics', 0.47330382466316223),
 ('guests', 0.4727531969547272)]

### Fine tune the global model to specific sub-corpora

In [21]:
import copy

In [22]:
m_it = copy.deepcopy(m0)
m_ch = copy.deepcopy(m0)

In [23]:
m_it.train(italian_corpus, total_examples=m0.corpus_count, epochs=m0.epochs)
m_ch.train(chinese_corpus, total_examples=m0.corpus_count, epochs=m0.epochs)

(7022315, 9992950)

In [24]:
m_it.wv.most_similar('dinner')

[('meal', 0.6211058497428894),
 ('supper', 0.5325251221656799),
 ('snack', 0.5054898262023926),
 ('cocktail', 0.4965253472328186),
 ('hostess', 0.48585087060928345),
 ('take-alongs', 0.44097140431404114),
 ('multitasking', 0.4398477375507355),
 ('lunch', 0.43938422203063965),
 ('night', 0.4361981153488159),
 ('starter', 0.43114975094795227)]

In [25]:
m_ch.wv.most_similar('dinner')

[('meal', 0.6799939274787903),
 ('supper', 0.5672358274459839),
 ('lunch', 0.4771226942539215),
 ('snack', 0.4711977243423462),
 ('dinners', 0.4625163674354553),
 ('night', 0.43596264719963074),
 ('entertaining', 0.41720816493034363),
 ('multitasking', 0.40387624502182007),
 ('hostess', 0.40293624997138977),
 ('occasion', 0.3939659595489502)]

In [26]:
word = 'spaghetti'
v0, vit, vch = m0.wv.get_vector(word), m_it.wv.get_vector(word), m_ch.wv.get_vector(word)

In [27]:
print(f"Moving to IT: {distance.cosine(vit, v0)}")
print(f"Moving to CH: {distance.cosine(vch, v0)}")
print(f"Moving from IT to CH: {distance.cosine(vch, vit)}")

Moving to IT: 0.06417824268192884
Moving to CH: 0.05927230830607866
Moving from IT to CH: 0.13013635178913807
