##### Master Degree in Computer Science and Data Science for Economics

# Word2Vec resources
## Example of using W2V to check for semantic shifts

### Alfio Ferrara


In [1]:
from gensim.models import Word2Vec

## Main functionalities of `gensim` implementation

In [2]:
import pymongo
from nltk.tokenize import word_tokenize
from scipy.spatial import distance

In [3]:
db = pymongo.MongoClient()['cousine']
recipes = db['foodcom']

q = {}
recipe_corpus = []
size = recipes.count_documents(q)
limit = 50_000

for recipe in recipes.find(q).limit(limit):
    try:
        recipe_corpus.append(word_tokenize(recipe['description'].lower()))
    except TypeError:
        pass 
    except AttributeError:
        pass 

In [4]:
print(recipe_corpus[0])

['we', 'love', 'grits', ',', 'this', 'is', 'another', 'good', 'way', 'to', 'serve', 'them', '.', 'a', 'great', 'alternative', 'to', 'a', 'baked', 'potato', 'when', 'served', 'with', 'grilled', 'steak', 'or', 'chicken', '.', 'i', 'belive', 'this', 'recipe', 'could', 'be', 'made', 'with', 'instant', 'grits.the', '2', '1/2', 'hours', 'for', 'refrigeration', 'is', 'not', 'include', 'in', 'time', '.', 'the', 'recipe', 'comes', 'from', 'tast', 'of', 'home', "'s", 'light', 'and', 'tasty', '.']


In [5]:
recipe_model = Word2Vec(sentences=recipe_corpus, vector_size=300, window=5, 
                        min_count=1, workers=8, epochs=25)

### Similarity

In [6]:
recipe_model.wv.most_similar('dinner')

[('supper', 0.6802205443382263),
 ('meal', 0.5461012125015259),
 ('brunch', 0.47505271434783936),
 ('appetizer', 0.4526340365409851),
 ('entree', 0.45230329036712646),
 ('entertaining', 0.4501704275608063),
 ('dinners', 0.429585725069046),
 ('gathering', 0.4263454079627991),
 ('lunch', 0.42482757568359375),
 ('meals', 0.41012176871299744)]

## Compositionality

In [7]:
dm = recipe_model.wv.doesnt_match(['pasta', 'spaghetti', 'noodles', 'apple'])
common = recipe_model.wv.get_mean_vector(['pasta', 'spaghetti', 'noodles', 'risotto'])
common_word = recipe_model.wv.similar_by_vector(common)
analogy = recipe_model.wv.most_similar(positive=['pizza', 'steak'], negative=['tomato'])

In [8]:
print(f"Doesn't match: {dm}")
print(f"Common terms: {common_word}")
print(f"Analogy: {analogy}")

Doesn't match: apple
Common terms: [('pasta', 0.7843869924545288), ('noodles', 0.783257782459259), ('spaghetti', 0.7344835996627808), ('risotto', 0.601172924041748), ('lasagna', 0.5741502642631531), ('polenta', 0.5542970299720764), ('linguine', 0.5229288935661316), ('fettuccine', 0.5191859602928162), ('penne', 0.518997848033905), ('couscous', 0.508514404296875)]
Analogy: [('grill', 0.38019317388534546), ('bbq', 0.35111916065216064), ('burger', 0.34664222598075867), ('steaks', 0.340492844581604), ('lasagna', 0.3275046646595001), ('reuben', 0.32319068908691406), ('ribs', 0.3157573342323303), ('broiling', 0.3092014491558075), ('fajitas', 0.30499404668807983), ('indoor', 0.30170127749443054)]


## Compare models vectors to measure a shift in meaning

In [9]:
db_tv = pymongo.MongoClient()['tmdb']
tvseries = db_tv['tvseries']

q = {}
tv_corpus = []
size = tvseries.count_documents(q)
limit = 50_000

for tvs in tvseries.find(q).limit(limit):
    try:
        tv_corpus.append(word_tokenize(tvs['overview'].lower()))
    except TypeError:
        pass 
    except AttributeError:
        pass 

In [10]:
tv_corpus[0][:6]

['walter', 'white', ',', 'a', 'new', 'mexico']

In [11]:
tv_model = Word2Vec(sentences=tv_corpus, vector_size=100, window=5, 
                        min_count=1, workers=8, epochs=50)

In [12]:
tv_model.wv.most_similar('dinner')

[('ultimate', 0.9939123392105103),
 ('phone', 0.9924060702323914),
 ('scoops', 0.9913731217384338),
 ('maria', 0.9903990030288696),
 ('clarity', 0.9899325966835022),
 ('miami', 0.9899277091026306),
 ('skywalker', 0.9898713827133179),
 ('mace', 0.9897506237030029),
 ('mei', 0.9896987676620483),
 ('fan', 0.9895042777061462)]

In [13]:
recipe_model.wv.most_similar('brother')

[('sister', 0.7777917981147766),
 ('daughter', 0.7555021643638611),
 ('boyfriend', 0.7444507479667664),
 ('niece', 0.7425244450569153),
 ('wife', 0.7346354722976685),
 ('dd', 0.7327642440795898),
 ('dad', 0.7315940856933594),
 ('fiance', 0.728138267993927),
 ('son', 0.7229806780815125),
 ('brother-in-law', 0.7174952626228333)]

## Measuring semantic shifts

In [14]:
italian_q = {'search_terms': 'italian'}
chinese_q = {'search_terms': 'chinese'}
limit = 5_000
italian_corpus = []
chinese_corpus = []

for q, c in [(italian_q, italian_corpus), (chinese_q, chinese_corpus)]:
    for doc in recipes.find(q).limit(limit):
        try:
            tokens = word_tokenize(doc['description'].lower())
            c.append(tokens)
        except AttributeError:
            pass

print(f"Italian corpus: {len(italian_corpus)}, Chinese corpus: {len(chinese_corpus)}")

Italian corpus: 4920, Chinese corpus: 4871


In [15]:
main_corpus = italian_corpus + chinese_corpus
m0 = Word2Vec(sentences=main_corpus, vector_size=100, window=5, 
                        min_count=1, workers=8, epochs=50)

In [16]:
m0.wv.most_similar('dinner')

[('meal', 0.658794105052948),
 ('supper', 0.6461277008056641),
 ('lunch', 0.5710224509239197),
 ('night', 0.5304909944534302),
 ('multitasking', 0.5162426829338074),
 ('snack', 0.5153259038925171),
 ('brunch', 0.5014334321022034),
 ('dinners', 0.49514809250831604),
 ('cocktail', 0.4790607988834381),
 ('take-alongs', 0.47777262330055237)]

### Fine tune the global model to specific sub-corpora

In [17]:
import copy

In [18]:
m_it = copy.deepcopy(m0)
m_ch = copy.deepcopy(m0)

In [19]:
m_it.train(italian_corpus, total_examples=m0.corpus_count, epochs=m0.epochs)
m_ch.train(chinese_corpus, total_examples=m0.corpus_count, epochs=m0.epochs)

(7022862, 9992950)

In [20]:
m_it.wv.most_similar('dinner')

[('meal', 0.5930757522583008),
 ('supper', 0.546654224395752),
 ('snack', 0.5166825652122498),
 ('cocktail', 0.4984075427055359),
 ('multitasking', 0.4908176362514496),
 ('hostess', 0.46310943365097046),
 ('take-alongs', 0.45842233300209045),
 ('brunch', 0.45402300357818604),
 ('lunch', 0.45054692029953003),
 ('picnics', 0.4429490268230438)]

In [21]:
m_ch.wv.most_similar('dinner')

[('meal', 0.6648055911064148),
 ('supper', 0.5536285638809204),
 ('multitasking', 0.49264785647392273),
 ('night', 0.48381927609443665),
 ('lunch', 0.48119181394577026),
 ('snack', 0.46922823786735535),
 ('crowd', 0.43798351287841797),
 ('entree', 0.41686946153640747),
 ('entertaining', 0.4125153720378876),
 ('unexpected', 0.40737923979759216)]

In [22]:
word = 'spaghetti'
v0, vit, vch = m0.wv.get_vector(word), m_it.wv.get_vector(word), m_ch.wv.get_vector(word)

In [23]:
print(f"Moving to IT: {distance.cosine(vit, v0)}")
print(f"Moving to CH: {distance.cosine(vch, v0)}")
print(f"Moving from IT to CH: {distance.cosine(vch, vit)}")

Moving to IT: 0.05137380935261615
Moving to CH: 0.05520601573640993
Moving from IT to CH: 0.09825096112418708
