##### Master Degree in Computer Science and Data Science for Economics

# Word2Vec resources
## Example of using W2V to check for semantic shifts

### Alfio Ferrara


In [1]:
from gensim.models import Word2Vec

## Main functionalities of `gensim` implementation

In [3]:
import pymongo
from nltk.tokenize import word_tokenize
from scipy.spatial import distance

In [4]:
db = pymongo.MongoClient()['cousine']
recipes = db['foodcom']

q = {}
recipe_corpus = []
size = recipes.count_documents(q)
limit = 50_000

for recipe in recipes.find(q).limit(limit):
    try:
        recipe_corpus.append(word_tokenize(recipe['description'].lower()))
    except TypeError:
        pass 
    except AttributeError:
        pass 

In [5]:
print(recipe_corpus[0])

['we', 'love', 'grits', ',', 'this', 'is', 'another', 'good', 'way', 'to', 'serve', 'them', '.', 'a', 'great', 'alternative', 'to', 'a', 'baked', 'potato', 'when', 'served', 'with', 'grilled', 'steak', 'or', 'chicken', '.', 'i', 'belive', 'this', 'recipe', 'could', 'be', 'made', 'with', 'instant', 'grits.the', '2', '1/2', 'hours', 'for', 'refrigeration', 'is', 'not', 'include', 'in', 'time', '.', 'the', 'recipe', 'comes', 'from', 'tast', 'of', 'home', "'s", 'light', 'and', 'tasty', '.']


In [6]:
recipe_model = Word2Vec(sentences=recipe_corpus, vector_size=300, window=5, 
                        min_count=1, workers=8, epochs=25)

### Similarity

In [20]:
recipe_model.wv.most_similar('dinner')

[('supper', 0.6818049550056458),
 ('meal', 0.5486781001091003),
 ('brunch', 0.478179007768631),
 ('entertaining', 0.4659525454044342),
 ('entree', 0.45991620421409607),
 ('appetizer', 0.45308664441108704),
 ('dinners', 0.44084885716438293),
 ('lunch', 0.41908538341522217),
 ('picnic', 0.4138370752334595),
 ('gathering', 0.41037610173225403)]

## Compositionality

In [8]:
dm = recipe_model.wv.doesnt_match(['pasta', 'spaghetti', 'noodles', 'apple'])
common = recipe_model.wv.get_mean_vector(['pasta', 'spaghetti', 'noodles', 'risotto'])
common_word = recipe_model.wv.similar_by_vector(common)
analogy = recipe_model.wv.most_similar(positive=['pizza', 'steak'], negative=['tomato'])

In [9]:
print(f"Doesn't match: {dm}")
print(f"Common terms: {common_word}")
print(f"Analogy: {analogy}")

Doesn't match: apple
Common terms: [('pasta', 0.7942464351654053), ('noodles', 0.7896058559417725), ('spaghetti', 0.7262192368507385), ('lasagna', 0.6067780256271362), ('risotto', 0.5812796950340271), ('polenta', 0.5663238167762756), ('fettuccine', 0.5209885835647583), ('couscous', 0.5188214778900146), ('penne', 0.5112525820732117), ('steamed', 0.49540385603904724)]
Analogy: [('steaks', 0.381004273891449), ('grill', 0.37874242663383484), ('ribs', 0.36033061146736145), ('burger', 0.35143524408340454), ('roast', 0.3302801549434662), ('flank', 0.3176497519016266), ('kransky', 0.3118528425693512), ('bbq', 0.3111320734024048), ('sandwich', 0.31009718775749207), ('rack', 0.30922338366508484)]


## Compare models vectors to measure a shift in meaning

In [10]:
db_tv = pymongo.MongoClient()['tmdb']
tvseries = db_tv['tvseries']

q = {}
tv_corpus = []
size = tvseries.count_documents(q)
limit = 50_000

for tvs in tvseries.find(q).limit(limit):
    try:
        tv_corpus.append(word_tokenize(tvs['overview'].lower()))
    except TypeError:
        pass 
    except AttributeError:
        pass 

In [11]:
tv_corpus[0][:6]

['walter', 'white', ',', 'a', 'new', 'mexico']

In [12]:
tv_model = Word2Vec(sentences=tv_corpus, vector_size=100, window=5, 
                        min_count=1, workers=8, epochs=50)

In [19]:
tv_model.wv.most_similar('dinner')

[('ultimate', 0.9926475286483765),
 ('watch', 0.9902042150497437),
 ('sect', 0.9894689917564392),
 ('robotechnology', 0.987846314907074),
 ('tenured', 0.9876978397369385),
 ('emmy', 0.9876712560653687),
 ('fort', 0.9876428246498108),
 ('discussion', 0.9875051975250244),
 ('head', 0.9873998165130615),
 ('phone', 0.9871098399162292)]

In [47]:
recipe_model.wv.most_similar('brother')

[('boyfriend', 0.7582463026046753),
 ('sister', 0.7582341432571411),
 ('daughter', 0.7382907271385193),
 ('fiance', 0.7378897070884705),
 ('dad', 0.7274300456047058),
 ('niece', 0.7191648483276367),
 ('wife', 0.7175347208976746),
 ('dd', 0.7085770964622498),
 ('father', 0.7035520076751709),
 ('son', 0.6993942260742188)]

## Measuring semantic shifts

In [21]:
italian_q = {'search_terms': 'italian'}
chinese_q = {'search_terms': 'chinese'}
limit = 5_000
italian_corpus = []
chinese_corpus = []

for q, c in [(italian_q, italian_corpus), (chinese_q, chinese_corpus)]:
    for doc in recipes.find(q).limit(limit):
        try:
            tokens = word_tokenize(doc['description'].lower())
            c.append(tokens)
        except AttributeError:
            pass

print(f"Italian corpus: {len(italian_corpus)}, Chinese corpus: {len(chinese_corpus)}")

Italian corpus: 4920, Chinese corpus: 4871


In [22]:
main_corpus = italian_corpus + chinese_corpus
m0 = Word2Vec(sentences=main_corpus, vector_size=100, window=5, 
                        min_count=1, workers=8, epochs=50)

In [25]:
m0.wv.most_similar('dinner')

[('meal', 0.616359293460846),
 ('supper', 0.6161196231842041),
 ('lunch', 0.6113405823707581),
 ('night', 0.5285757780075073),
 ('snack', 0.504379153251648),
 ('brunch', 0.4708978533744812),
 ('starter', 0.4672921299934387),
 ('dinners', 0.4645257294178009),
 ('week', 0.46191006898880005),
 ('multitasking', 0.4587549865245819)]

### Fine tune the global model to specific sub-corpora

In [26]:
import copy

In [27]:
m_it = copy.deepcopy(m0)
m_ch = copy.deepcopy(m0)

In [28]:
m_it.train(italian_corpus, total_examples=m0.corpus_count, epochs=m0.epochs)
m_ch.train(chinese_corpus, total_examples=m0.corpus_count, epochs=m0.epochs)

(7020588, 9992950)

In [31]:
m_it.wv.most_similar('dinner')

[('meal', 0.5755337476730347),
 ('supper', 0.5337268710136414),
 ('snack', 0.5199846029281616),
 ('lunch', 0.4908667206764221),
 ('hostess', 0.47211456298828125),
 ('multitasking', 0.4704848527908325),
 ('starter', 0.4685384929180145),
 ('brunch', 0.44351819157600403),
 ('take-alongs', 0.4351400136947632),
 ('cocktail', 0.4314136207103729)]

In [32]:
m_ch.wv.most_similar('dinner')

[('meal', 0.6126087307929993),
 ('supper', 0.5882522463798523),
 ('lunch', 0.5128371119499207),
 ('night', 0.47236335277557373),
 ('multitasking', 0.45236438512802124),
 ('snack', 0.4518190324306488),
 ('hostess', 0.4326353669166565),
 ('entree', 0.429100900888443),
 ('crowd', 0.41184669733047485),
 ('appetiser', 0.40547966957092285)]

In [33]:
word = 'spaghetti'
v0, vit, vch = m0.wv.get_vector(word), m_it.wv.get_vector(word), m_ch.wv.get_vector(word)

In [35]:
print(f"Moving to IT: {distance.cosine(vit, v0)}")
print(f"Moving to CH: {distance.cosine(vch, v0)}")
print(f"Moving from IT to CH: {distance.cosine(vch, vit)}")

Moving to IT: 0.055795016512140916
Moving to CH: 0.058662352822629216
Moving from IT to CH: 0.12499735031046066
