In [1]:
import pandas as pd

from gensim.models.phrases import Phrases, Phraser

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [2]:
pd.set_option('max_colwidth', 800)
pd.set_option('display.expand_frame_repr', False)

In [31]:
df = pd.read_csv("data/processed_simpsons_dataset.csv")

In [32]:
df.head()

Unnamed: 0,spoken_words
0,actually little disease magazine news show natural think
1,know sure like talk touch lesson plan teach
2,life worth live
3,poll open end recess case decide thought final statement martin
4,victory party slide


In [33]:
sent = [row.split() for row in df['spoken_words']]

In [34]:
bigram_phraser = Phrases(sent, min_count=30, progress_per=10000)

sentences = bigram_phraser[sent]

INFO - 18:44:40: collecting all words and their counts
INFO - 18:44:40: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 18:44:40: PROGRESS: at sentence #10000, processed 63414 words and 52395 word types
INFO - 18:44:40: PROGRESS: at sentence #20000, processed 130387 words and 98568 word types
INFO - 18:44:40: PROGRESS: at sentence #30000, processed 192369 words and 137101 word types
INFO - 18:44:40: PROGRESS: at sentence #40000, processed 249283 words and 170989 word types
INFO - 18:44:40: PROGRESS: at sentence #50000, processed 310323 words and 206466 word types
INFO - 18:44:40: PROGRESS: at sentence #60000, processed 372264 words and 241169 word types
INFO - 18:44:41: PROGRESS: at sentence #70000, processed 434951 words and 275773 word types
INFO - 18:44:41: PROGRESS: at sentence #80000, processed 496169 words and 308714 word types
INFO - 18:44:41: collected 329886 word types from a corpus of 540709 words (unigram + bigrams) and 86834 sentences
INFO - 18:44:41: us

In [7]:
from collections import defaultdict 

word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
print(len(word_freq))

sorted(word_freq, key=word_freq.get, reverse=True)[:10]

29648


['oh', 'like', 'know', 'get', 'hey', 'think', 'right', 'look', 'want', 'come']

### The parameters:
#### **min_count** = int - Ignores all words with total absolute frequency lower than this - (2, 100)
#### **window** = int - The maximum distance between the current and predicted word within a sentence. E.g. window words on the left and window words on the left of our target - (2, 10)
#### **size** = int - Dimensionality of the feature vectors. - (50, 300)
#### **sample** = float - The threshold for configuring which higher-frequency words are randomly downsampled. Highly influencial. - (0, 1e-5)
#### **alpha** = float - The initial learning rate - (0.01, 0.05)
#### **min_alpha** = float - Learning rate will linearly drop to min_alpha as training progresses. To set it: alpha - (min_alpha * epochs) ~ 0.00
#### **negative** = int - If > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drown. If set to 0, no negative sampling is used. - (5, 20)
#### **workers** = int - Use these many worker threads to train the model (=faster training with multicore machines)

In [8]:
import multiprocessing

from gensim.models import Word2Vec

cores = multiprocessing.cpu_count()
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [9]:
from time import time

t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 16:18:17: collecting all words and their counts
INFO - 16:18:17: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 16:18:18: PROGRESS: at sentence #10000, processed 61523 words, keeping 9464 word types
INFO - 16:18:18: PROGRESS: at sentence #20000, processed 126720 words, keeping 14262 word types
INFO - 16:18:18: PROGRESS: at sentence #30000, processed 187097 words, keeping 17334 word types
INFO - 16:18:18: PROGRESS: at sentence #40000, processed 242599 words, keeping 20033 word types
INFO - 16:18:19: PROGRESS: at sentence #50000, processed 302040 words, keeping 22449 word types
INFO - 16:18:19: PROGRESS: at sentence #60000, processed 362382 words, keeping 24710 word types
INFO - 16:18:19: PROGRESS: at sentence #70000, processed 423597 words, keeping 26800 word types
INFO - 16:18:19: PROGRESS: at sentence #80000, processed 483463 words, keeping 28662 word types
INFO - 16:18:20: collected 29648 word types from a corpus of 526714 raw words and 86834 sentence

Time to build vocab: 0.05 mins


In [10]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

#w2v_model.save("test_model.w2v", ignore=[])

INFO - 16:18:20: training model with 11 workers on 3332 vocabulary and 300 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2
INFO - 16:18:21: EPOCH 1 - PROGRESS: at 41.22% examples, 81474 words/s, in_qsize 0, out_qsize 0
INFO - 16:18:22: EPOCH 1 - PROGRESS: at 81.76% examples, 80363 words/s, in_qsize 0, out_qsize 0
INFO - 16:18:23: worker thread finished; awaiting finish of 10 more threads
INFO - 16:18:23: worker thread finished; awaiting finish of 9 more threads
INFO - 16:18:23: worker thread finished; awaiting finish of 8 more threads
INFO - 16:18:23: worker thread finished; awaiting finish of 7 more threads
INFO - 16:18:23: worker thread finished; awaiting finish of 6 more threads
INFO - 16:18:23: worker thread finished; awaiting finish of 5 more threads
INFO - 16:18:23: worker thread finished; awaiting finish of 4 more threads
INFO - 16:18:23: worker thread finished; awaiting finish of 3 more threads
INFO - 16:18:23: worker thread finished; awaiting finish of 2 more threa

Time to train the model: 1.27 mins


In [None]:
w2v_model = Word2Vec.load("test_model.w2v")

In [35]:
w2v_model.init_sims(replace=True)

INFO - 18:46:55: precomputing L2-norms of word weight vectors


In [12]:
w2v_model.wv.most_similar(positive=["homer"], topn=15)

[('marge', 0.7590745687484741),
 ('worry', 0.738866925239563),
 ('convince', 0.738680362701416),
 ('creepy', 0.7361273765563965),
 ('rude', 0.7276503443717957),
 ('gee', 0.7274417877197266),
 ('ralphie', 0.726743221282959),
 ('chore', 0.7243647575378418),
 ('god_sake', 0.7231760025024414),
 ('snuggle', 0.7201460599899292),
 ('asleep', 0.7189117670059204),
 ('embarrassing', 0.7168461084365845),
 ('crummy', 0.7163548469543457),
 ('hammock', 0.7126702070236206),
 ('talk', 0.7114170789718628)]

In [19]:
w2v_model.wv.most_similar(positive=["marge"])

[('homer', 0.7590745687484741),
 ('snuggle', 0.7550023794174194),
 ('rapture', 0.7539874315261841),
 ('brunch', 0.7461404800415039),
 ('homie', 0.7391209602355957),
 ('sweetheart', 0.7368350625038147),
 ('ohh', 0.7354586720466614),
 ('raccoon', 0.7350224256515503),
 ('glad', 0.7341887950897217),
 ('creepy', 0.7307769060134888)]

In [29]:
w2v_model.wv.similarity('maggie', 'baby')

0.7178596

In [30]:
w2v_model.wv.similarity('bart', 'baby')

0.41893536

In [26]:
w2v_model.wv.doesnt_match(['jimbo', 'milhouse', 'kearney'])



'milhouse'

In [24]:
w2v_model.wv.doesnt_match(["nelson", "bart", "milhouse"])

'nelson'

In [27]:
w2v_model.wv.most_similar(positive=["woman", "homer"], negative=["marge"], topn=3)

[('married', 0.6815634965896606),
 ('admire', 0.6345378160476685),
 ('attract', 0.6280614733695984)]

In [28]:
w2v_model.wv.most_similar(positive=["woman", "bart"], negative=["man"], topn=3)

[('lisa', 0.7096426486968994),
 ('pregnant', 0.6883337497711182),
 ('surprised', 0.6635551452636719)]