<a href="https://colab.research.google.com/github/princetondalmet/Word2Vec/blob/main/sim1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency

In [None]:
import spacy  # For preprocessing

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [None]:
from google.colab import drive
drive.mount('/content/grive')

Mounted at /content/grive


In [None]:
df = pd.read_csv("simpsons_dataset.csv")
df.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [None]:
df.isnull().sum()

raw_character_text    17814
spoken_words          26459
dtype: int64

In [None]:
df = df.dropna().reset_index(drop=True)
df.isnull().sum()

raw_character_text    0
spoken_words          0
dtype: int64

In [None]:
nlp = spacy.load('en', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [None]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['spoken_words'])

In [None]:
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_threads=-1)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 1.17 mins


In [None]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(85964, 1)

In [None]:
from gensim.models.phrases import Phrases, Phraser

INFO - 13:53:34: 'pattern' package not found; tag filters are not available for English


In [None]:
sent = [row.split() for row in df_clean['clean']]

In [None]:
phrases = Phrases(sent, min_count=30, progress_per=10000)

INFO - 13:54:04: collecting all words and their counts
INFO - 13:54:04: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 13:54:04: PROGRESS: at sentence #10000, processed 63561 words and 52816 word types
INFO - 13:54:04: PROGRESS: at sentence #20000, processed 130943 words and 99866 word types
INFO - 13:54:04: PROGRESS: at sentence #30000, processed 192972 words and 138532 word types
INFO - 13:54:04: PROGRESS: at sentence #40000, processed 249842 words and 172659 word types
INFO - 13:54:05: PROGRESS: at sentence #50000, processed 311265 words and 208566 word types
INFO - 13:54:05: PROGRESS: at sentence #60000, processed 373588 words and 243702 word types
INFO - 13:54:05: PROGRESS: at sentence #70000, processed 436441 words and 278740 word types
INFO - 13:54:05: PROGRESS: at sentence #80000, processed 497829 words and 311886 word types
INFO - 13:54:05: collected 330804 word types from a corpus of 537160 words (unigram + bigrams) and 85964 sentences
INFO - 13:54:05: us

In [None]:
bigram = Phraser(phrases)

INFO - 13:54:22: source_vocab length 330804
INFO - 13:54:25: Phraser built with 126 phrasegrams


In [None]:
sentences = bigram[sent]

In [None]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

30178

In [None]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['oh', 'like', 'know', 'get', 'hey', 'think', 'right', 'look', 'want', 'come']

In [None]:
import multiprocessing

from gensim.models import Word2Vec

In [None]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

In [None]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [None]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 13:56:04: collecting all words and their counts
INFO - 13:56:04: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 13:56:04: PROGRESS: at sentence #10000, processed 61718 words, keeping 9558 word types
INFO - 13:56:04: PROGRESS: at sentence #20000, processed 127351 words, keeping 14506 word types
INFO - 13:56:04: PROGRESS: at sentence #30000, processed 187829 words, keeping 17619 word types
INFO - 13:56:05: PROGRESS: at sentence #40000, processed 243332 words, keeping 20385 word types
INFO - 13:56:05: PROGRESS: at sentence #50000, processed 303182 words, keeping 22878 word types
INFO - 13:56:05: PROGRESS: at sentence #60000, processed 363940 words, keeping 25200 word types
INFO - 13:56:05: PROGRESS: at sentence #70000, processed 425408 words, keeping 27401 word types
INFO - 13:56:06: PROGRESS: at sentence #80000, processed 485464 words, keeping 29275 word types
INFO - 13:56:06: collected 30178 word types from a corpus of 523700 raw words and 85964 sentence

Time to build vocab: 0.05 mins


In [None]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 13:56:22: training model with 1 workers on 3319 vocabulary and 300 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2
INFO - 13:56:23: EPOCH 1 - PROGRESS: at 27.49% examples, 55677 words/s, in_qsize 0, out_qsize 0
INFO - 13:56:24: EPOCH 1 - PROGRESS: at 49.85% examples, 47369 words/s, in_qsize 0, out_qsize 0
INFO - 13:56:25: EPOCH 1 - PROGRESS: at 67.17% examples, 42360 words/s, in_qsize 0, out_qsize 0
INFO - 13:56:26: EPOCH 1 - PROGRESS: at 93.81% examples, 44809 words/s, in_qsize 0, out_qsize 0
INFO - 13:56:26: worker thread finished; awaiting finish of 0 more threads
INFO - 13:56:26: EPOCH - 1 : training on 523700 raw words (198820 effective words) took 4.3s, 45972 effective words/s
INFO - 13:56:27: EPOCH 2 - PROGRESS: at 29.42% examples, 60532 words/s, in_qsize 0, out_qsize 0
INFO - 13:56:28: EPOCH 2 - PROGRESS: at 63.30% examples, 61198 words/s, in_qsize 0, out_qsize 0
INFO - 13:56:29: EPOCH 2 - PROGRESS: at 95.68% examples, 61775 words/s, in_qsize 0, out_qsize 0
I

Time to train the model: 1.68 mins


In [None]:
w2v_model.init_sims(replace=True)

INFO - 13:58:14: precomputing L2-norms of word weight vectors


In [None]:
w2v_model.wv.most_similar(positive=["homer_simpson"])

[('recent', 0.7703971862792969),
 ('council', 0.7655794024467468),
 ('governor', 0.7636814117431641),
 ('congratulation', 0.7545962333679199),
 ('easily', 0.7522720694541931),
 ('erotic', 0.7426029443740845),
 ('committee', 0.7418639659881592),
 ('robert', 0.7410692572593689),
 ('defeat', 0.7407906651496887),
 ('pleased', 0.7378360629081726)]

In [None]:
w2v_model.wv.most_similar(positive=["marge"])

[('convince', 0.7846249341964722),
 ('homer', 0.7692607641220093),
 ('becky', 0.7607318162918091),
 ('rude', 0.753149151802063),
 ('raccoon', 0.7525891661643982),
 ('sorry', 0.7513182163238525),
 ('grownup', 0.7488009929656982),
 ('fault', 0.7467668056488037),
 ('spoil', 0.7458378076553345),
 ('brunch', 0.7453069090843201)]

In [None]:
w2v_model.wv.most_similar(positive=["bart"])

[('lisa', 0.8530301451683044),
 ('homework', 0.8104968070983887),
 ('surprised', 0.793811023235321),
 ('upset', 0.7883384823799133),
 ('mom', 0.7870502471923828),
 ('typical', 0.775067925453186),
 ('convince', 0.7662098407745361),
 ('substitute', 0.7661116123199463),
 ('strangle', 0.7621992230415344),
 ('hearing', 0.762141227722168)]

In [None]:
w2v_model.wv.similarity('maggie', 'baby')

0.7096791

In [None]:
w2v_model.wv.similarity('bart', 'nelson')

0.6498818

In [None]:
w2v_model.wv.doesnt_match(['jimbo', 'milhouse', 'kearney'])

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'milhouse'

In [None]:
w2v_model.wv.doesnt_match(["nelson", "bart", "milhouse"])

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'nelson'

In [None]:
w2v_model.wv.doesnt_match(['homer', 'patty', 'selma'])

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'homer'

In [None]:
w2v_model.wv.most_similar(positive=["woman", "homer"], negative=["marge"], topn=3)

[('admire', 0.6912828683853149),
 ('carefully', 0.6303946375846863),
 ('obvious', 0.6277869343757629)]

In [None]:
w2v_model.wv.most_similar(positive=["woman", "bart"], negative=["man"], topn=3)

[('lisa', 0.7569124698638916),
 ('upset', 0.7464685440063477),
 ('parent', 0.7420808672904968)]