In [1]:
# https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial

In [2]:
import pandas as pd

df = pd.read_csv("../data/simpsons_dataset.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158314 entries, 0 to 158313
Data columns (total 2 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   raw_character_text  140500 non-null  object
 1   spoken_words        131855 non-null  object
dtypes: object(2)
memory usage: 2.4+ MB


In [4]:
df.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [5]:
df.isnull().sum()

raw_character_text    17814
spoken_words          26459
dtype: int64

In [6]:
df = df.dropna().reset_index(drop=True)
df.isnull().sum()

raw_character_text    0
spoken_words          0
dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131853 entries, 0 to 131852
Data columns (total 2 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   raw_character_text  131853 non-null  object
 1   spoken_words        131853 non-null  object
dtypes: object(2)
memory usage: 2.0+ MB


In [8]:
corpus = list(df["spoken_words"].values)

In [9]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
w_n_lemmatizer = WordNetLemmatizer() 

corpus = [[w_n_lemmatizer.lemmatize(word) for word in word_tokenize(document.lower()) 
            if not word in stop_words and word.isalnum()] 
          for document in corpus]

corpus[0]

['actually',
 'little',
 'sometimes',
 'disease',
 'magazine',
 'news',
 'show',
 'natural',
 'think']

In [10]:
pip install gensim

Note: you may need to restart the kernel to use updated packages.


In [11]:
from gensim.models.phrases import Phrases, Phraser
phrases = Phrases(corpus, min_count=30, progress_per=10000)
print(phrases)

Phrases<376859 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>


In [12]:
import multiprocessing

from gensim.models import Word2Vec

In [13]:
cores = multiprocessing.cpu_count()
cores

4

In [14]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [15]:
w2v_model.build_vocab(corpus, progress_per=10000)
w2v_model.corpus_count

131853

In [16]:
w2v_model.train(corpus, 
                total_examples=w2v_model.corpus_count, 
                epochs=30, 
                report_delay=1)

(7634342, 19971480)

In [17]:
# which will make the model much more memory-efficient:
w2v_model.init_sims(replace=True)

In [18]:
w2v_model.wv.most_similar(positive=["homer"])

[('marge', 0.7151126861572266),
 ('settled', 0.6487970948219299),
 ('eliza', 0.6143308877944946),
 ('simpson', 0.6072618365287781),
 ('abe', 0.6037951111793518),
 ('sorry', 0.5871919393539429),
 ('supervisor', 0.5862137079238892),
 ('yoink', 0.5815441012382507),
 ('pfft', 0.577323317527771),
 ('asking', 0.5765624046325684)]

In [19]:
w2v_model.wv.most_similar(positive=["marge"])

[('homer', 0.7151126861572266),
 ('husband', 0.6739974617958069),
 ('nooooo', 0.6439752578735352),
 ('becky', 0.6334644556045532),
 ('fishing', 0.6333003640174866),
 ('badly', 0.6299690008163452),
 ('sharing', 0.629048228263855),
 ('homie', 0.6263099312782288),
 ('noooo', 0.6262434124946594),
 ('arranged', 0.6240975856781006)]

In [20]:
w2v_model.wv.most_similar(positive=["bart"])

[('lisa', 0.7769181728363037),
 ('mom', 0.7092975974082947),
 ('eliza', 0.6429473161697388),
 ('jessica', 0.6156376004219055),
 ('maggie', 0.6118208169937134),
 ('shh', 0.6051841974258423),
 ('creepy', 0.6046707034111023),
 ('assignment', 0.6038774847984314),
 ('substitute', 0.5964770317077637),
 ('concerned', 0.5958867073059082)]

In [21]:
w2v_model.wv.similarity("eliza", 'bart')

0.64294726

In [22]:
w2v_model.wv.doesnt_match(['jimbo', 'milhouse', 'kearney'])

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'milhouse'

In [23]:
# Which word is to woman as homer is to marge?
w2v_model.wv.most_similar(positive=["woman", "homer"], negative=["marge"], topn=3)

[('man', 0.5287352800369263),
 ('younger', 0.5231418609619141),
 ('impressive', 0.5067380666732788)]

In [24]:
w2v_model.wv.most_similar(positive=["woman", "bart"], negative=["man"], topn=3)

[('lisa', 0.5924097299575806),
 ('pregnant', 0.5832182168960571),
 ('mom', 0.5476857423782349)]

In [25]:
w2v_model.wv.save_word2vec_format("../data/word2vec.vec")

In [26]:
with open("../data/word2vec.vec", "r") as f:
    tmp = f.read().split("\n")

with open("../data/word2vec.tsv", "w") as t:
    for i in range(1,len(tmp)):
        splited = tmp[i].split(" ")
        t.write("\t".join(splited[1:]) + "\n")