In [1]:
import re    # for preprocessing
import pandas as pd    # for data handling
from time import time    # to time our operations
from collections import defaultdict    # for word frequency

import spacy    # for preprocessing

import logging    # setting up the loggings to mointor gensim
# Define the format of the log message. It includes log level, timestamp, and the actual log message
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt='%H:%M:%S', level=logging.INFO)

In [4]:
df = pd.read_csv('data/simpsons_dataset.csv')
df.shape
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158314 entries, 0 to 158313
Data columns (total 2 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   raw_character_text  140500 non-null  object
 1   spoken_words        131855 non-null  object
dtypes: object(2)
memory usage: 2.4+ MB


In [5]:
df.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [22]:
df_character = df['raw_character_text'].value_counts().to_frame("count")
df_character[df_character['count']>=500]

Unnamed: 0,count
Homer Simpson,29782
Marge Simpson,14141
Bart Simpson,13759
Lisa Simpson,11489
C. Montgomery Burns,3162
Moe Szyslak,2862
Seymour Skinner,2438
Ned Flanders,2144
Grampa Simpson,1880
Milhouse Van Houten,1862


In [25]:
len(df['spoken_words'].values), df['spoken_words'].values

(131853,
 array(["No, actually, it was a little of both. Sometimes when a disease is in all the magazines and all the news shows, it's only natural that you think you have it.",
        "Where's Mr. Bergstrom?",
        "I don't know. Although I'd sure like to talk to him. He didn't touch my lesson plan. What did he teach you?",
        ..., 'Psy-cho-so-ma-tic.', 'Does that mean you were crazy?',
        'No, that means she was faking it.'], dtype=object))

In [26]:
# Check how many lines are null
print(df.isnull().sum())

# remove null values
df = df.dropna().reset_index(drop=True)

print(df.shape)

raw_character_text    0
spoken_words          0
dtype: int64
(131853, 2)


In [6]:
# In spaCy, a pipeline refers to a series of processing components that are applied to a text document sequentially. 
# Each component in the pipeline performs a specific task, such as tokenization, part-of-speech tagging, syntactic parsing, named entity recognition, and so on. 
# The output of one component serves as the input to the next component in the pipeline.
# When you load a spaCy model, it comes with a default pipeline of processing components that are applied to the text. 
# However, you can customize the pipeline by adding, removing, or modifying the components according to your specific requirements.
# The pipeline in spaCy is designed to be efficient and allows for fast processing of large volumes of text. 
# It takes advantage of the processing capabilities of spaCy's underlying machine learning models and linguistic data structures.
# You can access the current pipeline components of a loaded spaCy model using the nlp.pipe_names attribute. 
# Similarly, you can add or modify components in the pipeline using the nlp.add_pipe() or nlp.remove_pipe() methods respectively.

In [27]:
# Spacy is a popular Python library used for natural language processing (NLP) tasks. 
# It provides a lemmatization module that allows you to convert words to their base or dictionary form, known as lemmas. 
# The Spacy lemmatizer is built on linguistic rules and utilizes contextual information to determine the appropriate lemma for a given word.
# Here's an example of how to use spaCy's lemmatizer
import spacy
nlp = spacy.load(name='en_core_web_sm')      # Load the English Language Model
txt = "I am running along Tancheon which is a small and narrow river for about two hours everyday, praying Christ Jesus to work inside me and help recover from my illness."

# Tokenize text into individual words
doc = nlp(txt)

# Lemmatize each word and print lemma
for token in doc:
    print(token.text, token.lemma_, token.pos_)

I I PRON
am be AUX
running run VERB
along along ADP
Tancheon Tancheon PROPN
which which PRON
is be AUX
a a DET
small small ADJ
and and CCONJ
narrow narrow ADJ
river river NOUN
for for ADP
about about ADV
two two NUM
hours hour NOUN
everyday everyday ADV
, , PUNCT
praying pray VERB
Christ Christ PROPN
Jesus Jesus PROPN
to to PART
work work VERB
inside inside ADP
me I PRON
and and CCONJ
help help VERB
recover recover VERB
from from ADP
my my PRON
illness illness NOUN
. . PUNCT


In [28]:
print(type(doc))
print(len(doc))
print(doc)

<class 'spacy.tokens.doc.Doc'>
32
I am running along Tancheon which is a small and narrow river for about two hours everyday, praying Christ Jesus to work inside me and help recover from my illness.


In [29]:
# cleaning
# We are lemmatizing and removing the stopwords and non-alphabetic characters from each line of dialogues
# Load the English Language Model
nlp = spacy.load(name='en_core_web_lg', disable=['parser', 'ner'])     # disabling Named Entity Recognition and syntatic parser for speed

def cleaning(doc):
    # Lemmaitizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representations of a target word,
    # if a sentence is only one or two words long, the benefit for training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [30]:
# We first load the default English language model, and then we print the names of the current pipeline components using nlp.pipe_names. 
# We then add a new component called "sentencizer" before the "parser" component using nlp.add_pipe(). 
# Finally, we remove the "ner" (named entity recognition) component from the pipeline using nlp.remove_pipe().

# Print the current pipeline components
print(nlp.pipe_names)

# Add a new component called sentencizer before tagger
# nlp.add_pipe("sentencizer", before='parser')

# Remove a component from the pipeline
# nlp.remove_pipe("ner")

['tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer']


In [31]:
# Remove non-alphabetic characters
brief_cleaning = (re.sub("[^A-Za-z']+", " ", str(row)).lower() for row in df['spoken_words'])

In [11]:
# The nlp.pipe() function in spaCy is a convenient way to process a large amount of texts efficiently using a spaCy language model. 
# It takes in a sequence of texts and applies the processing pipeline to each text in parallel.

import spacy

# Load the language model
nlp = spacy.load("en_core_web_sm")

# Define a list of texts to process
texts = ["This is the first text.", "Here comes the second text.", "And finally, the third text."]

# Process the texts using nlp.pipe()
docs = list(nlp.pipe(texts))
print(docs)
print()

# Access the processed documents
for doc in docs:
    # Perform any desired operations on each document
    print(doc.text)
    for token in doc:
        print(token.text, token.pos_, token.lemma_)
    print()

[This is the first text., Here comes the second text., And finally, the third text.]

This is the first text.
This PRON this
is AUX be
the DET the
first ADJ first
text NOUN text
. PUNCT .

Here comes the second text.
Here ADV here
comes VERB come
the DET the
second ADJ second
text NOUN text
. PUNCT .

And finally, the third text.
And CCONJ and
finally ADV finally
, PUNCT ,
the DET the
third ADJ third
text NOUN text
. PUNCT .



In [32]:
# Take advantage of spaCy.pipe() method to speed up the cleaning process
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000)]

print(f'Time to clean up everything : {round((time() - t) / 60, 3)} mins')

Time to clean up everything : 1.558 mins


In [33]:
# Put the results in a DataFrame to remove missing values and duplicates:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(85952, 1)

In [34]:
df_clean.to_csv('data/df_clean.csv', index=False)

In [35]:
df_clean.head()

Unnamed: 0,clean
0,actually little disease magazine news show nat...
2,know sure like talk touch lesson plan teach
3,life worth live
4,poll open end recess case decide thought final...
7,victory party slide


##### Bigrams

In [36]:
# We are using Gensim Phrases package to automatically detect common phrases (bigrams) from a list of sentences.
from gensim.models.phrases import Phrases, Phraser

In [37]:
# As Phrases() takes a list of list of words as input:
sent = [row.split() for row in df_clean['clean']]

print(len(sent), len(df_clean['clean']))
print(sent[1000])

85952 85952
['hey', 'bartle', 'eeboobely', 'care', 'steak', 'rooney']


In [38]:
# Creates the relevant phrases from the list of sentences:
phrases = Phrases(sent, min_count=30, progress_per=10000)

INFO - 17:36:32: collecting all words and their counts
INFO - 17:36:32: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 17:36:32: PROGRESS: at sentence #10000, processed 63557 words and 52676 word types
INFO - 17:36:32: PROGRESS: at sentence #20000, processed 130938 words and 99581 word types
INFO - 17:36:32: PROGRESS: at sentence #30000, processed 192957 words and 138111 word types
INFO - 17:36:32: PROGRESS: at sentence #40000, processed 249830 words and 172098 word types
INFO - 17:36:32: PROGRESS: at sentence #50000, processed 311271 words and 207908 word types
INFO - 17:36:32: PROGRESS: at sentence #60000, processed 373578 words and 242908 word types
INFO - 17:36:32: PROGRESS: at sentence #70000, processed 436426 words and 277808 word types
INFO - 17:36:32: PROGRESS: at sentence #80000, processed 497900 words and 310884 word types
INFO - 17:36:33: collected 329570 token types (unigram + bigrams) from a corpus of 537081 words and 85952 sentences
INFO - 17:36:33: m

In [39]:
# The goal of Phraser() is to cut down memory consumption of Phrases(), by discarding model state not strictly needed for the bigram detection task:
bigram = Phraser(phrases)

INFO - 17:36:39: exporting phrases from Phrases<329570 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>
INFO - 17:36:39: FrozenPhrases lifecycle event {'msg': 'exported FrozenPhrases<127 phrases, min_count=30, threshold=10.0> from Phrases<329570 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000> in 0.52s', 'datetime': '2023-06-04T17:36:39.608105', 'gensim': '4.3.1', 'python': '3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'created'}


In [43]:
print(dir(bigram))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_adapt_by_suffix', '_apply', '_load_specials', '_save_specials', '_smart_save', 'add_lifecycle_event', 'analyze_sentence', 'connector_words', 'delimiter', 'find_phrases', 'lifecycle_events', 'load', 'min_count', 'phrasegrams', 'save', 'score_candidate', 'scoring', 'threshold']


In [47]:
for bg in bigram[sent]:
    print(bg)

['actually', 'little', 'disease', 'magazine', 'news', 'show', 'natural', 'think']
['know', 'sure', 'like', 'talk', 'touch', 'lesson', 'plan', 'teach']
['life', 'worth', 'live']
['poll', 'open', 'end', 'recess', 'case', 'decide', 'thought', 'final', 'statement', 'martin']
['victory', 'party', 'slide']
['mr', 'bergstrom', 'mr', 'bergstrom']
['hey', 'hey', 'move', 'morning', 'new', 'job', 'take', 'copernicus', 'costume']
['think', 'take', 'train', 'capital_city']
['train', 'like', 'traditional', 'environmentally', 'sound']
['yes', 'backbone', 'country', 'leland', 'stanford', 'drive', 'golden', 'spike', 'promontory', 'point']
['hey', 'thank', 'vote', 'man']
['vote', 'voting', 'geek']
['get', 'right', 'thank', 'vote', 'girl']
['sweat', 'long', 'couple', 'people', 'right', 'milhouse']
['martin', 'martin', 'like', 'recount']
['want', 'sure', 'martin', 'martin']
['way', 'mister', 'president']
['board', 'track', 'afternoon', 'delight', 'come', 'shelbyville', 'parkville']
['mr', 'bergstrom', 'he

In [49]:
# Transform the corpus based on the bigrams detected
sentences = bigram[sent]

print(len(sentences))
print(sentences[1000])

85952
['hey', 'bartle', 'eeboobely', 'care', 'steak', 'rooney']


In [50]:
# Most Frequent Words
# Mainly a sanity check of the effectiveness of the lemmatization, removal of stopwords, and the addition of bigrams.
word_freq = defaultdict(int)

for sent in sentences:
    for i in sent:
        word_freq[i] += 1

print(len(word_freq))

29676


In [51]:
word_freq

defaultdict(int,
            {'actually': 422,
             'little': 2101,
             'disease': 45,
             'magazine': 122,
             'news': 249,
             'show': 214,
             'natural': 77,
             'think': 3593,
             'know': 4821,
             'sure': 1198,
             'like': 5598,
             'talk': 937,
             'touch': 192,
             'lesson': 162,
             'plan': 302,
             'teach': 324,
             'life': 1222,
             'worth': 141,
             'live': 768,
             'poll': 20,
             'open': 420,
             'end': 464,
             'recess': 10,
             'case': 215,
             'decide': 134,
             'thought': 120,
             'final': 105,
             'statement': 21,
             'martin': 120,
             'victory': 30,
             'party': 421,
             'slide': 48,
             'mr': 797,
             'bergstrom': 17,
             'hey': 3620,
             'move': 165,
     

In [52]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['oh', 'like', 'know', 'get', 'hey', 'think', 'come', 'right', 'look', 'want']

In [53]:
import operator
word_freq_by_key = sorted(word_freq.items(), key=operator.itemgetter(0), reverse=False)[:10]
print(word_freq_by_key)

[("'", 2779), ("'_n", 53), ("'_til", 82), ("'_tis", 35), ("'bout", 168), ("'cause", 470), ("'cuz", 7), ("a'hind", 1), ("a'ight", 2), ("a'twain", 1)]


In [54]:
word_freq_by_value = sorted(word_freq.items(), key=operator.itemgetter(1), reverse=True)[:10]
print(word_freq_by_value)

[('oh', 6453), ('like', 5598), ('know', 4821), ('get', 4213), ('hey', 3620), ('think', 3593), ('come', 3583), ('right', 3411), ('look', 3382), ('want', 3181)]


##### Training the model

In [55]:
# Gensim Word2Vec Implementation : We use Gensim implementation of word2vec.
# gensim.models.word2vec module implements the word2vec family of algorithms, using highly optimized C routines, data streaming and Pythonic interfaces.
# The word2vec algorithms include skip-gram and CBOW models, using either hierarchical softmax or negative sampling: 
# Thomas et al: Efficient Estimation of World Representations in vector space
# There are more ways to train word vectors in Gensim than just Word2Vec. See also Doc2Vec, FastText

In [56]:
import multiprocessing
from gensim.models import Word2Vec

In [57]:
# Why I separate the training of the model in 3 steps:
# I prefer to separate the training in 3 distictive steps for clarity and monitoring.
# 1. Word2Vec() : In this step, I set up the parameters of the model one-by-one.I do not supply the parameter sentences, and therefore leave the model uninitialized, purposefully.
# 2. .build_vocab() : Here it builds the vocabulary from a sequence of sentences and thus initialized the model. With the loggings, I can follow the progress and even more 
#  important, the effect of min_count and sample on the word corpus. I noticed that these two parameters, and in particular sample, have a great influence over the performance
#  of a model. Displaying both allows for a more accurate and an easier management of their influence.
# 3. .train() : Finally, trains the model. The loggings here are mainly useful for monitoring, making sure that no threads are executed instantaneously.

In [58]:
cores = multiprocessing.cpu_count()   # Count the number of cores in a computer
print(cores)

16


In [59]:
# The parameters
# min_count=int : ignores all words with total absolute frequency lower than this - (2, 100)
# window=int : The maximum distance between the current and predicted word within a sentence. E.g. window words on the left and widow words on the left of our target - (2, 10)
# vector_size=int : Dimensionality of the feature vectors - (50, 300)
# sample=float : The threshold for configuring which higher-frequency words are randomly downsampled. Highly influential - (0, 1e-5)
# alpha=float : The initial learning rate - (0.01, 0.05)
# min_alpha=flaot : Learning rate will linearly drop to min_lapha as trainig progresses. To set it: alpha - (min_alpha * epochs) ~ 0.00
# negative=int=int : If >0, negative sampling will be used, the int for negative specifies how many "noise words" should be drown. If set to 0, no negative sampling is used - (5,20)
# workers=int : Use these many worker threads to train the model(=faster training with multicore machines)

In [60]:
w2v_model = Word2Vec(min_count=20, window=2, vector_size=300, sample=6e-5, alpha=0.03, min_alpha=0.007, negative=20, workers=cores-1)

INFO - 17:43:06: Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=300, alpha=0.03>', 'datetime': '2023-06-04T17:43:06.481762', 'gensim': '4.3.1', 'python': '3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'created'}


In [61]:
# Buiding the Vocabulary Table
# Word2Vec requires us to build the vocabulary table(simply digesting all the words and filtering out the unique words, and doing some basic counts on them)

t = time()
w2v_model.build_vocab(sentences, progress_per=10000)
print(f"Time to build vocab : {round((time() - t) / 60, 2)}")

INFO - 17:43:19: collecting all words and their counts
INFO - 17:43:19: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 17:43:20: PROGRESS: at sentence #10000, processed 61697 words, keeping 9484 word types
INFO - 17:43:20: PROGRESS: at sentence #20000, processed 127319 words, keeping 14371 word types
INFO - 17:43:20: PROGRESS: at sentence #30000, processed 187773 words, keeping 17431 word types
INFO - 17:43:20: PROGRESS: at sentence #40000, processed 243269 words, keeping 20121 word types
INFO - 17:43:20: PROGRESS: at sentence #50000, processed 303136 words, keeping 22566 word types
INFO - 17:43:20: PROGRESS: at sentence #60000, processed 363877 words, keeping 24826 word types
INFO - 17:43:20: PROGRESS: at sentence #70000, processed 425332 words, keeping 26985 word types
INFO - 17:43:20: PROGRESS: at sentence #80000, processed 485473 words, keeping 28817 word types
INFO - 17:43:20: collected 29676 word types from a corpus of 523553 raw words and 85952 sentence

Time to build vocab : 0.01


In [62]:
# Training the model
# total_examples=int : Count of sentences
# epochs=int : Number of iterations(epochs) over the corpus - [10,20,30]

t = time()
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)
print(f"Time to train the model : {round((time() - t) / 60, 2)}")

INFO - 17:43:29: Word2Vec lifecycle event {'msg': 'training model with 15 workers on 3314 vocabulary and 300 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2 shrink_windows=True', 'datetime': '2023-06-04T17:43:29.335218', 'gensim': '4.3.1', 'python': '3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'train'}
INFO - 17:43:30: EPOCH 0 - PROGRESS: at 67.20% examples, 132664 words/s, in_qsize 16, out_qsize 2
INFO - 17:43:30: EPOCH 0: training on 523553 raw words (198924 effective words) took 1.1s, 188330 effective words/s
INFO - 17:43:31: EPOCH 1: training on 523553 raw words (198866 effective words) took 0.8s, 258098 effective words/s
INFO - 17:43:32: EPOCH 2: training on 523553 raw words (199444 effective words) took 0.9s, 215226 effective words/s
INFO - 17:43:32: EPOCH 3: training on 523553 raw words (198873 effective words) took 0.7s, 290281 effective words/s
INFO - 17:43:33: EPOCH 4 - PROGRESS: at 61.42% ex

Time to train the model : 0.41


In [63]:
print(dir(w2v_model.wv))

['__class__', '__contains__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_adapt_by_suffix', '_load_specials', '_log_evaluate_word_analogies', '_save_specials', '_smart_save', '_upconvert_old_d2vkv', '_upconvert_old_vocab', 'add_lifecycle_event', 'add_vector', 'add_vectors', 'allocate_vecattrs', 'closer_than', 'cosine_similarities', 'distance', 'distances', 'doesnt_match', 'evaluate_word_analogies', 'evaluate_word_pairs', 'expandos', 'fill_norms', 'get_index', 'get_mean_vector', 'get_normed_vectors', 'get_vecattr', 'get_vector', 'has_index_for', 'index2entity', 'index2word', 'index_to_key', 'init_sims', 'intersect_word2vec_format', 'key_to_index', 'load', 'load_w

In [67]:
print(len(w2v_model.wv))
print(w2v_model.wv[0].shape)
print(w2v_model.wv[0])

3314
(300,)
[ 0.07386999 -0.0111486   0.12890242  0.22857782 -0.13041826 -0.35621813
  0.5523982   0.5283274   0.04929522 -0.1125346   0.04653757 -0.29387367
 -0.05917864  0.05309346 -0.23404907 -0.2678345  -0.1265864   0.34509256
 -0.01761004  0.0546715   0.05871229 -0.27196687  0.1994948  -0.02533029
 -0.15592588  0.01947534  0.12811655  0.0390083  -0.13786127 -0.13906781
  0.18279578 -0.21167156  0.07319624  0.02908345  0.14895292 -0.07945303
  0.0316951  -0.19491099  0.15323257 -0.10824042  0.04829989  0.16291158
 -0.07684088 -0.09510821  0.17052369  0.0853541   0.07380101  0.17874272
 -0.10967975 -0.06392822 -0.2608806  -0.43607974 -0.17382431  0.04055748
 -0.08523371  0.24013625  0.11140185 -0.04355928  0.1073567   0.13294493
 -0.11049364 -0.07080015 -0.03206125  0.21452856  0.09940255  0.01662943
 -0.06508102  0.11530242 -0.0257541   0.09735345  0.02698535  0.32609195
 -0.01562629 -0.1646125   0.07538219  0.29293466  0.02480999  0.14327936
 -0.16268529  0.03518934 -0.08289132 -0

##### Exploring the model

In [35]:
w2v_model.wv.most_similar(positive=["homer"])

[('bongo', 0.5577720403671265),
 ('marge', 0.5275826454162598),
 ('hammock', 0.5179413557052612),
 ('chloe', 0.49749520421028137),
 ('bartender', 0.49741706252098083),
 ('brad', 0.49635228514671326),
 ('depressed', 0.48891308903694153),
 ('good_friend', 0.48294949531555176),
 ('bitch', 0.4828416109085083),
 ('sorry', 0.4825747311115265)]

In [36]:
w2v_model.wv.most_similar(positive=["homer_simpson"])

[('easily', 0.567703127861023),
 ('pleased', 0.5316898822784424),
 ('select', 0.5197466611862183),
 ('montgomery_burn', 0.5164961814880371),
 ('local', 0.5141699910163879),
 ('recent', 0.507243812084198),
 ('congratulation', 0.5032428503036499),
 ('sir', 0.501966118812561),
 ('sector', 0.4935298562049866),
 ('kennedy', 0.4724850356578827)]

In [37]:
w2v_model.wv.most_similar(positive=["marge"])

[('homer', 0.5275826454162598),
 ('becky', 0.5250788331031799),
 ('brownie', 0.4997222125530243),
 ('badly', 0.49702927470207214),
 ('grownup', 0.49249887466430664),
 ('humiliate', 0.48835721611976624),
 ('abe', 0.48297786712646484),
 ('sorry', 0.48102471232414246),
 ('relax', 0.4810197353363037),
 ('know', 0.4768749475479126)]

In [38]:
w2v_model.wv.most_similar(positive=["bart"])

[('lisa', 0.7042917013168335),
 ('hearing', 0.6031599640846252),
 ('mom', 0.5835254788398743),
 ('mom_dad', 0.5731374621391296),
 ('homework', 0.5605311393737793),
 ('maggie', 0.5395596027374268),
 ('convince', 0.537165105342865),
 ('punish', 0.5324351191520691),
 ('dr_hibbert', 0.5290904641151428),
 ('janey', 0.5288304090499878)]

##### Similarities

In [40]:
w2v_model.wv.similarity("maggie", "baby")

0.5721383

In [41]:
w2v_model.wv.similarity("bart", "nelson")

0.41649213

In [42]:
w2v_model.wv.doesnt_match(["jimbo", "milhouse", "kearney"])



'jimbo'

In [43]:
w2v_model.wv.doesnt_match(["nelson", "bart", "milhouse"])

'nelson'

In [44]:
w2v_model.wv.doesnt_match(["homer", "party", "selma"])

'party'

##### Analogy Difference

In [45]:
# Which word is to woman as homer is to marge?
w2v_model.wv.most_similar(positive=["woman", "homer"], negative=["marge"])

[('admire', 0.43360963463783264),
 ('ought', 0.4146246314048767),
 ('married', 0.4041617214679718),
 ('bald', 0.3781508803367615),
 ('adopt', 0.3774721324443817),
 ('attract', 0.37559741735458374),
 ('wonder', 0.3504493832588196),
 ('nasty', 0.34788036346435547),
 ('brad', 0.34443116188049316),
 ('apparently', 0.3440438210964203)]

In [46]:
# Which word is to woman as bart is to man?
w2v_model.wv.most_similar(positive=["woman", "bart"], negative=["man"], topn=3)

[('lisa', 0.546580970287323),
 ('anyhoo', 0.5054417848587036),
 ('mom', 0.5031613111495972)]