<a href="https://colab.research.google.com/github/preetamjumech/gensim_word2vec/blob/main/Preetam_Saha_29_08_2022_Gensim_Word2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import pandas as pd
import time
from collections import defaultdict
import spacy

In [7]:
df = pd.read_csv("simpsons_dataset.csv")
df.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [8]:
df.shape

(158314, 2)

In [9]:
df.isnull().sum()

raw_character_text    17814
spoken_words          26459
dtype: int64

In [10]:
df = df.dropna().reset_index(drop=True)
df.isnull().sum()

raw_character_text    0
spoken_words          0
dtype: int64

In [12]:
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [14]:
#Removes non-alphabetic characters:

brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['spoken_words'])

In [19]:
#Taking advantage of spaCy .pipe() attribute to speed-up the cleaning process:

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000)]


In [20]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(85954, 1)

In [21]:
#Bigrams:
#The main reason we do this is to catch words like "mr_burns" or "bart_simpson" !

from gensim.models.phrases import Phrases, Phraser

In [22]:
#As Phrases() takes a list of list of words as input:
sent = [row.split() for row in df_clean['clean']]

In [23]:
#Creates the relevant phrases from the list of sentences:
phrases = Phrases(sent, min_count=30, progress_per=10000)

In [24]:
bigram = Phraser(phrases)

In [25]:
sentences = bigram[sent]

In [26]:
#Most Frequent Words
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

29674

In [27]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['oh', 'like', 'know', 'get', 'hey', 'think', 'come', 'right', 'look', 'want']

In [28]:
#Training the model
import multiprocessing

from gensim.models import Word2Vec

In [30]:
cores = multiprocessing.cpu_count() 

In [31]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [33]:
#Building the Vocabulary Table:
#Word2Vec requires us to build the vocabulary table (simply digesting all the words and filtering out the unique words, 
#and doing some basic counts on them):


w2v_model.build_vocab(sentences, progress_per=10000)


In [34]:
#Training the model
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

(5983354, 15708360)

In [35]:
#As we do not plan to train the model any further, we are calling init_sims(), which will make the model much more memory-efficient:
w2v_model.init_sims(replace=True)

In [36]:
w2v_model.wv.most_similar(positive=["homer"])

[('marge', 0.8066062927246094),
 ('gee', 0.7934151887893677),
 ('sweetheart', 0.7930111885070801),
 ('depressed', 0.7834609746932983),
 ('hammock', 0.7532167434692383),
 ('bongo', 0.7508918642997742),
 ('creepy', 0.7460632920265198),
 ('snuggle', 0.7455220222473145),
 ('crummy', 0.7410843372344971),
 ('talk', 0.7410291433334351)]

In [37]:
w2v_model.wv.most_similar(positive=["homer_simpson"])

[('recent', 0.7935061454772949),
 ('council', 0.7912805080413818),
 ('pleased', 0.7870582342147827),
 ('select', 0.7810615301132202),
 ('congratulation', 0.780486524105072),
 ('sir', 0.7710354924201965),
 ('burns', 0.7689359188079834),
 ('robert', 0.7590256929397583),
 ('governor', 0.7540391087532043),
 ('estate', 0.7442103624343872)]

In [38]:
w2v_model.wv.most_similar(positive=["marge"])

[('homer', 0.8066062927246094),
 ('married', 0.7797975540161133),
 ('snuggle', 0.7620571851730347),
 ('brunch', 0.7560261487960815),
 ('sorry', 0.7482945919036865),
 ('sure', 0.7462934255599976),
 ('grownup', 0.7448376417160034),
 ('depressed', 0.7443678379058838),
 ('darling', 0.7439874410629272),
 ('homie', 0.7398719191551208)]

In [39]:
w2v_model.wv.most_similar(positive=["bart"])

[('lisa', 0.8210806846618652),
 ('convince', 0.7773534655570984),
 ('jealous', 0.7677967548370361),
 ('dr_hibbert', 0.7663052678108215),
 ('upset', 0.7648710012435913),
 ('mom', 0.7639108896255493),
 ('creepy', 0.7622742056846619),
 ('badly', 0.762080192565918),
 ('grownup', 0.7511234283447266),
 ('hopeless', 0.7468916773796082)]

In [41]:
w2v_model.wv.similarity('maggie', 'baby')

0.6523667

In [43]:
w2v_model.wv.similarity('bart', 'nelson')

0.63578844

In [44]:
w2v_model.wv.doesnt_match(['jimbo', 'milhouse', 'kearney'])

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'milhouse'

In [45]:
w2v_model.wv.doesnt_match(["nelson", "bart", "milhouse"])

'nelson'

In [46]:
w2v_model.wv.doesnt_match(['homer', 'patty', 'selma'])

'homer'

In [47]:
w2v_model.wv.most_similar(positive=["woman", "homer"], negative=["marge"], topn=3)

[('rude', 0.6938589215278625),
 ('married', 0.6527776718139648),
 ('see', 0.6511276960372925)]

In [48]:
w2v_model.wv.most_similar(positive=["woman", "bart"], negative=["man"], topn=3)

[('lisa', 0.7277119755744934),
 ('parent', 0.7018523812294006),
 ('pregnant', 0.6989172101020813)]