In [55]:
import numpy as np
import pandas as pd
from time import time
import re, string, unicodedata
from collections import defaultdict
import multiprocessing

# cleaning and preprocessing
from contractions import CONTRACTION_MAP
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, sent_tokenize

# bigrams
from gensim.models.phrases import Phrases, Phraser

# Word2Vec
from gensim.models import Word2Vec

# logging to monitor gensim
import logging
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

## Dataset

In [3]:
df = pd.read_csv('https://query.data.world/s/nu4cetyeefkjloxph4ytsbis5zejjg'); df.head()

Unnamed: 0,id,season,episode,scene,line_text,speaker,deleted
0,1,1,1,1,All right Jim. Your quarterlies look very good...,Michael,False
1,2,1,1,1,"Oh, I told you. I couldn't close it. So...",Jim,False
2,3,1,1,1,So you've come to the master for guidance? Is ...,Michael,False
3,4,1,1,1,"Actually, you called me in here, but yeah.",Jim,False
4,5,1,1,1,"All right. Well, let me show you how it's done.",Michael,False


In [4]:
df.shape

(59909, 7)

In [5]:
# filtering columns
COLUMNS = ["speaker", "line_text"]
df = df[COLUMNS];
df.tail(2)

Unnamed: 0,speaker,line_text
59907,Jim,I sold paper at this company for 12 years. My ...
59908,Pam,I thought it was weird when you picked us to m...


In [6]:
# null check
df.isnull().sum()

speaker      0
line_text    0
dtype: int64

## Cleaning

Cleaning pipeline consists of:

- Expanding contractions (y'all -> you all)
- Lowercase
- Remove punctuations, stopwords, special characters, non-alphanumeric
- Lemmatization

In [9]:
# Contractions helper function
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

expand_contractions("Y'all can't expand contractions I'd think")

'You all cannot expand contractions I would think'

In [11]:
df['contracted'] = df['line_text'].apply(lambda x: expand_contractions(x))

In [13]:
# Tokenizing
def tokenize(text):
    tweet_tokens = re.split('\W+', text)
    return tweet_tokens

In [14]:
df['tokens'] = df['contracted'].apply(lambda x: tokenize(x)).values

In [24]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, get_wordnet_pos(word))
        lemmas.append(lemma)
    return lemmas

In [25]:
def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = remove_stopwords(words)
    words = lemmatize_verbs(words)
    return words

In [28]:
df['normalize'] = df['tokens'].apply(lambda x: normalize(x))

In [29]:
df.head()

Unnamed: 0,speaker,line_text,contracted,tokens,normalize
0,Michael,All right Jim. Your quarterlies look very good...,All right Jim. Your quarterlies look very good...,"[All, right, Jim, Your, quarterlies, look, ver...","[right, jim, quarterly, look, good, thing, lib..."
1,Jim,"Oh, I told you. I couldn't close it. So...","Oh, I told you. I could not close it. So...","[Oh, I, told, you, I, could, not, close, it, S...","[oh, told, could, close]"
2,Michael,So you've come to the master for guidance? Is ...,So you have come to the master for guidance? I...,"[So, you, have, come, to, the, master, for, gu...","[come, master, guidance, say, grasshopper]"
3,Jim,"Actually, you called me in here, but yeah.","Actually, you called me in here, but yeah.","[Actually, you, called, me, in, here, but, yea...","[actually, call, yeah]"
4,Michael,"All right. Well, let me show you how it's done.","All right. Well, let me show you how it is done.","[All, right, Well, let, me, show, you, how, it...","[right, well, let, show, do]"


In [30]:
# Stitching together
tokens = []
for i,d in enumerate(df['normalize']):
    tokens.append(' '.join(d))
    
df['clean_text'] = tokens

In [31]:
df.head()

Unnamed: 0,speaker,line_text,contracted,tokens,normalize,clean_text
0,Michael,All right Jim. Your quarterlies look very good...,All right Jim. Your quarterlies look very good...,"[All, right, Jim, Your, quarterlies, look, ver...","[right, jim, quarterly, look, good, thing, lib...",right jim quarterly look good thing library
1,Jim,"Oh, I told you. I couldn't close it. So...","Oh, I told you. I could not close it. So...","[Oh, I, told, you, I, could, not, close, it, S...","[oh, told, could, close]",oh told could close
2,Michael,So you've come to the master for guidance? Is ...,So you have come to the master for guidance? I...,"[So, you, have, come, to, the, master, for, gu...","[come, master, guidance, say, grasshopper]",come master guidance say grasshopper
3,Jim,"Actually, you called me in here, but yeah.","Actually, you called me in here, but yeah.","[Actually, you, called, me, in, here, but, yea...","[actually, call, yeah]",actually call yeah
4,Michael,"All right. Well, let me show you how it's done.","All right. Well, let me show you how it is done.","[All, right, Well, let, me, show, you, how, it...","[right, well, let, show, do]",right well let show do


In [32]:
df.shape

(59909, 6)

In [33]:
# null check
df.isnull().sum()

speaker       0
line_text     0
contracted    0
tokens        0
normalize     0
clean_text    0
dtype: int64

## Bigrams

In [36]:
# Phrases() takes a list of words as input; so passing column 'normalize'
sent = [w for w in df['normalize']]

In [39]:
# Create relevant phrases from list of sentences
phrases = Phrases(sent, min_count=25, progress_per=5000)

INFO - 10:47:55: collecting all words and their counts
INFO - 10:47:55: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 10:47:55: PROGRESS: at sentence #5000, processed 28759 words and 22892 word types
INFO - 10:47:55: PROGRESS: at sentence #10000, processed 56658 words and 41058 word types
INFO - 10:47:55: PROGRESS: at sentence #15000, processed 83948 words and 57678 word types
INFO - 10:47:55: PROGRESS: at sentence #20000, processed 113156 words and 74958 word types
INFO - 10:47:55: PROGRESS: at sentence #25000, processed 142341 words and 91066 word types
INFO - 10:47:55: PROGRESS: at sentence #30000, processed 169750 words and 105143 word types
INFO - 10:47:55: PROGRESS: at sentence #35000, processed 197084 words and 119044 word types
INFO - 10:47:55: PROGRESS: at sentence #40000, processed 224977 words and 133115 word types
INFO - 10:47:55: PROGRESS: at sentence #45000, processed 253953 words and 147463 word types
INFO - 10:47:55: PROGRESS: at sentence #50000, p

In [40]:
bigram = Phraser(phrases)

INFO - 10:48:33: source_vocab length 191100
INFO - 10:48:35: Phraser built with 86 phrasegrams


In [41]:
sentences = bigram[sent]

## Most Frequent Words

In [50]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
        
len(word_freq)

16436

In [52]:
# most freq
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['go', 'get', 'know', 'oh', 'like', 'yeah', 'okay', 'well', 'right', 'michael']

In [54]:
# least freq
sorted(word_freq, key=word_freq.get, reverse=False)[:10]

['wastepaper',
 'spencer',
 'whass',
 'rodham',
 'ringie',
 'dingie',
 'packman',
 'godzillary',
 'unnecessarily',
 'daniqua']

## Training Word2Vec

In [56]:
# cores
cores = multiprocessing.cpu_count()
print(cores)

8


#### Word2Vec()

Set up the parameters.

Leaving it uninitialized purposefully by not supplying the parameter *sentences*.

In [57]:
w2v_model = Word2Vec(min_count=10,
                     window=2,
                     size=300,
                     sample=6e-5,
                     alpha=0.03,
                     min_alpha=0.0007,
                     negative=20,
                     workers=cores-1)

#### .build_vocab()

Builds vocab from sequence of sentences and the model gets initialized.

In [58]:
t = time()

w2v_model.build_vocab(sentences, progress_per=5000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 11:03:48: collecting all words and their counts
INFO - 11:03:48: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 11:03:48: PROGRESS: at sentence #5000, processed 28172 words, keeping 4186 word types
INFO - 11:03:48: PROGRESS: at sentence #10000, processed 55444 words, keeping 6148 word types
INFO - 11:03:48: PROGRESS: at sentence #15000, processed 82180 words, keeping 7753 word types
INFO - 11:03:48: PROGRESS: at sentence #20000, processed 110781 words, keeping 9218 word types
INFO - 11:03:48: PROGRESS: at sentence #25000, processed 139323 words, keeping 10460 word types
INFO - 11:03:48: PROGRESS: at sentence #30000, processed 166056 words, keeping 11344 word types
INFO - 11:03:48: PROGRESS: at sentence #35000, processed 192812 words, keeping 12262 word types
INFO - 11:03:48: PROGRESS: at sentence #40000, processed 220161 words, keeping 13147 word types
INFO - 11:03:49: PROGRESS: at sentence #45000, processed 248629 words, keeping 13932 word types
INFO -

Time to build vocab: 0.02 mins


#### Training model

In [59]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 11:06:52: training model with 7 workers on 3300 vocabulary and 300 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2
INFO - 11:06:53: EPOCH 1 - PROGRESS: at 51.35% examples, 62215 words/s, in_qsize 0, out_qsize 0
INFO - 11:06:54: worker thread finished; awaiting finish of 6 more threads
INFO - 11:06:54: worker thread finished; awaiting finish of 5 more threads
INFO - 11:06:54: worker thread finished; awaiting finish of 4 more threads
INFO - 11:06:54: worker thread finished; awaiting finish of 3 more threads
INFO - 11:06:54: worker thread finished; awaiting finish of 2 more threads
INFO - 11:06:54: worker thread finished; awaiting finish of 1 more threads
INFO - 11:06:54: worker thread finished; awaiting finish of 0 more threads
INFO - 11:06:54: EPOCH - 1 : training on 334617 raw words (124548 effective words) took 1.6s, 80014 effective words/s
INFO - 11:06:55: EPOCH 2 - PROGRESS: at 86.88% examples, 103042 words/s, in_qsize 0, out_qsize 0
INFO - 11:06:55: worker thread

INFO - 11:07:06: EPOCH 12 - PROGRESS: at 48.15% examples, 58506 words/s, in_qsize 0, out_qsize 0
INFO - 11:07:07: worker thread finished; awaiting finish of 6 more threads
INFO - 11:07:07: worker thread finished; awaiting finish of 5 more threads
INFO - 11:07:07: worker thread finished; awaiting finish of 4 more threads
INFO - 11:07:07: worker thread finished; awaiting finish of 3 more threads
INFO - 11:07:07: worker thread finished; awaiting finish of 2 more threads
INFO - 11:07:07: worker thread finished; awaiting finish of 1 more threads
INFO - 11:07:07: worker thread finished; awaiting finish of 0 more threads
INFO - 11:07:07: EPOCH - 12 : training on 334617 raw words (124900 effective words) took 1.6s, 79312 effective words/s
INFO - 11:07:08: EPOCH 13 - PROGRESS: at 83.93% examples, 104248 words/s, in_qsize 0, out_qsize 0
INFO - 11:07:08: worker thread finished; awaiting finish of 6 more threads
INFO - 11:07:08: worker thread finished; awaiting finish of 5 more threads
INFO - 11:0

INFO - 11:07:20: worker thread finished; awaiting finish of 6 more threads
INFO - 11:07:20: worker thread finished; awaiting finish of 5 more threads
INFO - 11:07:20: worker thread finished; awaiting finish of 4 more threads
INFO - 11:07:20: worker thread finished; awaiting finish of 3 more threads
INFO - 11:07:20: worker thread finished; awaiting finish of 2 more threads
INFO - 11:07:20: worker thread finished; awaiting finish of 1 more threads
INFO - 11:07:20: worker thread finished; awaiting finish of 0 more threads
INFO - 11:07:20: EPOCH - 23 : training on 334617 raw words (124703 effective words) took 1.1s, 108991 effective words/s
INFO - 11:07:21: EPOCH 24 - PROGRESS: at 83.93% examples, 103847 words/s, in_qsize 0, out_qsize 0
INFO - 11:07:22: worker thread finished; awaiting finish of 6 more threads
INFO - 11:07:22: worker thread finished; awaiting finish of 5 more threads
INFO - 11:07:22: worker thread finished; awaiting finish of 4 more threads
INFO - 11:07:22: worker thread f

Time to train the model: 0.61 mins


In [60]:
# make model memory efficient
w2v_model.init_sims(replace=True)

INFO - 11:08:09: precomputing L2-norms of word weight vectors


## Exploring the model

### Most similar

In [65]:
w2v_model.wv.most_similar(positive=['michael'])

[('stare', 0.9281713962554932),
 ('hang_phone', 0.9271564483642578),
 ('leaf', 0.9229561686515808),
 ('annex', 0.9186846017837524),
 ('jim', 0.9086365103721619),
 ('quietly', 0.9002681970596313),
 ('excuse', 0.8983709812164307),
 ('smile', 0.8971234560012817),
 ('walk', 0.8954806327819824),
 ('exit', 0.8950591087341309)]

In [64]:
w2v_model.wv.most_similar(positive=['michael_scott'])

[('dwight_schrute', 0.9417242407798767),
 ('paper_company', 0.9340528249740601),
 ('dunder_mifflin', 0.9315226078033447),
 ('introduce', 0.905677318572998),
 ('regional_manager', 0.9046109914779663),
 ('jim_halpert', 0.9032676815986633),
 ('david_wallace', 0.8575462102890015),
 ('robert_california', 0.8443684577941895),
 ('mr', 0.8369607925415039),
 ('howard', 0.8282136917114258)]

In [73]:
w2v_model.wv.most_similar(positive=['jim'])

[('pam', 0.9532161951065063),
 ('stare', 0.9458998441696167),
 ('quietly', 0.9405232667922974),
 ('whisper', 0.9315623044967651),
 ('annex', 0.9296285510063171),
 ('nod', 0.9281949996948242),
 ('smile', 0.9257539510726929),
 ('shake_head', 0.9235219955444336),
 ('leaf', 0.9179366230964661),
 ('erin', 0.9167506694793701)]

In [82]:
w2v_model.wv.most_similar(positive=['warehouse'])

[('nate', 0.8701183199882507),
 ('friendly', 0.863349437713623),
 ('upper', 0.8486036062240601),
 ('brand', 0.8473314642906189),
 ('vp', 0.8464341163635254),
 ('suspect', 0.8459171056747437),
 ('injury', 0.8451924324035645),
 ('director', 0.8450943231582642),
 ('temporary', 0.8407367467880249),
 ('claim', 0.8386325836181641)]

In [83]:
w2v_model.wv.most_similar(positive=['andy_bernard'])

[('dunder', 0.9334063529968262),
 ('mill', 0.9275669455528259),
 ('chief', 0.9271690845489502),
 ('regard', 0.9200038909912109),
 ('former', 0.9177340269088745),
 ('ceremony', 0.9167332649230957),
 ('lawyer', 0.9143906831741333),
 ('cappella', 0.9129273295402527),
 ('associate', 0.9128299951553345),
 ('retail', 0.9127031564712524)]

In [91]:
w2v_model.wv.most_similar(positive=['oscar'])

[('shy', 0.9073845148086548),
 ('approach', 0.9008233547210693),
 ('phyllis', 0.899946928024292),
 ('kevin', 0.8996772766113281),
 ('pat', 0.8965698480606079),
 ('kevins', 0.8955251574516296),
 ('open_door', 0.8949199914932251),
 ('shake_head', 0.8947243690490723),
 ('mary', 0.8944340944290161),
 ('deangelos', 0.8943274617195129)]

### Similarities

In [93]:
w2v_model.wv.similarity("jim", "pam")

0.95321625

In [102]:
w2v_model.wv.similarity("michael", "conference")

0.75328887

In [103]:
w2v_model.wv.similarity("dwight", "angela")

0.8048027

In [104]:
w2v_model.wv.similarity("jim", "dwight")

0.8841456

In [116]:
w2v_model.wv.similarity("andy_bernard", "cornell")

0.7957406

### Odd-One-Out

In [105]:
w2v_model.wv.doesnt_match(['jim', 'dwight', 'creed'])

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'creed'

In [106]:
w2v_model.wv.doesnt_match(['jim', 'dwight', 'pam'])

'dwight'

In [108]:
w2v_model.wv.doesnt_match(['michael', 'dwight', 'angela'])

'angela'

In [114]:
w2v_model.wv.doesnt_match(['andy', 'erin', 'cornell'])

'cornell'

### t-SNE