In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords

In [2]:
# Utility function to clean text.
def text_cleaner(text):
    
    # Visual inspection shows spaCy does not recognize the double dash '--'.
    # Better get rid of it now!
    text = re.sub(r'--',' ',text)
    
    # Get rid of headings in square brackets.
    text = re.sub("[\[].*?[\]]", "", text)
    
    # Get rid of chapter titles.
    text = re.sub(r'Chapter \d+','',text)
    
    # Get rid of extra whitespace.
    text = ' '.join(text.split())
    
    return text[0:900000]


# Import all the Austen in the Project Gutenberg corpus.
austen = ""
for novel in ['persuasion','emma','sense']:
    work = gutenberg.raw('austen-' + novel + '.txt')
    austen = austen + work

# Clean the data.
austen_clean = text_cleaner(austen)

In [3]:
# Parse the data. This can take some time.
import en_core_web_sm
nlp = en_core_web_sm.load()
austen_doc = nlp(austen_clean)

In [4]:
# Organize the parsed doc into sentences, while filtering out punctuation
# and stop words, and converting words to lower case lemmas.
sentences = []
for sentence in austen_doc.sents:
    sentence = [
        token.lemma_.lower()
        for token in sentence
        if not token.is_stop
        and not token.is_punct
    ]
    sentences.append(sentence)


print(sentences[20])
print('We have {} sentences and {} tokens.'.format(len(sentences), len(austen_clean)))

['daughter', 'eld', 'give', 'thing', 'tempt']
We have 8323 sentences and 900000 tokens.


In [5]:
import gensim
from gensim.models import word2vec

model = word2vec.Word2Vec(
    sentences,
    workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=6,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

print('done!')

done!


In [6]:
# List of words in model.
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))

# Similarity is calculated using the cosine, so again 1 is total
# similarity and 0 is no similarity.
print(model.wv.similarity('mr', 'mrs'))

# One of these things is not like the other...
print(model.doesnt_match("breakfast marriage dinner lunch".split()))

[('goddard', 0.9360855221748352), ('harville', 0.9300448894500732), ('musgrove', 0.9291076064109802), ('clay', 0.9229669570922852), ('benwick', 0.9154080152511597), ('wentworth', 0.9048967957496643), ('colonel', 0.8477964401245117), ('weston', 0.8383522033691406), ('god', 0.8213027715682983), ('navy', 0.817534327507019)]
0.91653967


  # This is added back by InteractiveShellApp.init_path()
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


marriage


## Drill 0

Take a few minutes to modify the hyperparameters of this model and see how its answers change. Can you wrangle any improvements?

In [7]:
model = word2vec.Word2Vec(
    sentences,
    workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=10,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=30,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

print('done!')

done!


In [8]:
# List of words in model.
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))

# Similarity is calculated using the cosine, so again 1 is total
# similarity and 0 is no similarity.
print(model.wv.similarity('mr', 'mrs'))

# One of these things is not like the other...
print(model.doesnt_match("breakfast marriage dinner lunch".split()))

[('clay', 0.8332077860832214), ('musgrove', 0.8324244022369385), ('benwick', 0.8230276107788086), ('excessively', 0.8155937194824219), ('harville', 0.785472571849823), ('shirley', 0.7738784551620483), ('god', 0.7603787779808044), ('hall', 0.7532171010971069), ('wentworth', 0.7360179424285889), ('navy', 0.7225682139396667)]
0.808252


  # This is added back by InteractiveShellApp.init_path()


marriage


In [9]:
model = word2vec.Word2Vec(
    sentences,
    workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=10,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-2 ,  # Penalize frequent words.
    size=30,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

print('done!')

done!


In [11]:
# List of words in model.
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))

# Similarity is calculated using the cosine, so again 1 is total
# similarity and 0 is no similarity.
print(model.wv.similarity('mr', 'mrs'))

# One of these things is not like the other...
print(model.doesnt_match("breakfast marriage dinner lunch".split()))

[('croft', 0.6666483283042908), ('clay', 0.6550388336181641), ('musgrove', 0.6068781614303589), ('harville', 0.5755938291549683), ('hall', 0.5471861362457275), ('join', 0.5249279737472534), ('christmas', 0.5094653367996216), ('henrietta', 0.4894123077392578), ('sister', 0.4886399805545807), ('put', 0.48214367032051086)]
0.56615853


  # This is added back by InteractiveShellApp.init_path()
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


marriage


In [12]:
model = word2vec.Word2Vec(
    sentences,
    workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=6,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=400,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

# List of words in model.
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))

# Similarity is calculated using the cosine, so again 1 is total
# similarity and 0 is no similarity.
print(model.wv.similarity('mr', 'mrs'))

# One of these things is not like the other...
print(model.doesnt_match("breakfast marriage dinner lunch".split()))

[('musgrove', 0.9157456159591675), ('goddard', 0.904461145401001), ('harville', 0.903589129447937), ('benwick', 0.8948531150817871), ('clay', 0.8657619953155518), ('wentworth', 0.859317421913147), ('weston', 0.8532475233078003), ('navy', 0.8468574285507202), ('hall', 0.8434076309204102), ('colonel', 0.8427942991256714)]
0.89741284




dinner


## Drill 1: Word2Vec on 100B+ words
As we mentioned, word2vec really works best on a big corpus, but it can take half a day to clean such a corpus and run word2vec on it. Fortunately, there are word2vec models available that have already been trained on really big corpora. They are big files, but you can download a pretrained model of your choice here. At minimum, the ones built with word2vec (check the "Architecture" column) should load smoothly using an appropriately modified version of the code below, and you can play to your heart's content.

Because the models are so large, however, you may run into memory problems or crash the kernel. If you can't get a pretrained model to run locally, check out this interactive web app of the Google News model instead.

However you access it, play around with a pretrained model. Is there anything interesting you're able to pull out about analogies, similar words, or words that don't match? Write up a quick note about your tinkering and discuss it with your mentor during your next session.

I could not get a pretrained model to run locally, so I looked at the interactive web app of the Google News model. When looking at similar words, most of the words entered came up with similar words, but when I entered U.S.A, it returned a set of terms such as "trademark lawsuit" that might come after U.S.A. For other countries, this does not seem to happen and either similar countries or cities in that country generally come up. For analogies, this generally worked best when they were straightforward; something like Boston is to Massachusetts as Austin is to blank worked better than dress is to woman as suit is to blank, which returned lawsuit, which is not what we would expect. For words that don't match, putting in a fourth word that was more similar to the other three caused incorrect results, such as 'ice water juice steam', which I expected to return juice, but returned ice.