In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
!python -m spacy download en

[+] Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')
[x] Couldn't link model to 'en'
Creating a symlink in spacy/data failed. Make sure you have the required
permissions and try re-running the command as admin, or use a virtualenv. You
can still import the model as a module and call its load() method, or create the
symlink manually.
C:\Users\sajee\AppData\Local\Continuum\anaconda3\lib\site-packages\en_core_web_sm
-->
C:\Users\sajee\AppData\Local\Continuum\anaconda3\lib\site-packages\spacy\data\en
[!] Download successful but linking failed
Creating a shortcut link for 'en' didn't work (maybe you don't have admin
permissions?), but you can still load the model via its full package name: nlp =
spacy.load('en_core_web_sm')


You do not have sufficient privilege to perform this operation.


In [2]:
# Utility function to clean text.
def text_cleaner(text):
    
    # Visual inspection shows spaCy does not recognize the double dash '--'.
    # Better get rid of it now!
    text = re.sub(r'--',' ',text)
    
    # Get rid of headings in square brackets.
    text = re.sub("[\[].*?[\]]", "", text)
    
    # Get rid of chapter titles.
    text = re.sub(r'Chapter \d+','',text)
    
    # Get rid of extra whitespace.
    text = ' '.join(text.split())
    
    return text[0:900000]


# Import all the Austen in the Project Gutenberg corpus.
austen = ""
for novel in ['persuasion','emma','sense']:
    work = gutenberg.raw('austen-' + novel + '.txt')
    austen = austen + work

# Clean the data.
austen_clean = text_cleaner(austen)

In [3]:
# Parse the data. This can take some time.
nlp = spacy.load('en_core_web_sm')
austen_doc = nlp(austen_clean)

In [4]:
# Organize the parsed doc into sentences, while filtering out punctuation
# and stop words, and converting words to lower case lemmas.
sentences = []
for sentence in austen_doc.sents:
    sentence = [
        token.lemma_.lower()
        for token in sentence
        if not token.is_stop
        and not token.is_punct
    ]
    sentences.append(sentence)


print(sentences[20])
print('We have {} sentences and {} tokens.'.format(len(sentences), len(austen_clean)))

['lady', 'russell', 'steady', 'age', 'character', 'extremely', 'provide', 'thought', 'second', 'marriage', 'need', 'apology', 'public', 'apt', 'unreasonably', 'discontent', 'woman', 'marry', 'sir', 'walter', 'continue', 'singleness', 'require', 'explanation']
We have 9298 sentences and 900000 tokens.


In [5]:
import gensim
from gensim.models import word2vec

model = word2vec.Word2Vec(
    sentences,
    workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=6,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

print('done!')

  "C extension not loaded, training will be slow. "


done!


In [6]:
# List of words in model.
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))

# Similarity is calculated using the cosine, so again 1 is total
# similarity and 0 is no similarity.
print(model.wv.similarity('mr', 'mrs'))

# One of these things is not like the other...
print(model.doesnt_match("breakfast marriage dinner lunch".split()))

[('clay', 0.9573068618774414), ('goddard', 0.9536917209625244), ('musgrove', 0.949884831905365), ('harville', 0.9459196329116821), ('benwick', 0.9432364702224731), ('charles', 0.9112398624420166), ('wentworth', 0.902734637260437), ('hall', 0.9018844962120056), ('croft', 0.8926742076873779), ('colonel', 0.8894760012626648)]
0.901502


  # This is added back by InteractiveShellApp.init_path()


dinner


### Changed min_count from 10 to 6, increased size from 300 to 350, and increased window from 6 to 9:

In [7]:
model_new_parameters = word2vec.Word2Vec(
    sentences,
    workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=6,  # Minimum word count threshold.
    window=9,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=350,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

print('done!')

  "C extension not loaded, training will be slow. "


done!


In [8]:
# List of words in model.
vocab_new_parameters = model_new_parameters.wv.vocab.keys()

print(model_new_parameters.wv.most_similar(positive=['lady', 'man'], negative=['woman']))

# Similarity is calculated using the cosine, so again 1 is total
# similarity and 0 is no similarity.
print(model_new_parameters.wv.similarity('mr', 'mrs'))

# One of these things is not like the other...
print(model_new_parameters.doesnt_match("breakfast marriage dinner lunch".split()))

[('goddard', 0.9710274934768677), ('jealousy', 0.956786572933197), ('croft', 0.9492164850234985), ('musgrove', 0.9465190172195435), ('clay', 0.9457922577857971), ('hall', 0.9451195001602173), ('cole', 0.9381397366523743), ('repeatedly', 0.9370145797729492), ('acquire', 0.9366686940193176), ('sufficient', 0.9335703253746033)]
0.75098747


  # This is added back by InteractiveShellApp.init_path()


breakfast


After changing some of the model's hyperparameters, the model didn't improve.  Unlike the original model where some of the answers could have been used in an analogy of man:"word" as woman:lady, none of the answers given by the model with the new hyperparameters don't make sense in the analogy to the word "man", as we would have hoped.

Also, unlike the previous model, the model with the new hyperparameters failed at differentiating the word "marriage" from the list of words we had given about different meal times.

### Changed min_count from 10 to 14, and decreased window from 6 to 2: 

In [9]:
model_new_parameters2 = word2vec.Word2Vec(
    sentences,
    workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=14,  # Minimum word count threshold.
    window=2,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

print('done!')

  "C extension not loaded, training will be slow. "


done!


In [10]:
# List of words in model.
vocab_new_parameters2 = model_new_parameters2.wv.vocab.keys()

print(model_new_parameters2.wv.most_similar(positive=['lady', 'man'], negative=['woman']))

# Similarity is calculated using the cosine, so again 1 is total
# similarity and 0 is no similarity.
print(model_new_parameters2.wv.similarity('mr', 'mrs'))

# One of these things is not like the other...
print(model_new_parameters2.doesnt_match("breakfast marriage dinner lunch".split()))

[('goddard', 0.9588110446929932), ('weston', 0.9223220348358154), ('clay', 0.9200819730758667), ('smith', 0.8834136724472046), ('musgrove', 0.8788282871246338), ('hall', 0.8765681982040405), ('croft', 0.8623643517494202), ('colonel', 0.8536821603775024), ('harville', 0.8498642444610596), ('cole', 0.8123654127120972)]
0.6091271


  # This is added back by InteractiveShellApp.init_path()


dinner


I changed the hyperparameters again, but this time I decreased window from 6 to 2, and changed min_count from 10 to 14.  I hoped that by decreasing window from 6 to 2, the model would work better by focusing only on the words immediately next to the target word, but the model actually performed worse.  

### Used negative sampling instead of hierarchical softmax:

In [11]:
model_neg_sampling = word2vec.Word2Vec(
    sentences,
    workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=6,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=0,           # Using negartive sampling.
    negative=6
)

print('done!')

  "C extension not loaded, training will be slow. "


done!


In [12]:
# List of words in model.
vocab_negative_sampling = model_neg_sampling.wv.vocab.keys()

print(model_neg_sampling.wv.most_similar(positive=['lady', 'man'], negative=['woman']))

# Similarity is calculated using the cosine, so again 1 is total
# similarity and 0 is no similarity.
print(model_neg_sampling.wv.similarity('mr', 'mrs'))

# One of these things is not like the other...
print(model_neg_sampling.doesnt_match("breakfast marriage dinner lunch".split()))

[('enter', 0.9993537664413452), ('highly', 0.9993413686752319), ('wish', 0.9993402361869812), ('friend', 0.9993385076522827), ('meet', 0.9993383884429932), ('anxiety', 0.9993330240249634), ('consciousness', 0.9993325471878052), ('age', 0.9993311166763306), ('listen', 0.9993295669555664), ('appearance', 0.9993287324905396)]
0.9996066


  # This is added back by InteractiveShellApp.init_path()


breakfast


With Negative Sampling the model was better at catching the similarity between words such as "Mr." and "Mrs.", but it still failed to differentiate the word "marriage" from the other words that represented meal times.