In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn

import matplotlib.pyplot as plt
import seaborn as sns
import re


In [3]:

from nltk.corpus import gutenberg, stopwords

In [4]:
# Utility function to clean text.
def text_cleaner(text):
    
    # Visual inspection shows spaCy does not recognize the double dash '--'.
    # Better get rid of it now!
    text = re.sub(r'--',' ',text)
    
    # Get rid of headings in square brackets.
    text = re.sub("[\[].*?[\]]", "", text)
    
    # Get rid of chapter titles.
    text = re.sub(r'Chapter \d+','',text)
    
    # Get rid of extra whitespace.
    text = ' '.join(text.split())
    
    return text

In [5]:
# Import all the Austen in the Project Gutenberg corpus.

sense = gutenberg.raw('austen-sense.txt')
persuasion = gutenberg.raw('austen-persuasion.txt')
emma = gutenberg.raw('austen-emma.txt')


# Clean the data.
sense_clean = text_cleaner(sense)
persuasion_clean = text_cleaner(persuasion)

emma_clean = text_cleaner(emma)

In [6]:
print(len(sense_clean))
print(len(persuasion_clean))
print(len(emma_clean))

666583
462818
876869


In [7]:
sense_1 = sense_clean[0:300000]
sense_2 = sense_clean[300000:]

persuasion_1 = persuasion_clean[0:200000]
persuasion_2 = persuasion_clean[200000:]

emma_1 = emma_clean[0:300000]
emma_2 = emma_clean[300000:600000]
emma_3 = emma_clean[600000:]

print(len(sense_1))
print(len(sense_2))

print(len(persuasion_1))
print(len(persuasion_2))

print(len(emma_1))
print(len(emma_2))
print(len(emma_3))

300000
366583
200000
262818
300000
300000
276869


In [1]:
import spacy
nlp = spacy.load('en')

In [8]:
sense_1_doc = nlp(sense_1)

In [9]:
sense_2_doc = nlp(sense_2)

In [10]:
persuasion_1_doc = nlp(persuasion_1)

In [11]:
persuasion_2_doc = nlp(persuasion_2)

In [12]:
emma_1_doc = nlp(emma_1)

In [13]:
emma_2_doc = nlp(emma_2)

In [14]:
emma_3_doc = nlp(emma_3)

In [15]:
all_sentences = []
for sentence in sense_1_doc.sents:
    sentence = [
        token.lemma_.lower()
        for token in sentence
        if not token.is_stop
        and not token.is_punct
    ]
    all_sentences.append(sentence)
    

In [16]:
for sentence in sense_2_doc.sents:
    sentence2 = [
        token.lemma_.lower()
        for token in sentence
        if not token.is_stop
        and not token.is_punct
    ]
    all_sentences.append(sentence2)

In [18]:
for sentence in persuasion_1_doc.sents:
    sentence3 = [
        token.lemma_.lower()
        for token in sentence
        if not token.is_stop
        and not token.is_punct
    ]
    all_sentences.append(sentence3)

In [19]:
for sentence in persuasion_2_doc.sents:
    sentence4 = [
        token.lemma_.lower()
        for token in sentence
        if not token.is_stop
        and not token.is_punct
    ]
    all_sentences.append(sentence4)

In [20]:
for sentence in emma_1_doc.sents:
    sentence5 = [
        token.lemma_.lower()
        for token in sentence
        if not token.is_stop
        and not token.is_punct
    ]
    all_sentences.append(sentence5)

In [21]:
for sentence in emma_2_doc.sents:
    sentence6 = [
        token.lemma_.lower()
        for token in sentence
        if not token.is_stop
        and not token.is_punct
    ]
    all_sentences.append(sentence6)

In [22]:
for sentence in emma_3_doc.sents:
    sentence7 = [
        token.lemma_.lower()
        for token in sentence
        if not token.is_stop
        and not token.is_punct
    ]
    all_sentences.append(sentence7)

In [23]:
print(all_sentences[20])
print('We have {} sentences and {} tokens.'.format(len(all_sentences), (len(sense_clean) + len(persuasion_clean) + len(emma_clean))))

['-pron-', 'survive', '-pron-', 'uncle', 'no', 'longer', 'and', 'ten', 'thousand', 'pound', 'include', 'the', 'late', 'legacy', 'be', 'all', 'that', 'remain', 'for', '-pron-', 'widow', 'and', 'daughter']
We have 18161 sentences and 2006270 tokens.


In [24]:
# conda install -c anaconda gensim
import gensim
from gensim.models import word2vec




In [25]:

model = word2vec.Word2Vec(
    all_sentences,
    workers=2,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=6,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

print('done!')

done!


In [26]:
# List of words in model.
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))

# Similarity is calculated using the cosine, so again 1 is total
# similarity and 0 is no similarity.
print(model.wv.similarity('loud', 'aloud'))
print(model.wv.similarity('mr', 'mrs'))

# One of these things is not like the other...
print(model.wv.doesnt_match("breakfast marriage dinner lunch".split()))

[('people', 0.542380690574646), ('child', 0.5123438835144043), ('wait', 0.49683427810668945), ('person', 0.48106372356414795), ('lay', 0.4491744339466095), ('daughter', 0.44877469539642334), ('sake', 0.44300079345703125), ('invite', 0.4413103461265564), ('chamber', 0.440081924200058), ('set', 0.43497589230537415)]
0.560183042806
0.275718331033
marriage


# Drill

In [28]:
model = word2vec.Word2Vec(
    all_sentences,
    workers=2,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=2,  # Minimum word count threshold.
    window=3,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

print('done!')

# List of words in model.
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))

# Similarity is calculated using the cosine, so again 1 is total
# similarity and 0 is no similarity.
print(model.wv.similarity('loud', 'aloud'))
print(model.wv.similarity('mr', 'mrs'))

# One of these things is not like the other...
print(model.doesnt_match("breakfast marriage dinner lunch".split()))

done!
[('famous', 0.5893282890319824), ('people', 0.5829415321350098), ('person', 0.5615419149398804), ('gilberts', 0.5614441633224487), ('cape', 0.544938325881958), ('straightforward', 0.5141352415084839), ('clownish', 0.5114244222640991), ('physician', 0.5096943378448486), ('address', 0.5021142959594727), ('kellynch', 0.49815690517425537)]
0.609513507993
0.574171428229




marriage


In [29]:
model = word2vec.Word2Vec(
    all_sentences,
    workers=2,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=2,  # Minimum word count threshold.
    window=3,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=0           # Use hierarchical softmax.
)

print('done!')

# List of words in model.
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))

# Similarity is calculated using the cosine, so again 1 is total
# similarity and 0 is no similarity.
print(model.wv.similarity('loud', 'aloud'))
print(model.wv.similarity('mr', 'mrs'))

# One of these things is not like the other...
print(model.doesnt_match("breakfast marriage dinner lunch".split()))

done!
[('grandson', 0.7622113227844238), ('iv', 0.758206844329834), ('middleton', 0.7399743795394897), ('henry', 0.7323991060256958), ('patronage', 0.7281194925308228), ('brown', 0.7158392667770386), ('richard', 0.7148146629333496), ('bella', 0.7125517129898071), ('abruptness', 0.7116174697875977), ('pointer', 0.710627555847168)]
0.954916424423
0.945722839831




marriage


In [34]:
model = word2vec.Word2Vec(
    all_sentences,
    workers=2,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=2,  # Minimum word count threshold.
    window=3,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-1 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

print('done!')

# List of words in model.
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))

# Similarity is calculated using the cosine, so again 1 is total
# similarity and 0 is no similarity.
print(model.wv.similarity('loud', 'aloud'))
print(model.wv.similarity('mr', 'mrs'))

# One of these things is not like the other...
print(model.doesnt_match("breakfast marriage dinner lunch".split()))

done!
[('person', 0.5112453699111938), ('physician', 0.5019273161888123), ('people', 0.4616897404193878), ('gilberts', 0.46049851179122925), ('offence', 0.44991835951805115), ('party', 0.44036799669265747), ('crime', 0.44010987877845764), ('gentleman', 0.43712136149406433), ('subject', 0.4238295555114746), ('negotiation', 0.4132401645183563)]
0.276240473093
0.543629915672




marriage


# Conclusion
With changes in the hyperparameters of this model, finding the odd one out has performed consistently accurate than others.
We got result as "marriage" which is dissimilar to "breakfast", "lunch", and "dinner". 
For analogy while some words given above might possibly fill in the analogy lady:woman::man:?, most answers likely make little sense. 

Larger windows tend to capture more topic/domain information, while Smaller windows tend to capture more about word itself. So When decreased the window size from 6 to 3 and changed Minimum word count threshold from 10 to 2 , we got 4% improvement for finding similiarity between word "loud" and "aloud". But this incraesed the score from 27% to 57% for finding similarity between 'mr' and 'mrs'. 

# Drill 1

In [36]:
# Load Google's pre-trained Word2Vec model.
#model = gensim.models.KeyedVectors.load_word2vec_format('./model/GoogleNews-vectors-negative300.bin', binary=True)

# Ran into memory problems - used the web interactive

In [None]:
# Analogies:

cat  is to kitten as dog is to ? Ans was puppy 
[["puppy",0.769972562789917],["pup",0.6861710548400879],["pit_bull",0.6776559352874756],["dogs",0.6770986318588257],["Rottweiler",0.66466224193573]]


day is to night as black is to ? Ans was white

 [["white",0.5813108682632446],["Responded_Letterman_How",0.4426087737083435],["raspberries_inhibited",0.4368473291397095],["Shilah_Phillips",0.42732203006744385],["sang_Learnin",0.4253648519515991]]

day is to sun as night is to ?
ans  [["Robert_Tychkowski_Edmonton",0.5355181694030762],["CARA_EASTWOOD",0.5136322975158691],["STEPHAN_FRAZIER",0.5134779214859009],["hijacked_airliner_belongs",0.5006062984466553],["columnist_Bill_Zwecker",0.5005240440368652]]

    
# Closest words:

bread 
[["butter",0.6417260766029358],["rye_sourdough",0.6290417909622192],["breads",0.6243128776550293],["loaf",0.6184971332550049],["flour",0.6152125597000122]]

rain
 [["heavy_rain",0.8421464562416077],["downpour",0.7967616319656372],["rains",0.7827130556106567],["torrential_rain",0.7578904628753662],["Rain",0.7476006746292114]]
    


# Words does not fit-
apple orange strawberry egg
-- output was egg
blue red green photo
-- output was photo

pen paper ink ball
-- ouput was ball

# Conclusion

In general, the model is very good at finding phrase that doesnot fit, and also pretty good at finding reasonable similarities. but struggles with finding analogies. 