In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords

In [1]:
pip install gensim

Collecting gensim
  Downloading https://files.pythonhosted.org/packages/09/ed/b59a2edde05b7f5755ea68648487c150c7c742361e9c8733c6d4ca005020/gensim-3.8.1-cp37-cp37m-win_amd64.whl (24.2MB)
Collecting smart-open>=1.8.1 (from gensim)
  Downloading https://files.pythonhosted.org/packages/0c/09/735f2786dfac9bbf39d244ce75c0313d27d4962e71e0774750dc809f2395/smart_open-1.9.0.tar.gz (70kB)
Collecting boto3 (from smart-open>=1.8.1->gensim)
  Downloading https://files.pythonhosted.org/packages/69/80/43311e9948169a65168cb9ce7eabf66d013c0ec099d4e2c3b72bd65eabbc/boto3-1.10.25-py2.py3-none-any.whl (128kB)
Collecting s3transfer<0.3.0,>=0.2.0 (from boto3->smart-open>=1.8.1->gensim)
  Downloading https://files.pythonhosted.org/packages/16/8a/1fc3dba0c4923c2a76e1ff0d52b305c44606da63f718d14d3231e21c51b0/s3transfer-0.2.1-py2.py3-none-any.whl (70kB)
Collecting jmespath<1.0.0,>=0.7.1 (from boto3->smart-open>=1.8.1->gensim)
  Downloading https://files.pythonhosted.org/packages/83/94/7179c3832a6d45b266ddb2aac329e

In [3]:
# Utility function to clean text.
def text_cleaner(text):
    
    # Visual inspection shows spaCy does not recognize the double dash '--'.
    # Better get rid of it now!
    text = re.sub(r'--',' ',text)
    
    # Get rid of headings in square brackets.
    text = re.sub("[\[].*?[\]]", "", text)
    
    # Get rid of chapter titles.
    text = re.sub(r'Chapter \d+','',text)
    
    # Get rid of extra whitespace.
    text = ' '.join(text.split())
    
    return text[0:900000]


# Import all the Austen in the Project Gutenberg corpus.
austen = ""
for novel in ['persuasion','emma','sense']:
    work = gutenberg.raw('austen-' + novel + '.txt')
    austen = austen + work

# Clean the data.
austen_clean = text_cleaner(austen)

In [17]:
# Parse the data. This can take some time.
nlp = spacy.load('en_core_web_sm')
austen_doc = nlp(austen_clean)

In [8]:
# Organize the parsed doc into sentences, while filtering out punctuation
# and stop words, and converting words to lower case lemmas.
sentences = []
for sentence in austen_doc.sents:
    sentence = [
        token.lemma_.lower()
        for token in sentence
        if not token.is_stop
        and not token.is_punct
    ]
    sentences.append(sentence)


print(sentences[20])
print('We have {} sentences and {} tokens.'.format(len(sentences), len(austen_clean)))

['daughter', 'eld', 'give', 'thing', 'tempt']
We have 8146 sentences and 900000 tokens.


In [23]:
import gensim
from gensim.models import word2vec

model = word2vec.Word2Vec(
    sentences,
    workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=6,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

print('done')

done


In [21]:
# List of words in model.
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))

# Similarity is calculated using the cosine, so again 1 is total
# similarity and 0 is no similarity.
print(model.wv.similarity('mr', 'mrs'))

# One of these things is not like the other...
print(model.doesnt_match("breakfast marriage dinner lunch".split()))

[('goddard', 0.9232902526855469), ('musgrove', 0.9064273834228516), ('clay', 0.9050904512405396), ('harville', 0.8854701519012451), ('benwick', 0.8763200044631958), ('colonel', 0.8345077037811279), ('hall', 0.8329727649688721), ('wentworth', 0.8052282333374023), ('christmas', 0.8037371635437012), ('smith', 0.793175995349884)]
0.9294944


  # This is added back by InteractiveShellApp.init_path()
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


marriage


## Drill 0

### Increasing workers to 6

In [85]:
# Tinker with hyperparameters here.
model = word2vec.Word2Vec(
    sentences,
    workers=6,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=6,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

# List of words in model.
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))
print('-----------------------------------------------------------------------------------')
# Similarity is calculated using the cosine, so again 1 is total
# simlarity and 0 is no similarity.
print(model.wv.similarity('mr', 'mrs'))
print('-----------------------------------------------------------------------------------')
# One of these things is not like the other...
print(model.doesnt_match("breakfast marriage dinner lunch".split()))

[('clay', 0.954087495803833), ('musgrove', 0.9431738257408142), ('benwick', 0.9208430647850037), ('harville', 0.9149641394615173), ('goddard', 0.8874160051345825), ('smith', 0.8836383819580078), ('wentworth', 0.8831669688224792), ('hall', 0.8781406879425049), ('colonel', 0.8536697626113892), ('croft', 0.8046360015869141)]
-----------------------------------------------------------------------------------
0.9553017
-----------------------------------------------------------------------------------




marriage


Not a big change !!!! similiarity between 'mr and 'mrs' increased.

### Decrease workers to 2

In [86]:
# Tinker with hyperparameters here.
model = word2vec.Word2Vec(
    sentences,
    workers=2,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=6,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

# List of words in model.
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))
print('-----------------------------------------------------------------------------------')
# Similarity is calculated using the cosine, so again 1 is total
# simlarity and 0 is no similarity.
print(model.wv.similarity('mr', 'mrs'))
print('-----------------------------------------------------------------------------------')
# One of these things is not like the other...
print(model.doesnt_match("breakfast marriage dinner lunch".split()))

[('goddard', 0.9559103846549988), ('benwick', 0.9286251068115234), ('weston', 0.9163435697555542), ('colonel', 0.9051037430763245), ('harville', 0.9045014977455139), ('clay', 0.8930296897888184), ('wentworth', 0.8789148330688477), ('musgrove', 0.8775738477706909), ('smith', 0.8560359477996826), ('throat', 0.8381198644638062)]
-----------------------------------------------------------------------------------
0.9156404
-----------------------------------------------------------------------------------




marriage


Not a big change !!!! similiarity between 'mr and 'mrs' decreased. Unrelated words like 'throat' is included. <br>
It is better to go with workers = 6

### Increase min_count to 15

In [87]:
# Tinker with hyperparameters here.
model = word2vec.Word2Vec(
    sentences,
    workers=6,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=15,  # Minimum word count threshold.
    window=6,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

# List of words in model.
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))
print('-----------------------------------------------------------------------------------')
# Similarity is calculated using the cosine, so again 1 is total
# simlarity and 0 is no similarity.
print(model.wv.similarity('mr', 'mrs'))
print('-----------------------------------------------------------------------------------')
# One of these things is not like the other...
print(model.doesnt_match("breakfast marriage dinner lunch".split()))

[('room', 0.9255969524383545), ('hall', 0.8896030783653259), ('kellynch', 0.8866817951202393), ('anne', 0.8844233751296997), ('louisa', 0.8835333585739136), ('sister', 0.8664480447769165), ('mary', 0.8648064732551575), ('musgrove', 0.8647823929786682), ('captain', 0.861393928527832), ('dalrymple', 0.8579645156860352)]
-----------------------------------------------------------------------------------
0.8143915
-----------------------------------------------------------------------------------




breakfast


Alot of unrelated words / breakfast was th non match and not marriage. It is not a good idea

### Decrease min_count to 5

In [88]:
# Tinker with hyperparameters here.
model = word2vec.Word2Vec(
    sentences,
    workers=6,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=15,  # Minimum word count threshold.
    window=6,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

# List of words in model.
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))
print('-----------------------------------------------------------------------------------')
# Similarity is calculated using the cosine, so again 1 is total
# simlarity and 0 is no similarity.
print(model.wv.similarity('mr', 'mrs'))
print('-----------------------------------------------------------------------------------')
# One of these things is not like the other...
print(model.doesnt_match("breakfast marriage dinner lunch".split()))

[('louisa', 0.9397026896476746), ('hall', 0.939234733581543), ('musgrove', 0.9292339086532593), ('anne', 0.9248431921005249), ('charles', 0.9247894287109375), ('croft', 0.9185818433761597), ('room', 0.9124189019203186), ('clay', 0.9028815031051636), ('prefer', 0.8952974081039429), ('kellynch', 0.8920326232910156)]
-----------------------------------------------------------------------------------
0.7119586
-----------------------------------------------------------------------------------




breakfast


Slightly better than min_count = 15 but worse than word_count = 10. <br>
The best approach is to keep min_count = 10

### Increase window to 8

In [89]:
# Tinker with hyperparameters here.
model = word2vec.Word2Vec(
    sentences,
    workers=6,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=8,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

# List of words in model.
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))
print('-----------------------------------------------------------------------------------')
# Similarity is calculated using the cosine, so again 1 is total
# simlarity and 0 is no similarity.
print(model.wv.similarity('mr', 'mrs'))
print('-----------------------------------------------------------------------------------')
# One of these things is not like the other...
print(model.doesnt_match("breakfast marriage dinner lunch".split()))

[('benwick', 0.9173084497451782), ('colonel', 0.890707790851593), ('musgrove', 0.8846743106842041), ('clay', 0.883734941482544), ('harville', 0.8778499364852905), ('goddard', 0.844180166721344), ('wentworth', 0.826752245426178), ('excessively', 0.8214936256408691), ('hall', 0.8111519813537598), ('smith', 0.8066307902336121)]
-----------------------------------------------------------------------------------
0.9589726
-----------------------------------------------------------------------------------




dinner


Dinner !!! not marriage . including words like execessively. 

### Decrease window to 4

In [99]:
# Tinker with hyperparameters here.
model = word2vec.Word2Vec(
    sentences,
    workers=6,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=4,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

# List of words in model.
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))
print('-----------------------------------------------------------------------------------')
# Similarity is calculated using the cosine, so again 1 is total
# simlarity and 0 is no similarity.
print(model.wv.similarity('mr', 'mrs'))
print('-----------------------------------------------------------------------------------')
# One of these things is not like the other...
print(model.doesnt_match("breakfast marriage dinner lunch".split()))

[('goddard', 0.9319785833358765), ('clay', 0.91188645362854), ('musgrove', 0.8959094285964966), ('harville', 0.8893753290176392), ('benwick', 0.8730003237724304), ('colonel', 0.8387373685836792), ('estate', 0.8371177315711975), ('hall', 0.8339253664016724), ('god', 0.8147356510162354), ('smith', 0.8111018538475037)]
-----------------------------------------------------------------------------------
0.91297674
-----------------------------------------------------------------------------------




dinner


Not better as well. Keeping window = 6 is tyhe best option

### Sg = 1

In [100]:
# Tinker with hyperparameters here.
model = word2vec.Word2Vec(
    sentences,
    workers=6,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=4,      # Number of words around target word to consider.
    sg=1,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

# List of words in model.
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))
print('-----------------------------------------------------------------------------------')
# Similarity is calculated using the cosine, so again 1 is total
# simlarity and 0 is no similarity.
print(model.wv.similarity('mr', 'mrs'))
print('-----------------------------------------------------------------------------------')
# One of these things is not like the other...
print(model.doesnt_match("breakfast marriage dinner lunch".split()))

[('decide', 0.6912325620651245), ('prefer', 0.68659907579422), ('listen', 0.6706044673919678), ('match', 0.6668838858604431), ('colonel', 0.6639150381088257), ('louisa', 0.6463339328765869), ('recommendation', 0.6383770108222961), ('henrietta', 0.636981725692749), ('except', 0.6302949786186218), ('cease', 0.6277197599411011)]
-----------------------------------------------------------------------------------
0.63978344
-----------------------------------------------------------------------------------




marriage


Very bad idea !!!!!

### Increase sample to 1e-2

In [103]:
# Tinker with hyperparameters here.
model = word2vec.Word2Vec(
    sentences,
    workers=6,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=4,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-2 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

# List of words in model.
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))
print('-----------------------------------------------------------------------------------')
# Similarity is calculated using the cosine, so again 1 is total
# simlarity and 0 is no similarity.
print(model.wv.similarity('mr', 'mrs'))
print('-----------------------------------------------------------------------------------')
# One of these things is not like the other...
print(model.doesnt_match("breakfast marriage dinner lunch".split()))

[('clay', 0.6705139875411987), ('croft', 0.6131385564804077), ('hall', 0.5588861703872681), ('harville', 0.5258289575576782), ('musgrove', 0.5109976530075073), ('wallis', 0.5086723566055298), ('people', 0.47978276014328003), ('colonel', 0.46926355361938477), ('benwick', 0.40738463401794434), ('goddard', 0.39646345376968384)]
-----------------------------------------------------------------------------------
0.55075586
-----------------------------------------------------------------------------------




breakfast


Not a good idea

### decrease sample to 1e-4

In [104]:
# Tinker with hyperparameters here.
model = word2vec.Word2Vec(
    sentences,
    workers=6,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=4,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-4 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

# List of words in model.
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))
print('-----------------------------------------------------------------------------------')
# Similarity is calculated using the cosine, so again 1 is total
# simlarity and 0 is no similarity.
print(model.wv.similarity('mr', 'mrs'))
print('-----------------------------------------------------------------------------------')
# One of these things is not like the other...
print(model.doesnt_match("breakfast marriage dinner lunch".split()))

[('say', 0.9996386170387268), ('charles', 0.9996241331100464), ('go', 0.9996241331100464), ('mind', 0.9996230602264404), ('mrs', 0.9996224045753479), ('come', 0.9996166229248047), ('family', 0.999616265296936), ('place', 0.9996160268783569), ('time', 0.999605119228363), ('mr', 0.9996025562286377)]
-----------------------------------------------------------------------------------
0.9998519
-----------------------------------------------------------------------------------




breakfast


Another bad idea. Keeping sample at 1e-3 is the best option

### Increase size to 500

In [105]:
# Tinker with hyperparameters here.
model = word2vec.Word2Vec(
    sentences,
    workers=6,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=4,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=500,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

# List of words in model.
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))
print('-----------------------------------------------------------------------------------')
# Similarity is calculated using the cosine, so again 1 is total
# simlarity and 0 is no similarity.
print(model.wv.similarity('mr', 'mrs'))
print('-----------------------------------------------------------------------------------')
# One of these things is not like the other...
print(model.doesnt_match("breakfast marriage dinner lunch".split()))

[('clay', 0.9607342481613159), ('musgrove', 0.9415855407714844), ('harville', 0.9390192031860352), ('goddard', 0.9287592768669128), ('benwick', 0.9260991811752319), ('hall', 0.896354079246521), ('croft', 0.8887282013893127), ('colonel', 0.881657600402832), ('wentworth', 0.8797074556350708), ('smith', 0.8742300271987915)]
-----------------------------------------------------------------------------------
0.9251984
-----------------------------------------------------------------------------------




marriage


Not bad 

### Decrease size to 100

In [106]:
# Tinker with hyperparameters here.
model = word2vec.Word2Vec(
    sentences,
    workers=6,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=4,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=100,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

# List of words in model.
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))
print('-----------------------------------------------------------------------------------')
# Similarity is calculated using the cosine, so again 1 is total
# simlarity and 0 is no similarity.
print(model.wv.similarity('mr', 'mrs'))
print('-----------------------------------------------------------------------------------')
# One of these things is not like the other...
print(model.doesnt_match("breakfast marriage dinner lunch".split()))

[('clay', 0.8901311159133911), ('goddard', 0.858863353729248), ('colonel', 0.8431275486946106), ('musgrove', 0.805325984954834), ('harville', 0.7979613542556763), ('benwick', 0.7938932180404663), ('hall', 0.7882579565048218), ('excessively', 0.7104836106300354), ('croft', 0.7015020847320557), ('wallis', 0.6915990114212036)]
-----------------------------------------------------------------------------------
0.8419894
-----------------------------------------------------------------------------------




dinner


It gets worse with decreasing size. Changing size to 500 slightly improves the model

### hs = 0

In [107]:
# Tinker with hyperparameters here.
model = word2vec.Word2Vec(
    sentences,
    workers=6,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=4,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=500,      # Word vector length.
    hs=0           # Use hierarchical softmax.
)

# List of words in model.
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))
print('-----------------------------------------------------------------------------------')
# Similarity is calculated using the cosine, so again 1 is total
# simlarity and 0 is no similarity.
print(model.wv.similarity('mr', 'mrs'))
print('-----------------------------------------------------------------------------------')
# One of these things is not like the other...
print(model.doesnt_match("breakfast marriage dinner lunch".split()))

[('kellynch', 0.999703049659729), ('equal', 0.9997026324272156), ('bath', 0.9997007846832275), ('mind', 0.9997005462646484), ('meet', 0.9996975064277649), ('manner', 0.9996971487998962), ('feel', 0.9996962547302246), ('look', 0.9996957778930664), ('excellent', 0.9996954202651978), ('pretty', 0.9996941089630127)]
-----------------------------------------------------------------------------------
0.99970305
-----------------------------------------------------------------------------------




breakfast


Bad idea !!!!!!

### the Best Available Possible Model

In [109]:
# Tinker with hyperparameters here.
model = word2vec.Word2Vec(
    sentences,
    workers=6,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=4,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=100,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

# List of words in model.
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))
print('-----------------------------------------------------------------------------------')
# Similarity is calculated using the cosine, so again 1 is total
# simlarity and 0 is no similarity.
print(model.wv.similarity('mr', 'mrs'))
print('-----------------------------------------------------------------------------------')
# One of these things is not like the other...
print(model.doesnt_match("breakfast marriage dinner lunch".split()))

[('clay', 0.8789665102958679), ('harville', 0.8584632277488708), ('musgrove', 0.849410355091095), ('colonel', 0.8391425609588623), ('benwick', 0.8272191882133484), ('goddard', 0.8113318681716919), ('hall', 0.7921149730682373), ('excessively', 0.7917721271514893), ('croft', 0.778504490852356), ('wentworth', 0.7624565362930298)]
-----------------------------------------------------------------------------------
0.8638252
-----------------------------------------------------------------------------------




marriage


## Drill 1

In [110]:
# Load Google's pre-trained Word2Vec model.
model = gensim.models.KeyedVectors.load_word2vec_format ('https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [115]:
#Let's find out the size of the word vectors (aka the number of hidden units used in the model)
model.word_vec('Hello').shape[0]

300

We have vector of length 300 for each word

In [116]:
# Play around with your pretrained model here.
print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))

  


[('fella', 0.6031545400619507), ('gentleman', 0.5849649906158447), ('chap', 0.5543248653411865), ('gent', 0.543907880783081), ('guy', 0.5265033841133118), ('lad', 0.5139425992965698), ('feller', 0.5072450041770935), ('bloke', 0.49030160903930664), ('rascal', 0.4873698949813843), ('ladies', 0.47617611289024353)]


In [117]:
print(model.wv.similarity('mr', 'mrs'))

  """Entry point for launching an IPython kernel.


0.66098833


In [118]:
print(model.doesnt_match("breakfast marriage dinner lunch".split()))

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


marriage


In [119]:
print(model.wv.most_similar('Libya'))

  """Entry point for launching an IPython kernel.


[('Libyan', 0.8276487588882446), ('Qaddafi', 0.7149137854576111), ('Gadhafi', 0.7069592475891113), ('Libyans', 0.6999526619911194), ('Gaddafi', 0.6969988942146301), ('Col_Gaddafi', 0.6904028654098511), ('Kadhafi', 0.6901055574417114), ('Gadhafi_regime', 0.6888466477394104), ('Qadhafi', 0.6864583492279053), ('Gaddafi_regime', 0.684234082698822)]


In [120]:
print(model.wv.most_similar('Lazio'))

  """Entry point for launching an IPython kernel.


[('Sampdoria', 0.808428168296814), ('Juventus', 0.7894195914268494), ('Fiorentina', 0.7867385745048523), ('Udinese', 0.7749403715133667), ('Juve', 0.7597488164901733), ('AS_Roma', 0.7585214376449585), ('Cagliari', 0.7436245083808899), ('Empoli', 0.7397139072418213), ('Reggina', 0.7319977283477783), ('Chievo', 0.726660966873169)]


In [121]:
print(model.wv.most_similar('Messi'))

  """Entry point for launching an IPython kernel.


[('Lionel_Messi', 0.840912938117981), ('Xavi', 0.8333092927932739), ('Iniesta', 0.8275967836380005), ('Ronaldinho', 0.8265319466590881), ('Ronaldo', 0.8209547400474548), ("Eto'o", 0.8086107969284058), ('Forlan', 0.8052786588668823), ('Leo_Messi', 0.7813161611557007), ('Higuain', 0.7786868810653687), ('Puyol', 0.7752934694290161)]


In [124]:
# Play around with your pretrained model here.
print(model.wv.most_similar('Argentina'))

  


[('Uruguay', 0.7746641635894775), ('Brazil', 0.7405785322189331), ('Chile', 0.7342544794082642), ('Argentine', 0.7306740880012512), ('Paraguay', 0.7187186479568481), ('Ecuador', 0.6962069869041443), ('Argentines', 0.676395058631897), ('Argentinian', 0.6565613746643066), ('Agustin_Velotti', 0.6532235145568848), ('Argentinean', 0.6501435041427612)]


In [136]:
print(model.doesnt_match("Batistuta Maradona Messi Romario".split()))

Romario


In [137]:
print(model.doesnt_match("Libya Tunisia Egypt Chad".split()))

Chad


In [138]:
print(model.wv.similarity('Messi', 'Ronaldo'))

  """Entry point for launching an IPython kernel.


0.8209548


In [139]:
print(model.wv.similarity('messi', 'ronaldo'))

  """Entry point for launching an IPython kernel.


0.78799146


In [141]:
print(model.wv.similarity('Obama', 'Trump'))

  """Entry point for launching an IPython kernel.


0.3842004


In [143]:
print(model.wv.most_similar('Trump'))

  """Entry point for launching an IPython kernel.


[('Donald_Trump', 0.8103920817375183), ('impersonator_entertained', 0.5942257046699524), ('Ivanka_Trump', 0.5924582481384277), ('Ivanka', 0.560720682144165), ('mogul_Donald_Trump', 0.5592452883720398), ('Trump_Tower', 0.5485552549362183), ('Kepcher', 0.5468589067459106), ('billionaire_Donald_Trump', 0.5447269678115845), ('Trumpster', 0.5412819385528564), ('tycoon_Donald_Trump', 0.5383971929550171)]


In [147]:
print(model.wv.most_similar('Raleigh'))

  """Entry point for launching an IPython kernel.


[('Raleigh_NC', 0.7883150577545166), ('Winston_Salem', 0.7826129794120789), ('Greensboro', 0.7722611427307129), ('Asheville', 0.7606959342956543), ('Gastonia', 0.7328293323516846), ('Rocky_Mount', 0.7305831909179688), ('Chapel_Hill', 0.7248146533966064), ('Statesville', 0.7202255725860596), ('Spartanburg', 0.7052722573280334), ('Greenville', 0.702080249786377)]
