# Intro and setup

This notebook demonstrates the use of word2vec in Python using the  [gensim libraries](https://github.com/RaRe-Technologies/gensim).  Information is available on the [gensim website](https://radimrehurek.com/gensim/index.html) along with tutorials and the [API](https://radimrehurek.com/gensim/apiref.html). 

You can install them to your local machine using the command:
```
pip install --upgrade gensim
```

In [1]:
# turn off pretty printing to get horizontal display - optional, but I'm saving space for display
%pprint

Pretty printing has been turned OFF


In [2]:
import os

# Data

In [3]:
with open('shakespeare.txt', 'r') as f:
    raw_data = f.read()

In [4]:
type(raw_data)

<class 'str'>

In [5]:
# what does it look like?
raw_data[:1000]

"A MIDSUMMER-NIGHT'S DREAM\n\nNow , fair Hippolyta , our nuptial hour \nDraws on apace : four happy days bring in \nAnother moon ; but O ! methinks how slow \nThis old moon wanes ; she lingers my desires ,\nLike to a step dame , or a dowager \nLong withering out a young man's revenue .\n\nFour days will quickly steep themselves in night ;\nFour nights will quickly dream away the time ;\nAnd then the moon , like to a silver bow \nNew-bent in heaven , shall behold the night \nOf our solemnities .\n\nGo , Philostrate ,\nStir up the Athenian youth to merriments ;\nAwake the pert and nimble spirit of mirth ;\nTurn melancholy forth to funerals ;\nThe pale companion is not for our pomp .\n\nHippolyta , I woo'd thee with my sword ,\nAnd won thy love doing thee injuries ;\nBut I will wed thee in another key ,\nWith pomp , with triumph , and with revelling .\n\n\nHappy be Theseus , our renowned duke !\n\nThanks , good Egeus : what's the news with thee ?\n\nFull of vexation come I , with complain

In [6]:
words = raw_data.split()

In [7]:
len(words)

980637

In [8]:
# how many unique words?
len(set(words))

33505

In [9]:
# the kinds of words you might expect from shakespeare
list(set(words))[:100]

['indifferent', 'moody-mad', 'proposes', 'disannul', "parent's", 'touchstone', 'boors', "Dove-feather'd", 'fifth', 'hodge-pudding', "brew'd", 'first', 'Somebody', 'injunction', 'Gallian', 'honourable-dangerous', 'Thrice', 'fringed', 'neighbour-stained', 'thaws', 'sweatest', "ravin'd", 'pretext', 'overcast', 'commixtion', 'Ancient', 'witty', 'carters', 'Towards', 'patines', 'easy-yielding', 'Ethiopian', 'Puff', 'chest', 'encamped', "prun'st", 'wickedness', 'shorter', 'Esteem', 'destroy', "moreo'er", 'forefended', 'missives', "Gabriel's", "money's", 'Virginity', 'affecting', 'dauntless', 'contemptuously', 'king-cardinal', 'Bosom', 'Wax', 'dresser', 'Vaudemont', "park'd", "Cynthia's", "Perform'd", 'northward', 'delivered', 'Took', "disclaim'd", 'workings', 'bedward', 'Startles', 'lamentation', "deriv'd", "Mail'd", 'brief', "maim'd", 'incur', 'sedges', 'unquietly', 'Augment', 'fealty', 'another', 'warming-pan', 'squeal', 'asketh', 'napkins', 'marking', 'smell', 'places', "'be", 'purse-taki

# Gensim

In [10]:
import gensim

## Setup data
`gensim.models` takes a corpus broken into sentences.  I'm using the `Text8Corpus` iterator that comes as part of the `word2vec` class.  You can use any other data as long as you create an iterable to yield sentences.

In [11]:
from gensim.summarization.textcleaner import split_sentences
model = gensim.models.Word2Vec(
    [[str(word) for word in sentence.split()] for sentence in split_sentences(raw_data)],
    size=150,
    window=10,
    min_count=2,
    workers=-1,
    iter=10)

## run model

In [12]:
# save it as binary
model.save('demo-model')

In [13]:
print(model)

Word2Vec(vocab=17786, size=150, alpha=0.025)


## Vocabulary

In [14]:
# get list of word vectors
model_words = list(model.wv.vocab)

In [15]:
len(model_words)

17786

In [16]:
# get sorted list of word vectors
words_indexes = list(model.wv.index2word)

In [17]:
len(words_indexes)

17786

In [18]:
words[:20]

['A', "MIDSUMMER-NIGHT'S", 'DREAM', 'Now', ',', 'fair', 'Hippolyta', ',', 'our', 'nuptial', 'hour', 'Draws', 'on', 'apace', ':', 'four', 'happy', 'days', 'bring', 'in']

In [19]:
# check the index for a word
model.wv.vocab['one'].index

81

## Vectors

In [20]:
model.wv?

In [21]:
model.wv.get_vector('one')

array([-1.8152921e-03,  5.6478905e-04, -7.8418496e-04, -3.6449387e-04,
        1.6285548e-03, -5.0214748e-04,  2.1059131e-03, -2.0875765e-03,
       -3.3218879e-03,  2.1152066e-03,  1.4569993e-03,  2.9110848e-03,
        3.9236867e-04,  2.3824876e-04, -1.7436886e-03,  1.1234558e-03,
       -1.3874199e-03,  3.9341021e-04,  3.0808896e-03, -3.2899153e-04,
        7.5272325e-04,  2.4378495e-03,  1.1028055e-03,  1.6816900e-03,
        2.4801707e-03,  4.6861434e-04, -9.6394437e-08,  1.5737962e-03,
       -2.4646814e-03,  8.8448462e-04, -2.5674489e-03,  1.2114829e-03,
       -2.6967127e-03, -8.7054499e-04,  1.8452557e-03,  2.0516629e-03,
        2.2942019e-03,  1.0021572e-03,  1.5452539e-03, -3.0509108e-03,
       -2.5775989e-03,  1.3247338e-03, -4.2071668e-05,  1.8963421e-04,
       -2.0919945e-03,  3.1433594e-03, -3.1250105e-03, -1.2731816e-03,
       -1.8060119e-03, -3.2136030e-03,  3.8091379e-04, -1.3472907e-03,
        2.6351626e-03, -2.1840718e-03, -1.9669659e-03,  1.1153348e-03,
      

In [22]:
len(model.wv.get_vector('one'))

150

### Distance from mean
<a id="distance-from-mean"></a>

In [23]:
model.wv.doesnt_match?

In [24]:
# find word in list that is farthest from the mean
model.wv.doesnt_match("breakfast cereal dinner lunch".split())

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'breakfast'

In [1]:
model.wv.doesnt_match("cook janitor pilot sport teacher".split())

NameError: name 'model' is not defined

In [26]:
model.wv.doesnt_match("kill dead knife love".split())

'love'

In [27]:
model.wv.doesnt_match("insect animal cat tree".split())

'tree'

In [28]:
model.wv.doesnt_match("dog cat parrot lion".split())

'dog'

## Similarity
<a id="similarity"></a>

### Cosine similarity

In [29]:
model.wv.similarity?

In [30]:
model.wv.similarity('angry', 'happy')

-0.04043906

In [31]:
model.wv.similarity('woman', 'tree')

0.013431083

In [32]:
model.wv.similarity('tree', 'shrub')

0.13484123

In [33]:
model.wv.similarity('tree', 'bush')

-0.021015963

In [34]:
# distance is just the opposite of similarity
model.wv.distance('woman', 'tree')

0.9865689165890217

In [35]:
model.wv.distance('woman', 'man') + model.wv.similarity('woman', 'man')

1.0

In [36]:
# closest by cosine similarity
model.wv.similar_by_word('woman', topn=10)

[('spread', 0.35272103548049927), ('funeral', 0.31335967779159546), ('Lysander', 0.31107717752456665), ('un', 0.2948065400123596), ('mads', 0.29344886541366577), ('chat', 0.28563278913497925), ('Begin', 0.2839564085006714), ('corpse', 0.2800891399383545), ("reason's", 0.27878642082214355), ('trice', 0.27630355954170227)]

In [37]:
# closest by cosine similarity
model.wv.similar_by_word('she', topn=10)

[('precedent', 0.3183187246322632), ('Southam', 0.3044399619102478), ('menace', 0.29369544982910156), ('Lestrale', 0.2888152003288269), ('fist', 0.2856958508491516), ('hale', 0.28095510601997375), ('musical', 0.2804621458053589), ('dern', 0.27870047092437744), ('disgraced', 0.27278465032577515), ('disprove', 0.26655182242393494)]

In [38]:
model.wv.most_similar?

In [39]:
model.wv.most_similar(positive=['woman'], topn=10)

[('spread', 0.35272103548049927), ('funeral', 0.31335967779159546), ('Lysander', 0.31107717752456665), ('un', 0.2948065400123596), ('mads', 0.29344886541366577), ('chat', 0.28563278913497925), ('Begin', 0.2839564085006714), ('corpse', 0.2800891399383545), ("reason's", 0.27878642082214355), ('trice', 0.27630355954170227)]

In [40]:
model.wv.most_similar(negative=['woman'], topn=10)

[('torches', 0.3494153916835785), ('thighs', 0.29718780517578125), ('peal', 0.29053595662117004), ('regal', 0.28637075424194336), ("husband's", 0.27862781286239624), ('Excitements', 0.2690203785896301), ('Bouciqualt', 0.26698148250579834), ('runagate', 0.26553893089294434), ("disgrac'd", 0.26328760385513306), ('Instruct', 0.26045405864715576)]

In [41]:
model.wv.most_similar(positive=['woman', 'king'], topn=10)

[('cowards', 0.34229138493537903), ('downfall', 0.319137841463089), ('trice', 0.3064131438732147), ('losers', 0.2898869514465332), ("o'erpast", 0.28871944546699524), ('platform', 0.28868529200553894), ('hide', 0.2798108458518982), ('worthier', 0.2693411707878113), ('Nought', 0.2640897035598755), ('snail', 0.2635517418384552)]

In [42]:
model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=10)

[('platform', 0.32144904136657715), ('nimbly', 0.31101977825164795), ('downfall', 0.2975620925426483), ('trice', 0.2919257879257202), ("o'erpast", 0.28407901525497437), ('worthier', 0.2759639024734497), ('hide', 0.26736924052238464), ('Fluellen', 0.2651178240776062), ('cripple', 0.26006942987442017), ('sharply', 0.25976935029029846)]

### Multiplicative combination

In [43]:
model.wv.most_similar_cosmul?

In [44]:
model.wv.most_similar_cosmul(positive=['woman', 'king'], negative=['man'], topn=10)

[('nimbly', 0.9265996813774109), ('platform', 0.888831615447998), ('downfall', 0.8324941992759705), ("despis'd", 0.8318725228309631), ('trice', 0.8252105712890625), ("o'erpast", 0.8192151784896851), ('appeal', 0.8181260228157043), ('acres', 0.817109227180481), ('worthier', 0.8125076293945312), ('flax', 0.8101872801780701)]

In [45]:
model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=10)

[('platform', 0.32144904136657715), ('nimbly', 0.31101977825164795), ('downfall', 0.2975620925426483), ('trice', 0.2919257879257202), ("o'erpast", 0.28407901525497437), ('worthier', 0.2759639024734497), ('hide', 0.26736924052238464), ('Fluellen', 0.2651178240776062), ('cripple', 0.26006942987442017), ('sharply', 0.25976935029029846)]

In [46]:
model.wv.most_similar_cosmul(positive=['woman', 'king'], topn=10)

[('cowards', 0.3847014605998993), ('downfall', 0.37490156292915344), ('trice', 0.36851873993873596), ('losers', 0.36229458451271057), ('platform', 0.3616563081741333), ("o'erpast", 0.3615654706954956), ('hide', 0.35795024037361145), ('worthier', 0.35360199213027954), ('Nought', 0.3511848449707031), ('offender', 0.350872665643692)]

In [47]:
model.wv.most_similar(positive=['woman', 'king'], topn=10)

[('cowards', 0.34229138493537903), ('downfall', 0.319137841463089), ('trice', 0.3064131438732147), ('losers', 0.2898869514465332), ("o'erpast", 0.28871944546699524), ('platform', 0.28868529200553894), ('hide', 0.2798108458518982), ('worthier', 0.2693411707878113), ('Nought', 0.2640897035598755), ('snail', 0.2635517418384552)]