## Loading model
Here we will load a pretrained word2vec model from google

google full pre-trained model available at https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit (1.5 GB)

a smaller model (345 MB unpacked), which is the one we'll be using, is available at https://github.com/eyaler/word2vec-slim/blob/master/GoogleNews-vectors-negative300-SLIM.bin.gz

In [1]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300-SLIM.bin', binary=True)  

## Vocabulary

In [2]:
# get list of word vectors
model_words = list(model.wv.vocab)

  


In [3]:
len(model_words)

299567

In [4]:
# get sorted list of word vectors
words_indexes = list(model.wv.index2word)

  


In [5]:
len(words_indexes)

299567

In [6]:
words_indexes[:20]

['in',
 'for',
 'that',
 'is',
 'on',
 'The',
 'with',
 'said',
 'was',
 'the',
 'at',
 'not',
 'as',
 'it',
 'be',
 'from',
 'by',
 'are',
 'I',
 'have']

In [7]:
# check the index for a word
model.wv.vocab['one'].index

  


40

## Vectors

In [8]:
model.wv?

In [9]:
model.wv.get_vector('one')

  """Entry point for launching an IPython kernel.


array([ 0.03171886, -0.10109327,  0.10855653,  0.11534131,  0.07632881,
        0.0052158 ,  0.05122511, -0.02154169,  0.10923501,  0.06920478,
        0.03612897, -0.1031287 ,  0.04477957,  0.08209588, -0.01840372,
        0.06275924,  0.00669997, -0.08582751, -0.01984549,  0.00502498,
       -0.04477957,  0.0563137 , -0.06072381, -0.02832647,  0.01306071,
        0.05733142, -0.05529598,  0.05495674,  0.00767529,  0.0580099 ,
        0.00128275,  0.05292131,  0.03358468,  0.0069544 ,  0.00631833,
        0.00538542,  0.01704677, -0.02111764,  0.03188848, -0.01848853,
        0.07124022, -0.00966832,  0.0121278 ,  0.04410109,  0.04172641,
        0.02747837,  0.06852631,  0.00784491, -0.06038457, -0.01781006,
       -0.03494163,  0.03494163,  0.01009236,  0.00155838, -0.03680745,
       -0.00602149,  0.02120245,  0.00886262,  0.03952136,  0.05461751,
        0.02120245,  0.06716935,  0.00268211, -0.14315893, -0.05258207,
       -0.00597909, -0.04477957,  0.00589428, -0.04087832,  0.08

In [10]:
len(model.wv.get_vector('one'))

  """Entry point for launching an IPython kernel.


300

### Distance from mean
<a id="distance-from-mean"></a>

In [11]:
model.wv.doesnt_match?

In [12]:
# find word in list that is farthest from the mean
model.wv.doesnt_match("breakfast cereal dinner lunch".split())

  
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'cereal'

In [13]:
model.wv.doesnt_match("cook janitor pilot sport teacher".split())

  """Entry point for launching an IPython kernel.


'sport'

In [14]:
model.wv.doesnt_match("kill dead knife love".split())

  """Entry point for launching an IPython kernel.


'love'

In [15]:
model.wv.doesnt_match("insect animal cat tree".split())

  """Entry point for launching an IPython kernel.


'tree'

In [16]:
model.wv.doesnt_match("dog cat parrot lion".split())

  """Entry point for launching an IPython kernel.


'lion'

## Similarity
<a id="similarity"></a>

### Cosine similarity

In [17]:
model.wv.similarity?

In [18]:
model.wv.similarity('angry', 'happy')

  """Entry point for launching an IPython kernel.


0.37493223

In [19]:
model.wv.similarity('woman', 'tree')

  """Entry point for launching an IPython kernel.


0.24068612

In [20]:
model.wv.similarity('tree', 'shrub')

  """Entry point for launching an IPython kernel.


0.598446

In [21]:
model.wv.similarity('tree', 'bush')

  """Entry point for launching an IPython kernel.


0.44554722

In [22]:
# distance is just the opposite of similarity
model.wv.distance('woman', 'tree')

  


0.7593138813972473

In [23]:
model.wv.distance('woman', 'man') + model.wv.similarity('woman', 'man')

  """Entry point for launching an IPython kernel.


1.0

In [24]:
# closest by cosine similarity
model.wv.similar_by_word('woman', topn=10)

  


[('man', 0.7664012312889099),
 ('girl', 0.7494640946388245),
 ('teenager', 0.631708562374115),
 ('lady', 0.6288787126541138),
 ('mother', 0.607630729675293),
 ('policewoman', 0.6069462299346924),
 ('boy', 0.5975908041000366),
 ('Woman', 0.5770983099937439),
 ('she', 0.5641393661499023),
 ('WOMAN', 0.5480420589447021)]

In [36]:
# closest by cosine similarity
model.wv.similar_by_word('cat', topn=10)

  


[('cats', 0.8099379539489746),
 ('dog', 0.7609457969665527),
 ('kitten', 0.7464984059333801),
 ('feline', 0.7326234579086304),
 ('beagle', 0.7150583863258362),
 ('puppy', 0.7075453400611877),
 ('pup', 0.6934291124343872),
 ('pet', 0.6891531348228455),
 ('felines', 0.6755931377410889),
 ('chihuahua', 0.6709762811660767)]

In [26]:
model.wv.most_similar?

In [37]:
model.wv.most_similar(positive=['cat'], topn=10)

  """Entry point for launching an IPython kernel.


[('cats', 0.8099379539489746),
 ('dog', 0.7609457969665527),
 ('kitten', 0.7464984059333801),
 ('feline', 0.7326234579086304),
 ('beagle', 0.7150583863258362),
 ('puppy', 0.7075453400611877),
 ('pup', 0.6934291124343872),
 ('pet', 0.6891531348228455),
 ('felines', 0.6755931377410889),
 ('chihuahua', 0.6709762811660767)]

In [38]:
model.wv.most_similar(negative=['cat'], topn=10)

  """Entry point for launching an IPython kernel.


[('Syndicale', 0.19090351462364197),
 ('involvedin', 0.19022290408611298),
 ('MIDSTREAM', 0.18107298016548157),
 ('AIPS', 0.1751096248626709),
 ('JAF', 0.16483020782470703),
 ('Nator', 0.1642889678478241),
 ('SoI', 0.16299599409103394),
 ('GIULIO', 0.16063831746578217),
 ('FGS', 0.15692923963069916),
 ('YEM', 0.15415330231189728)]

In [52]:
model.wv.most_similar(positive=['cat', 'baby'], topn=10)

  """Entry point for launching an IPython kernel.


[('kitten', 0.78639817237854),
 ('puppy', 0.7523137331008911),
 ('pup', 0.7289223670959473),
 ('puppies', 0.7061880826950073),
 ('kittens', 0.6959831714630127),
 ('newborn', 0.6925413608551025),
 ('cats', 0.6913950443267822),
 ('dog', 0.6681196689605713),
 ('infant', 0.6670407056808472),
 ('babies', 0.6643195152282715)]

In [69]:
model.wv.most_similar(positive=['Google', 'mail'], negative=['internet'], topn=10)

  """Entry point for launching an IPython kernel.


[('USPS', 0.4894232153892517),
 ('mailbox', 0.4463164508342743),
 ('mailing', 0.4420804977416992),
 ('Yahoo', 0.43995609879493713),
 ('UPS', 0.43590471148490906),
 ('mailboxes', 0.41918665170669556),
 ('FedEx', 0.4184463620185852),
 ('mailings', 0.41606035828590393),
 ('GMail', 0.41539356112480164),
 ('Gmail', 0.41207629442214966)]

### Multiplicative combination

In [31]:
model.wv.most_similar_cosmul?

In [32]:
model.wv.most_similar_cosmul(positive=['woman', 'king'], negative=['man'], topn=10)

  """Entry point for launching an IPython kernel.


[('queen', 0.9314122200012207),
 ('monarch', 0.858533501625061),
 ('princess', 0.8476566076278687),
 ('queens', 0.8099815249443054),
 ('monarchy', 0.801961362361908),
 ('prince', 0.8009798526763916),
 ('empress', 0.7958388328552246),
 ('throne', 0.7853889465332031),
 ('sultan', 0.7844569683074951),
 ('royal', 0.7835602760314941)]

In [33]:
model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=10)

  """Entry point for launching an IPython kernel.


[('queen', 0.7118192911148071),
 ('monarch', 0.6189674735069275),
 ('princess', 0.5902431607246399),
 ('prince', 0.5377322435379028),
 ('kings', 0.5236844420433044),
 ('queens', 0.5181134343147278),
 ('sultan', 0.5098593235015869),
 ('monarchy', 0.5087411999702454),
 ('throne', 0.5005807280540466),
 ('royal', 0.4938204884529114)]

In [34]:
model.wv.most_similar_cosmul(positive=['woman', 'king'], topn=10)

  """Entry point for launching an IPython kernel.


[('queen', 0.5432848334312439),
 ('man', 0.5429146885871887),
 ('princess', 0.5300394296646118),
 ('prince', 0.5130366683006287),
 ('girl', 0.5127540826797485),
 ('monarch', 0.5109155178070068),
 ('boy', 0.5006506443023682),
 ('lady', 0.4842751622200012),
 ('teenager', 0.4837571084499359),
 ('goddess', 0.4713200628757477)]

In [35]:
model.wv.most_similar(positive=['woman', 'king'], topn=10)

  """Entry point for launching an IPython kernel.


[('man', 0.6628609299659729),
 ('queen', 0.643856406211853),
 ('girl', 0.6136074066162109),
 ('princess', 0.6087510585784912),
 ('monarch', 0.5900576710700989),
 ('prince', 0.5896846055984497),
 ('boy', 0.5665285587310791),
 ('lady', 0.5445605516433716),
 ('teenager', 0.5442259907722473),
 ('ruler', 0.5134526491165161)]