In [40]:
# importing packages

import spacy
from scipy import spatial

In [63]:
# load the spacy english extension/library

nlp = spacy.load('en_core_web_md')
nlp.pipe_names

['tagger', 'parser', 'ner']

In [64]:
# md model --> 685k keys, 20k unique vectors (300 dimensions)
# lg model --> 685k keys, 685k unique vectors (300 dimensions)
print(len(nlp.vocab))
print(len(nlp.vocab.vectors))

1340241
20000


In [8]:
# it means it has 300 dimensions

nlp(u'lion').vector.shape

(300,)

In [15]:
tokens = nlp(u'cat lion pet')

for t1 in tokens:
    for t2 in tokens:
        print(t1.text,t2.text,t1.similarity(t2))

cat cat 1.0
cat lion 0.52654374
cat pet 0.7505456
lion cat 0.52654374
lion lion 1.0
lion pet 0.39923766
pet cat 0.7505456
pet lion 0.39923766
pet pet 1.0


In [39]:
# just checking some tokens

tokens = nlp(u'dog cat sharma abhishek')
for t in tokens:
    print(t.text,t.has_vector,t.vector_norm,t.is_oov)

dog True 7.0336733 False
cat True 6.6808186 False
sharma True 7.524998 False
abhishek True 7.3687935 False


In [18]:
# we want to implement king-man+queen

king = nlp(u'king').vector
man = nlp(u'man').vector
woman = nlp(u'woman').vector

In [19]:
# creating the new vector

new_vector = king-man+woman
new_vector

In [41]:
# creating a cosine similarity function

cosine_similarity = lambda vec1,vec2 : 1-spatial.distance.cosine(vec1,vec2)

In [51]:
# iterating through each and every word in the library and calculating its similarity to the new vector 

similarities = []

for word in nlp.vocab:
    if word.has_vector and word.is_alpha and word.is_lower:
        similarities.append((cosine_similarity(new_vector,word.vector),word.text))

In [55]:
# as we can observe that for a vector like king-man+woman we obviously expect a queen and it 
#proves to be successful in getting that

for similarity,word in  sorted(similarities,reverse=True)[:10]:
    print(word)

king
queen
sultan
prince
highness
commoner
sultans
princes
maharajas
kumbia
