In [1]:
import spacy
from scipy import spatial

In [2]:
# load the spacy english extension/library

nlp = spacy.load('en_core_web_md')
nlp.pipe_names

['tagger', 'parser', 'ner']

In [3]:
# md model --> 685k keys, 20k unique vectors (300 dimensions)
# lg model --> 685k keys, 685k unique vectors (300 dimensions)
print(len(nlp.vocab))
print(len(nlp.vocab.vectors))

1340241
20000


In [4]:
# it means it has 300 dimensions

nlp(u'lion').vector.shape

(300,)

In [5]:
tokens = nlp(u'cat lion pet')

for t1 in tokens:
    for t2 in tokens:
        print(t1.text,t2.text,t1.similarity(t2))

cat cat 1.0
cat lion 0.5265438
cat pet 0.7505457
lion cat 0.5265438
lion lion 1.0
lion pet 0.39923766
pet cat 0.7505457
pet lion 0.39923766
pet pet 1.0


In [6]:
# just checking some tokens

tokens = nlp(u'dog cat sharma abhishek')
for t in tokens:
    print(t.text,t.has_vector,t.vector_norm,t.is_oov)

dog True 7.0336733 False
cat True 6.6808186 False
sharma True 7.524998 False
abhishek True 7.3687935 False


In [7]:
# we want to implement king-man+queen

king = nlp(u'king').vector
man = nlp(u'man').vector
woman = nlp(u'woman').vector

In [8]:
# creating the new vector

new_vector = king-man+woman
new_vector

array([ 5.14087021e-01, -2.78459996e-01,  2.42767006e-01,  4.54899669e-02,
       -2.59425014e-01, -3.19999963e-01,  3.23920012e-01, -6.71030045e-01,
       -9.98499990e-02,  1.91499996e+00, -5.68080008e-01, -2.74451017e-01,
       -1.49906695e-01,  8.01083148e-02, -2.34764010e-01, -1.10950008e-01,
       -1.02593988e-01,  8.53819966e-01, -2.68564999e-01,  3.85140002e-01,
       -1.36149988e-01,  6.35029972e-01, -7.62044966e-01, -2.52770007e-01,
       -6.75969958e-01,  3.89851004e-01, -2.89680034e-01,  1.75860003e-01,
       -5.16229987e-01,  5.21373034e-01, -1.89909995e-01,  6.73759937e-01,
        1.17550008e-01, -4.69896019e-01,  5.88999987e-01,  1.29447982e-01,
       -5.71900010e-01, -5.47450066e-01, -4.84210014e-01,  5.85503951e-02,
        4.82379973e-01, -2.86769986e-01, -2.01718003e-01, -4.74729985e-01,
        3.43068987e-01, -2.28827983e-01, -1.76439017e-01,  6.05450034e-01,
        2.07139999e-01, -2.89762974e-01, -7.63288975e-01,  4.37090009e-01,
       -2.06220001e-01, -

In [9]:
# creating a cosine similarity function

cosine_similarity = lambda vec1,vec2 : 1-spatial.distance.cosine(vec1,vec2)

In [10]:
# iterating through each and every word in the library and calculating its similarity to the new vector 

similarities = []

for word in nlp.vocab:
    if word.has_vector and word.is_alpha and word.is_lower:
        similarities.append((cosine_similarity(new_vector,word.vector),word.text))

In [11]:
# as we can observe that for a vector like king-man+woman we obviously expect a queen and it 
#proves to be successful in getting that

for similarity,word in  sorted(similarities,reverse=True)[:10]:
    print(word)

king
queen
sultan
prince
highness
commoner
sultans
princes
maharajas
kumbia
