In [64]:
#download larger spacy english language models, that contain word vectors
#smaller models , dont contain the word vectors

In [65]:
import spacy

In [66]:
nlp = spacy.load('en_core_web_lg')

In [67]:
nlp(u"lion").vector
    #we are passing an unicode string

array([ 1.8963e-01, -4.0309e-01,  3.5350e-01, -4.7907e-01, -4.3311e-01,
        2.3857e-01,  2.6962e-01,  6.4332e-02,  3.0767e-01,  1.3712e+00,
       -3.7582e-01, -2.2713e-01, -3.5657e-01, -2.5355e-01,  1.7543e-02,
        3.3962e-01,  7.4723e-02,  5.1226e-01, -3.9759e-01,  5.1333e-03,
       -3.0929e-01,  4.8911e-02, -1.8610e-01, -4.1702e-01, -8.1639e-01,
       -1.6908e-01, -2.6246e-01, -1.5983e-02,  1.2479e-01, -3.7276e-02,
       -5.7125e-01, -1.6296e-01,  1.2376e-01, -5.5464e-02,  1.3244e-01,
        2.7519e-02,  1.2592e-01, -3.2722e-01, -4.9165e-01, -3.5559e-01,
       -3.0630e-01,  6.1185e-02, -1.6932e-01, -6.2405e-02,  6.5763e-01,
       -2.7925e-01, -3.0450e-03, -2.2400e-02, -2.8015e-01, -2.1975e-01,
       -4.3188e-01,  3.9864e-02, -2.2102e-01, -4.2693e-02,  5.2748e-02,
        2.8726e-01,  1.2315e-01, -2.8662e-02,  7.8294e-02,  4.6754e-01,
       -2.4589e-01, -1.1064e-01,  7.2250e-02, -9.4980e-02, -2.7548e-01,
       -5.4097e-01,  1.2823e-01, -8.2408e-02,  3.1035e-01, -6.33

In [68]:
#doc ans span objects also have vectors, and these vectors are originated from
#the averages from the token vectors

In [69]:
#doc itself is the average of all words

In [70]:
nlp(u"The quick brown fox jumped").vector.shape
#the doc is the avrage of all the singular vectors for all the words

(300,)

In [71]:
nlp(u"fox").vector.shape

(300,)

In [72]:
#identify the similar vectors
#by . similarity method , on the document tokens
#so we will create tokens

In [73]:
tokens = nlp(u"lion cat pet")

In [74]:
#now there is similarity between cat and pet, cats are often pets
# and lions and cat are also similar, since they are from same family

In [75]:
for token1 in tokens:
    for token2 in tokens:
        print(token1.text,token2.text, token1.similarity(token2))

lion lion 1.0
lion cat 0.52654374
lion pet 0.39923766
cat lion 0.52654374
cat cat 1.0
cat pet 0.7505456
pet lion 0.39923766
pet cat 0.7505456
pet pet 1.0


In [76]:
#lion lion, cat cat , and pet pet, have 100% similarity b/w each other

In [77]:
#cat and pet have more similarity

In [78]:
#lion and pet, have less simiilarity

In [79]:
tokens = nlp(u"like love hate")
#we know that love and hate are having dfferent meanings, how ever, they are used in similar context like
#we love a movie, or hate a movie, so in that aspect, they have some similarity

In [80]:
for token1 in tokens:
    for token2 in tokens:
        print(token1.text,token2.text, token1.similarity(token2))

like like 1.0
like love 0.65790397
like hate 0.6574652
love like 0.65790397
love love 1.0
love hate 0.6393099
hate like 0.6574652
hate love 0.6393099
hate hate 1.0


In [81]:
len(nlp.vocab.vectors)

684830

In [82]:
#they are 685 thousand unique words, in the vocabulary, that we have vectors for

In [83]:
nlp.vocab.vectors.shape

(684830, 300)

In [84]:
#685 thousand words,by 300 dimensions for each word vectora

In [85]:
#an unusual name will not have a vector

In [86]:
tokens = nlp(u"dog cat nargle")

In [87]:
for token in tokens:
    print(token.text,token.has_vector,token.vector_norm,token.is_oov)
    #is__oov, out of vocab

dog True 7.0336733 False
cat True 6.6808186 False
nargle False 0.0 True


In [88]:
#vector_norm ,sum of squares of 300 dimensions

In [89]:
#common names also may be vectorised
#we can acutally calculate a ne vector, from adding or subtracting the vectors
# famous example is king - men +women = queen

In [90]:
#so we need to calculate cosine similarity ourselves

In [91]:
from scipy import spatial 
#~cipi === scipy

cosine_similarity = lambda vec1,vec2 : 1 - spatial.distance.cosine(vec1,vec2)

In [92]:
king = nlp.vocab['king'].vector
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector

In [93]:
#we are going to ask for king vector -man vector +woman vector ---->
#NEW_VECTOR similar to Queen,princess,highness
#we are assuming that , somewhere in that , 300 dimensions,
#there might be some understanding that,, with the (royalty) king, (gender) man,
#usually the kings are men, so subtracting (gender dimension)man from kings, 
#we might get some royalty and gender (woman)

In [94]:
new_vector = king-man+woman

In [95]:
computed_similarities = []

#For all words in my words
for word in nlp.vocab:
    if word.has_vector:
        if word.is_lower:
            if word.is_alpha:#not a number
                similarity = cosine_similarity(new_vector, word.vector)
                computed_similarities.append((word,similarity))

In [96]:
computed_similarities = sorted(computed_similarities,key=lambda item:-item[1])
#- (descending order), (item[1] is similarity in (word,similarity))

In [97]:
# computed_similarities[:10]

print([t[0].text  for t in computed_similarities[:10]])

['king', 'woman', 'she', 'lion', 'who', 'fox', 'brown', 'when', 'dare', 'cat']
