In [1]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
     ---------------------------------------- 0.0/400.7 MB ? eta -:--:--
     ---------------------------------------- 1.6/400.7 MB 9.3 MB/s eta 0:00:43
     ---------------------------------------- 3.1/400.7 MB 8.4 MB/s eta 0:00:48
      --------------------------------------- 5.5/400.7 MB 9.3 MB/s eta 0:00:43
      --------------------------------------- 7.3/400.7 MB 9.2 MB/s eta 0:00:43
      --------------------------------------- 9.7/400.7 MB 9.7 MB/s eta 0:00:41
     - ------------------------------------- 11.8/400.7 MB 9.7 MB/s eta 0:00:41
     - ------------------------------------ 14.4/400.7 MB 10.1 MB/s eta 0:00:39
     - ------------------------------------ 17.0/400.7 MB 10.3 MB/s eta 0:00:38
     - ------------------------------------ 19.4/400.7 MB 10.5 MB/s eta 0:00:37
     -- ----------------------

In [2]:
import spacy

# word vectors occupy lot of space. hence en_core_web_sm model do not have them included. 
# In order to download
# word vectors you need to install large or medium english model. We will install the large one!

nlp = spacy.load("en_core_web_lg")

In [17]:
doc = nlp("The Lion is king asfasf")

for token in doc:
    print(token.text, "Vector:", token.has_vector, "Out Of Vocabulrary:", token.is_oov)

The Vector: True Out Of Vocabulrary: False
Lion Vector: True Out Of Vocabulrary: False
is Vector: True Out Of Vocabulrary: False
king Vector: True Out Of Vocabulrary: False
asfasf Vector: False Out Of Vocabulrary: True


In [8]:
doc[0].vector.shape

(300,)

In [18]:
base_token = nlp("cake")
base_token.vector.shape

(300,)

In [21]:
doc = nlp("cake sandwich burger car tiger human wheat")

for token in doc:
    print(f"{token.text} <---> {base_token.text}     Similarity  : {token.similarity(base_token)*100}%")

cake <---> cake     Similarity  : 100.00000000083284%
sandwich <---> cake     Similarity  : 55.85351062188764%
burger <---> cake     Similarity  : 47.560893431220684%
car <---> cake     Similarity  : 22.67691356854236%
tiger <---> cake     Similarity  : 25.20734479510205%
human <---> cake     Similarity  : 16.999006824590037%
wheat <---> cake     Similarity  : 36.095782868704354%


In [15]:
def print_similarity(base_word, words_to_compare):
    base_token = nlp(base_word)
    doc = nlp(words_to_compare)
    for token in doc:
        print(f"{token.text} <-> {base_token.text}: ", token.similarity(base_token))

In [16]:
print_similarity("iphone", "i am having iphone")

i <-> iphone:  0.3950952469165612
am <-> iphone:  0.18108134476424292
having <-> iphone:  0.1590331817061666
iphone <-> iphone:  1.0000000285783557


In [22]:
king = nlp.vocab["king"].vector
man = nlp.vocab["man"].vector
woman = nlp.vocab["woman"].vector
queen = nlp.vocab["queen"].vector

result = king - man + woman

In [24]:
king.shape

(300,)

##### Cosine similarity measures the similarity between two vectors of an inner product space. It is measured by the cosine of the angle between two vectors and determines whether two vectors are pointing in roughly the same direction.

In [26]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity([result], [queen])

array([[0.78808445]], dtype=float32)