In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_lg')

In [None]:
"""
Not just single words but doc and spans of docs also have vector embeddings.
    
    nlp(u'<text>').vector <-- returns a 300 dimensional vector
"""

In [10]:
this_vector = nlp(u'This').vector

In [11]:
long_vector = nlp(u"""
    This is so cool. 
    How are they creating vectors for such long sentences?
    I wish to understand what is going on inside but I want to learn
    how to implement them first
    """).vector

In [17]:
tokens = nlp(u'lion cat tiger dog pet')

In [22]:
"""
token1.similarity(token2) --> takes cosine similarity between the vector for token1 & token2
"""
for token1 in tokens:
    for token2 in tokens:
        print(f"| {token1.text:{10}} {token2.text:{10}} {token1.similarity(token2)}")
    print('-----------------------------------------')

| lion       lion       1.0
| lion       cat        0.3854507803916931
| lion       tiger      0.713609516620636
| lion       dog        0.29493075609207153
| lion       pet        0.20031584799289703
-----------------------------------------
| cat        lion       0.3854507803916931
| cat        cat        1.0
| cat        tiger      0.5670855045318604
| cat        dog        0.8220816850662231
| cat        pet        0.732966423034668
-----------------------------------------
| tiger      lion       0.713609516620636
| tiger      cat        0.5670855045318604
| tiger      tiger      1.0
| tiger      dog        0.42287227511405945
| tiger      pet        0.31030499935150146
-----------------------------------------
| dog        lion       0.29493075609207153
| dog        cat        0.8220816850662231
| dog        tiger      0.42287227511405945
| dog        dog        1.0
| dog        pet        0.7856058478355408
-----------------------------------------
| pet        lion       0.200

In [23]:
tokens = nlp(u"like love hate kill")

In [24]:
def show_similarity(doc):
    for token1 in doc:
        for token2 in doc:
            print(f"| {token1.text:{10}} {token2.text:{10}} {token1.similarity(token2)}")
        print('-----------------------------------------')           

In [25]:
show_similarity(tokens)

| like       like       1.0
| like       love       0.5212638974189758
| like       hate       0.5065140724182129
| like       kill       0.30623432993888855
-----------------------------------------
| love       like       0.5212638974189758
| love       love       1.0
| love       hate       0.5708349943161011
| love       kill       0.25415143370628357
-----------------------------------------
| hate       like       0.5065140724182129
| hate       love       0.5708349943161011
| hate       hate       1.0
| hate       kill       0.3746185302734375
-----------------------------------------
| kill       like       0.30623432993888855
| kill       love       0.25415143370628357
| kill       hate       0.3746185302734375
| kill       kill       1.0
-----------------------------------------


In [27]:
nlp.vocab.vectors.shape # 514157 many unique words with 300 dimension representation

(514157, 300)

In [31]:
"""
Not every word will have an interpretation or will be in the vocab of dictionary

    vector_norm --> L2 norm or eucledian distance
"""

'\nNot every word will have an interpretation or will be in the vocab of dictionary\n\n    vector_norm --> L2 norm or eucledian distance\n'

In [32]:
tokens = nlp(u"dog cat nargle Abhijeet")

In [33]:
for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov) # oov --> out of vocabulary

dog True 75.254234 False
cat True 63.188496 False
nargle False 0.0 True
Abhijeet True 17.454887 False


In [34]:
"""
Token arithmatic
"""

'\nToken arithmatic\n'

In [35]:
from scipy import spatial

In [36]:
cosine_similarity = lambda vec1, vec2: 1 - spatial.distance.cosine(vec1, vec2)

In [60]:
king = nlp.vocab['king'].vector

In [61]:
man = nlp.vocab['man'].vector

In [62]:
woman = nlp.vocab['woman'].vector

In [40]:
# king - man + woman --> queen

In [63]:
new_vector = king-man+woman

In [64]:
computed_similarites = []

for word in nlp.vocab:
    if word.has_vector:
        if word.is_lower:
            if word.is_alpha:
                similarity = cosine_similarity(word.vector, new_vector)
                computed_similarites.append((word.text, similarity))
print(len(computed_similarites))

135


In [65]:
computed_similarites = sorted(computed_similarites, key=lambda item: -item[1])

In [66]:
computed_similarites[:10]

[('king', 0.8489541411399841),
 ('queen', 0.6178014278411865),
 ('and', 0.3899005055427551),
 ('that', 0.38483577966690063),
 ('creating', 0.3748939335346222),
 ('where', 0.3385923206806183),
 ('them', 0.33383065462112427),
 ('she', 0.32445624470710754),
 ('they', 0.3206636309623718),
 ('implement', 0.31726282835006714)]