In [1]:
import spacy

In [2]:
# For word vectors, use medium (md) or large (lg) models, as they are not available with small (sm) models
# If not installed, install as:
# python -m spacy download en_core_web_md

# Medium model
nlp = spacy.load('en_core_web_md')

In [3]:
# Check similarity between two sentences
doc1 = nlp('I like fast food.')
doc2 = nlp('I like pizza.')

print(doc1.similarity(doc2))

0.9009145331610278


In [4]:
# Compare similarity between two tokens
doc = nlp('I like pizza and pasta')
token1 = doc[2]
token2 = doc[4]

print('Similarity between', token1.text, 'and', token2.text, 'is', token1.similarity(token2))

Similarity between pizza and pasta is 0.73695457


In [5]:
# Different objects can be compared against each other
# Doc with a token
doc = nlp('I like pizza.')
token = nlp('soap')[0]

print(doc.similarity(token))
print(token.similarity(doc))

0.32468245260362194
0.32468245260362194


In [6]:
# Span with a doc
span = nlp('I like pizza and pasta')[2:5]
doc = nlp('McDonalds sells burgers')

print(span.similarity(doc))

0.6199091710787739


## Regarding Vectors & Similarity..

* In spacy, word vectors are used per token
* Vectors for doc and span default to average of token vectors
* Cosine similarity is the default for similarity calculation, but can be adjusted

In [7]:
# Vectors can be viewed by using vectpr attribute of a token
# 300-dim vector is obtained

doc = nlp('I have a banana.')
token = doc[3]

print('token:', token)
print('vector shape:', token.vector.shape)
print('vector:', token.vector)

token: banana
vector shape: (300,)
vector: [ 2.0228e-01 -7.6618e-02  3.7032e-01  3.2845e-02 -4.1957e-01  7.2069e-02
 -3.7476e-01  5.7460e-02 -1.2401e-02  5.2949e-01 -5.2380e-01 -1.9771e-01
 -3.4147e-01  5.3317e-01 -2.5331e-02  1.7380e-01  1.6772e-01  8.3984e-01
  5.5107e-02  1.0547e-01  3.7872e-01  2.4275e-01  1.4745e-02  5.5951e-01
  1.2521e-01 -6.7596e-01  3.5842e-01 -4.0028e-02  9.5949e-02 -5.0690e-01
 -8.5318e-02  1.7980e-01  3.3867e-01  1.3230e-01  3.1021e-01  2.1878e-01
  1.6853e-01  1.9874e-01 -5.7385e-01 -1.0649e-01  2.6669e-01  1.2838e-01
 -1.2803e-01 -1.3284e-01  1.2657e-01  8.6723e-01  9.6721e-02  4.8306e-01
  2.1271e-01 -5.4990e-02 -8.2425e-02  2.2408e-01  2.3975e-01 -6.2260e-02
  6.2194e-01 -5.9900e-01  4.3201e-01  2.8143e-01  3.3842e-02 -4.8815e-01
 -2.1359e-01  2.7401e-01  2.4095e-01  4.5950e-01 -1.8605e-01 -1.0497e+00
 -9.7305e-02 -1.8908e-01 -7.0929e-01  4.0195e-01 -1.8768e-01  5.1687e-01
  1.2520e-01  8.4150e-01  1.2097e-01  8.8239e-02 -2.9196e-02  1.2151e-03
  5.6825

In [8]:
# There is no objective definition of similarity, since similarity depends on application context
# The two sentences below are ranked similar because of overlapping words
# But, since they express opposite sentiments, they might be required to be shown very dissimilar

doc1 = nlp('I like cats.')
doc1 = nlp('I hate cats.')
doc1.similarity(doc2)

0.8298221282148381

In [10]:
doc = nlp('Two bananas in pyjamas')
bananas_vector = doc[1].vector
print('token:', doc[1].text)
print('vector:', doc[1].vector)

token: bananas
vector: [-2.2009e-01 -3.0322e-02 -7.9859e-02 -4.6279e-01 -3.8600e-01  3.6962e-01
 -7.7178e-01 -1.1529e-01  3.3601e-02  5.6573e-01 -2.4001e-01  4.1833e-01
  1.5049e-01  3.5621e-01 -2.1508e-01 -4.2743e-01  8.1400e-02  3.3916e-01
  2.1637e-01  1.4792e-01  4.5811e-01  2.0966e-01 -3.5706e-01  2.3800e-01
  2.7971e-02 -8.4538e-01  4.1917e-01 -3.9181e-01  4.0434e-04 -1.0662e+00
  1.4591e-01  1.4643e-03  5.1277e-01  2.6072e-01  8.3785e-02  3.0340e-01
  1.8579e-01  5.9999e-02 -4.0270e-01  5.0888e-01 -1.1358e-01 -2.8854e-01
 -2.7068e-01  1.1017e-02 -2.2217e-01  6.9076e-01  3.6459e-02  3.0394e-01
  5.6989e-02  2.2733e-01 -9.9473e-02  1.5165e-01  1.3540e-01 -2.4965e-01
  9.8078e-01 -8.0492e-01  1.9326e-01  3.1128e-01  5.5390e-02 -4.2423e-01
 -1.4082e-02  1.2708e-01  1.8868e-01  5.9777e-02 -2.2215e-01 -8.3950e-01
  9.1987e-02  1.0180e-01 -3.1299e-01  5.5083e-01 -3.0717e-01  4.4201e-01
  1.2666e-01  3.7643e-01  3.2333e-01  9.5673e-02  2.5083e-01 -6.4049e-02
  4.2143e-01 -1.9375e-01  3.

In [14]:
doc1 = nlp('It\'s a warm summer day')
doc2 = nlp('It\'s sunny outside')
doc1.similarity(doc2)

0.8789265574516525

In [15]:
doc = nlp('I like TV and books')
token1, token2 = doc[2], doc[4]
token1.similarity(token2)

0.22325331

In [17]:
doc = nlp('This was a great restaurant. Afterwards, we went to a really nice bar.')
span1 = doc[3:5]
span2 = doc[12:15]

print(span1)
print(span2)
print(span1.similarity(span2))

great restaurant
really nice bar
0.7517392
