# Install First Large Corpus

In [59]:
# !python -m spacy download en_core_web_lg

# Import Spacy

In [60]:
import spacy

## Load Large Corpus (with vectors)

In [61]:
nlp = spacy.load('en_core_web_lg')

In [62]:
# See the vector values

x = nlp(u'The quick brown fox jumps over the lazy dog')
x.vector

array([-2.24356681e-01,  2.36273333e-02, -8.95571113e-02, -2.46849991e-02,
       -1.11383367e-02,  1.67895377e-01, -1.35965556e-01,  4.54140007e-02,
        1.75733224e-01,  1.79952669e+00, -2.56371140e-01, -1.32132426e-01,
       -1.17909007e-01, -1.19132228e-01, -1.53437123e-01,  5.88577799e-02,
        4.98943403e-02,  9.97797728e-01,  1.87705532e-02, -3.12472224e-01,
       -1.79796323e-01,  1.91822220e-02, -2.03344449e-02, -2.80422211e-01,
        1.33547157e-01, -8.47513303e-02, -1.59022897e-01, -1.44049451e-01,
        1.03780672e-01, -1.91260219e-01, -1.97320044e-01,  2.90798336e-01,
        3.06795575e-02, -2.73727775e-02,  1.84221894e-01, -4.08223346e-02,
        4.02911119e-02, -7.83216804e-02, -2.88786497e-02, -2.23331116e-02,
        1.70663014e-01,  5.03979996e-02, -1.22279301e-01, -1.17678441e-01,
        1.17505684e-01,  8.05872232e-02, -1.25954449e-01,  1.26498044e-01,
        1.02427334e-01,  3.67995575e-02, -7.65517727e-02,  1.33473098e-01,
       -2.75868904e-02,  

In [63]:
# See the vector shape

y = nlp(u'The quick brown fox jumps over the lazy dog')
y.vector.shape

(300,)

In [64]:
# Token examples

tokens = nlp(u'lion cat pet')

# lion cat pet
# like love hate

In [65]:
# Check Similarities

for token1 in tokens:

  for token2 in tokens:

    print(token1.text, token2.text, token1.similarity(token2))

lion lion 1.0
lion cat 0.5265437960624695
lion pet 0.39923766255378723
cat lion 0.5265437960624695
cat cat 1.0
cat pet 0.7505456805229187
pet lion 0.39923766255378723
pet cat 0.7505456805229187
pet pet 1.0


In [83]:
# Check Current Vocabulary

len(nlp.vocab.vectors)

342918

In [82]:
# Check existence of vocabulary

tokens = nlp(u'dog cat nargle')

for token in tokens:

  print(token.text, token.has_vector, token.vector_norm, token.is_oov) # out of vocabulary (oov)

dog True 7.0336733 False
cat True 6.6808186 False
nargle False 0.0 True


In [68]:
# Vector Arithmetic

from scipy import spatial

cosine_similarity = lambda vec1, vec2: 1 - spatial.distance.cosine(vec1, vec2)

In [84]:
king = nlp.vocab['king'].vector
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector

# king - man + woman ---> NEW VECTOR similar quean, princess, highness

new_vector = king + woman

computed_similarities = []

for word in nlp.vocab:

  if word.has_vector:

    if word.is_lower:

      if word.is_alpha:

        similarity = cosine_similarity(new_vector, word.vector)
        computed_similarities.append((word, similarity))

In [85]:
# Sort list of tuples (descending)

computed_similarities = sorted(computed_similarities, key = lambda item:-item[1])

In [86]:
# Print Top 10 similar words

print([t[0].text for t in computed_similarities[:10]])

['king', 'woman', 'man', 'she', 'he', 'who', 'when', 'lion', 'was', 'that']
