<a href="https://colab.research.google.com/github/rahul1990gupta/indic-nlp-datasets/blob/master/examples/word_embeddings_and_similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from spacy.lang.hi import Hindi 
nlp = Hindi()
sent1 = 'मुझे भोजन पसंद है।'
doc = nlp(sent1)

In [4]:
doc[0].vector

array([], dtype=float32)

In [5]:
# As we can see that there are no word embeddings available for hindi words. 
# Luckily, there are word embeddings available online under fasttext project by facebook
# So, we will download them and load that in spaCy
import requests 
url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.vec.gz"
r = requests.get(url, allow_redirects=True)
fpath = url.split("/")[-1]
with open(fpath, "wb") as fw:
  fw.write(r.content)

In [6]:
# The file is about 1 GB in size. So, it will take some time to download. 
# Let's see how we can use external word embeddings in spaCy
# Here is a link to spacy documentation on how to do this https://spacy.io/usage/vectors-similarity#converting

! python -m spacy init-model hi ./hi_vectors_wiki_lg --vectors-loc cc.hi.300.vec.gz



⠙ Creating model...[2K[38;5;2m✔ Successfully created model[0m
⠙ Reading vectors from cc.hi.300.vec.gztcmalloc: large alloc 2251988992 bytes == 0x2dac000 @  0x7f6b1fca8001 0x7f6b1d80c765 0x7f6b1d870bb0 0x7f6b1d872a4f 0x7f6b1d909048 0x50a7f5 0x50cfd6 0x509918 0x50a64d 0x50c1f4 0x507f24 0x509c50 0x50a64d 0x50c1f4 0x507f24 0x588e91 0x59fe1e 0x50d596 0x507f24 0x509c50 0x50a64d 0x50c1f4 0x507f24 0x509c50 0x50a64d 0x50c1f4 0x507f24 0x5165a5 0x50a47f 0x50c1f4 0x507f24
1876653it [02:55, 10704.01it/s]
[2K[38;5;2m✔ Loaded vectors from cc.hi.300.vec.gz[0m
[38;5;2m✔ Sucessfully compiled vocab[0m
1876653 entries, 1876653 vectors


In [8]:
# Let's load the model now in spacy to do some work
import spacy
nlp_hi = spacy.load("./hi_vectors_wiki_lg")
doc = nlp_hi(sent1)
doc[0].vector

array([ 5.210e-02, -4.990e-02,  9.150e-02, -8.800e-03,  7.370e-02,
       -4.700e-03, -3.410e-02,  4.200e-03,  2.160e-02,  2.810e-02,
       -5.430e-02, -2.680e-02,  2.070e-02, -1.120e-02,  3.960e-02,
       -1.140e-02,  1.108e-01,  1.560e-02, -2.690e-02,  1.336e-01,
       -1.290e-02,  6.110e-02, -1.520e-02, -5.390e-02, -3.870e-02,
        2.180e-02, -4.010e-02, -4.380e-02,  4.030e-02,  6.700e-03,
       -6.400e-03, -2.020e-02,  3.660e-02, -4.700e-03,  6.030e-02,
        6.860e-02,  1.070e-02, -8.210e-02, -3.090e-02,  1.620e-02,
       -2.700e-03,  4.910e-02, -2.800e-03, -3.250e-02,  1.030e-02,
        9.310e-02, -1.380e-02, -6.380e-02, -1.900e-02, -3.220e-02,
       -9.700e-03, -5.200e-03, -7.400e-03,  9.890e-02, -1.530e-02,
       -4.380e-02,  1.590e-02,  8.700e-03, -9.600e-03, -1.080e-02,
       -3.300e-03, -2.090e-02, -7.560e-02,  6.220e-02,  3.440e-02,
       -5.700e-03, -7.590e-02, -5.490e-02, -9.000e-04, -4.920e-02,
        4.610e-02, -5.360e-02,  2.080e-02,  7.690e-02,  4.200e

In [9]:
# Looks like vectors are loaded. 
# Let's use these vectors to compare two very similar sentences

sent2 = 'मैं ऐसे भोजन की सराहना करता हूं जिसका स्वाद अच्छा हो।'
doc1 = nlp_hi(sent1)
doc2 = nlp_hi(sent2)

# Both the sent1 and sent2 are very similar, so, we expect their similarity score to be high
doc1.similarity(doc2)

0.8630901600736095

In [10]:
# similarity score implied that both the sentences are very similar
# Now, Let's find synonyms for a word using word embeddings 
doc1[1]

भोजन

In [17]:
vector1 = doc1[1].vector
result = nlp_hi.vocab.vectors.most_similar(vector1.transpose())


AxisError: ignored

In [42]:
# Uh-oh. Something is not right. Let's go to github issues
# Somebody suggested here to reshape the vector https://github.com/explosion/spaCy/issues/276
# Let's do that 
def get_similar_words(word):
  vector = word.vector
  results = nlp_hi.vocab.vectors.most_similar(vector.reshape(1, 300))

  ret = []
  for result in results:    
    try:
      candidate = nlp_hi.vocab[result[0][0]]
      ret.append(candidate.text)
    except KeyError:
      pass
    return ret



In [43]:
get_similar_words(doc[1])

['भोजन']

In [47]:
# That's not useful. Let's try some other words 
words = "सुंदर दिन माँ भाई"
doc = nlp_hi(words)

for token in doc:
  print(nlp_hi.vocab.vectors.most_similar(token.vector.reshape(1, 300)))

(array([[3568482901620635625]], dtype=uint64), array([[1220]], dtype=int32), array([[1.]], dtype=float32))
(array([[5776141810202350524]], dtype=uint64), array([[137]], dtype=int32), array([[1.]], dtype=float32))
(array([[13757753178649016088]], dtype=uint64), array([[559]], dtype=int32), array([[1.]], dtype=float32))
(array([[14413965628861974583]], dtype=uint64), array([[485]], dtype=int32), array([[1.]], dtype=float32))


In [None]:
# Hmm. seems like there is something wrong with wither spacy or word vectors. Maybe word vectors are very sparse and trained on very little 
# data 

# There is a nltk library for looking up similarity. Next time we will try that.  
