<a href="https://colab.research.google.com/github/souradipta93/NLP/blob/main/word2vec_full_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction to word embedding

In [None]:
from gensim.models import Word2Vec



In [None]:
#!pip install python-Levenshtein

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
#Import text corpus from gensim
from gensim.test.utils import common_texts

In [None]:
common_texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

### Word2Vec Arguments

- vector_size:The number of dimensions of the embedding, e.g. the length of the dense vector to represent each token (word).
- window:The maximum distance between a target word and words around the target word.
- min_count: The minimum count of words to consider when training the model; words with an occurrence less than this count will be ignored.
- workers:The number of threads to use while training.
- sg:The training algorithm, either CBOW (0) or skip gram (1)

In [None]:
# train model based on vocabulary of current corpus
model = Word2Vec(common_texts,
                vector_size=100,
                window=5,
                min_count=1,
                workers=4)




In [None]:
#Get vocabulary
model.wv.key_to_index

{'system': 0,
 'graph': 1,
 'trees': 2,
 'user': 3,
 'minors': 4,
 'eps': 5,
 'time': 6,
 'response': 7,
 'survey': 8,
 'computer': 9,
 'interface': 10,
 'human': 11}

In [None]:
#Save model to a file
model.save("word2vec.model")

In [None]:
#Load model to new variable
model1 = Word2Vec.load('word2vec.model')


## Training model on custom text

In [None]:
# define training data - tokenize the sentence
sentences = [['this', 'is', 'the', 'first', 'class', 'for', 'word2vec'],
            ['this', 'is', 'the', 'second', 'class', 'in','NLP'],
            ['yet', 'another', 'concept'],
            ['one', 'more', 'day'],
            ['and', 'the', 'final', 'class']]

In [None]:
#Train model on new sentences
model1.train(sentences, total_examples=1, epochs=1)

(0, 24)

In [None]:
# get numpy vector of a word
vector = model.wv['computer']  
vector

array([-0.00515774, -0.00667028, -0.0077791 ,  0.00831315, -0.00198292,
       -0.00685696, -0.0041556 ,  0.00514562, -0.00286997, -0.00375075,
        0.0016219 , -0.0027771 , -0.00158482,  0.0010748 , -0.00297881,
        0.00852176,  0.00391207, -0.00996176,  0.00626142, -0.00675622,
        0.00076966,  0.00440552, -0.00510486, -0.00211128,  0.00809783,
       -0.00424503, -0.00763848,  0.00926061, -0.00215612, -0.00472081,
        0.00857329,  0.00428458,  0.0043261 ,  0.00928722, -0.00845554,
        0.00525685,  0.00203994,  0.0041895 ,  0.00169839,  0.00446543,
        0.00448759,  0.0061063 , -0.00320303, -0.00457706, -0.00042664,
        0.00253447, -0.00326412,  0.00605948,  0.00415534,  0.00776685,
        0.00257002,  0.00811904, -0.00138761,  0.00808028,  0.0037181 ,
       -0.00804967, -0.00393476, -0.0024726 ,  0.00489447, -0.00087241,
       -0.00283173,  0.00783599,  0.00932561, -0.0016154 , -0.00516075,
       -0.00470313, -0.00484746, -0.00960562,  0.00137242, -0.00

In [None]:
#Get similar words
sim_word = model.wv.most_similar('system', topn=10)  # get other similar words
sim_word

[('computer', 0.21617142856121063),
 ('response', 0.09293834120035172),
 ('human', 0.07963485270738602),
 ('interface', 0.06288163363933563),
 ('survey', 0.027057476341724396),
 ('time', 0.016134709119796753),
 ('graph', -0.010839181020855904),
 ('minors', -0.02775038219988346),
 ('trees', -0.052346743643283844),
 ('eps', -0.05987627059221268)]

In [None]:
import gensim.downloader as api



In [None]:
# load pre-trained word-vectors from gensim-data
word_vectors = api.load("glove-wiki-gigaword-100") 

In [None]:
# Check similarity based on multiple conditions
result = word_vectors.most_similar(positive=['lion', 'tiger'], negative=['man'])
result

[('leopard', 0.6701600551605225),
 ('elephant', 0.551284909248352),
 ('rhinoceros', 0.5426068902015686),
 ('rhino', 0.5233815908432007),
 ('dragon', 0.52213054895401),
 ('elephants', 0.5167747139930725),
 ('turtle', 0.5037534832954407),
 ('leopards', 0.4965704083442688),
 ('unicorn', 0.4904889464378357),
 ('lillies', 0.4891042709350586)]

In [None]:
#Extract most similar word
most_similar_key, similarity = result[0]  # look at the first match
print(f"{most_similar_key}: {similarity:.4f}")

leopard: 0.6702


In [None]:
# Use a different similarity measure: "cosmul".
result1 = word_vectors.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])
result1

[('queen', 0.8964556455612183),
 ('monarch', 0.8495979309082031),
 ('throne', 0.8447030782699585),
 ('princess', 0.8371668457984924),
 ('elizabeth', 0.835679292678833),
 ('daughter', 0.8348594903945923),
 ('prince', 0.8230058550834656),
 ('mother', 0.815445065498352),
 ('margaret', 0.8147736191749573),
 ('father', 0.8100855946540833)]

In [None]:
# Find the odd man out
print(word_vectors.doesnt_match("breakfast cereal dinner lunch".split()))

cereal


In [None]:
#Measure similarity between 2 words
similarity = word_vectors.similarity('woman', 'man')
similarity

0.8323495

In [None]:
#conda install -c conda-forge pyemd (in anaconda prompt)
from pyemd import emd
from gensim.similarities import WmdSimilarity

In [None]:
distance = word_vectors.distance("media", "movie")
print(f"{distance:.1f}")

0.5
