In [1]:
import gensim
from gensim.models import word2vec
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
word_vectors=KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin',binary=True)

In [3]:
print(word_vectors)

<gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x00000168E73F5B70>


In [4]:
v_apple=word_vectors['apple']
v_mango=word_vectors['mango']
print(v_apple.shape)
print(v_mango.shape)

(300,)
(300,)


In [None]:
cosine_similarity([v_apple],[v_mango])

In [None]:
cosine_similarity([v_apple],[v_apple])

In [None]:
import numpy as np

# Finding the odd one out

In [20]:
def odd_one_out(words):
    # accepts list of words and returns the odd one
    # generate all the word embeddings for the given list
    all_word_vector=[word_vectors[w] for w in words]
    avg_vector=np.mean(all_word_vector,axis=0)
    min_similarity=1.0
    odd_one_out=None
    for w in words:
        sim=cosine_similarity([word_vectors[w]],[avg_vector])
        if sim<min_similarity:
            min_similarity=sim
            odd_one_out=w
        print("similarity between %s and avg vector is %.2f"%(w,sim))
    return odd_one_out

In [21]:
input_1 = ["apple","mango","juice","party","orange"] 
input_2 = ["music","dance","sleep","dancer","food"]        
input_3  = ["match","player","football","cricket","dancer"]
input_4 = ["india","paris","russia","france","germany"]

In [22]:
odd_one_out(input_1)

similarity between apple and avg vector is 0.78
similarity between mango and avg vector is 0.76
similarity between juice and avg vector is 0.71
similarity between party and avg vector is 0.36
similarity between orange and avg vector is 0.65


'party'

In [23]:
odd_one_out(input_2)

similarity between music and avg vector is 0.66
similarity between dance and avg vector is 0.81
similarity between sleep and avg vector is 0.51
similarity between dancer and avg vector is 0.72
similarity between food and avg vector is 0.52


'sleep'

In [24]:
odd_one_out(input_3)

similarity between match and avg vector is 0.58
similarity between player and avg vector is 0.68
similarity between football and avg vector is 0.72
similarity between cricket and avg vector is 0.70
similarity between dancer and avg vector is 0.53


'dancer'

In [25]:
odd_one_out(input_4)

similarity between india and avg vector is 0.81
similarity between paris and avg vector is 0.75
similarity between russia and avg vector is 0.79
similarity between france and avg vector is 0.81
similarity between germany and avg vector is 0.84


'paris'

# Word Analogies

In [26]:
def predict_words(a,b,c,word_vectors):
    # accepts a triad of words a,b,c and returns d such that a is to b : c is to d
    a,b,c=a.lower(),b.lower(),c.lower()
    max_similarity=-100
    d=None
    words=word_vectors.vocab.keys()
    wa,wb,wc=word_vectors[a],word_vectors[b],word_vectors[c]
    for w in words:
        if w in [a,b,c]:
            continue
        wv=word_vectors[w]
        sim=cosine_similarity([wb-wa],[wv-wc])
        if sim>max_similarity:
            max_similarity=sim
            d=w
    return d

In [27]:
import warnings
warnings.filterwarnings('ignore')
predict_words("man","coder","woman",word_vectors)

'coders'

# Using Most Similar Method

In [29]:
word_vectors.most_similar(positive=['woman','king'],negative=['man'],topn=1)

[('queen', 0.7118192911148071)]

# Training Your Own Word2Vec Model

- word2vec model can learn embeddings from any text corpus
- continous bag of words model
- Skip Gram model
- Algorithm looks at window of target word(Y) to provide context word(X) , the model is trained on (X,Y) pairs in a supervised manner.The algorithm was developed by Tomos Mikolov.
- Example-  I Love to have Dominos Pizza and Garlic Bread.

# Data Preparation
-  Each sentence must be tokenized, into a list of words.
-  The sentences can be text loaded into memory once,or we can build a data pipeline  which iteratively feeds data to the model.

In [30]:
import gensim
from gensim.models import word2vec
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity

In [31]:
import nltk
from nltk.corpus import stopwords

In [47]:
stopw=set(stopwords.words('english'))
def readFile(file):
    f=open(file,'r',encoding='utf-8')
    text=f.read()
    # since word2vec accepts only sentences and words 
    text=nltk.sent_tokenize(text)
    data=[]
    for sent in text:
        words=nltk.word_tokenize(sent)
        words=[w.lower() for w in words if len(w)>2 and w not in stopw]
        data.append(words)
    print(len(text))
    return data

In [48]:
data=readFile('bollywood.txt')
print(data)

18
[['deepika', 'padukone', 'ranveer', 'singh', 'wedding', 'one', 'biggest', 'bollywood', 'events', 'happened', '2018'], ['the', 'deepika', 'ranveer', 'celebrations', 'hooked', 'phones', 'waiting', 'come', 'also', 'gave', 'enough', 'reason', 'believe', 'stylish', 'two', 'couple'], ['from', 'airport', 'looks', 'reception', 'parties', 'everything', 'entire', 'timeline', 'deepika', 'ranveer', 'wedding', 'style', 'file'], ['not', 'ambanis', 'deepika', 'ranveer', 'priyanka', 'nick'], ['man', 'proves', 'wedding', 'the', 'year', 'this', 'year', 'year', 'big', 'fat', 'lavish', 'extravagant', 'weddings'], ['from', 'isha', 'ambani', 'anand', 'piramal', 'deepika', 'padukone', 'ranveer', 'singh', 'priyanka', 'chopra', 'nick', 'jonas', 'kapil', 'sharma', 'ginni', 'chatrath', '2018', 'saw', 'many', 'grand', 'weddings'], ['but', 'nothing', 'beats', 'man', 'wedding', 'the', 'year', 'award', 'social', 'media'], ['priyanka', 'also', 'shared', 'video', 'featuring', 'nick', 'jonaswas', 'also', 'celebratin

# 2. Create Model

In [88]:
from gensim.models import Word2Vec
model=Word2Vec(data,size=300,window=10,min_count=1)
print(model)
# window: distance within we will check words, min_count will tell us minimum words we want, sie will tell us
# no of words in that context

Word2Vec(vocab=116, size=300, alpha=0.025)


In [89]:
# so there are 116 unique words in a dictionary and learning rate is 0.025, size will tell the dimensionality of vector

In [90]:
# to see the words in vocab/dictionary
words=list(model.wv.vocab)
print(words)

['deepika', 'padukone', 'ranveer', 'singh', 'wedding', 'one', 'biggest', 'bollywood', 'events', 'happened', '2018', 'the', 'celebrations', 'hooked', 'phones', 'waiting', 'come', 'also', 'gave', 'enough', 'reason', 'believe', 'stylish', 'two', 'couple', 'from', 'airport', 'looks', 'reception', 'parties', 'everything', 'entire', 'timeline', 'style', 'file', 'not', 'ambanis', 'priyanka', 'nick', 'man', 'proves', 'year', 'this', 'big', 'fat', 'lavish', 'extravagant', 'weddings', 'isha', 'ambani', 'anand', 'piramal', 'chopra', 'jonas', 'kapil', 'sharma', 'ginni', 'chatrath', 'saw', 'many', 'grand', 'but', 'nothing', 'beats', 'award', 'social', 'media', 'shared', 'video', 'featuring', 'jonaswas', 'celebrating', 'family', 'first', 'celebrated', 'christmas', 'london', 'pictures', 'new', 'outstanding', 'glimpses', 'celebration', 'verbier', 'switzerland', 'married', 'december', 'three', 'receptions', 'delhi', 'mumbai', 'jaggo', 'night', 'made', 'even', 'special', 'industry', 'friends', 'long', '

In [91]:
print(len(words))

116


In [92]:
print(model["deepika"].shape)

(300,)


In [110]:
word_vectors_kv = KeyedVectors.load_word2vec_format('bollywood.bin',binary=False)
word_vectors = word_vectors_kv.wv

# Create Analogies

In [111]:
def predict_words(a,b,c,word_vectors):
    # accept a triad of words and returns d such that a is to b : c is to d
    a,b,c=a.lower(),b.lower(),c.lower()
    max_similarity=-200
    d=None
    wa,wb,wc=word_vectors[a],word_vectors[b],word_vectors[c]
    options=["ranveer","deepika","padukone","singh","nick","jonas","chopra",
            "priyanka","virat","anushka","ginni"]
    for w in options:
        if w in [a,b,c]:
            continue
        wv=word_vectors[w]
        sim=cosine_similarity([wb-wa],[wv-wc])
        if sim>max_similarity:
            max_similarity=sim
            d=w
    return d

# Test Your Model

In [118]:
predict_words("nick","priyanka","virat",word_vectors)

'anushka'

In [119]:
predict_words("ranveer","deepika","priyanka",word_vectors)

'nick'

In [120]:
predict_words("ranveer","singh","deepika",word_vectors)

'padukone'

In [121]:
predict_words("deepika","padukone","priyanka",word_vectors)

'chopra'

In [122]:
predict_words("priyanka","jonas","nick",word_vectors)

'chopra'