# Finding odd word using Word2Vec

In [33]:
import numpy as np
import gensim
from gensim.models import word2vec,KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity

**KeyedVectors** : This object essentially contains the mapping between words and embeddings. After training, it can be used directly to query those embeddings in various ways.

**About the model/dataset:**
This dataset/model contains 50 Billion word samples and their corresponding vector embeddings. The dataset size is about 3.3 GB. Each word is embedded inside a 300 dimensional space.
In the model/dataset is arranged with similar values are grouped close to each other in the dimensional space and they have high cosimilarity value.

In [2]:
word_vectors = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin',binary=True)

In [27]:
v_guava = word_vectors["guava"]
v_mango = word_vectors["mango"]
v_india = word_vectors["India"]

In [24]:
print(v_guava.shape)
print(v_mango.shape)

(300,)
(300,)


In [28]:
print("Cosine similarity between Guava and Mango is: ",cosine_similarity([v_guava],[v_mango]))
print("Cosine similarity between India and Mango is: ",cosine_similarity([v_india],[v_mango]))

Cosine similarity between Guava and Mango is:  [[0.71923983]]
Cosine similarity between India and Mango is:  [[0.21009971]]


**Since, Guava and Mango both belong to same category i.e., 'Fruit' they have a high cosine similarity.
Since, India and Mango both belong to different category i.e., 'Country' and 'Fruit' therefore they have low cosine similarity.**

## Finding the odd word from a list of words

In [55]:
def odd_one_out(words):
    '''The function accepts a list of word and returns the odd word.'''
    
    # Generate all word embeddings for the given list of words
    
    all_word_vectors = [word_vectors[w] for w in words]
    avg_vector = np.mean(all_word_vectors,axis=0)
    
    # Iterate over every word and find similarity
    odd_one_out = None
    min_similarity = 1.0 # Can be one or any very high value
    
    for w in words:
        similar = cosine_similarity([word_vectors[w]],[avg_vector])
        if similar < min_similarity:
            min_similarity = similar
            odd_word = w
    
        print("Similarity between %s and avg_vector is %.3f"%(w,similar))
    
    print("\nThe odd word is: "+odd_word)
    

In [56]:
odd_one_out(input_1)

Similarity between apple and avg_vector is 0.765
Similarity between mango and avg_vector is 0.808
Similarity between juice and avg_vector is 0.688
Similarity between party and avg_vector is 0.289
Similarity between orange and avg_vector is 0.611
Similarity between guava and avg_vector is 0.790

The odd word is: party


In [66]:
input_1 = ['apple','mango','juice','party','orange','guava'] # party is odd word
input_2 = ['music','dance','sleep','dancer','fruit'] # fruit is odd word
input_3 = ['match','player','football','cricket','dancer','ball'] # dancer is odd word
input_4 = ['India','paris','Russia','France','Germany','USA'] # paris is odd word

In [67]:
w_list = [input_1,input_2,input_3,input_4]

In [68]:
for i in w_list:
    print(i)
    odd_one_out(i)
    print("\n\n")

['apple', 'mango', 'juice', 'party', 'orange', 'guava']
Similarity between apple and avg_vector is 0.765
Similarity between mango and avg_vector is 0.808
Similarity between juice and avg_vector is 0.688
Similarity between party and avg_vector is 0.289
Similarity between orange and avg_vector is 0.611
Similarity between guava and avg_vector is 0.790

The odd word is: party



['music', 'dance', 'sleep', 'dancer', 'fruit']
Similarity between music and avg_vector is 0.645
Similarity between dance and avg_vector is 0.794
Similarity between sleep and avg_vector is 0.490
Similarity between dancer and avg_vector is 0.723
Similarity between fruit and avg_vector is 0.431

The odd word is: fruit



['match', 'player', 'football', 'cricket', 'dancer', 'ball']
Similarity between match and avg_vector is 0.588
Similarity between player and avg_vector is 0.684
Similarity between football and avg_vector is 0.712
Similarity between cricket and avg_vector is 0.643
Similarity between dancer and avg_vecto