# Wrod2vec model trained on google news dataset 

#### Shubhankar Tiwari


In [54]:
import gensim

model = gensim.models.KeyedVectors.load_word2vec_format('./data/GoogleNews-vectors-negative300.bin.gz', binary=True)

### Check the word vector of a common word

In [55]:
dog = model['dog']
(dog.shape)

(300,)

In [56]:
(dog[:10])

array([ 0.05126953, -0.02233887, -0.17285156,  0.16113281, -0.08447266,
        0.05737305,  0.05859375, -0.08251953, -0.01538086, -0.06347656],
      dtype=float32)

### Check most similar words by specifying positives and negatives

In [57]:
(model.most_similar(positive=['woman', 'king'], negative=['man']))

[('queen', 0.7118192911148071),
 ('monarch', 0.6189674139022827),
 ('princess', 0.5902431011199951),
 ('crown_prince', 0.5499460697174072),
 ('prince', 0.5377321243286133),
 ('kings', 0.5236844420433044),
 ('Queen_Consort', 0.5235945582389832),
 ('queens', 0.518113374710083),
 ('sultan', 0.5098593235015869),
 ('monarchy', 0.5087411999702454)]

In [72]:
model.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])
#Find the top-N most similar words, using the multiplicative combination objective proposed by Omer Levy and Yoav Goldberg. 
#Positive words still contribute positively towards the similarity, 
#negative words negatively, but with less susceptibility to one large distance dominating the calculation

[('queen', 0.9314123392105103),
 ('monarch', 0.858533501625061),
 ('princess', 0.8476566076278687),
 ('Queen_Consort', 0.8150269985198975),
 ('queens', 0.8099815249443054),
 ('crown_prince', 0.808997631072998),
 ('royal_palace', 0.8027306795120239),
 ('monarchy', 0.801961362361908),
 ('prince', 0.800979733467102),
 ('empress', 0.7958388328552246)]

### Check most similar words for a proper noun

In [58]:
model.most_similar('Shubhankar')

[('Prathamesh', 0.7386659383773804),
 ('Akshatha', 0.728675127029419),
 ('Rutuja', 0.7228821516036987),
 ('Anushree', 0.7228357791900635),
 ('Subham', 0.7221089601516724),
 ('Rishika', 0.7186915874481201),
 ('Hyd_##K', 0.7139887809753418),
 ('Aakanksha', 0.7106143236160278),
 ('Rachit', 0.7093888521194458),
 ('Uthra', 0.7090432047843933)]

In [119]:
model.similar_by_word('Kaia',topn=5)

[('Alana', 0.6019689440727234),
 ('Hannah', 0.582836389541626),
 ('Olivia', 0.5792908668518066),
 ('Tessa', 0.5756932497024536),
 ('Kyla', 0.5743770599365234)]

### Odd one out

In [59]:
(model.doesnt_match("breakfast cereal dinner lunch".split()))

'cereal'

In [60]:
model.doesnt_match("Shubhankar Gaurav Ashlesh Eva".split())

'Eva'

### Similarity

In [61]:
(model.similarity('woman', 'man'))

0.7664012230995352

### Pick the most similar word

In [68]:
model.most_similar_to_given('dog', ['cat','horse','deer','whale','baby'])

'cat'

In [69]:
model.words_closer_than('carnivore', 'mammal') #Returns all words that are closer to w1 than w2 is to w1.

['carnivores', 'meat_eater']

### Get the best 10 matches for each MSBA name

In [78]:
msba = ['John','Prathamesh','Wenhui','Yilin','Bryce','Shuyu','Aastha','Abhijit','Abhilasha','Aditya',
        'Alexander','Aneesh','Animesh','Ankit','Ao','Aparajitha','Aravind','Archchana','Ashlesh','Astha',
        'Badarinath','Barkha','Bekzat','Benjamin','Bhanu','Bhuvan','Bingjie','Colin','Cory','Deepak','Devansh',
        'Disha','Gaurav','Gopi','Gowthami','Hanjing','Haoyang','Hemanth','Hui','Imran','Ishwarya','Jing','Jithin',
        'Justin','Kaia','Katharine','Kaustubh','Kaveri','Kevin','Kruthik','Kyle','Mayank','Millee','Monica','Moumi',
        'Mukul','Navya','Neelakanteshwar','Nicholas','Niharika','Nikhil','Peiwen','Pritheesh','Qinnan','Rachel',
        'Ramnath','Ravi','Saketh','Saumya','Saurabh','Shipra','Shubhankar','Siddart','Srihari','Sruthi','Sumit',
        'Suzanne','Tanya','Taraka','Tempo','Udit','Utkarsh','Varun','Vineet','Xiang','Yan','Yi','Yi-Fan','Yi-Hsiu',
        'Yuchen','Yuwen','Yuyang','Zeyuan']

#'Qinyu',

d = {}
for student in msba:
    try:
        temp = model.most_similar(student)
        d[student] = temp
    except:
        pass

### Filter matched names

As all of the top 10 matches for each name may or may not be part of the MSBA class, I have filtered out matched names which are not a part of the MSBA class 

In [92]:
dc={}
for key, value in d.items():
    temp = []
    #print (value)
    for matched_student in value:
        #print (matched_student[0])
        if matched_student[0] in msba: temp.append(matched_student)
            
    dc[key]=temp
            

### Results of exact matching

In [95]:
dc['Mayank']

[('Gaurav', 0.7826263904571533),
 ('Nikhil', 0.7639487385749817),
 ('Deepak', 0.7515873908996582),
 ('Saurabh', 0.7394168376922607)]

In [96]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process



In [106]:
fdc={}
for key, value in d.items():
    temp = []
    #print (value)
    for matched_student in value:
        #print (matched_student[0])
        #if matched_student[0] in msba: temp.append(matched_student)
        for s in msba:
            if fuzz.ratio(matched_student[0],s) > 80:temp.append(matched_student)
            
    fdc[key]=temp
            

### Results of fuzzy matching

In [108]:
fdc['Rachel']

[('Rachael', 0.8424904346466064), ('Sarah', 0.8346818685531616)]

In [116]:
import pandas as pd
temp = pd.DataFrame.from_dict(fdc, orient="index")
temp.head()

Unnamed: 0,0,1,2,3
John,,,,
Prathamesh,"(Pritesh, 0.7546816468238831)","(Sanket, 0.7507660984992981)",,
Wenhui,,,,
Yilin,"(Yiliang, 0.5568621754646301)",,,
Bryce,"(Kyle, 0.5833208560943604)",,,


### Export data for R visualization

In [117]:
temp.to_csv("fuzzyRaw.csv")

## References

1. Gensim documentation: https://radimrehurek.com/gensim/models/keyedvectors.html