## Experimenting with pre-trained models
https://code.google.com/archive/p/word2vec/  
https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md  

In [1]:
import time
from gensim.models import KeyedVectors
from gensim.models.wrappers import FastText
import pandas as pd

In [2]:
import multiprocessing
import psutil
print(multiprocessing.cpu_count())
print(psutil.virtual_memory())

28
svmem(total=134471933952, available=123613245440, percent=8.1, used=6534287360, free=123541958656, active=8167022592, inactive=261148672, buffers=0, cached=4395687936, shared=3649470464, slab=944652288)


#### Load Google Model

In [3]:
!ls -l /project/msca/kadochnikov/wordvec/GoogleNews-vectors-negative300.bin

-rwxr-xr-x 1 kadochnikov kadochnikov 3644258522 Mar  5  2015 /project/msca/kadochnikov/wordvec/GoogleNews-vectors-negative300.bin


In [4]:
start_time = time.time()

model_google = KeyedVectors.load_word2vec_format('/project/msca/kadochnikov/wordvec/GoogleNews-vectors-negative300.bin', binary=True)

print("Load time {} seconds".format(time.time() - start_time))

Load time 33.940810441970825 seconds


In [5]:
university = model_google['university']
print(university.shape)
print(university[:10])

(300,)
[-0.15234375 -0.11669922  0.17089844  0.2890625   0.26953125 -0.0859375
  0.16992188 -0.12304688 -0.03112793 -0.1875    ]


In [6]:
if 'chicago' in model_google:
    print(model_google['chicago'].shape)
else:
    print('{0} is an out of dictionary word'.format('chicago'))

(300,)


In [7]:
if 'blockchain' in model_google:
    print(model_google['blockchain'].shape)
else:
    print('{0} is an out of dictionary word'.format('blockchain'))

blockchain is an out of dictionary word


#### What are the most symantically similar words?  Do they have the same meaning?

In [8]:
df = pd.DataFrame(model_google.most_similar(positive=['good'], topn=10), columns=['Word','Similarity'])
df

Unnamed: 0,Word,Similarity
0,great,0.729151
1,bad,0.719005
2,terrific,0.688912
3,decent,0.683735
4,nice,0.683609
5,excellent,0.644293
6,fantastic,0.640778
7,better,0.612073
8,solid,0.580603
9,lousy,0.57642


In [9]:
df = pd.DataFrame(model_google.most_similar(positive=['tea'], topn=10), columns=['Word','Similarity'])
df

Unnamed: 0,Word,Similarity
0,Tea,0.700904
1,teas,0.672738
2,shape_Angius,0.632348
3,activist_Jamie_Radtke,0.586386
4,decaffeinated_brew,0.583954
5,planter_bungalow,0.575829
6,herbal_tea,0.573117
7,coffee,0.563529
8,jasmine_tea,0.548339
9,Tea_NASDAQ_PEET,0.540254


In [10]:
df = pd.DataFrame(model_google.most_similar(positive=['good'], negative=['bad'], topn=10), columns=['Word','Similarity'])
df

Unnamed: 0,Word,Similarity
0,excellent,0.461346
1,great,0.448569
2,terrific,0.410607
3,nice,0.378358
4,fantastic,0.37617
5,decent,0.37292
6,solid,0.370124
7,wonderful,0.362087
8,tremendous,0.361636
9,always_prided_ourselves,0.353962


In [28]:
df = pd.DataFrame(model_google.most_similar(positive=['apple'], topn=10), columns=['Word','Similarity'])
df

Unnamed: 0,Word,Similarity
0,apples,0.72036
1,pear,0.64507
2,fruit,0.641015
3,berry,0.630229
4,pears,0.613396
5,strawberry,0.605826
6,peach,0.602587
7,potato,0.596093
8,grape,0.593586
9,blueberry,0.586667


In [12]:
df = pd.DataFrame(model_google.most_similar(positive=['apple'], negative=['fruit'], topn=10), columns=['Word','Similarity'])
df

Unnamed: 0,Word,Similarity
0,Apple,0.333128
1,Appleâ_€_™,0.321516
2,Ipod,0.317913
3,designer_Jonathan_Ive,0.31395
4,ipod,0.30566
5,ipod_nano,0.305071
6,ipod_touch,0.303974
7,i_Pod,0.29602
8,asp,0.293945
9,iPod,0.293453


In [13]:
df = pd.DataFrame(model_google.most_similar(positive=['tea', 'water'], negative=['coffee'], topn=10), columns=['Word','Similarity'])
df

Unnamed: 0,Word,Similarity
0,Water,0.491065
1,sewage_effluent,0.481098
2,Floridan_aquifer,0.478459
3,Hirakud_reservoir,0.47374
4,Cwellyn,0.47241
5,groundwater,0.470844
6,brackish_groundwater,0.469435
7,sullage,0.468796
8,lake,0.466999
9,Veeranam_lake,0.465283


In [14]:
df = pd.DataFrame(model_google.most_similar(positive=['money', 'dollars'], negative=['cash'], topn=10), columns=['Word','Similarity'])
df

Unnamed: 0,Word,Similarity
0,taxpayer_dollars,0.571842
1,thousand_dollars,0.549534
2,monies,0.548971
3,millons,0.536003
4,gazillion_dollars,0.528186
5,billions,0.520503
6,moneys,0.508059
7,millions,0.502323
8,billons,0.500004
9,dime,0.492584


In [15]:
df = pd.DataFrame(model_google.most_similar(positive=['mercedes_benz', 'bmw'], negative=['ford'], topn=10))
df

Unnamed: 0,0,1
0,Spyker_C8_Aileron_Spyder,0.526682
1,benz,0.519646
2,spyder,0.515456
3,carrera,0.514674
4,hyundai,0.513636
5,SuperVeloce,0.51006
6,Pagani_Zonda,0.505662
7,Biturbo,0.505394
8,Maybach_##S,0.502249
9,renault,0.499352


In [16]:
print(model_google.doesnt_match("helsinki stockholm munich chicago".split()))

helsinki


In [17]:
print(model_google.similarity('chicago', 'university'))

0.024541648918563645


In [32]:
print(model_google.similarity('Chicago', 'University'))

0.14200904556124422


In [18]:
print(model_google.similarity('university', 'college'))

0.638526942739184


#### Load fastText Wiki Model

In [19]:
!ls -l /project/msca/kadochnikov/wordvec/wiki.en.vec

-rwxr-xr-x 1 kadochnikov kadochnikov 6597238061 Sep 19  2016 /project/msca/kadochnikov/wordvec/wiki.en.vec


In [20]:
start_time = time.time()

#model = FastText.load_fasttext_format('/project/msca/kadochnikov/wordvec/wiki.en.bin')
model_fb = KeyedVectors.load_word2vec_format('/project/msca/kadochnikov/wordvec/wiki.en.vec', binary=False)

print("Load time {} seconds".format(time.time() - start_time))

Load time 510.0804805755615 seconds


In [21]:
university = model_fb['university']
print(university.shape)
print(university[:10])

(300,)
[ 0.041453 -0.17245  -0.11003   0.13728  -0.21607  -0.18947   0.020979
 -0.079464  0.25171   0.34705 ]


In [22]:
if 'chicago' in model_fb:
    print(model_fb['chicago'].shape)
else:
    print('{0} is an out of dictionary word'.format('chicago'))

(300,)


In [23]:
if 'blockchain' in model_fb:
    print(model_fb['blockchain'].shape)
else:
    print('{0} is an out of dictionary word'.format('blockchain'))

(300,)


In [35]:
df = pd.DataFrame(model_fb.most_similar(positive=['university', 'school'], negative=['hospital']))
df

Unnamed: 0,0,1
0,"school,university",0.667792
1,university—as,0.663809
2,university–,0.6636
3,"university,harvard",0.66326
4,university/school,0.661247
5,universitygraduate,0.659672
6,universityteachers,0.658358
7,university—is,0.655011
8,university—in,0.654828
9,university†,0.654746


In [29]:
df = pd.DataFrame(model_fb.most_similar(positive=['university', 'school'], negative=['hospital']))
df

Unnamed: 0,0,1
0,"school,university",0.667792
1,university—as,0.663809
2,university–,0.6636
3,"university,harvard",0.66326
4,university/school,0.661247
5,universitygraduate,0.659672
6,universityteachers,0.658358
7,university—is,0.655011
8,university—in,0.654828
9,university†,0.654746


In [33]:
df = pd.DataFrame(model_fb.most_similar(positive=['tea', 'water'], negative=['coffee'], topn=10), columns=['Word','Similarity'])
df

Unnamed: 0,Word,Similarity
0,water—,0.588629
1,seawater,0.57005
2,",water",0.533633
3,potable,0.527245
4,"water,and",0.524927
5,"water,",0.524887
6,ρwater,0.513948
7,—water,0.512262
8,pond,0.510805
9,seawater—in,0.505546


In [38]:
df = pd.DataFrame(model_fb.most_similar(positive=['mercedes', 'bmw'], negative=['ford'], topn=10))
df

Unnamed: 0,0,1
0,benz,0.664906
1,porsche,0.650926
2,",porsche",0.647168
3,porsches,0.613271
4,mercedesz,0.59939
5,emercedesbenz,0.590513
6,mclaren–mercedes,0.587076
7,—mercedes,0.584988
8,audi,0.581014
9,cupporsche,0.579812


In [39]:
df = pd.DataFrame(model_fb.most_similar(positive=['money', 'dollars'], negative=['cash'], topn=10), columns=['Word','Similarity'])
df

Unnamed: 0,Word,Similarity
0,dollar,0.584335
1,$,0.574529
2,funds,0.573663
3,billions,0.561195
4,monies,0.557003
5,"dollars,",0.555354
6,million—and,0.554647
7,dollars—a,0.551543
8,us$,0.547243
9,$us,0.540166


In [25]:
print(model_fb.doesnt_match("london helsinki paris france japan".split()))

japan


In [26]:
print(model_fb.similarity('chicago', 'university'))

0.32988367786441386


In [27]:
print(model_fb.similarity('university', 'college'))

0.6745003706983639
