In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity
import pickle

import matplotlib.pyplot as plt
import seaborn as sns 

%matplotlib inline

import csv

from tqdm import tqdm

In [2]:
vocabSize = 1000000

wordCount = pickle.load(open('wordCount','rb'))


vocab = wordCount.most_common(vocabSize)

id_to_word = {i:x[0] for i,x in enumerate(vocab)}

word_to_id = {value:key for key,value in id_to_word.items()}

In [3]:
focal = pickle.load(open('focal_embed_30','rb'))
context = pickle.load(open('context_embed_30','rb'))


In [19]:
def getSimilarity(word1, word2, cat = None):
    
    word1_vec_f = focal[word_to_id[word1]].reshape(1,-1)
    word2_vec_f = focal[word_to_id[word2]].reshape(1,-1)
    
    word1_vec_c = context[word_to_id[word1]].reshape(1,-1)
    word2_vec_c = context[word_to_id[word2]].reshape(1,-1)
    
    word1_vec = word1_vec_c + word1_vec_f
    word2_vec = word2_vec_c + word2_vec_f
    
    return cosine_similarity(word1_vec_f, word2_vec_f),cosine_similarity(word1_vec_c, word2_vec_c), cosine_similarity(word1_vec, word2_vec),
    
    
    
    

In [20]:
getSimilarity('happy','sad')

(array([[0.5138645]], dtype=float32),
 array([[0.6096065]], dtype=float32),
 array([[0.6189511]], dtype=float32))

In [21]:
getSimilarity('happy','joy')

(array([[0.6887676]], dtype=float32),
 array([[0.5940156]], dtype=float32),
 array([[0.6868048]], dtype=float32))

In [23]:
getSimilarity('glad','happy')

(array([[0.37813732]], dtype=float32),
 array([[0.409115]], dtype=float32),
 array([[0.43154302]], dtype=float32))

In [29]:
getSimilarity('trump','joy')

(array([[0.15965402]], dtype=float32),
 array([[0.05595398]], dtype=float32),
 array([[0.07510523]], dtype=float32))

In [30]:
getSimilarity('trump','angry')

(array([[0.16862899]], dtype=float32),
 array([[0.02129865]], dtype=float32),
 array([[0.07248678]], dtype=float32))

In [31]:
getSimilarity('trump','sad')

(array([[0.00309093]], dtype=float32),
 array([[-0.02934984]], dtype=float32),
 array([[-0.0651384]], dtype=float32))

In [34]:
a.indices

array([     0,      1,      2, ..., 999990, 999993, 999997], dtype=int32)

In [8]:
#After 20 iters

In [9]:
getSimilarity('happy','sad')

(array([[0.21809852]], dtype=float32),
 array([[0.44713795]], dtype=float32),
 array([[0.5153621]], dtype=float32))

In [10]:
getSimilarity('happy','joy')

(array([[0.46207908]], dtype=float32),
 array([[0.45473552]], dtype=float32),
 array([[0.58758694]], dtype=float32))

In [11]:
getSimilarity('glad','joy')

(array([[0.15788756]], dtype=float32),
 array([[0.05948482]], dtype=float32),
 array([[0.06841627]], dtype=float32))

In [12]:
getSimilarity('trump','joy')

(array([[0.2006445]], dtype=float32),
 array([[0.10079014]], dtype=float32),
 array([[0.11198172]], dtype=float32))

In [13]:
getSimilarity('trump','angry')

(array([[0.16391163]], dtype=float32),
 array([[0.16077389]], dtype=float32),
 array([[0.18015333]], dtype=float32))

In [14]:
getSimilarity('trump','sad')

(array([[0.10551284]], dtype=float32),
 array([[0.11864118]], dtype=float32),
 array([[0.08880009]], dtype=float32))

### Output TSNE vectors

In [10]:
wordList = []

with open('financeWordList.txt') as f:
    wordList = f.readlines()
    
wordList = [x[:-1].split("\t")[0] for x in wordList[1:]]

In [11]:
wordList

['money',
 'currency',
 'finance',
 'market',
 'grocery',
 'funds',
 'shares',
 'mutual',
 'deposit',
 'stock',
 'stocks',
 'investor',
 'investment',
 'trade',
 'trades',
 'bond',
 'strength',
 'join',
 'food',
 'farm',
 'capital',
 'territory',
 'interest',
 'like',
 'attention',
 'notice']

In [12]:
def getVec(word, types = 'focal'):
    
    if types == 'focal':
        return focal[word_to_id[word]]
    elif types =='context':
        return context[word_to_id[word]]
    else:
        return focal[word_to_id[word]] + context[word_to_id[word]]

In [13]:
vecList = []

for i in range(len(wordList)):
    
    vecList.append(getVec(wordList[i]))



In [14]:
vecList = np.array(vecList)

In [15]:
df = pd.DataFrame(vecList).to_csv('vecList.tsv',sep='\t', quoting=csv.QUOTE_NONE, index=False, index_label=False,header=None)

In [33]:
np.round(vecList,3)

array([[-0.362, -0.164,  0.166, ...,  0.172, -0.093, -0.059],
       [-0.026,  0.284,  0.215, ...,  0.296,  0.396, -0.011],
       [-0.007, -0.071, -0.14 , ...,  0.086,  0.273,  0.103],
       ...,
       [-0.277,  0.19 , -0.096, ..., -0.007, -0.039, -0.34 ],
       [-0.417,  0.161, -0.126, ..., -0.074,  0.073, -0.185],
       [-0.219,  0.256, -0.11 , ..., -0.058,  0.177, -0.488]],
      dtype=float32)

### Creating plots

In [2]:
a = pd.read_csv('output_15.csv')

In [8]:
a.drop('Unnamed: 0',inplace = True, axis = 1)
a.index = list(a.columns)

plt.figure(figsize = (10,10))
sns.heatmap(a)

In [2]:
def createPlot(filename):
    
    a = pd.read_csv(filename)
    index = filename.split("_")[-1].split(".")[0]
    
    a.drop('Unnamed: 0',inplace = True, axis = 1)
    a.index = list(a.columns)

    plt.figure(figsize = (10,10))
    plot = sns.heatmap(a)
    fig = plot.get_figure()
    fig.savefig("/home/santhosr/Documents/Courses/GloVe/Combined/output_"+str(index)+".png")
    fig.clf()
    
    plt.close('all')
    
    

In [3]:
for i in tqdm(range(0,11)):
    createPlot('/home/santhosr/Documents/Courses/GloVe/Combined/output_'+str(i)+'.csv')

100%|██████████| 11/11 [00:03<00:00,  3.41it/s]


### Comparing Vectors

In [11]:
iter19 = pickle.load(open('/home/santhosr/Documents/Courses/GloVe/Combined/Outputs/run1/vectors_19','rb'))

In [12]:
iter20 = pickle.load(open('/home/santhosr/Documents/Courses/GloVe/Combined//vectors_20','rb'))

In [13]:
iter21 = pickle.load(open('/home/santhosr/Documents/Courses/GloVe/Combined//vectors_21','rb'))

In [14]:
iter19[0]

array([[ 0.21419736,  0.37608376,  0.38375359,  0.55822744,  0.02376448,
         0.0629039 , -0.08366993, -0.25640352,  0.21236808, -0.07452327,
         0.3042607 ,  0.3077659 , -0.04015953, -0.75448262, -0.35997035,
        -0.14508085, -0.18484113,  0.53941643,  0.13295973,  0.40860824,
        -0.69121893,  0.41388928,  0.37963962, -0.37031569, -0.03407085,
         0.59314984,  0.06913729, -0.03479349,  0.25663908,  0.50618451,
         0.10713476,  0.31670395, -0.27584387,  0.31545877,  0.2216125 ,
        -0.53790899,  0.30601954, -0.09109011, -0.32780648,  0.39573568,
        -0.01603103, -0.40824233, -0.41359364,  0.08731299,  0.32841773,
        -0.21984676, -0.54396458,  0.11803389, -0.26006599,  0.30366134,
         0.22414797,  0.48689513,  0.19134835, -0.13852205, -0.17894172,
        -0.09195593,  0.04966683,  0.2890392 ,  0.32669468,  0.65328522,
        -0.10516156, -0.62187181, -0.33977548, -0.15750885,  0.03561835,
         0.24738176, -0.00300699,  0.25272236,  0.1

In [15]:
iter20[0]

array([[-0.03020226, -0.18530621, -0.18971471, -0.2589169 ,  0.28274411,
        -0.24403704, -0.33162827, -0.26382095,  0.12561689, -0.12654847,
        -0.04961572, -0.27798611, -0.0609039 ,  0.46745358,  0.07293451,
         0.31041258,  0.10407142,  0.20986297, -0.14915539,  0.33164221,
        -0.31419144, -0.2062289 ,  0.14006086, -0.02891152,  0.13109571,
         0.27819446, -0.1776404 , -0.39715696,  0.19983371,  0.18702947,
        -0.18598419,  0.23023425, -0.17426112,  0.29021978, -0.16879907,
        -0.34909583, -0.31028556, -0.23722834,  0.35916007, -0.22260562,
        -0.25227143,  0.2346476 , -0.10682473, -0.22974718,  0.35285042,
         0.08852486, -0.28123026,  0.32151736,  0.27008913, -0.34653887,
        -0.10317461,  0.31036792,  0.17692814,  0.27267828,  0.13474819,
         0.14502461,  0.20454682, -0.17702583,  0.06332033,  0.02229482,
         0.03104597,  0.05411408,  0.19806357, -0.29459979, -0.42457314,
        -0.10431104, -0.31915943,  0.20419395,  0.0

In [16]:
iter21[0]

array([[-0.0685621 , -0.05850976, -0.16998207, -0.26953854,  0.30281827,
        -0.25321124, -0.34782091, -0.27567916,  0.07847436, -0.13978765,
        -0.03885137, -0.28610866, -0.00735985,  0.41507011, -0.06597665,
         0.36413546,  0.11330846,  0.19481676, -0.32095087,  0.34249495,
        -0.32704604, -0.19224487,  0.10695173, -0.01776277,  0.163946  ,
         0.1652856 , -0.19630462, -0.34827044,  0.20966117,  0.20717817,
        -0.25353543,  0.20580627, -0.12200643,  0.30008873, -0.065706  ,
        -0.46457951, -0.28842862, -0.33026315,  0.36897105, -0.22306559,
        -0.21274733,  0.25668202,  0.01086658, -0.22957616,  0.36457797,
         0.42016986, -0.17459922,  0.34223923,  0.28123349, -0.36363003,
        -0.11035915,  0.31851104,  0.08800693,  0.28750655,  0.10762462,
         0.16931823,  0.21557258, -0.18526806,  0.14716782, -0.140531  ,
         0.16222156,  0.01449658,  0.20578145, -0.31497209, -0.39278312,
        -0.24447244, -0.33252378,  0.17450174, -0.0