In [77]:
%matplotlib inline
import pandas as pd
import numpy as np
import spacy
from spacy.en import English
from nltk import sent_tokenize
from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors
from scipy import spatial
from scipy.stats.stats import pearsonr
import seaborn as sns
import random
import time
import re

In [49]:
temp = nlp.tokenizer(word)
temp.text.lower()

u'multicolour'

In [50]:
def w2vsim(text, vector, spacy_mod, w2v_mod):
    w = spacy_mod.tokenizer(text.decode('utf8'))
    if w.text.lower() in w2v_mod.vocab:
        text_vec = model.word_vec(w.text.lower())
    else:
        return(0)

    return(1-spatial.distance.cosine(vector, text_vec))

In [3]:
nlp = spacy.load('en')

In [4]:
model = KeyedVectors.load_word2vec_format('../../w2v/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [34]:
time_vec = np.mean([model.word_vec('time'), model.word_vec('future'), 
                    model.word_vec('past')], axis=0)
people_vec = np.mean([model.word_vec('people'), model.word_vec('friends'), 
                    model.word_vec('family')], axis=0)
happy_vec = np.mean([model.word_vec('happy'), model.word_vec('joy')], axis=0),
prac_vec = np.mean([model.word_vec('practical'), model.word_vec('pragmatic'),
                   model.word_vec('logical')], axis=0)
df = pd.read_csv('../output/word_topic_deflections.csv')

In [51]:
df['time'] = df.words.apply(w2vsim, args=[time_vec, nlp, model])
df['people'] = df.words.apply(w2vsim, args=[people_vec, nlp, model])
df['happy'] = df.words.apply(w2vsim, args=[happy_vec, nlp, model])
df['prac'] = df.words.apply(w2vsim, args=[prac_vec, nlp, model])

In [98]:
prog = re.compile(r'^[a-z]+$')
single_words_lower = filter(prog.search, model.index2word)
#sublist = random.sample(single_words_lower, 10000)
len(single_words_lower)

155060

In [101]:
df_results = pd.DataFrame({'words':single_words_lower,
                          'low_cor':0,
                          'med_cor':0,
                          'high_cor':0})
low_cor = [0]*len(single_words_lower)
med_cor = [0]*len(single_words_lower)
high_cor = [0]*len(single_words_lower)


In [102]:
start = time.time()
for i, word in enumerate(single_words_lower):
    if not i%100:
        print i
        print("time elapsed: ", time.time()-start)
    tempvec = model.word_vec(word)
    df['temp'] = df.words.apply(w2vsim, args=[tempvec, nlp, model])
    
    low_cor[i] = pearsonr(df.temp, df.low)[0]
    med_cor[i] = pearsonr(df.temp, df.med)[0]
    high_cor[i] = pearsonr(df.temp, df.high)[0]

print("total time taken this loop: ", time.time() - start)
df_results['low_cor'] = low_cor
df_results['med_cor'] = med_cor
df_results['high_cor'] = high_cor

0
('time elapsed: ', 0.00028896331787109375)
100
('time elapsed: ', 14.01452112197876)
200
('time elapsed: ', 28.03955101966858)
300
('time elapsed: ', 42.04698395729065)
400
('time elapsed: ', 56.29463505744934)
500
('time elapsed: ', 71.01426005363464)
600
('time elapsed: ', 86.37652015686035)
700
('time elapsed: ', 101.31619811058044)
800
('time elapsed: ', 116.02182698249817)
900
('time elapsed: ', 130.29178714752197)
1000
('time elapsed: ', 144.77281403541565)
1100
('time elapsed: ', 159.2457311153412)
1200
('time elapsed: ', 173.73695611953735)
1300
('time elapsed: ', 188.0438630580902)
1400
('time elapsed: ', 202.76484203338623)
1500
('time elapsed: ', 217.51856303215027)
1600
('time elapsed: ', 231.9145610332489)
1700
('time elapsed: ', 246.91693902015686)
1800
('time elapsed: ', 261.59019207954407)
1900
('time elapsed: ', 276.08975100517273)
2000
('time elapsed: ', 290.66915798187256)
2100
('time elapsed: ', 305.4955439567566)
2200
('time elapsed: ', 320.02139496803284)
2300
(

In [103]:
df_results['low_high_diff'] = df_results.high_cor-df_results.low_cor
df_results['abs_diff'] = df_results.low_high_diff.apply(abs)

In [105]:
df_results.sort_values('low_high_diff', ascending=False)

Unnamed: 0,high_cor,low_cor,med_cor,words,low_high_diff,abs_diff
39422,-0.006476,-0.074554,-0.047791,hilltops,0.068078,0.068078
28166,0.025129,-0.041574,-0.000859,vantage,0.066703,0.066703
119987,-0.066528,-0.132473,-0.111336,radiotelescope,0.065944,0.065944
32739,-0.052409,-0.118192,-0.100154,caverns,0.065782,0.065782
82284,0.054068,-0.010805,0.021771,geopark,0.064873,0.064873
91505,-0.070230,-0.133845,-0.079271,xinhuanet,0.063615,0.063615
80828,-0.015649,-0.079127,-0.042072,dedicatory,0.063479,0.063479
143317,-0.023396,-0.086667,-0.049978,swissnex,0.063271,0.063271
84043,-0.066127,-0.129045,-0.096340,salars,0.062917,0.062917
84309,0.045974,-0.016706,0.003619,geochemists,0.062680,0.062680


In [106]:
df_results.to_csv('../output/word2vec_correlations.csv')

Code below calculates correlations between every word in word2vec model and the stm coefficients

In [None]:
df_results = pd.DataFrame({'words':model.index2word,
                          'low_cor':0,
                          'med_cor':0,
                          'high_cor':0})
low_cor = [0]*len(model.index2word)
med_cor = [0]*len(model.index2word)
high_cor = [0]*len(model.index2word)
for i, word in enumerate(model.index2word):
    tempvec = model.word_vec(word)
    df['temp'] = df.words.apply(w2vsim, args=[tempvec, nlp, model])
    
    low_cor[i] = pearsonr(df.temp, df.low)[0]
    med_cor[i] = pearsonr(df.temp, df.med)[0]
    high_cor[i] = pearsonr(df.temp, df.high)[0]


In [149]:
low_cor = [0]*len(model.index2word)

In [150]:
low_cor[7] = 2

In [152]:
pearsonr(df['temp'], df.low)[0]

0.12788015531178859