In [531]:
# File for downloading GloVe and using it as the predictors in a random forest model

In [461]:
import gensim.models
from gensim.models import KeyedVectors
from gensim.models import word2vec
from gensim.models import Word2Vec
import logging
import numpy as np
import pandas as pd
import math

In [462]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [465]:
# Read in data
dat = pd.read_excel('all_sentences.xlsx')
dat.head()

Unnamed: 0,label,words,words_clean
0.0,0,Forcing middle-class workers to bear a greater...,forcing middle class workers bear greater shar...
1.0,0,Because it would not be worthwhile to bring a ...,would worthwhile bring case arbitration clause...
2.0,0,"Indeed , Lind argues that high profits and hig...",indeed lind argues high profits high wages rei...
3.0,0,"In fairness , it should be noted that he devot...",fairness noted devotes entire chapter new york...
4.0,0,Psychological tactics are social control techn...,psychological tactics social control technique...


In [466]:
# Create dictionary of words in corpus
sentence = dat['words_clean'].iloc[0]
words = sentence.split(' ')

float(words.count(words[0]))/len(words)

word_dict = {}

for sentence in dat['words_clean']:
    for word in sentence.split(' '):
        if word not in word_dict:
            word_dict[word] = sum([1 for sentence in dat['words_clean'] if word in sentence.split(' ')]) 

In [476]:
# Sanity check
word_dict['forcing']

12

In [489]:
# Implementing IDF by hand
idf = []
for sentence in dat['words_clean']:   
    word_freq = []
    for word in sentence.split(' '):
        word_freq.append(word_dict[word])
    idf.append([math.log(dat.shape[0]/float(count)) for count in word_freq])

In [490]:
idf_dict = {}
for key,val in word_dict.iteritems():
    idf_dict[key] = math.log(dat.shape[0]/float(val))

In [491]:
idf_dict['forcing']

6.104048907855128

In [492]:
len(idf), dat.shape

(5372, (5372, 4))

In [493]:
# Add IDF values to dataframe
dat['idf'] = idf

In [494]:
dat.head()

Unnamed: 0,label,words,words_clean,idf
0.0,0,Forcing middle-class workers to bear a greater...,forcing middle class workers bear greater shar...,"[6.10404890786, 3.93499520749, 3.62611092738, ..."
1.0,0,Because it would not be worthwhile to bring a ...,would worthwhile bring case arbitration clause...,"[2.38237963092, 7.20266119652, 5.45346134171, ..."
2.0,0,"Indeed , Lind argues that high profits and hig...",indeed lind argues high profits high wages rei...,"[5.12321965484, 7.89580837708, 4.71775454674, ..."
3.0,0,"In fairness , it should be noted that he devot...",fairness noted devotes entire chapter new york...,"[6.64304540859, 5.37007973277, 7.89580837708, ..."
4.0,0,Psychological tactics are social control techn...,psychological tactics social control technique...,"[6.64304540859, 6.28637046465, 3.10001783149, ..."


In [495]:
word_freq = []
for word in sentence.split(' '):
    word_freq.append(word_dict[word])
    
print [math.log(dat.shape[0]/float(count)) for count in word_freq]
print word_freq
print [float(count)/sum(word_freq) for count in word_freq]

[7.4903432689750185, 8.588955557643128, 8.588955557643128, 7.895808377083183, 8.588955557643128, 6.797196088415073, 5.092447996176648]
[3, 1, 1, 2, 1, 6, 33]
[0.06382978723404255, 0.02127659574468085, 0.02127659574468085, 0.0425531914893617, 0.02127659574468085, 0.1276595744680851, 0.7021276595744681]


In [496]:
dat['words_clean'].iloc[0]

u'forcing middle class workers bear greater share cost government weakens support needed investments stirs resentment toward depend public services'

In [263]:
# MIght not need all this??

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [261]:
vectorizer = CountVectorizer()
bag_of_words = vectorizer.fit_transform(dat['words_clean'])

In [262]:
bag_of_words

<4326x14764 sparse matrix of type '<type 'numpy.int64'>'
	with 92621 stored elements in Compressed Sparse Row format>

In [264]:
tf_transformer = TfidfTransformer(use_idf=False).fit(bag_of_words)
X_train_tf = tf_transformer.transform(bag_of_words)
X_train_tf.shape

(4326, 14764)

In [272]:
X_train_tf[0,1000]

0.0

### GloVe Import, Testing, and Wrangling

In [497]:
# Import pretrained glove vectors
from gensim.models.keyedvectors import KeyedVectors
glove_model = KeyedVectors.load_word2vec_format("gensim_glove_vectors.txt", binary=False)

In [498]:
# Testing vectors
glove_model.most_similar(positive=['king', 'woman'], negative=['man'], topn=5)

[(u'queen', 0.8523603677749634),
 (u'throne', 0.7664333581924438),
 (u'prince', 0.759214460849762),
 (u'daughter', 0.7473883032798767),
 (u'elizabeth', 0.7460220456123352)]

In [429]:
# Testing vectors
glove_model.most_similar('excited', topn=5)

[(u'thrilled', 0.8795695900917053),
 (u'amazed', 0.8213223218917847),
 (u'definitely', 0.8046097755432129),
 (u"'m", 0.802273690700531),
 (u'happy', 0.7983384132385254)]

In [499]:
# Testing vectors
glove_model.wv['daniel']

array([-0.43943   ,  0.29657999,  0.44867   ,  0.23591   ,  0.94338   ,
        0.32563999, -0.88607001,  0.10126   , -0.49053001,  0.047319  ,
        0.35266   ,  0.82309002, -0.72639   ,  0.47466001,  0.27928001,
       -0.27307999,  0.55792999, -1.24950004, -0.41683999,  0.26661   ,
        0.0283    ,  0.55105001,  0.19496   ,  0.077513  ,  0.079079  ,
       -1.03699994,  0.57209998, -1.04229999, -1.09500003, -0.073586  ,
        1.10239995, -0.85009998, -0.71897   , -0.66255999, -0.022977  ,
       -0.17501999, -0.18613   ,  0.24626   ,  1.7687    , -0.016052  ,
        0.15638   ,  1.12129998, -0.12678   , -1.13510001,  0.68115002,
        0.74198997,  0.1877    , -1.06570005,  0.43312001,  0.24698   ], dtype=float32)

In [500]:
word_vectors = glove_model.wv

In [501]:
type(word_vectors)

gensim.models.keyedvectors.KeyedVectors

In [504]:
# Create columns for avg_vec, sum_vec, and both of those weighted
avg_vec = np.full(dat.shape[0], None)
sum_vec = np.full(dat.shape[0], None)
weighted_avg_vec = np.full(dat.shape[0], None)
weighted_sum_vec = np.full(dat.shape[0], None)
for i, sentence in enumerate(dat['words_clean']):
    words = sentence.split(' ')
    #print i
    real_words = []
    real_weights = []
    for word in words:
        if word in glove_model.wv:
            real_words.append(word)
            real_weights.append(idf_dict[word])
            
    vecs = np.full(len(real_words), None)
    weighted_vecs = np.full(len(real_words), None)
    #print words
    for j, word in enumerate(real_words):
        vecs[j] = glove_model.wv[word]
        weighted_vecs[j] = glove_model.wv[word] * real_weights[j]
    
    weighted_sum_vec[i] = sum(weighted_vecs)
    weighted_avg_vec[i] = sum(weighted_vecs)/float(sum(real_weights))
    avg_vec[i] = sum(vecs)/len(vecs)
    sum_vec[i] = sum(vecs)
    
dat['sum_vec'] = sum_vec
dat['avg_vec'] = avg_vec
dat['weighted_avg_vec'] = weighted_avg_vec
dat['weighted_sum_vec'] = weighted_sum_vec

In [505]:
dat.head(n=20)

Unnamed: 0,label,words,words_clean,idf,sum_vec,avg_vec,weighted_avg_vec,weighted_sum_vec
0.0,0,Forcing middle-class workers to bear a greater...,forcing middle class workers bear greater shar...,"[6.10404890786, 3.93499520749, 3.62611092738, ...","[4.79607, 1.69907, 5.25044, -6.22599, 2.55119,...","[0.252425, 0.0894248, 0.276339, -0.327684, 0.1...","[0.223467, 0.0817972, 0.246648, -0.305011, 0.1...","[20.6355, 7.55337, 22.7761, -28.1655, 11.0272,..."
1.0,0,Because it would not be worthwhile to bring a ...,would worthwhile bring case arbitration clause...,"[2.38237963092, 7.20266119652, 5.45346134171, ...","[6.79759, -2.75716, -4.73693, -0.350068, 3.601...","[0.33988, -0.137858, -0.236846, -0.0175034, 0....","[0.30305, -0.172721, -0.297291, 0.00720224, 0....","[32.3558, -18.441, -31.7409, 0.768962, 14.779,..."
2.0,0,"Indeed , Lind argues that high profits and hig...",indeed lind argues high profits high wages rei...,"[5.12321965484, 7.89580837708, 4.71775454674, ...","[0.36504, -1.32171, 4.03973, -4.02884, 1.79523...","[0.02808, -0.10167, 0.310748, -0.309911, 0.138...","[0.075521, -0.154822, 0.298123, -0.273047, 0.1...","[5.07958, -10.4134, 20.0519, -18.3653, 10.9096..."
3.0,0,"In fairness , it should be noted that he devot...",fairness noted devotes entire chapter new york...,"[6.64304540859, 5.37007973277, 7.89580837708, ...","[-7.31102, 8.90912, -4.11219, 0.903822, 3.9656...","[-0.332319, 0.40496, -0.186918, 0.0410828, 0.1...","[-0.353876, 0.425656, -0.194207, 0.0904033, 0....","[-45.856, 55.1575, -25.1658, 11.7147, 21.3939,..."
4.0,0,Psychological tactics are social control techn...,psychological tactics social control technique...,"[6.64304540859, 6.28637046465, 3.10001783149, ...","[5.29807, -4.94703, 1.31191, -5.86548, -0.8557...","[0.33113, -0.30919, 0.0819944, -0.366592, -0.0...","[0.336981, -0.384413, 0.103796, -0.405972, -0....","[28.8706, -32.9343, 8.8926, -34.7813, -6.83236..."
5.0,0,The uncontrolled profit motive is destroying h...,uncontrolled profit motive destroying health i...,"[7.89580837708, 4.82775544195, 6.97951764521, ...","[7.68936, -5.6458, 3.04431, -3.57051, -0.75572...","[0.54924, -0.403271, 0.217451, -0.255037, -0.0...","[0.547838, -0.485027, 0.114258, -0.208554, -0....","[45.8133, -40.5607, 9.55493, -17.4404, -3.5667..."
6.0,0,Organizations representing the religious right...,organizations representing religious right loy...,"[5.29311869164, 6.97951764521, 5.15496835316, ...","[-3.04053, -1.52265, -1.02142, -5.80551, 6.230...","[-0.178855, -0.0895678, -0.0600837, -0.341501,...","[-0.297347, -0.128609, -0.128135, -0.286796, 0...","[-28.1698, -12.1841, -12.1391, -27.1702, 34.22..."
7.0,0,A market based on greed and fear has tugged on...,market based greed fear tugged worst things us...,"[3.19079285613, 3.80146381486, 5.22165972766, ...","[2.09188, -1.70208, 1.48798, -2.71342, 1.75557...","[0.209188, -0.170208, 0.148798, -0.271342, 0.1...","[0.166599, -0.212585, 0.107497, -0.374009, 0.1...","[8.01947, -10.233, 5.17453, -18.0034, 6.48401,..."
8.0,0,THE CONSERVATIVE MOVEMENT IS ROOTED IN A COHER...,conservative movement rooted coherent easy sum...,"[4.06716698059, 4.31228943863, 6.28637046465, ...","[0.686753, 0.993661, 1.78845, -6.54942, 9.9502...","[0.0274701, 0.0397465, 0.0715382, -0.261977, 0...","[0.0532822, 0.016406, 0.0275177, -0.294374, 0....","[6.61953, 2.0382, 3.41867, -36.5716, 50.5963, ..."
9.0,0,"By eliminating the private insurer , you could...",eliminating private insurer could save billion...,"[5.54443311992, 3.47696776929, 7.20266119652, ...","[12.2866, -3.33005, 12.0976, -5.19781, -1.5351...","[0.438809, -0.11893, 0.432055, -0.185636, -0.0...","[0.409654, -0.13104, 0.404075, -0.127847, -0.0...","[52.5201, -16.8002, 51.805, -16.3908, -7.03142..."


In [506]:
len(dat['idf'].iloc[0])

19

In [507]:
len(dat['avg_vec'].iloc[0])

50

In [508]:
# Export
dat.to_excel('dat_large.xlsx')

### Create Random Forest Model

In [509]:
type(dat['sum_vec'].iloc[0])

numpy.ndarray

In [510]:
from sklearn import ensemble
from sklearn.model_selection import train_test_split

In [511]:
def return_acc(probs, y_test, thresh):
    y_pred = []
    for row in probs:
        if row[1] > thresh:
            y_pred.append(1)
        else:
            y_pred.append(np.argmax(row))
    #print(len(y_pred), len(y_test))
    return y_pred, np.mean(np.array(y_pred) == y_test)

In [436]:
# Tuning neutral threshold
accuracy = [0]*50
for j in range(20):
    X_train, X_test, y_train, y_test = train_test_split(dat['weighted_avg_vec'], dat['label'], test_size=0.3, random_state=j)
    X_train_mat = np.matrix(X_train.values.tolist())
    X_test_mat = np.matrix(X_test.values.tolist())
    rf_model = ensemble.RandomForestClassifier(max_depth=200, n_estimators=50)
    rf_model.fit(X_train_mat, y_train) 
    threshes = np.arange(0.0, 0.5, 0.01)
    for i, thresh in enumerate(threshes):
        probs = rf_model.predict_proba(X_test_mat)
        accuracy[i] += return_acc(probs, y_test, thresh)[1]

new_acc = [acc/20 for acc in accuracy]
print(new_acc)
np.argmax(new_acc)

[0.14849768875192607, 0.14872881355932205, 0.1725346687211094, 0.17388289676425267, 0.21024653312788905, 0.21209553158705702, 0.26163328197226504, 0.26467642526964563, 0.31336671802773497, 0.31660246533127884, 0.36255778120184906, 0.36529275808936823, 0.40300462249614794, 0.40527734976887525, 0.43859784283513098, 0.44029275808936819, 0.46440677966101684, 0.46548536209553165, 0.4833975346687211, 0.48470724191063175, 0.49637904468412941, 0.49734206471494613, 0.50496918335901397, 0.50516178736517714, 0.50974576271186434, 0.5096687211093992, 0.51132511556240368, 0.51190292758089373, 0.51228813559322028, 0.51248073959938378, 0.51251926040061635, 0.51248073959938356, 0.51136363636363624, 0.51155624036979963, 0.51059322033898313, 0.51067026194144849, 0.51028505392912182, 0.51036209553158707, 0.51070878274268106, 0.51086286594761166, 0.51097842835130969, 0.51097842835130969, 0.51086286594761166, 0.51086286594761166, 0.51090138674884433, 0.51090138674884433, 0.51113251155624018, 0.5111325115562

30

In [445]:
# Tuning max_depth
depths = [1,2,3,5,10,30,50,70,100,200,300,500,800,1000,1500]
avg_acc = [None]*len(depths)
for i, depth in enumerate(depths):
    accuracy = [0]*20
    for j in range(len(accuracy)):
        X_train, X_test, y_train, y_test = train_test_split(dat['avg_vec'], dat['label'], test_size=0.3, random_state=j)
        X_train_mat = np.matrix(X_train.values.tolist())
        X_test_mat = np.matrix(X_test.values.tolist())
        rf_model = ensemble.RandomForestClassifier(max_depth=200, n_estimators=50)
        rf_model.fit(X_train_mat, y_train) 
        probs = rf_model.predict_proba(X_test_mat)
        accuracy[j] += return_acc(probs, y_test, 0.30)[1]
    avg_acc[i] = sum(accuracy)/len(accuracy)

In [442]:
print(avg_acc)
np.argmax(avg_acc)

[0.47214946070878272, 0.48328197226502312, 0.49345146379044691, 0.50358243451463802, 0.51409861325115569, 0.50651001540832064, 0.50878274268104784, 0.5098228043143298, 0.50531587057010785, 0.51086286594761177, 0.51197996918335886, 0.513251155624037, 0.50805084745762719, 0.50354391371340523, 0.51078582434514641]


0.50531587057010785

In [514]:
X_train, X_test, y_train, y_test = train_test_split(dat['weighted_avg_vec'], dat['label'], test_size=0.3, random_state=j)
X_train_mat = np.matrix(X_train.values.tolist())
X_test_mat = np.matrix(X_test.values.tolist())
rf_model = ensemble.RandomForestClassifier(max_depth=200, n_estimators=50)
rf_model.fit(X_train_mat, y_train) 
probs = rf_model.predict_proba(X_test_mat)
return_acc(probs, y_test, 0.30)[1]

0.50372208436724564

In [520]:
# Functionalize to see which variable is best
def random_forest(var):
    accuracy = [0]*20
    for j in range(len(accuracy)):
        X_train, X_test, y_train, y_test = train_test_split(dat['avg_vec'], dat['label'], test_size=0.3, random_state=j)
        X_train_mat = np.matrix(X_train.values.tolist())
        X_test_mat = np.matrix(X_test.values.tolist())
        rf_model = ensemble.RandomForestClassifier(max_depth=200, n_estimators=50)
        rf_model.fit(X_train_mat, y_train) 
        probs = rf_model.predict_proba(X_test_mat)
        accuracy[j] += return_acc(probs, y_test, 0.30)[1]
    return sum(accuracy)/len(accuracy)

In [521]:
print random_forest('avg_vec')
print random_forest('sum_vec')
print random_forest('weighted_avg_vec')
print random_forest('weighted_sum_vec')

0.535763027295
0.537034739454
0.537189826303
0.536910669975


In [530]:
# All forms of the vector get the same score pretty much, none outperform Naive Bayes