In [54]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from patsy import dmatrices

In [56]:
dftweets=pd.read_csv("respvardf.csv")


In [57]:
dftweets = dftweets.drop('Unnamed: 0.1', 1)
dftweets = dftweets.drop('Unnamed: 0', 1)

In [58]:
len(dftweets)

82794

In [59]:
# We decided to drop Martin O'Malley because there are so few tweets on him compared to the other candidates
finaldf=dftweets[dftweets['resp'] != 'martin']

In [60]:
len(finaldf)

76937

In [61]:
# Here we look at our model training on all the data.
y, X = dmatrices('hillary ~ hashtag_count + url_count + swear_count + emoji_count + emoticon_count\
                 + sentiment + sentiment_var', finaldf, return_type="dataframe")
y = np.ravel(y)
model = LogisticRegression()
model = model.fit(X, y)
model.score(X, y)

0.56435218883227622

In [62]:
# split data into a train and test set with 70% of data in train set
itrain, itest = train_test_split(xrange(finaldf.shape[0]), train_size=0.7)

In [63]:
# Apply a mask to the dataframe to add the indication of train/test
mask=np.ones(finaldf.shape[0], dtype='int')
mask[itrain]=1
mask[itest]=0
mask = (mask==1)
mask.shape, mask.sum()

((76937L,), 53855)

In [64]:
traindf = finaldf[mask]
testdf = finaldf[~mask]

In [65]:
# create dataframes with an intercept
ytrain, Xtrain = dmatrices('hillary ~ hashtag_count + url_count + favorite_count + retweet_count + swear_count + emoji_count + emoticon_count\
                 + sentiment + sentiment_var', traindf, return_type="dataframe")
ytest, Xtest = dmatrices('hillary ~ hashtag_count + url_count + favorite_count + retweet_count + swear_count + emoji_count + emoticon_count\
                 + sentiment + sentiment_var', testdf, return_type="dataframe")

In [66]:
# flatten the y-vars
ytrain = np.ravel(ytrain)
ytest = np.ravel(ytest)

In [67]:
# Create the model
trainmodel = LogisticRegression()
trainmodel = trainmodel.fit(Xtrain, ytrain)
trainmodel.score(Xtrain, ytrain)

0.56138819771976078

In [68]:
# Score of the model on the test vars
trainmodel.score(Xtest, ytest)

0.56606879819772982

In [69]:
# collect the predictions on the test set
predicted = trainmodel.predict(Xtest)
np.mean(predicted)

0.80469630014730098

In [70]:
# generate class probabilities
probs = trainmodel.predict_proba(Xtest)
print probs

[[ 0.36024769  0.63975231]
 [ 0.52484732  0.47515268]
 [ 0.42851093  0.57148907]
 ..., 
 [ 0.32303214  0.67696786]
 [ 0.53700383  0.46299617]
 [ 0.48841322  0.51158678]]


In [71]:
# generate evaluation metrics
print metrics.accuracy_score(ytest, predicted)
print metrics.roc_auc_score(ytest, probs[:, 1])

0.566068798198
0.590481988364


In [72]:
# Confusion matrix
print metrics.confusion_matrix(ytest, predicted)
print metrics.classification_report(ytest, predicted)

[[ 2662  8170]
 [ 1846 10404]]
             precision    recall  f1-score   support

        0.0       0.59      0.25      0.35     10832
        1.0       0.56      0.85      0.68     12250

avg / total       0.57      0.57      0.52     23082



In [73]:
# Coeficients from the model
trainmodel.coef_

array([[  1.08104364e-01,  -1.54324635e-01,   5.37109308e-01,
          7.23305774e-04,  -1.25845828e-04,   1.91795994e-01,
         -1.30468376e-02,  -1.65068599e-01,  -8.12245800e-03,
          1.36336716e-02]])

In [74]:
# create dataframes with an intercept
# this is the saturated model with all first-order interaction variables
ytrain2, Xtrain2 = dmatrices('hillary ~ hashtag_count + url_count + favorite_count + retweet_count + swear_count + emoji_count + emoticon_count\
                 + sentiment + sentiment_var + hashtag_count*url_count + hashtag_count*retweet_count + hashtag_count*swear_count\
                 + hashtag_count*emoji_count + hashtag_count*emoticon_count + hashtag_count*sentiment + url_count*favorite_count\
                 +url_count*retweet_count + url_count*swear_count + url_count*emoji_count + url_count*emoticon_count + url_count*sentiment\
                 + favorite_count*swear_count + favorite_count*emoji_count + favorite_count*emoticon_count +favorite_count*sentiment\
                 + swear_count*emoji_count+swear_count*emoticon_count + swear_count * sentiment + emoji_count * emoticon_count\
                 + emoji_count * sentiment + emoji_count * sentiment_var + sentiment * sentiment_var', traindf, return_type="dataframe")
ytest2, Xtest2 = dmatrices('hillary ~ hashtag_count + url_count + favorite_count + retweet_count + swear_count + emoji_count + emoticon_count\
                 + sentiment + sentiment_var + hashtag_count*url_count + hashtag_count*retweet_count + hashtag_count*swear_count\
                 + hashtag_count*emoji_count + hashtag_count*emoticon_count + hashtag_count*sentiment + url_count*favorite_count\
                 +url_count*retweet_count + url_count*swear_count + url_count*emoji_count + url_count*emoticon_count + url_count*sentiment\
                 + favorite_count*swear_count + favorite_count*emoji_count + favorite_count*emoticon_count +favorite_count*sentiment\
                 + swear_count*emoji_count+swear_count*emoticon_count + swear_count * sentiment + emoji_count * emoticon_count\
                 + emoji_count * sentiment + emoji_count * sentiment_var + sentiment * sentiment_var', testdf, return_type="dataframe")

In [75]:
# flatten the y-vars
ytrain2 = np.ravel(ytrain2)
ytest2 = np.ravel(ytest2)

In [76]:
# Create the model
trainmodel2 = LogisticRegression()
trainmodel2 = trainmodel2.fit(Xtrain2, ytrain2)
trainmodel2.score(Xtrain2, ytrain2)

0.56112823559995539

In [77]:
# Score of the model on the test vars
trainmodel2.score(Xtest2, ytest2)

0.56346937007191755

In [78]:
# collect the predictions on the test set
predicted2 = trainmodel2.predict(Xtest2)
np.mean(predicted2)

0.81336106056667534

In [79]:
# generate class probabilities
probs2 = trainmodel2.predict_proba(Xtest2)
print probs2

[[ 0.35591196  0.64408804]
 [ 0.52805457  0.47194543]
 [ 0.40484061  0.59515939]
 ..., 
 [ 0.33776046  0.66223954]
 [ 0.55990835  0.44009165]
 [ 0.47693812  0.52306188]]


In [80]:
# generate evaluation metrics
print metrics.accuracy_score(ytest2, predicted2)
print metrics.roc_auc_score(ytest2, probs2[:, 1])

0.563469370072
0.59316345748


In [30]:
# Confusion matrix
print metrics.confusion_matrix(ytest2, predicted2)
print metrics.classification_report(ytest2, predicted2)

[[ 2635  8321]
 [ 1843 10283]]
             precision    recall  f1-score   support

        0.0       0.59      0.24      0.34     10956
        1.0       0.55      0.85      0.67     12126

avg / total       0.57      0.56      0.51     23082



In [81]:
# Coeficients from the model
trainmodel2.coef_

array([[ -2.20587615e-02,   9.09685124e-02,   5.22360012e-01,
         -2.86954734e-04,   9.60830789e-04,   1.58752244e-01,
         -5.64280898e-01,  -4.41222431e-01,   4.73723196e-02,
         -4.73902601e-02,   7.21852594e-02,  -2.19576675e-04,
          7.57450070e-03,   9.28542380e-02,   2.51162115e-01,
         -5.13196356e-02,  -6.98683574e-04,  -3.37558732e-03,
         -1.38763370e-01,   4.14730066e-01,   8.01045933e-02,
         -1.02943809e-02,   3.21321786e-04,   2.32337371e-02,
         -7.58827238e-02,   1.39529992e-04,  -1.20468897e-01,
          7.90039093e-01,   3.64892774e-03,  -4.38729096e-01,
          6.46588618e-02,   1.02877059e-02,   1.20636938e-02]])