In [85]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from patsy import dmatrices

In [86]:
dftweets=pd.read_csv("respvardf.csv")


In [87]:
dftweets = dftweets.drop('Unnamed: 0.1', 1)
dftweets = dftweets.drop('Unnamed: 0', 1)

In [88]:
len(dftweets)

82794

In [89]:
# We decided to drop Martin O'Malley because there are so few tweets on him compared to the other candidates
finaldf=dftweets[dftweets['resp'] != 'martin']

In [90]:
len(finaldf)

76937

In [91]:
# split data into a train and test set with 70% of data in train set
itrain, itest = train_test_split(xrange(finaldf.shape[0]), train_size=0.7)

In [92]:
# Apply a mask to the dataframe to add the indication of train/test
mask=np.ones(finaldf.shape[0], dtype='int')
mask[itrain]=1
mask[itest]=0
mask = (mask==1)
mask.shape, mask.sum()

((76937L,), 53855)

In [93]:
traindf = finaldf[mask]
testdf = finaldf[~mask]

In [94]:
# create dataframes with an intercept
ytrain, Xtrain = dmatrices('hillary ~ hashtag_count + url_count + favorite_count + retweet_count + swear_count + emoji_count + emoticon_count\
                 + sentiment + sentiment_var', traindf, return_type="dataframe")
ytest, Xtest = dmatrices('hillary ~ hashtag_count + url_count + favorite_count + retweet_count + swear_count + emoji_count + emoticon_count\
                 + sentiment + sentiment_var', testdf, return_type="dataframe")

In [95]:
# flatten the y-vars
ytrain = np.ravel(ytrain)
ytest = np.ravel(ytest)

In [96]:
# Create the model
trainmodel = LogisticRegression()
trainmodel = trainmodel.fit(Xtrain, ytrain)
trainmodel.score(Xtrain, ytrain)

0.56341218850967434

In [97]:
# Score of the model on the test vars
trainmodel.score(Xtest, ytest)

0.56728186465644226

In [98]:
# collect the predictions on the test set
predicted = trainmodel.predict(Xtest)
np.mean(predicted)

0.67845074083701584

In [99]:
# generate class probabilities
probs = trainmodel.predict_proba(Xtest)
print probs

[[ 0.32563078  0.67436922]
 [ 0.45797562  0.54202438]
 [ 0.45797562  0.54202438]
 ..., 
 [ 0.3605278   0.6394722 ]
 [ 0.315419    0.684581  ]
 [ 0.315273    0.684727  ]]


In [100]:
# generate evaluation metrics
print metrics.accuracy_score(ytest, predicted)
print metrics.roc_auc_score(ytest, probs[:, 1])

0.567281864656
0.584739331921


In [101]:
# Confusion matrix
print metrics.confusion_matrix(ytest, predicted)
print metrics.classification_report(ytest, predicted)

[[4156 6722]
 [3266 8938]]
             precision    recall  f1-score   support

        0.0       0.56      0.38      0.45     10878
        1.0       0.57      0.73      0.64     12204

avg / total       0.57      0.57      0.55     23082



In [102]:
# Coeficients from the model
trainmodel.coef_

array([[  1.33599037e-01,  -1.52089006e-01,   5.72083360e-01,
          4.49669299e-04,   2.72638146e-04,   2.28208725e-01,
         -2.76662684e-02,  -1.23998408e-01,  -1.82396796e-02,
          1.21347888e-02]])

In [103]:
# create dataframes with an intercept
# this is the saturated model with all first-order interaction variables
ytrain, Xtrain = dmatrices('hillary ~ hashtag_count + url_count + favorite_count + retweet_count + swear_count + emoji_count + emoticon_count\
                 + sentiment + sentiment_var + hashtag_count*url_count + hashtag_count*retweet_count + hashtag_count*swear_count\
                 + hashtag_count*emoji_count + hashtag_count*emoticon_count + hashtag_count*sentiment + url_count*favorite_count\
                 +url_count*retweet_count + url_count*swear_count + url_count*emoji_count + url_count*emoticon_count + url_count*sentiment\
                 + favorite_count*swear_count + favorite_count*emoji_count + favorite_count*emoticon_count +favorite_count*sentiment\
                 + swear_count*emoji_count+swear_count*emoticon_count + swear_count * sentiment + emoji_count * emoticon_count\
                 + emoji_count * sentiment + emoji_count * sentiment_var + sentiment * sentiment_var', traindf, return_type="dataframe")
ytest, Xtest = dmatrices('hillary ~ hashtag_count + url_count + favorite_count + retweet_count + swear_count + emoji_count + emoticon_count\
                 + sentiment + sentiment_var + hashtag_count*url_count + hashtag_count*retweet_count + hashtag_count*swear_count\
                 + hashtag_count*emoji_count + hashtag_count*emoticon_count + hashtag_count*sentiment + url_count*favorite_count\
                 +url_count*retweet_count + url_count*swear_count + url_count*emoji_count + url_count*emoticon_count + url_count*sentiment\
                 + favorite_count*swear_count + favorite_count*emoji_count + favorite_count*emoticon_count +favorite_count*sentiment\
                 + swear_count*emoji_count+swear_count*emoticon_count + swear_count * sentiment + emoji_count * emoticon_count\
                 + emoji_count * sentiment + emoji_count * sentiment_var + sentiment * sentiment_var', testdf, return_type="dataframe")

In [104]:
# flatten the y-vars
ytrain = np.ravel(ytrain)
ytest = np.ravel(ytest)

In [105]:
# Create the model
trainmodel = LogisticRegression()
trainmodel = trainmodel.fit(Xtrain, ytrain)
trainmodel.score(Xtrain, ytrain)

0.56196382812790135

In [106]:
# Score of the model on the test vars
trainmodel.score(Xtest, ytest)

0.56840828351096095

In [107]:
# collect the predictions on the test set
predicted = trainmodel.predict(Xtest)
np.mean(predicted)

0.7692574300320596

In [108]:
# generate class probabilities
probs = trainmodel.predict_proba(Xtest)
print probs

[[ 0.32328766  0.67671234]
 [ 0.42686564  0.57313436]
 [ 0.42686564  0.57313436]
 ..., 
 [ 0.35583026  0.64416974]
 [ 0.33041261  0.66958739]
 [ 0.3086738   0.6913262 ]]


In [109]:
# generate evaluation metrics
print metrics.accuracy_score(ytest, predicted)
print metrics.roc_auc_score(ytest, probs[:, 1])

0.568408283511
0.585285973771


In [110]:
# Confusion matrix
print metrics.confusion_matrix(ytest, predicted)
print metrics.classification_report(ytest, predicted)

[[3121 7757]
 [2205 9999]]
             precision    recall  f1-score   support

        0.0       0.59      0.29      0.39     10878
        1.0       0.56      0.82      0.67     12204

avg / total       0.57      0.57      0.53     23082



In [111]:
# Coeficients from the model
trainmodel.coef_

array([[  3.36503865e-02,   5.68191722e-02,   4.61177365e-01,
         -9.10822364e-04,   3.47243420e-03,   1.52402047e-01,
         -4.81063134e-01,  -3.51876699e-01,   2.42196831e-02,
         -5.47152161e-02,   6.00038786e-02,  -1.09935706e-03,
          5.15266472e-03,   3.26586123e-02,   8.92194692e-02,
         -4.30599643e-02,   1.23366605e-03,  -6.05230652e-03,
         -1.09831235e-01,   3.88651177e-01,   2.47712950e-01,
          9.86329128e-03,   1.89646231e-03,   5.82909000e-02,
         -2.68988235e-03,   6.54451529e-05,  -2.46273011e-01,
          4.74555147e-01,   1.18677498e-02,  -3.75106444e-01,
          5.51446693e-02,   3.26647277e-03,   1.32674218e-02]])