In [3]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from patsy import dmatrices

In [5]:
dftweets=pd.read_csv("respvardf.csv")


In [6]:
dftweets = dftweets.drop('Unnamed: 0.1', 1)
dftweets = dftweets.drop('Unnamed: 0', 1)

In [7]:
len(dftweets)

82794

In [8]:
# We decided to drop Martin O'Malley because there are so few tweets on him compared to the other candidates
finaldf=dftweets[dftweets['resp'] != 'martin']

In [9]:
len(finaldf)

76937

In [10]:
# Here we look at our model training on all the data.
y, X = dmatrices('hillary ~ hashtag_count + url_count + swear_count + emoji_count + emoticon_count\
                 + sentiment + sentiment_var', finaldf, return_type="dataframe")
y = np.ravel(y)
model = LogisticRegression()
model = model.fit(X, y)
model.score(X, y)

0.56435218883227622

In [11]:
# split data into a train and test set with 70% of data in train set
itrain, itest = train_test_split(xrange(finaldf.shape[0]), train_size=0.7)

In [12]:
# Apply a mask to the dataframe to add the indication of train/test
mask=np.ones(finaldf.shape[0], dtype='int')
mask[itrain]=1
mask[itest]=0
mask = (mask==1)
mask.shape, mask.sum()

((76937L,), 53855)

In [13]:
traindf = finaldf[mask]
testdf = finaldf[~mask]

In [14]:
# create dataframes with an intercept
ytrain, Xtrain = dmatrices('hillary ~ hashtag_count + url_count + favorite_count + retweet_count + swear_count + emoji_count + emoticon_count\
                 + sentiment + sentiment_var', traindf, return_type="dataframe")
ytest, Xtest = dmatrices('hillary ~ hashtag_count + url_count + favorite_count + retweet_count + swear_count + emoji_count + emoticon_count\
                 + sentiment + sentiment_var', testdf, return_type="dataframe")

In [15]:
# flatten the y-vars
ytrain = np.ravel(ytrain)
ytest = np.ravel(ytest)

In [16]:
# Create the model
trainmodel = LogisticRegression()
trainmodel = trainmodel.fit(Xtrain, ytrain)
trainmodel.score(Xtrain, ytrain)

0.56413636870056083

In [17]:
# Score of the model on the test vars
trainmodel.score(Xtest, ytest)

0.56000346590416772

In [18]:
# collect the predictions on the test set
predicted = trainmodel.predict(Xtest)
np.mean(predicted)

0.80235681483406984

In [19]:
# generate class probabilities
probs = trainmodel.predict_proba(Xtest)
print probs

[[ 0.32402024  0.67597976]
 [ 0.35527055  0.64472945]
 [ 0.46798642  0.53201358]
 ..., 
 [ 0.48225114  0.51774886]
 [ 0.49041347  0.50958653]
 [ 0.31527508  0.68472492]]


In [20]:
# generate evaluation metrics
print metrics.accuracy_score(ytest, predicted)
print metrics.roc_auc_score(ytest, probs[:, 1])

0.560003465904
0.582572493052


In [21]:
# Confusion matrix
print metrics.confusion_matrix(ytest, predicted)
print metrics.classification_report(ytest, predicted)

[[ 2681  8275]
 [ 1881 10245]]
             precision    recall  f1-score   support

        0.0       0.59      0.24      0.35     10956
        1.0       0.55      0.84      0.67     12126

avg / total       0.57      0.56      0.52     23082



In [22]:
# Coeficients from the model
trainmodel.coef_

array([[  1.32342212e-01,  -1.61918636e-01,   5.52518391e-01,
         -2.10988906e-04,   1.21943618e-03,   2.31168892e-01,
         -1.81389399e-02,  -2.15070308e-01,  -1.33316277e-02,
          1.02740520e-02]])

In [23]:
# create dataframes with an intercept
# this is the saturated model with all first-order interaction variables
ytrain2, Xtrain2 = dmatrices('hillary ~ hashtag_count + url_count + favorite_count + retweet_count + swear_count + emoji_count + emoticon_count\
                 + sentiment + sentiment_var + hashtag_count*url_count + hashtag_count*retweet_count + hashtag_count*swear_count\
                 + hashtag_count*emoji_count + hashtag_count*emoticon_count + hashtag_count*sentiment + url_count*favorite_count\
                 +url_count*retweet_count + url_count*swear_count + url_count*emoji_count + url_count*emoticon_count + url_count*sentiment\
                 + favorite_count*swear_count + favorite_count*emoji_count + favorite_count*emoticon_count +favorite_count*sentiment\
                 + swear_count*emoji_count+swear_count*emoticon_count + swear_count * sentiment + emoji_count * emoticon_count\
                 + emoji_count * sentiment + emoji_count * sentiment_var + sentiment * sentiment_var', traindf, return_type="dataframe")
ytest2, Xtest2 = dmatrices('hillary ~ hashtag_count + url_count + favorite_count + retweet_count + swear_count + emoji_count + emoticon_count\
                 + sentiment + sentiment_var + hashtag_count*url_count + hashtag_count*retweet_count + hashtag_count*swear_count\
                 + hashtag_count*emoji_count + hashtag_count*emoticon_count + hashtag_count*sentiment + url_count*favorite_count\
                 +url_count*retweet_count + url_count*swear_count + url_count*emoji_count + url_count*emoticon_count + url_count*sentiment\
                 + favorite_count*swear_count + favorite_count*emoji_count + favorite_count*emoticon_count +favorite_count*sentiment\
                 + swear_count*emoji_count+swear_count*emoticon_count + swear_count * sentiment + emoji_count * emoticon_count\
                 + emoji_count * sentiment + emoji_count * sentiment_var + sentiment * sentiment_var', testdf, return_type="dataframe")

In [24]:
# flatten the y-vars
ytrain2 = np.ravel(ytrain2)
ytest2 = np.ravel(ytest2)

In [25]:
# Create the model
trainmodel2 = LogisticRegression()
trainmodel2 = trainmodel2.fit(Xtrain2, ytrain2)
trainmodel2.score(Xtrain2, ytrain2)

0.56439633082036622

In [26]:
# Score of the model on the test vars
trainmodel2.score(Xtest2, ytest2)

0.55965687548739274

In [27]:
# collect the predictions on the test set
predicted2 = trainmodel2.predict(Xtest2)
np.mean(predicted2)

0.80599601421020706

In [28]:
# generate class probabilities
probs2 = trainmodel2.predict_proba(Xtest2)
print probs2

[[ 0.32085516  0.67914484]
 [ 0.34851148  0.65148852]
 [ 0.4313344   0.5686656 ]
 ..., 
 [ 0.46490426  0.53509574]
 [ 0.48076799  0.51923201]
 [ 0.30734291  0.69265709]]


In [29]:
# generate evaluation metrics
print metrics.accuracy_score(ytest2, predicted2)
print metrics.roc_auc_score(ytest2, probs2[:, 1])

0.559656875487
0.583895833284


In [30]:
# Confusion matrix
print metrics.confusion_matrix(ytest2, predicted2)
print metrics.classification_report(ytest2, predicted2)

[[ 2635  8321]
 [ 1843 10283]]
             precision    recall  f1-score   support

        0.0       0.59      0.24      0.34     10956
        1.0       0.55      0.85      0.67     12126

avg / total       0.57      0.56      0.51     23082



In [31]:
# Coeficients from the model
trainmodel2.coef_

array([[  3.59282764e-02,   4.41364134e-02,   4.01664292e-01,
         -2.48340887e-03,   4.80808477e-03,   2.56909596e-01,
         -4.44901268e-01,  -3.29456278e-01,   2.87193596e-02,
         -6.26556773e-02,   6.98933561e-02,  -8.40929147e-04,
          4.60988807e-02,   4.98265679e-02,   1.30120369e-01,
         -4.34371994e-02,   2.67311453e-03,  -8.56384026e-03,
         -2.37282523e-01,   2.67146646e-01,   1.44612268e-01,
          1.60494633e-02,   8.50210942e-04,   4.90615370e-02,
         -9.47340338e-02,   1.74993573e-04,  -1.11975938e-01,
          2.54259515e-01,  -1.66777849e-02,  -4.02440928e-01,
          5.01923778e-02,  -4.53958528e-03,   1.45782835e-02]])