In [2]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from patsy import dmatrices

In [3]:
dftweets=pd.read_csv("respvardf.csv")


In [4]:
dftweets = dftweets.drop('Unnamed: 0.1', 1)
dftweets = dftweets.drop('Unnamed: 0', 1)

In [5]:
len(dftweets)

82794

In [6]:
# We decided to drop Martin O'Malley because there are so few tweets on him compared to the other candidates
finaldf=dftweets[dftweets['resp'] != 'martin']

In [7]:
len(finaldf)

76937

In [8]:
# Here we look at our model training on all the data.
y, X = dmatrices('hillary ~ hashtag_count + url_count + swear_count + emoji_count + emoticon_count\
                 + sentiment + sentiment_var', finaldf, return_type="dataframe")
y = np.ravel(y)
model = LogisticRegression()
model = model.fit(X, y)
model.score(X, y)

0.56435218883227622

In [9]:
# split data into a train and test set with 70% of data in train set
itrain, itest = train_test_split(xrange(finaldf.shape[0]), train_size=0.7)

In [10]:
# Apply a mask to the dataframe to add the indication of train/test
mask=np.ones(finaldf.shape[0], dtype='int')
mask[itrain]=1
mask[itest]=0
mask = (mask==1)
mask.shape, mask.sum()

((76937L,), 53855)

In [11]:
traindf = finaldf[mask]
testdf = finaldf[~mask]

In [12]:
# create dataframes with an intercept
ytrain, Xtrain = dmatrices('hillary ~ hashtag_count + url_count + favorite_count + retweet_count + swear_count + emoji_count + emoticon_count\
                 + sentiment + sentiment_var', traindf, return_type="dataframe")
ytest, Xtest = dmatrices('hillary ~ hashtag_count + url_count + favorite_count + retweet_count + swear_count + emoji_count + emoticon_count\
                 + sentiment + sentiment_var', testdf, return_type="dataframe")

In [13]:
# flatten the y-vars
ytrain = np.ravel(ytrain)
ytest = np.ravel(ytest)

In [14]:
# Create the model
trainmodel = LogisticRegression()
trainmodel = trainmodel.fit(Xtrain, ytrain)
trainmodel.score(Xtrain, ytrain)

0.56538854331074184

In [15]:
# Score of the model on the test vars
trainmodel.score(Xtest, ytest)

0.55803474719466228

In [16]:
# collect the predictions on the test set
predicted = trainmodel.predict(Xtest)
np.mean(predicted)

0.80139508686798666

In [17]:
# generate class probabilities
probs = trainmodel.predict_proba(Xtest)
print probs

[[ 0.32504608  0.67495392]
 [ 0.45867518  0.54132482]
 [ 0.52346898  0.47653102]
 ..., 
 [ 0.47797128  0.52202872]
 [ 0.25187257  0.74812743]
 [ 0.4089338   0.5910662 ]]


In [18]:
# generate evaluation metrics
print metrics.accuracy_score(ytest, predicted)
print metrics.roc_auc_score(ytest, probs[:, 1])

0.558034747195
0.583294319167


In [30]:
# Confusion matrix
print metrics.confusion_matrix(ytest, predicted)
print metrics.classification_report(ytest, predicted)

[[ 2636  8385]
 [ 1820 10240]]
             precision    recall  f1-score   support

        0.0       0.59      0.24      0.34     11021
        1.0       0.55      0.85      0.67     12060

avg / total       0.57      0.56      0.51     23081



In [20]:
# Coeficients from the model
trainmodel.coef_

array([[  1.29007559e-01,  -1.53515543e-01,   5.61939124e-01,
         -7.16685032e-05,   1.18689013e-03,   2.75534993e-01,
         -1.03471209e-02,  -3.27236729e-01,  -1.43848466e-02,
          1.24947321e-02]])

In [44]:
# create dataframes with an intercept
# this is the saturated model with all first-order interaction variables
ytrain2, Xtrain2 = dmatrices('hillary ~ hashtag_count + url_count + favorite_count + retweet_count + swear_count + emoji_count + emoticon_count\
                 + sentiment + sentiment_var + hashtag_count*url_count + hashtag_count*retweet_count + hashtag_count*swear_count\
                 + hashtag_count*emoji_count + hashtag_count*emoticon_count + hashtag_count*sentiment + url_count*favorite_count\
                 +url_count*retweet_count + url_count*swear_count + url_count*emoji_count + url_count*emoticon_count + url_count*sentiment\
                 + favorite_count*swear_count + favorite_count*emoji_count + favorite_count*emoticon_count +favorite_count*sentiment\
                 + swear_count*emoji_count+swear_count*emoticon_count + swear_count * sentiment + emoji_count * emoticon_count\
                 + emoji_count * sentiment + emoji_count * sentiment_var + sentiment * sentiment_var', traindf, return_type="dataframe")
ytest2, Xtest2 = dmatrices('hillary ~ hashtag_count + url_count + favorite_count + retweet_count + swear_count + emoji_count + emoticon_count\
                 + sentiment + sentiment_var + hashtag_count*url_count + hashtag_count*retweet_count + hashtag_count*swear_count\
                 + hashtag_count*emoji_count + hashtag_count*emoticon_count + hashtag_count*sentiment + url_count*favorite_count\
                 +url_count*retweet_count + url_count*swear_count + url_count*emoji_count + url_count*emoticon_count + url_count*sentiment\
                 + favorite_count*swear_count + favorite_count*emoji_count + favorite_count*emoticon_count +favorite_count*sentiment\
                 + swear_count*emoji_count+swear_count*emoticon_count + swear_count * sentiment + emoji_count * emoticon_count\
                 + emoji_count * sentiment + emoji_count * sentiment_var + sentiment * sentiment_var', testdf, return_type="dataframe")

In [45]:
# flatten the y-vars
ytrain2 = np.ravel(ytrain2)
ytest2 = np.ravel(ytest2)

In [46]:
# Create the model
trainmodel2 = LogisticRegression()
trainmodel2 = trainmodel2.fit(Xtrain2, ytrain2)
trainmodel2.score(Xtrain2, ytrain2)

0.56492433385943741

In [47]:
# Score of the model on the test vars
trainmodel2.score(Xtest2, ytest2)

0.55786144447814223

In [48]:
# collect the predictions on the test set
predicted2 = trainmodel2.predict(Xtest2)
np.mean(predicted2)

0.80694077379662921

In [49]:
# generate class probabilities
probs2 = trainmodel2.predict_proba(Xtest2)
print probs2

[[ 0.3235633   0.6764367 ]
 [ 0.42259642  0.57740358]
 [ 0.52579054  0.47420946]
 ..., 
 [ 0.48786056  0.51213944]
 [ 0.32905155  0.67094845]
 [ 0.424278    0.575722  ]]


In [41]:
# generate evaluation metrics
print metrics.accuracy_score(ytest2, predicted2)
print metrics.roc_auc_score(ytest2, probs2[:, 1])

0.557861444478
0.585078810045


In [50]:
# Confusion matrix
print metrics.confusion_matrix(ytest2, predicted2)
print metrics.classification_report(ytest2, predicted2)

[[ 2636  8385]
 [ 1820 10240]]
             precision    recall  f1-score   support

        0.0       0.59      0.24      0.34     11021
        1.0       0.55      0.85      0.67     12060

avg / total       0.57      0.56      0.51     23081



In [51]:
# Coeficients from the model
trainmodel2.coef_

array([[ 0.01883212,  0.08362351,  0.48918602, -0.00401454,  0.0039297 ,
         0.11539828, -0.50062331, -0.30979595,  0.03256227, -0.06885571,
         0.06922951, -0.00138839,  0.01574478,  0.03547779,  0.03256053,
        -0.04889607, -0.00101372, -0.00330904, -0.18512296,  0.40912606,
        -0.0160733 ,  0.00129108, -0.00310671,  0.0445914 , -0.02973928,
         0.0006499 , -0.12914617,  0.5975632 ,  0.02911268, -0.46010319,
         0.063091  ,  0.00950964,  0.0160899 ]])