In [24]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
from wordcloud import STOPWORDS
import re
import scipy.stats as stats

from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import classification_report

#### 1. Loading in the data and engineering some features

In [6]:
senate = pd.read_csv('./data/senate_processed.csv')

In [7]:
senate.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219539 entries, 0 to 219538
Data columns (total 11 columns):
created_at        219539 non-null object
id_str            219539 non-null float64
reply_count       219539 non-null float64
retweet_count     219539 non-null float64
text              219539 non-null object
user              219539 non-null object
name              219539 non-null object
state             219539 non-null object
party             219539 non-null object
the_ratio         219539 non-null float64
text_processed    219536 non-null object
dtypes: float64(4), object(7)
memory usage: 18.4+ MB


In [8]:
senate.dropna(inplace=True)

In [9]:
senate.head()

Unnamed: 0,created_at,id_str,reply_count,retweet_count,text,user,name,state,party,the_ratio,text_processed
0,Sun May 21 19:26:26 +0000 2017,8.66e+17,116.0,174.0,"Franni here. Since it's Al's birthday, and sin...",alfranken,Al Franken,New York,Democratic,0.666667,franni since al birthday since working hard wa...
1,Tue May 16 01:44:44 +0000 2017,8.64e+17,718.0,1248.0,This is profoundly troubling. Why would Presid...,alfranken,Al Franken,New York,Democratic,0.575321,profoundly troubling would president trump giv...
2,Wed May 10 19:55:37 +0000 2017,8.62e+17,218.0,1334.0,It couldn't be clearer: we need an independent...,alfranken,Al Franken,New York,Democratic,0.163418,clearer need independent investigation preside...
3,Wed May 10 19:54:55 +0000 2017,8.62e+17,134.0,794.0,More troubling news: AG Sessions was involved ...,alfranken,Al Franken,New York,Democratic,0.168766,troubling news ag session involved firing jeff...
4,Wed May 10 19:54:20 +0000 2017,8.62e+17,131.0,556.0,Troubling news that you probably know by now: ...,alfranken,Al Franken,New York,Democratic,0.235612,troubling news probably know president trump f...


In [10]:
senate.the_ratio.describe()

count    219536.000000
mean          0.677341
std           1.847243
min           0.000000
25%           0.046512
50%           0.250000
75%           0.636364
max         138.000000
Name: the_ratio, dtype: float64

In [11]:
# This divides the_ratio column into quartiles and labels them by the corresponding number
senate['ratio_quartile'] = pd.qcut(senate['the_ratio'], 4, labels=False)

####  2. Establishing a baseline accuracy

In [12]:
senate['ratio_quartile'].value_counts()

1    59427
0    54906
3    54855
2    50348
Name: ratio_quartile, dtype: int64

In [13]:
# The baseline accuracy will be the percentage of the most frequent class
baseline_accuracy = senate['ratio_quartile'].value_counts()[0]/len(senate['ratio_quartile'])
baseline_accuracy

0.2501002113548575

### 3. Logistic Regression model

In [14]:
# Added more stopwords after a deep dive into EDA
stopwords = set(STOPWORDS)
stopwords.add("http")
stopwords.add("co")
stopwords.add("amp")
stopwords.add("u")
stopwords.add("w")
stopwords.add("bit")
stopwords.add("ly")

In [17]:
# setting up X and y
y = senate['ratio_quartile']
X = senate['text_processed']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.3, random_state=42)

le = LabelEncoder()
le.fit(y)

y_train = le.transform(y_train)
y_test = le.transform(y_test)

In [18]:
%%time 

pipeline = Pipeline([
    ('vect', CountVectorizer(stop_words=stopwords, ngram_range=(1,1))), 
    ('logit', LogisticRegression())
]) 
pipeline.fit(X_train, y_train)
print(pipeline.score(X_train, y_train))
print(pipeline.score(X_test, y_test))
print(cross_val_score(pipeline, X_test, y_test, cv=5, verbose=1).mean()
)

0.8099951195705222
0.4157240248401937
0.3888034547167317
CPU times: user 1min 22s, sys: 818 ms, total: 1min 23s
Wall time: 1min 23s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   40.8s finished


In [19]:
%%time 

pipeline = Pipeline([
    ('vect', CountVectorizer(stop_words=stopwords, 
                             ngram_range=(1,1),
                             min_df=1,
                             strip_accents='unicode')), 
    ('logit', LogisticRegression())
]) 
pipeline.fit(X_train, y_train)
print(pipeline.score(X_train, y_train))
print(pipeline.score(X_test, y_test))
print(cross_val_score(pipeline, X_test, y_test, cv=5, verbose=1).mean()
)

0.809858467545144
0.4156784743626729
0.3889552942556505
CPU times: user 1min 23s, sys: 732 ms, total: 1min 24s
Wall time: 1min 24s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   41.6s finished


In [20]:
%%time 
pipeline = Pipeline([
    ('vect', CountVectorizer(stop_words=stopwords, 
                             ngram_range=(1,1),
                             strip_accents='ascii')), 
    ('logit', LogisticRegression(penalty='l1'))
]) 
pipeline.fit(X_train, y_train)
print(pipeline.score(X_train, y_train))
print(pipeline.score(X_test, y_test))
print(cross_val_score(pipeline, X_test, y_test, cv=5, verbose=1).mean()
)

0.5646266471449487
0.41765232838857597
0.39067096046559513
CPU times: user 44.6 s, sys: 673 ms, total: 45.2 s
Wall time: 45.3 s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   23.8s finished


In [22]:
y_preds = pipeline.predict(X_test)

logit_confusion = pd.crosstab(y_test, y_preds, rownames=['Actual'], colnames=['Predicted'], margins=True)
logit_confusion

Predicted,0,1,2,3,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,8431,3196,1605,3191,16423
1,3617,8817,2550,2935,17919
2,3387,4784,2760,4177,15108
3,3591,2943,2378,7499,16411
All,19026,19740,9293,17802,65861


In [23]:
metrics = classification_report(y_test, y_preds)
print(metrics)

             precision    recall  f1-score   support

          0       0.44      0.51      0.48     16423
          1       0.45      0.49      0.47     17919
          2       0.30      0.18      0.23     15108
          3       0.42      0.46      0.44     16411

avg / total       0.41      0.42      0.41     65861

