In [36]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import re
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import scipy.stats as stats

In [3]:
senate = pd.read_csv('./data/senate_processed.csv')

In [4]:
senate.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219539 entries, 0 to 219538
Data columns (total 11 columns):
created_at        219539 non-null object
id_str            219539 non-null float64
reply_count       219539 non-null float64
retweet_count     219539 non-null float64
text              219539 non-null object
user              219539 non-null object
name              219539 non-null object
state             219539 non-null object
party             219539 non-null object
the_ratio         219539 non-null float64
text_processed    219536 non-null object
dtypes: float64(4), object(7)
memory usage: 18.4+ MB


In [5]:
senate.dropna(inplace=True)

In [6]:
senate.head()

Unnamed: 0,created_at,id_str,reply_count,retweet_count,text,user,name,state,party,the_ratio,text_processed
0,Sun May 21 19:26:26 +0000 2017,8.66e+17,116.0,174.0,"Franni here. Since it's Al's birthday, and sin...",alfranken,Al Franken,New York,Democratic,0.666667,franni since al birthday since working hard wa...
1,Tue May 16 01:44:44 +0000 2017,8.64e+17,718.0,1248.0,This is profoundly troubling. Why would Presid...,alfranken,Al Franken,New York,Democratic,0.575321,profoundly troubling would president trump giv...
2,Wed May 10 19:55:37 +0000 2017,8.62e+17,218.0,1334.0,It couldn't be clearer: we need an independent...,alfranken,Al Franken,New York,Democratic,0.163418,clearer need independent investigation preside...
3,Wed May 10 19:54:55 +0000 2017,8.62e+17,134.0,794.0,More troubling news: AG Sessions was involved ...,alfranken,Al Franken,New York,Democratic,0.168766,troubling news ag session involved firing jeff...
4,Wed May 10 19:54:20 +0000 2017,8.62e+17,131.0,556.0,Troubling news that you probably know by now: ...,alfranken,Al Franken,New York,Democratic,0.235612,troubling news probably know president trump f...


In [23]:
senate.the_ratio.describe()

count    219539.000000
mean          0.677336
std           1.847231
min           0.000000
25%           0.046512
50%           0.250000
75%           0.636364
max         138.000000
Name: the_ratio, dtype: float64

In [22]:
# This divides the_ratio column into quartiles and labels them by the corresponding number
senate['ratio_quartile'] = pd.qcut(senate['the_ratio'], 4, labels=False)

In [30]:
senate['ratio_quartile'].value_counts()

1    59427
0    54906
3    54855
2    50348
Name: ratio_quartile, dtype: int64

In [33]:
# establishing our baseline accuracy
baseline_accuracy = senate['ratio_quartile'].value_counts()[0]/len(senate['ratio_quartile'])
baseline_accuracy

0.2501002113548575

In [25]:
# Setting our X,y and encoding our target variable
y = senate['ratio_quartile']
X = senate['text']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.3, random_state=42)

le = LabelEncoder()
le.fit(y)

y_train = le.transform(y_train)
y_test = le.transform(y_test)

In [26]:
%%time

# Creating a pipeline to countvectorize and run a RandomForestClassifier
pipeline = Pipeline([
    ('vect', CountVectorizer()), 
    ('rfc', RandomForestClassifier())
]) 
pipeline.fit(X_train, y_train)
print(pipeline.score(X_train, y_train))
print(pipeline.score(X_test, y_test))

0.9849487554904831
0.3622629477232353
CPU times: user 8min 15s, sys: 2.14 s, total: 8min 17s
Wall time: 8min 18s


In [34]:
y_preds = pipeline.predict(X_test)

rfc_confusion = pd.crosstab(y_test, y_preds, rownames=['Actual'], colnames=['Predicted'], margins=True)
rfc_confusion

Predicted,0,1,2,3,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,8119,3632,1965,2707,16423
1,4661,7869,2689,2700,17919
2,4368,4928,2624,3188,15108
3,4516,4019,2629,5247,16411
All,21664,20448,9907,13842,65861


In [37]:
metrics = classification_report(y_test, y_preds)
print(metrics)

             precision    recall  f1-score   support

          0       0.37      0.49      0.43     16423
          1       0.38      0.44      0.41     17919
          2       0.26      0.17      0.21     15108
          3       0.38      0.32      0.35     16411

avg / total       0.35      0.36      0.35     65861



### Running a GridsearchCV

In [27]:
# Parameter grid for gridsearchCV
params_grid = {
    'vect__strip_accents': ['ascii', 'unicode'],
    'vect__min_df': [1,5,10,20],
    'rfc__n_estimators': [5,10,15,20],
    'rfc__max_depth': [5,10,15,20,None],
    'rfc__criterion': ['gini', 'entropy']
}

In [28]:
%%time
# THIS CODE TAKES A LONG TIME TO RUN

# Gridsearching for best parameters

gs = GridSearchCV(pipeline, params_grid, n_jobs = 1)

gs.fit(X_train, y_train)

print(gs.score(X_train, y_train))
print(gs.score(X_test, y_test))
print(gs.best_params_)

0.9902000976085895
0.38786231608994703
{'rfc__criterion': 'gini', 'rfc__max_depth': None, 'rfc__n_estimators': 20, 'vect__min_df': 10, 'vect__strip_accents': 'unicode'}
CPU times: user 6h 49min 15s, sys: 2min 22s, total: 6h 51min 38s
Wall time: 6h 51min 41s


In [38]:
y_preds = gs.predict(X_test)

rfc_confusion = pd.crosstab(y_test, y_preds, rownames=['Actual'], colnames=['Predicted'], margins=True)
rfc_confusion

Predicted,0,1,2,3,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,8408,3864,1489,2662,16423
1,4180,8964,2109,2666,17919
2,4011,5522,2225,3350,15108
3,3945,4351,2167,5948,16411
All,20544,22701,7990,14626,65861


In [39]:
metrics = classification_report(y_test, y_preds)
print(metrics)

             precision    recall  f1-score   support

          0       0.41      0.51      0.45     16423
          1       0.39      0.50      0.44     17919
          2       0.28      0.15      0.19     15108
          3       0.41      0.36      0.38     16411

avg / total       0.37      0.39      0.37     65861

