In [29]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from wordcloud import STOPWORDS
import re
from sklearn.grid_search import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
import scipy.stats as stats

In [2]:
senate = pd.read_csv('./data/senate.csv')

In [22]:
def text_process(text):
    '''
    Takes in a string of text, then performs the following:
        1. Tokenizes and removes punctuation
        2. Removes stopwords
        3. Lemmatizes
        4. Returns a list of the cleaned text
    '''
    if pd.isnull(text):
        return []
    # tokenizing and removing punctuation
    tokenizer = RegexpTokenizer(r'\w+')
    text_processed=tokenizer.tokenize(text)
    
    # removing any stopwords
    text_processed = [word.lower() for word in text_processed if word.lower() not in stopwords.words('english')]
    
    # steming
    lemmatizer = WordNetLemmatizer()
    
    text_processed = [lemmatizer.lemmatize(word) for word in text_processed]
    
    try:
        text_processed.remove('b')
    except: 
        pass

    return " ".join(text_processed) 

In [23]:
text_process(senate['text'][69034])

'privilege meeting national cherry queen danielle bott yesterday thank making michigan proud http co sqnyowmoqp'

In [24]:
# Becareful of running this code, it takes a long time. (leaving it commented out to be safe)
# senate['text_processed'] = senate['text'].apply(text_process)

In [25]:
senate.head()

Unnamed: 0,created_at,id_str,reply_count,retweet_count,text,user,name,state,party,the_ratio,text_processed
0,Sun May 21 19:26:26 +0000 2017,8.66e+17,116.0,174.0,"Franni here. Since it's Al's birthday, and sin...",alfranken,Al Franken,New York,Democratic,0.666667,franni since al birthday since working hard wa...
1,Tue May 16 01:44:44 +0000 2017,8.64e+17,718.0,1248.0,This is profoundly troubling. Why would Presid...,alfranken,Al Franken,New York,Democratic,0.575321,profoundly troubling would president trump giv...
2,Wed May 10 19:55:37 +0000 2017,8.62e+17,218.0,1334.0,It couldn't be clearer: we need an independent...,alfranken,Al Franken,New York,Democratic,0.163418,clearer need independent investigation preside...
3,Wed May 10 19:54:55 +0000 2017,8.62e+17,134.0,794.0,More troubling news: AG Sessions was involved ...,alfranken,Al Franken,New York,Democratic,0.168766,troubling news ag session involved firing jeff...
4,Wed May 10 19:54:20 +0000 2017,8.62e+17,131.0,556.0,Troubling news that you probably know by now: ...,alfranken,Al Franken,New York,Democratic,0.235612,troubling news probably know president trump f...


In [26]:
# exporting the dataframe to a .csv
senate.to_csv('./data/senate_processed.csv', index=False)

### Predicting partisanship

In [2]:
senate = pd.read_csv('./data/senate_processed.csv')

In [3]:
senate['party'].value_counts()

Republican     115566
Democratic      98558
Independent      5415
Name: party, dtype: int64

In [4]:
# we are only interested in predicting either Democratic or Republican partisanship
bipartisan = senate[senate['party'] != 'Independent']
bipartisan['party'].value_counts()

Republican    115566
Democratic     98558
Name: party, dtype: int64

In [5]:
# establishing our baseline accuracy
baseline_accuracy = bipartisan['party'].value_counts()[0]/len(bipartisan['party'])
baseline_accuracy

0.5397153051502868

In [9]:
bipartisan.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 214124 entries, 0 to 219538
Data columns (total 11 columns):
created_at        214124 non-null object
id_str            214124 non-null float64
reply_count       214124 non-null float64
retweet_count     214124 non-null float64
text              214124 non-null object
user              214124 non-null object
name              214124 non-null object
state             214124 non-null object
party             214124 non-null object
the_ratio         214124 non-null float64
text_processed    214121 non-null object
dtypes: float64(4), object(7)
memory usage: 29.6+ MB


In [10]:
bipartisan.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [30]:
# Added more stopwords after a deep dive into EDA
stopwords = set(STOPWORDS)
stopwords.add("http")
stopwords.add("co")
stopwords.add("amp")
stopwords.add("u")
stopwords.add("w")
stopwords.add("bit")
stopwords.add("ly")

In [11]:
# stetting up X and y
y = bipartisan['party']
X = bipartisan['text_processed']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.3, random_state=42)

le = LabelEncoder()
le.fit(y)

y_train = le.transform(y_train)
y_test = le.transform(y_test)

In [12]:
%%time

# Creating a pipeline to CountVectorize and run a RandomForestClassifier

pipeline = Pipeline([
    ('vect', CountVectorizer(stop_words=stopwords)), 
    ('rfc', RandomForestClassifier())
]) 
pipeline.fit(X_train, y_train)
print(pipeline.score(X_train, y_train))
print(pipeline.score(X_test, y_test))

0.9916535454084492
0.7647150396189112
CPU times: user 4min 4s, sys: 1.07 s, total: 4min 5s
Wall time: 4min 6s


In [None]:
%%time
# comparing to see with the addition of new stop words
pipeline = Pipeline([
    ('vect', CountVectorizer(stop_words=stopwords)), 
    ('rfc', RandomForestClassifier())
]) 
pipeline.fit(X_train, y_train)
print(pipeline.score(X_train, y_train))
print(pipeline.score(X_test, y_test))


In [13]:
y_preds = pipeline.predict(X_test)

rfc_confusion = pd.crosstab(y_test, y_preds, rownames=['Actual'], colnames=['Predicted'], margins=True)
rfc_confusion

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,22348,7003,29351
1,8111,26775,34886
All,30459,33778,64237


In [14]:
# gridsearching

params_grid = {
    'vect__strip_accents': ['ascii', 'unicode'],
    'vect__min_df': [1,5,10,20],
    'rfc__n_estimators': [5,10,15,20],
    'rfc__max_depth': [5,10,15,20,None],
    'rfc__criterion': ['gini', 'entropy']
}

In [15]:
%%time
# THIS CODE TAKES A LONG TIME TO RUN

gs = GridSearchCV(pipeline, params_grid, n_jobs = 1)

gs.fit(X_train, y_train)

print(gs.score(X_train, y_train))
print(gs.score(X_test, y_test))
print(gs.best_params_)

0.9986055883216354
0.7895293989445336
{'rfc__criterion': 'entropy', 'rfc__max_depth': None, 'rfc__n_estimators': 20, 'vect__min_df': 1, 'vect__strip_accents': 'ascii'}
CPU times: user 3h 44min 50s, sys: 1min 16s, total: 3h 46min 6s
Wall time: 3h 46min 8s


In [16]:
gs.best_params_

{'rfc__criterion': 'entropy',
 'rfc__max_depth': None,
 'rfc__n_estimators': 20,
 'vect__min_df': 1,
 'vect__strip_accents': 'ascii'}

In [17]:
y_preds = gs.predict(X_test)

rfc_confusion = pd.crosstab(y_test, y_preds, rownames=['Actual'], colnames=['Predicted'], margins=True)
rfc_confusion

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,22431,6920,29351
1,6600,28286,34886
All,29031,35206,64237


In [25]:
metrics = classification_report(y_test, y_preds)
print(metrics)

             precision    recall  f1-score   support

          0       0.77      0.76      0.77     29351
          1       0.80      0.81      0.81     34886

avg / total       0.79      0.79      0.79     64237

