In [41]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import re
from sklearn.grid_search import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
import scipy.stats as stats

In [2]:
senate = senate = pd.read_csv('./data/senate.csv')

In [22]:
def text_process(text):
    '''
    Takes in a string of text, then performs the following:
        1. Tokenizes and removes punctuation
        2. Removes stopwords
        3. Lemmatizes
        4. Returns a list of the cleaned text
    '''
    if pd.isnull(text):
        return []
    # tokenizing and removing punctuation
    tokenizer = RegexpTokenizer(r'\w+')
    text_processed=tokenizer.tokenize(text)
    
    # removing any stopwords
    text_processed = [word.lower() for word in text_processed if word.lower() not in stopwords.words('english')]
    
    # steming
    lemmatizer = WordNetLemmatizer()
    
    text_processed = [lemmatizer.lemmatize(word) for word in text_processed]
    
    try:
        text_processed.remove('b')
    except: 
        pass

    return " ".join(text_processed) 

In [23]:
text_process(senate['text'][69034])

'privilege meeting national cherry queen danielle bott yesterday thank making michigan proud http co sqnyowmoqp'

In [24]:
# Becareful of running this code, it takes a long time. (leaving it commented out to be safe)
# senate['text_processed'] = senate['text'].apply(text_process)

In [25]:
senate.head()

Unnamed: 0,created_at,id_str,reply_count,retweet_count,text,user,name,state,party,the_ratio,text_processed
0,Sun May 21 19:26:26 +0000 2017,8.66e+17,116.0,174.0,"Franni here. Since it's Al's birthday, and sin...",alfranken,Al Franken,New York,Democratic,0.666667,franni since al birthday since working hard wa...
1,Tue May 16 01:44:44 +0000 2017,8.64e+17,718.0,1248.0,This is profoundly troubling. Why would Presid...,alfranken,Al Franken,New York,Democratic,0.575321,profoundly troubling would president trump giv...
2,Wed May 10 19:55:37 +0000 2017,8.62e+17,218.0,1334.0,It couldn't be clearer: we need an independent...,alfranken,Al Franken,New York,Democratic,0.163418,clearer need independent investigation preside...
3,Wed May 10 19:54:55 +0000 2017,8.62e+17,134.0,794.0,More troubling news: AG Sessions was involved ...,alfranken,Al Franken,New York,Democratic,0.168766,troubling news ag session involved firing jeff...
4,Wed May 10 19:54:20 +0000 2017,8.62e+17,131.0,556.0,Troubling news that you probably know by now: ...,alfranken,Al Franken,New York,Democratic,0.235612,troubling news probably know president trump f...


In [26]:
# exporting the dataframe to a .csv
senate.to_csv('./data/senate_processed.csv', index=False)

### Predicting partisanship

In [28]:
senate['party'].value_counts()

Republican     115566
Democratic      98558
Independent      5415
Name: party, dtype: int64

In [30]:
# we are only interested in predicting either Democratic or Republican partisanship
bipartisan = senate[senate['party'] != 'Independent']
bipartisan['party'].value_counts()

Republican    115566
Democratic     98558
Name: party, dtype: int64

In [31]:
# establishing our baseline accuracy
baseline_accuracy = bipartisan['party'].value_counts()[0]/len(bipartisan['party'])
baseline_accuracy

0.5397153051502868

In [32]:
# stetting up X and y
y = bipartisan['party']
X = bipartisan['text_processed']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.3, random_state=42)

le = LabelEncoder()
le.fit(y)

y_train = le.transform(y_train)
y_test = le.transform(y_test)

In [35]:
%%time

# Creating a pipeline to CountVectorize and run a RandomForestClassifier

pipeline = Pipeline([
    ('vect', CountVectorizer()), 
    ('rfc', RandomForestClassifier())
]) 
pipeline.fit(X_train, y_train)
print(pipeline.score(X_train, y_train))
print(pipeline.score(X_test, y_test))

0.991386787291675
0.7614963105949749
CPU times: user 4min 32s, sys: 1.58 s, total: 4min 33s
Wall time: 4min 34s


In [37]:
y_preds = pipeline.predict(X_test)

rfc_confusion = pd.crosstab(y_test, y_preds, rownames=['Actual'], colnames=['Predicted'], margins=True)
rfc_confusion

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,22387,6993,29380
1,8328,26530,34858
All,30715,33523,64238


In [None]:
%%time

# Creating a pipeline to CountVectorize and run a KNeighborsClassifier

pipeline = Pipeline([
    ('vect', CountVectorizer()), 
    ('KNN', KNeighborsClassifier())
]) 
pipeline.fit(X_train, y_train)
print(pipeline.score(X_train, y_train))
print(pipeline.score(X_test, y_test))

In [None]:
y_preds = pipeline.predict(X_test)

knn_confusion = pd.crosstab(y_test, y_preds, rownames=['Actual'], colnames=['Predicted'], margins=True)
knn_confusion