In [59]:
#Imports

In [290]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

In [61]:
import warnings; warnings.simplefilter('ignore')

In [62]:
politics_reddit=pd.read_csv('politics_reddit.csv',index_col='Unnamed: 0')
politics_reddit.head()

Unnamed: 0,titles,marker,neg,neu,pos,compound,title_length
0,Elizabeth Warren is Officially Leading the 202...,D,0.0,1.0,0.0,0.0,53
1,"Joe Biden: Trump won’t destroy me, and he won’...",D,0.196,0.804,0.0,-0.875,261
2,"Tucker Carlson attacks the awful Shep Smith, a...",R,0.312,0.688,0.0,-0.7096,104
3,How California is using the courts to fight th...,D,0.181,0.819,0.0,-0.4939,144
4,Adam Schiff has 2 aides who worked with whistl...,R,0.0,1.0,0.0,0.0,68


In [63]:
#Train Test Split and check for balanced classes

In [64]:
X=politics_reddit['titles']
y=politics_reddit['marker'].map(lambda x: 1 if x=='R' else 0)
X_train, X_test, y_train, y_test =train_test_split(X, y, random_state=42, stratify=y)

In [65]:
y.value_counts() #1 is outbalanced by 2 to 1 leading to stratification of y

0    624
1    314
Name: marker, dtype: int64

In [66]:
y.mean() #baseline score

0.3347547974413646

In [67]:
(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

((703,), (703,), (235,), (235,))

In [68]:
#model instantiation and scoring, uses a mixture of unregularized and gridsearching

In [289]:
cv_lr_pipe=Pipeline([
    ( 'cv', CountVectorizer()),
    ('lr',LogisticRegressionCV())
])

cv_lr_pipe.fit(X_train,y_train)
print(f'Best Training Accuracy Score:{cv_lr_pipe.score(X_train,y_train)}')
print(f'Best Training Accuracy Score:{cv_lr_pipe.score(X_test,y_test)}')

Best Training Accuracy Score:0.9601706970128022
Best Training Accuracy Score:0.6978723404255319


In [70]:
params_cv_lr={
    'cv__stop_words':[None,'english'],
    'cv__ngram_range':[(1,1),(1,2),(1,3)],
    'lr__Cs':[np.logspace(1,10,1)]}

gs_cv_lr=GridSearchCV(cv_lr_pipe,
                     params_cv_lr,
                     cv=3)
gs_cv_lr.fit(X_train,y_train)
print(f'Best params:{gs_cv_lr.best_params_}')
print(f'Best Training Accuracy Score:{gs_cv_lr.best_score_}')
print(f'Best Test Accuracy Score:{gs_cv_lr.score(X_test,y_test)}')

Best params:{'cv__ngram_range': (1, 3), 'cv__stop_words': None, 'lr__Cs': array([10.])}
Best Training Accuracy Score:0.7140825035561877
Best Test Accuracy Score:0.7319148936170212


In [71]:
tf_lr_pipe=Pipeline([
    ('tf', TfidfVectorizer()),
    ('lr',LogisticRegressionCV())
])

tf_lr_pipe.fit(X_train,y_train)
print(f'Best Training Accuracy Score:{tf_lr_pipe.score(X_train,y_train)}')
print(f'Best Training Accuracy Score:{tf_lr_pipe.score(X_test,y_test)}')

Best Training Accuracy Score:1.0
Best Training Accuracy Score:0.7191489361702128


In [72]:
params_tf_lr={
    'tf__stop_words':[None,'english'],
    'tf__ngram_range':[(1,1),(1,2),(1,3)],
    'lr__Cs':[np.logspace(1,10,1)]}

gs_tf_lr=GridSearchCV(tf_lr_pipe,
                     params_tf_lr,
                     cv=3)

gs_tf_lr.fit(X_train,y_train)
print(f'Best params:{gs_tf_lr.best_params_}')
print(f'Best Training Accuracy Score:{gs_tf_lr.best_score_}')
print(f'Best Test Accuracy Score:{gs_tf_lr.score(X_test,y_test)}')

Best params:{'lr__Cs': array([10.]), 'tf__ngram_range': (1, 2), 'tf__stop_words': 'english'}
Best Training Accuracy Score:0.6913229018492176
Best Test Accuracy Score:0.7021276595744681


In [73]:
cv_mbn_pipe=Pipeline([
    ('cv', CountVectorizer()),
    ('mbn',MultinomialNB())
])
cv_mbn_pipe.fit(X_train,y_train)
print(f'Best Training Accuracy Score:{cv_mbn_pipe.score(X_train,y_train)}')
print(f'Best Training Accuracy Score:{cv_mbn_pipe.score(X_test,y_test)}')

Best Training Accuracy Score:0.9260312944523471
Best Training Accuracy Score:0.6808510638297872


In [74]:
params_cv={
    'cv__stop_words':[None,'english'],
    'cv__ngram_range':[(1,1),(1,2),(1,3)]}

gs_cv_mbn=GridSearchCV(cv_mbn_pipe,
                     params_cv,
                     cv=3)

gs_cv_mbn.fit(X_train,y_train)

print(f'Best params:{gs_cv_mbn.best_params_}')
print(f'Best Training Accuracy Score:{gs_cv_mbn.best_score_}')
print(f'Best Test Accuracy Score:{gs_cv_mbn.score(X_test,y_test)}')

Best params:{'cv__ngram_range': (1, 2), 'cv__stop_words': None}
Best Training Accuracy Score:0.6799431009957326
Best Test Accuracy Score:0.6680851063829787


In [75]:
tf_mbn_pipe=Pipeline([
    ('tf', TfidfVectorizer()),
    ('mbn',MultinomialNB())
])

tf_mbn_pipe.fit(X_train,y_train)
print(f'Best Training Accuracy Score:{tf_mbn_pipe.score(X_train,y_train)}')
print(f'Best Training Accuracy Score:{tf_mbn_pipe.score(X_test,y_test)}')

Best Training Accuracy Score:0.7795163584637269
Best Training Accuracy Score:0.6680851063829787


In [76]:
params_tf={
    'tf__stop_words':[None,'english'],
    'tf__ngram_range':[(1,1),(1,2),(1,3)]}

gs_tf_mbn=GridSearchCV(tf_mbn_pipe,
                     params_tf,
                     cv=3)
gs_tf_mbn.fit(X_train,y_train)

print(f'Best params:{gs_tf_mbn.best_params_}')
print(f'Best Training Accuracy Score:{gs_tf_mbn.best_score_}')
print(f'Best Test Accuracy Score:{gs_tf_mbn.score(X_test,y_test)}')

Best params:{'tf__ngram_range': (1, 1), 'tf__stop_words': 'english'}
Best Training Accuracy Score:0.6770981507823614
Best Test Accuracy Score:0.6723404255319149


In [77]:
cv_bnb_pipe=Pipeline([
    ('cv', CountVectorizer()),
    ('bnb',BernoulliNB())
])

cv_bnb_pipe.fit(X_train,y_train)
print(f'Best Training Accuracy Score:{cv_bnb_pipe.score(X_train,y_train)}')
print(f'Best Training Accuracy Score:{cv_bnb_pipe.score(X_test,y_test)}')

Best Training Accuracy Score:0.9359886201991465
Best Training Accuracy Score:0.6851063829787234


In [78]:
params_cv={
    'cv__stop_words':[None,'english'],
    'cv__ngram_range':[(1,1),(1,2),(1,3)]}

gs_cv_bnb=GridSearchCV(cv_bnb_pipe,
                     params_cv,
                     cv=3)

gs_cv_bnb.fit(X_train,y_train)

print(f'Best params:{gs_cv_bnb.best_params_}')
print(f'Best Training Accuracy Score:{gs_cv_bnb.best_score_}')
print(f'Best Test Accuracy Score:{gs_cv_bnb.score(X_test,y_test)}')

Best params:{'cv__ngram_range': (1, 1), 'cv__stop_words': None}
Best Training Accuracy Score:0.6742532005689901
Best Test Accuracy Score:0.6851063829787234


In [79]:
tf_bnb_pipe=Pipeline([
    ('tf', TfidfVectorizer()),
    ('bnb',BernoulliNB())])

tf_bnb_pipe.fit(X_train,y_train)

print(f'Best Training Accuracy Score:{tf_bnb_pipe.score(X_train,y_train)}')
print(f'Best Training Accuracy Score:{tf_bnb_pipe.score(X_test,y_test)}')

Best Training Accuracy Score:0.9359886201991465
Best Training Accuracy Score:0.6851063829787234


In [80]:
params_tf={
    'tf__stop_words':[None,'english'],
    'tf__ngram_range':[(1,1),(1,2),(1,3)]}


gs_tf_bnb=GridSearchCV(tf_bnb_pipe,
                     params_tf,
                     cv=3)

gs_tf_bnb.fit(X_train,y_train)

print(f'Best params:{gs_tf_bnb.best_params_}')
print(f'Best Training Accuracy Score:{gs_tf_bnb.best_score_}')
print(f'Best Test Accuracy Score:{gs_tf_bnb.score(X_test,y_test)}')

Best params:{'tf__ngram_range': (1, 1), 'tf__stop_words': None}
Best Training Accuracy Score:0.6742532005689901
Best Test Accuracy Score:0.6851063829787234


In [81]:
cv_knn_pipe=Pipeline([
    ('cv', CountVectorizer()),
    ('knn',KNeighborsClassifier())])

cv_knn_pipe.fit(X_train,y_train)
print(f'Best Training Accuracy Score:{cv_knn_pipe.score(X_train,y_train)}')
print(f'Best Training Accuracy Score:{cv_knn_pipe.score(X_test,y_test)}')

Best Training Accuracy Score:0.6813655761024182
Best Training Accuracy Score:0.4765957446808511


In [82]:
params_cv_knn={
    'cv__stop_words':[None,'english'],
    'cv__ngram_range':[(1,1),(1,2),(1,3)],
    'knn__n_neighbors':[3,5,7,9,15],
    'knn__p':[1,2]}

gs_cv_knn=GridSearchCV(cv_knn_pipe,
                     params_cv_knn,
                     cv=3)

gs_cv_knn.fit(X_train,y_train)
print(f'Best params:{gs_cv_knn.best_params_}')
print(f'Best Training Accuracy Score:{gs_cv_knn.best_score_}')
print(f'Best Test Accuracy Score:{gs_cv_knn.score(X_test,y_test)}')

Best params:{'cv__ngram_range': (1, 1), 'cv__stop_words': 'english', 'knn__n_neighbors': 3, 'knn__p': 2}
Best Training Accuracy Score:0.5419630156472262
Best Test Accuracy Score:0.5957446808510638


In [83]:
tf_knn_pipe=Pipeline([
    ('tf', TfidfVectorizer()),
    ('knn',KNeighborsClassifier())])

tf_knn_pipe.fit(X_train,y_train)
print(f'Best Training Accuracy Score:{tf_knn_pipe.score(X_train,y_train)}')
print(f'Best Training Accuracy Score:{tf_knn_pipe.score(X_test,y_test)}')

Best Training Accuracy Score:0.7695590327169275
Best Training Accuracy Score:0.676595744680851


In [84]:
params_tf_knn={
    'tf__stop_words':[None,'english'],
    'tf__ngram_range':[(1,1),(1,2),(1,3)],
    'knn__n_neighbors':[3,5,7,9,15],
    'knn__p':[1,2]}

gs_tf_knn=GridSearchCV(tf_knn_pipe,
                     params_tf_knn,
                     cv=3)

gs_tf_knn.fit(X_train,y_train)
print(f'Best params:{gs_tf_knn.best_params_}')
print(f'Best Training Accuracy Score:{gs_tf_knn.best_score_}')
print(f'Best Test Accuracy Score:{gs_tf_knn.score(X_test,y_test)}')

Best params:{'knn__n_neighbors': 9, 'knn__p': 2, 'tf__ngram_range': (1, 1), 'tf__stop_words': None}
Best Training Accuracy Score:0.6785206258890469
Best Test Accuracy Score:0.6595744680851063


In [85]:
#gather words and coefs

In [86]:
cv_lr_pipe.named_steps['cv']

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [87]:
words=cv_lr_pipe.named_steps['cv'].get_feature_names()

In [88]:
cv_lr_coef=cv_lr_pipe.named_steps['lr'].coef_[0]

In [89]:
print(tf_lr_pipe.named_steps['tf'].get_feature_names()[-1])
len(tf_lr_pipe.named_steps['tf'].get_feature_names())

zuckerberg


3382

In [90]:
tf_lr_coef=tf_lr_pipe.named_steps['lr'].coef_[0]

In [91]:
cv_mbn_coef=cv_mbn_pipe.named_steps['mbn'].coef_[0]

In [92]:
tf_mbn_coef=tf_mbn_pipe.named_steps['mbn'].coef_[0]

In [93]:
tf_mbn_coef=tf_mbn_pipe.named_steps['mbn'].coef_[0]

In [94]:
cv_bnb_coef=cv_bnb_pipe.named_steps['bnb'].coef_[0]

In [95]:
tf_bnb_coef=tf_bnb_pipe.named_steps['bnb'].coef_[0]

In [96]:
#create dataframe for further modeling

In [176]:
words_df=pd.DataFrame({'words':words,
                      'cv_lr_coef':cv_lr_coef,
                      'tf_lr_coef':tf_lr_coef,
                      'cv_mbn_coef':cv_mbn_coef,
                      'tf_mbn_coef':tf_mbn_coef,
                      'cv_bnb_coef':cv_bnb_coef,
                      'tf_bnb_coef':tf_bnb_coef})

In [98]:
words_df.head(20)

Unnamed: 0,words,cv_lr_coef,tf_lr_coef,cv_mbn_coef,tf_mbn_coef,cv_bnb_coef,tf_bnb_coef
0,000,-0.138481,-1.090025,-8.737292,-8.324987,-5.46806,-5.46806
1,09,-0.043993,-0.406573,-8.737292,-8.324987,-5.46806,-5.46806
2,10,-0.09846,-0.66832,-8.737292,-8.324987,-5.46806,-5.46806
3,100,-0.048649,-0.329723,-8.044145,-8.119376,-4.774913,-4.774913
4,10m,-0.056806,-0.280052,-8.737292,-8.324987,-5.46806,-5.46806
5,11,-0.179016,-1.202437,-8.737292,-8.324987,-5.46806,-5.46806
6,11th,-0.014776,-0.206866,-8.737292,-8.324987,-5.46806,-5.46806
7,12,0.037105,-0.105699,-8.044145,-8.122278,-4.774913,-4.774913
8,120,-0.006599,-0.178084,-8.737292,-8.324987,-5.46806,-5.46806
9,13,-0.164234,-1.111443,-8.737292,-8.324987,-5.46806,-5.46806


In [99]:
words_df.tail(20)

Unnamed: 0,words,cv_lr_coef,tf_lr_coef,cv_mbn_coef,tf_mbn_coef,cv_bnb_coef,tf_bnb_coef
3362,wrote,0.222833,2.057842,-8.044145,-7.956542,-4.774913,-4.774913
3363,wsj,-0.101946,-0.934357,-8.737292,-8.324987,-5.46806,-5.46806
3364,wtmf,-0.019635,-0.310617,-8.737292,-8.324987,-5.46806,-5.46806
3365,wwii,0.161972,1.566688,-8.044145,-7.99226,-4.774913,-4.774913
3366,yang,-0.309954,-2.591764,-8.737292,-8.324987,-5.46806,-5.46806
3367,yascha,-0.002525,-0.102047,-8.737292,-8.324987,-5.46806,-5.46806
3368,year,-0.086079,-0.52972,-8.044145,-8.148265,-4.774913,-4.774913
3369,years,-0.088446,-0.9301,-7.63868,-8.003708,-4.369448,-4.369448
3370,yes,-0.060752,-0.795286,-8.737292,-8.324987,-5.46806,-5.46806
3371,yet,-0.122053,-1.065893,-8.737292,-8.324987,-5.46806,-5.46806


In [100]:
words_df.sample(20)

Unnamed: 0,words,cv_lr_coef,tf_lr_coef,cv_mbn_coef,tf_mbn_coef,cv_bnb_coef,tf_bnb_coef
3224,vision,-0.006599,-0.178084,-8.737292,-8.324987,-5.46806,-5.46806
476,bullet,0.066419,0.55139,-8.044145,-8.075684,-4.774913,-4.774913
2877,strength,-0.02388,-0.286258,-8.737292,-8.324987,-5.46806,-5.46806
2737,shut,0.061488,0.639397,-8.044145,-7.997804,-4.774913,-4.774913
422,body,-0.052449,-0.325377,-8.737292,-8.324987,-5.46806,-5.46806
193,amid,-0.00147,-0.109937,-8.737292,-8.324987,-5.46806,-5.46806
584,circle,-0.003095,-0.185875,-8.737292,-8.324987,-5.46806,-5.46806
2317,principle,-0.001057,-0.15603,-8.737292,-8.324987,-5.46806,-5.46806
150,ago,0.096302,0.878572,-8.044145,-8.098112,-4.774913,-4.774913
1859,manufacturing,-0.072562,-0.652881,-8.737292,-8.324987,-5.46806,-5.46806


In [103]:
words_df.to_csv('words_coef.csv')

In [115]:
#Confusion Matrix Using the optimized TF LR model

In [251]:
y_preds_best=gs_cv_lr.predict(X_test)

In [252]:
tn, fp, fn, tp=confusion_matrix(y_test,y_preds_best).ravel()

In [253]:
def error_rate(tn, fp, fn, tp):
    return np.sum([fp, fn])/np.sum([tn, fp, fn, tp])

In [254]:
def accuracy(a, b, c, d):
    return np.sum([d,a])/np.sum([a, b, c, d])

In [255]:
def recall(a,b):
    return b/np.sum([a, b])

In [256]:
def specificity(a,b):
    return a/np.sum([a, b])

In [257]:
def false_positive(a,b):
    return b/np.sum([a, b])

In [258]:
def percession(a,b):
    return b/np.sum([a, b])

In [259]:
tn, fp, fn, tp

(129, 27, 36, 43)

In [260]:
error_rate(tn, fp, fn, tp)

0.2680851063829787

In [261]:
accuracy(tn, fp, fn, tp)

0.7319148936170212

In [262]:
recall(tp,fn)

0.45569620253164556

In [263]:
specificity(tn,fp)

0.8269230769230769

In [264]:
percession(tp,fp)

0.38571428571428573

In [265]:
false_positive(fp,tn)

0.8269230769230769

In [266]:
test_phrases_df=pd.DataFrame({'test_words':X_test,
                      'true_values':y_test,
                      'pred_values':y_preds_best})

In [267]:
test_phrases_df.head()

Unnamed: 0,test_words,true_values,pred_values
400,Trump gives green light to Turkey to attack.,0,1
136,"The Lights Are Out in California, And That Was the Plan All Along",1,0
292,"David Frum on Twitter: ""Congress did not vote aid to Ukraine as an act of charity. Congress beli...",0,0
822,Committees Ready Subpoena for White House After Ukraine Documents Withheld for Weeks,0,0
927,"Second whistleblower about Ukraine phone call coming forward. Uhm, dumbasses, we have the damn t...",1,0


In [269]:
test_phrases_df.reset_index(inplace=True)
test_phrases_df.drop(columns='index',inplace=True)

In [274]:
test_phrases_df['good_preds'] = (test_phrases_df['true_values'] == test_phrases_df['pred_values']).astype(int)

In [281]:
pd.set_option('max_colwidth', 400)

In [285]:
test_phrases_df[test_phrases_df['good_preds']==0]

Unnamed: 0,test_words,true_values,pred_values,good_preds
0,Trump gives green light to Turkey to attack.,0,1,0
1,"The Lights Are Out in California, And That Was the Plan All Along",1,0,0
4,"Second whistleblower about Ukraine phone call coming forward. Uhm, dumbasses, we have the damn transcript.",1,0,0
6,Andrew McCabe Does an Interview and Makes a Stunning Admission,1,0,0
8,Yang Gang Lights Up L.A.,0,1,0
...,...,...,...,...
217,"For anyone in Edinburgh, Scotland - the Jordan Peterson documentary is coming to a theatre near you!",1,0,0
219,Biden’s Most Formidable Opponent Is Not Another Democrat - Questions about his age have dogged the former vice president throughout the primary.,0,1,0
220,"Mike Ghassali is running as a Republican for Congress in NJ-05. He is a legal immigrant from Syria, Mayor of Montvale, and a patriotic American! He has lowered taxes, and refused to make his town a “sanctuary city”. He has raised over 300k since July to unseat Josh Gottheimer. Let’s put NJ first!",1,0,0
231,Just Remember: Roy Cohn Taught Him His ABCs”,0,1,0


In [283]:
test_phrases_df.to_csv('test_phrase_df.csv')