# Natural Language Processing for Campaign Speeches

In [614]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import timeit

%matplotlib inline

import wikipedia
import spacy
from textblob import TextBlob, Word

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [615]:
pd.set_option("display.max_rows", 999)
pd.set_option("display.max_columns", 999)

In [679]:
df_kerry = pd.read_csv('./csv_files/speech_kerry')
df_bush = pd.read_csv('./csv_files/speech_bush')
df_obama = pd.read_csv('./csv_files/speech_obama')
df_mccain = pd.read_csv('./csv_files/speech_mccain')
df_obama_2 = pd.read_csv('./csv_files/speech_obama_2')
df_romney = pd.read_csv('./csv_files/speech_romney')
df_clinton = pd.read_csv('./csv_files/speech_clinton')
df_trump = pd.read_csv('./csv_files/speech_trump')

In [680]:
df_kerry['party'] = 'dem'
df_bush['party'] = 'rep'
df_obama['party'] = 'dem'
df_mccain['party'] = 'rep'
df_obama_2['party'] = 'dem'
df_romney['party'] = 'rep'
df_clinton['party'] = 'dem'
df_trump['party'] = 'rep'

In [681]:
df = pd.concat([df_kerry, df_bush, df_obama, df_mccain, df_obama_2, df_romney, 
                df_clinton, df_trump], axis=0, ignore_index=True)

In [682]:
df.reindex()
df.shape

(1004, 4)

# Preprocessing

In [695]:
df['speech'] = df['speech'].apply(lambda x: x.replace("\\", "'"))
df['speech'] = df['speech'].apply(lambda x: x.lower().replace("kerry", ""))
df['speech'] = df['speech'].apply(lambda x: x.lower().replace("bush", ""))
df['speech'] = df['speech'].apply(lambda x: x.lower().replace("obama", ""))
df['speech'] = df['speech'].apply(lambda x: x.lower().replace("mccain", ""))
df['speech'] = df['speech'].apply(lambda x: x.lower().replace("romeny", ""))
df['speech'] = df['speech'].apply(lambda x: x.lower().replace("clinton", ""))
df['speech'] = df['speech'].apply(lambda x: x.lower().replace("trump", ""))

In [658]:
subjectivity = []
polarity = []
for i in range(0, len(df.speech)):
    sub = TextBlob(df.speech[i]).sentiment.subjectivity
    subjectivity.append(sub)
    
    pol = TextBlob(df.speech[i]).sentiment.polarity
    polarity.append(pol)

    
df['subjectivity'] = subjectivity
df['polarity'] = polarity

In [664]:
docs = df.speech

sp = []
for speech in docs:
    stem_list = []
    speech_new = []
    speech = speech.split()
    for word in speech:
        w = Word(word)
        stem = w.stem()
        stem_list.append(stem)
    speech_new = ' '.join(stem_list)
    sp.append(speech_new)

df.speech = sp

In [665]:
df.head()

Unnamed: 0,candidate,party,speech,year,subjectivity,polarity
0,kerry,1,"thank you, max cleland for your friendship, yo...",2004,0.470254,0.14997
1,kerry,1,good morn and thank you for invit me into thi ...,2004,0.506184,0.062693
2,kerry,1,we are here tonight to honor our next gener of...,2004,0.456656,0.158972
3,kerry,1,"today, from florida to mississippi, to louisia...",2004,0.454353,0.145098
4,kerry,1,"thank you john, rich, linda, and member of the...",2004,0.422159,0.132524


In [666]:
df.party = df.party.apply(lambda x: 1 if x == 'dem' else 0)
X = df[['speech', 'subjectivity', 'polarity']]
y = df.party

In [667]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=42)

# Modeling Straight Predictions

Without Truncating

In [669]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf.fit(X_train.speech)
X_train_tf_tsvd = tfidf.transform(X_train.speech)


X_train_tf_tsvd = pd.DataFrame(X_train_tf_tsvd.todense())


train_sub_pol = X_train[['subjectivity', 'polarity']].reset_index().drop(['index'], axis=1)
X_train_tf_tsvd = pd.concat([X_train_tf_tsvd, train_sub_pol], axis=1)



rfc = RandomForestClassifier(n_estimators=100, max_depth=5)
rfc.fit(X_train_tf_tsvd, y_train)
train_predictions = rfc.predict(X_train_tf_tsvd)
print(rfc.score(X_train_tf_tsvd, y_train))
print(confusion_matrix(y_train, train_predictions))
print(classification_report(y_train, train_predictions))

X_test_tf_tsvd = tfidf.transform(X_test.speech)
X_test_tf_tsvd = pd.DataFrame(X_test_tf_tsvd.todense())

test_sub_pol = X_test[['subjectivity', 'polarity']].reset_index().drop(['index'], axis=1)
X_test_tf_tsvd = pd.concat([X_test_tf_tsvd, test_sub_pol], axis=1)

test_predictions = rfc.predict(X_test_tf_tsvd)
print(rfc.score(X_test_tf_tsvd, y_test))
print(confusion_matrix(y_test, test_predictions))
print(classification_report(y_test, test_predictions))

1.0
[[602]]
             precision    recall  f1-score   support

          0       1.00      1.00      1.00       602

avg / total       1.00      1.00      1.00       602

1.0
[[402]]
             precision    recall  f1-score   support

          0       1.00      1.00      1.00       402

avg / total       1.00      1.00      1.00       402



In [663]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf.fit(X_train.speech)
X_train_tf_tsvd = tfidf.transform(X_train.speech)

tsvd = TruncatedSVD(n_components=100)
tsvd.fit(X_train_tf)
X_train_tf_tsvd = tsvd.transform(X_train_tf)

X_train_tf_tsvd = pd.DataFrame(X_train_tf_tsvd.todense())


train_sub_pol = X_train[['subjectivity', 'polarity']].reset_index().drop(['index'], axis=1)
X_train_tf_tsvd = pd.concat([X_train_tf_tsvd, train_sub_pol], axis=1)




rfc = RandomForestClassifier(n_estimators=100, max_depth=5)
rfc.fit(X_train_tf_tsvd, y_train)
train_predictions = rfc.predict(X_train_tf_tsvd)
print(rfc.score(X_train_tf_tsvd, y_train))
print(confusion_matrix(y_train, train_predictions))
print(classification_report(y_train, train_predictions))

X_test_tf_tsvd = tfidf.transform(X_test.speech)
X_test_tf_tsvd = pd.DataFrame(X_test_tf_tsvd.todense())

test_sub_pol = X_test[['subjectivity', 'polarity']].reset_index().drop(['index'], axis=1)
X_test_tf_tsvd = pd.concat([X_test_tf_tsvd, test_sub_pol], axis=1)

test_predictions = rfc.predict(X_test_tf_tsvd)
print(rfc.score(X_test_tf_tsvd, y_test))
print(confusion_matrix(y_test, test_predictions))
print(classification_report(y_test, test_predictions))

0.976744186047
[[234  13]
 [  1 354]]
             precision    recall  f1-score   support

          0       1.00      0.95      0.97       247
          1       0.96      1.00      0.98       355

avg / total       0.98      0.98      0.98       602

0.900497512438
[[106  35]
 [  5 256]]
             precision    recall  f1-score   support

          0       0.95      0.75      0.84       141
          1       0.88      0.98      0.93       261

avg / total       0.91      0.90      0.90       402



With Truncating

In [99]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf.fit(X_train)
X_train_tf = tfidf.transform(X_train)

tsvd = TruncatedSVD(n_components=100)
tsvd.fit(X_train_tf)
X_train_tf_tsvd = tsvd.transform(X_train_tf)

rfc = RandomForestClassifier(n_estimators=100, max_depth=5)
rfc.fit(X_train_tf_tsvd, y_train)
train_predictions = rfc.predict(X_train_tf_tsvd)
print(rfc.score(X_train_tf_tsvd, y_train))
print(confusion_matrix(y_train, train_predictions))
print(classification_report(y_train, train_predictions))

X_test_tf = tfidf.transform(X_test)
X_test_tf_tsvd = tsvd.transform(X_test_tf)
test_predictions = rfc.predict(X_test_tf_tsvd)
print(rfc.score(X_test_tf_tsvd, y_test))
print(confusion_matrix(y_test, test_predictions))
print(classification_report(y_test, test_predictions))

0.961794019934
[[211  23]
 [  0 368]]
             precision    recall  f1-score   support

          0       1.00      0.90      0.95       234
          1       0.94      1.00      0.97       368

avg / total       0.96      0.96      0.96       602

0.830845771144
[[ 88  66]
 [  2 246]]
             precision    recall  f1-score   support

          0       0.98      0.57      0.72       154
          1       0.79      0.99      0.88       248

avg / total       0.86      0.83      0.82       402



In [209]:
nlp = spacy.load('en_core_web_sm')

In [146]:
doc = nlp(documents)
print(doc)
for token in doc:
    print(token.text, token.pos_)

today, from florida to mississippi, to louisiana and texas, the message once again rings loud and clear: change is coming to america.this nation is demanding for leadership that takes us in a new direction. and george  has made it clear that he's not going to provide it.george w.  is running on the slogan of ‘steady leadership.' but after four years of the same old failed policies, what we've seen is ‘stubborn leadership.'george  stubbornly proposes tax cut after tax cut for the wealthiest americans while we steadily lose millions of jobs. so tonight we say: change is coming to america.george  stubbornly refuses to help families afford health care while premiums steadily rise through the roof. and tonight we say: change is coming to america.this president stubbornly continues to let polluters rewrite our environmental laws while children steadily breathe dirtier and dirtier air. tonight we say: change is coming to america.george  stubbornly insists on a foreign policy where we go it al

In [147]:
documents

"today, from florida to mississippi, to louisiana and texas, the message once again rings loud and clear: change is coming to america.this nation is demanding for leadership that takes us in a new direction. and george  has made it clear that he's not going to provide it.george w.  is running on the slogan of ‘steady leadership.' but after four years of the same old failed policies, what we've seen is ‘stubborn leadership.'george  stubbornly proposes tax cut after tax cut for the wealthiest americans while we steadily lose millions of jobs. so tonight we say: change is coming to america.george  stubbornly refuses to help families afford health care while premiums steadily rise through the roof. and tonight we say: change is coming to america.this president stubbornly continues to let polluters rewrite our environmental laws while children steadily breathe dirtier and dirtier air. tonight we say: change is coming to america.george  stubbornly insists on a foreign policy where we go it a

In [611]:
cv = CountVectorizer()
cv.fit(X_train.speech)
X_train_v = cv.transform(X_train.speech)

feature_names = cv.get_feature_names()

lda = LatentDirichletAllocation(n_components=5)
lda.fit(X_train_v)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=5, n_jobs=1,
             n_topics=None, perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [612]:
results = pd.DataFrame(lda.components_, columns=feature_names)


results.loc['mean'] = results.mean()
# results.loc[['mean']]
results = results.transpose()


In [613]:
results[['mean']].loc[results['mean'] > 100]

Unnamed: 0,mean
000,216.904627
able,198.869541
about,1205.669817
across,196.411202
act,107.355483
actually,146.698238
administration,151.396315
afford,135.423096
afghanistan,116.4683
after,281.110257
