In [83]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [84]:
pd.set_option("display.max_rows", 999)
pd.set_option("display.max_columns", 999)

In [85]:
df_kerry = pd.read_csv('./csv_files/speech_kerry')
df_bush = pd.read_csv('./csv_files/speech_bush')
df_obama = pd.read_csv('./csv_files/speech_obama')
df_mccain = pd.read_csv('./csv_files/speech_mccain')
df_obama_2 = pd.read_csv('./csv_files/speech_obama_2')
df_romney = pd.read_csv('./csv_files/speech_romney')
df_clinton = pd.read_csv('./csv_files/speech_clinton')
df_trump = pd.read_csv('./csv_files/speech_trump')

In [86]:
df_kerry['party'] = 'dem'
df_bush['party'] = 'rep'
df_obama['party'] = 'dem'
df_mccain['party'] = 'rep'
df_obama_2['party'] = 'dem'
df_romney['party'] = 'rep'
df_clinton['party'] = 'dem'
df_trump['party'] = 'rep'

In [87]:
df = pd.concat([df_kerry, df_bush, df_obama, df_mccain, df_obama_2, df_romney, 
                df_clinton, df_trump], axis=0, ignore_index=True)

In [88]:
df.reindex()
df.shape

(1004, 4)

# Preprocessing

In [89]:
df['speech'] = df['speech'].apply(lambda x: x.replace("\'", "'"))
df['speech'] = df['speech'].apply(lambda x: x.lower().replace("kerry", ""))
df['speech'] = df['speech'].apply(lambda x: x.lower().replace("bush", ""))
df['speech'] = df['speech'].apply(lambda x: x.lower().replace("obama", ""))
df['speech'] = df['speech'].apply(lambda x: x.lower().replace("mccain", ""))
df['speech'] = df['speech'].apply(lambda x: x.lower().replace("romeny", ""))
df['speech'] = df['speech'].apply(lambda x: x.lower().replace("clinton", ""))
df['speech'] = df['speech'].apply(lambda x: x.lower().replace("trump", ""))

In [90]:
df.party = df.party.apply(lambda x: 1 if x == 'dem' else 0)
X = df.speech 
y = df.party

In [91]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)

In [100]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf.fit(X_train)
X_train_tf_tsvd = tfidf.transform(X_train)

# tsvd = TruncatedSVD(n_components=100)
# tsvd.fit(X_train_tf)
# X_train_tf_tsvd = tsvd.transform(X_train_tf)

rfc = RandomForestClassifier(n_estimators=100, max_depth=5)
rfc.fit(X_train_tf_tsvd, y_train)
train_predictions = rfc.predict(X_train_tf_tsvd)
print(rfc.score(X_train_tf_tsvd, y_train))
print(confusion_matrix(y_train, train_predictions))
print(classification_report(y_train, train_predictions))

X_test_tf_tsvd = tfidf.transform(X_test)
# X_test_tf_tsvd = tsvd.transform(X_test_tf)
test_predictions = rfc.predict(X_test_tf_tsvd)
print(rfc.score(X_test_tf_tsvd, y_test))
print(confusion_matrix(y_test, test_predictions))
print(classification_report(y_test, test_predictions))

<bound method CountVectorizer.get_feature_names of TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)>
0.950166112957
[[204  30]
 [  0 368]]
             precision    recall  f1-score   support

          0       1.00      0.87      0.93       234
          1       0.92      1.00      0.96       368

avg / total       0.95      0.95      0.95       602

0.833333333333
[[ 89  65]
 [  2 246]]
             precision    recall  f1-score   support

          0       0.98      0.58      0.73       154
          1       0.79      0.99      0.88       248

avg / total       0.86      0.83      0.82    

In [99]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf.fit(X_train)
X_train_tf = tfidf.transform(X_train)

tsvd = TruncatedSVD(n_components=100)
tsvd.fit(X_train_tf)
X_train_tf_tsvd = tsvd.transform(X_train_tf)

rfc = RandomForestClassifier(n_estimators=100, max_depth=5)
rfc.fit(X_train_tf_tsvd, y_train)
train_predictions = rfc.predict(X_train_tf_tsvd)
print(rfc.score(X_train_tf_tsvd, y_train))
print(confusion_matrix(y_train, train_predictions))
print(classification_report(y_train, train_predictions))

X_test_tf = tfidf.transform(X_test)
X_test_tf_tsvd = tsvd.transform(X_test_tf)
test_predictions = rfc.predict(X_test_tf_tsvd)
print(rfc.score(X_test_tf_tsvd, y_test))
print(confusion_matrix(y_test, test_predictions))
print(classification_report(y_test, test_predictions))

0.961794019934
[[211  23]
 [  0 368]]
             precision    recall  f1-score   support

          0       1.00      0.90      0.95       234
          1       0.94      1.00      0.97       368

avg / total       0.96      0.96      0.96       602

0.830845771144
[[ 88  66]
 [  2 246]]
             precision    recall  f1-score   support

          0       0.98      0.57      0.72       154
          1       0.79      0.99      0.88       248

avg / total       0.86      0.83      0.82       402



In [101]:
cv = CountVectorizer()
cv.fit(X_train)
cv.transform(X_train)

cv.get_feature_names()


['00',
 '000',
 '0000',
 '01',
 '02',
 '025',
 '026',
 '03',
 '04',
 '05',
 '050',
 '055',
 '06',
 '07',
 '08',
 '09',
 '10',
 '100',
 '1000',
 '1009',
 '100th',
 '101',
 '101st',
 '102',
 '103',
 '104',
 '105',
 '105th',
 '106',
 '107',
 '1079',
 '109',
 '10th',
 '11',
 '110',
 '110th',
 '111434',
 '112',
 '114',
 '115',
 '116',
 '116th',
 '117',
 '118',
 '1189',
 '11th',
 '12',
 '120',
 '1200',
 '1222',
 '123',
 '123rd',
 '124th',
 '125',
 '126th',
 '129th',
 '12th',
 '13',
 '130',
 '1300',
 '130th',
 '134',
 '139',
 '13th',
 '14',
 '140',
 '1400',
 '143',
 '144',
 '145',
 '147',
 '148',
 '14d',
 '14th',
 '15',
 '150',
 '1500',
 '152',
 '155',
 '158',
 '159',
 '15th',
 '16',
 '160',
 '1600',
 '161',
 '1630s',
 '164',
 '165',
 '168',
 '16th',
 '17',
 '170',
 '1700',
 '173',
 '174',
 '175',
 '1776',
 '1777',
 '1778',
 '1787',
 '1789',
 '1791',
 '17th',
 '18',
 '180',
 '1800',
 '1800s',
 '1808',
 '1827',
 '183',
 '1842',
 '1848',
 '185',
 '1850',
 '1858',
 '186',
 '1860',
 '1869',
 '187

In [102]:
lda = LatentDirichletAllocation(n_components=5)
lda.fit(X_train)

ValueError: could not convert string to float: 'todd: we\'ll have my interview this morning with donald  in just a few minutes. but we\'re going to begin with the democrats and former secretary of state, hillary . you liked that "graveyard of pollsters" line, didn\'t you? : well, i\'ve lived through this twice — todd: you\'ve lived through this. : — in \'92 and 2008. and look, i know i\'m behind. and i\'m going to keep fighting until the very last vote is counted because i care about this primary. todd: why do you think you started with such a big deficit this time, considering — look, this has been a big state to both your husband and yourself, very important. : yes. todd: they\'ve always helped you. : well, look, i can\'t sit here and analyze all the reasons. obviously, one is that the senator has been in public life next door for 25 years. so there\'s a familiarity with him. i totally respect that. and i just think that what\'s great about this primary is new hampshire voters take a first, second, third, fourth, fifth look. i mean, yesterday, as i was crisscrossing from one side of the state to the other, talking to voters, i literally had people come up, say you changed my mind, you know, who knows. i mean, this is what\'s so exciting about the primary, chuck. i came in in 2008, as i recall, like 16 points behind. i remember the night before the primary in \'92, bill\'s pollster saying, you know, you\'re in single digits. it\'s over. who knows. i love this excitement. and you know, i\'m going to fight as hard as i can. but i\'m having a great time whatever happens. todd: all right. let\'s — i want to go to the debate a little bit because i would say the foreign policy section is one that you guys wanted to highlight because you felt as if he didn\'t do so well. you brought up secretary of state madeleine albright to talk about it. but some of the things that you said about senator \'s readiness on foreign policy, they are very similar to what you\'ve said about bernie sanders. in \'08, you said, "i have a lifetime of experience that i will bring to the white house. i know senator  has a lifetime of experience that he will bring to the white house." senator  has a speech he gave in 2002; it sounds very similar to what you said when you\'re like, ok, you had one vote on the iraq war — : well, you know what — todd: what\'s the difference between what you said about then- senator  and what you\'re saying about senator sanders? : there\'s a very big difference. in 2008, senator  had really done his homework in the senate. he\'d been there by that time a few years. he had developed a network of advisors on national security and foreign policy issues. they were very diligent and focused on making sure he was ready, that he had as broad a set of views as possible. and they really went toe-to-toe with all the people supporting me. that\'s not happening in this campaign. there really isn\'t any kind of foreign policy network that is supporting and advising senator sanders. i\'ll let him speak for himself. i think that what\'s important is this job requires you to be ready on all aspects of it on the first day. and we know we got a particularly complex world right now. and the president\'s not going to have the time. maybe previous presidents in past years could have a little more leeway because of, you know, the way the world functioned. but now it\'s north korea with its missile tests, it\'s russian aggression, it\'s enforcing the iran agreement. you have to do it all at once. todd: do you think the iraq vote should still matter to voters? : look, i think that voters are perfectly free to take into account anything they want to take. but i also hope they\'ll take the rest of the record. you know, i was involved in the biggest counterterrorism decision in the  administration to determine whether or not to go after bin laden. i did put the sanctions on iran to get them to the negotiating table. i think that this is a debate that the voters really have to pay attention to because it is choosing both a president and a commander in chief. todd: i\'m curious, do you believe if it wasn\'t for the iraq war we wouldn\'t have isis today? : well, i think that\'s a hard conclusion to draw because, remember, we had al qaeda before we had isis. al qaeda attacked us in new york. al qaeda attacked our embassies in africa — [crosstalk] todd: the argument is that the instability in iraq is what has created this and that if we — that if saddam hussein were still there, we wouldn\'t have isis. : well, i think that\'s a lot of jumps in logic that to me doesn\'t really add up. the iraq war, there\'s no doubt, contributed to instability. i\'m not going to in any way deny that. but you cannot draw a direct line. what you can do is to say that jihadist terrorism starting with al qaeda — todd: right. : — and moving onto its latest incarnations, most particularly isis, is in response to a number of forces and factors that are roiling up the middle east. and certainly fighting for what islam means and how it\'s going to be presented and what people are going to mean when they talk about it — so, yes, we\'ve got a much bigger set of problems. todd: all right. another thing i wanted to follow up on the debate. senator sanders called the entire business model of wall street a fraud. we didn\'t get a chance to ask you to respond directly to that critique. i\'d like to ask you to respond to it now. : well, i think it\'s the kind of extreme statement that, once you really take a hard look at it, it\'s hard to understand. you know, when you talk about wall street, are we talking about every bank? or are we talking about a particular part of new york? that\'s never really clarified. what i believe is that there are good actors and bad actors in every part of our economy. the job of the president is to weed out and prevent the bad actors from disrupting economic activity, from amassing too much power and influence. but we live in a complex global economy where we\'ve got to have a good banking system that is able to service the american economy. and it needs to be more than just looking at the five banks that are the big banks. we have to have a much more robust community banking system, regional banking system, other forms of credit access. and that\'s what i am advocating for. and i still do not understand why i\'m having this problem getting senator sanders to join me in going after what are the potential problems that are out there, the shadow banking sector and the investment and hedge fund sector. todd: do you — can you have a treasury secretary who isn\'t familiar with how wall street works? and i say this because i think there\'s so much distrust right now. : yes. there is. there is. todd: on wall street. six of the last treasury secretaries either came from wall street or went to wall street after. i think there certainly right now isn\'t an appetite for somebody from wall street to be the next treasury secretary. and yet can you have a treasury secretary if they don\'t understand wall street? : well, you have to have a treasury secretary who understands the economy, the american economy and the global economy. i think there are a lot more places where one can and should look for such a treasury secretary — [crosstalk] todd: do you think you can pick one without having them have a wall street background? : you know, i want somebody who can make a good commitment to work with me to get the economy moving, to get more good jobs created, to get incomes rising, to look out over the horizon at some of the economic problems that are out there. we\'ve got to figure out what we\'re going to do with china. china is finally having to come to grips with the fact that a lot of its growth may not have been as on a firm foundation as we would hope. so we need people in government who have that kind of commitment and understanding. but we\'ve got to put the needs of the american economy first. and that\'s going to be my commitment. todd: before i let you go, i want to ask you about a comment former secretary of state madeleine albright said — a comment that i\'ve heard her say before. but it sort of rang differently to a lot of people. she said, there\'s a special place in hell for women who don\'t help women. but the implication is that somehow, if you\'re a democratic woman and you\'re not supporting you, what\'s wrong with you? do you want the vote to be decided on gender lines like that? : oh, look, you know, as you remember, madeleine has been saying this for many, many years. todd: starbucks cups, i think, centers on that. i get that. : she believes it firmly and., in part, because she knows what a struggle it has been. and she understands the struggle is not over. so i don\'t want people to be offended by what she is expressing as her very — todd: you can understand why some might have been offended by it? : well, good grief, we\'re getting offended about everything these days. honest to goodness, i mean, people can\'t say anything without offending somebody. she has a life experience that i respect. i admire her greatly. and i think what she was trying to do, which she\'s done in every setting i\'ve ever seen her, in going back 20-plus years, was to remind young women particularly that this struggle, which many of us have been part of is, not over. and don\'t be in any way lulled by the progress we\'ve made. and i think it was a light-hearted but very pointed remark, which people can take however they choose. todd: all right. what do you got in the super bowl? : i don\'t have anybody right now. todd: you don\'t have anybody? : i\'m going to flint, michigan, i\'m worrying about the kids in flint, michigan, trying to figure out what we\'re going to do to make sure they\'re not damaged irreparably by this — todd: so you\'re going today, you could have gone wednesday. that says something about how you feel about new hampshire? : i love new hampshire. and you know, the mayor asked me to come. this was as early as we could get it done. i want to lend my support. i\'m very hopeful the congress, which is trying to work in a bipartisan way, will come up with some funding to deal with these problems that have afflicted the community. and i\'m going to keep doing everything i can to help them. todd: secretary , i know you got a plane to catch. : thanks. great to talk to you.'

In [69]:
results = pd.DataFrame(lda.components_,
                      columns=feature_names)

AttributeError: 'LatentDirichletAllocation' object has no attribute 'components_'