In [404]:
import collections
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import preprocessing
from sklearn.feature_selection import VarianceThreshold
import re
import matplotlib.pyplot as plt
import mpld3
import pandas as pd
import difflib
from gensim.models import doc2vec
import multiprocessing
import random
from datetime import datetime
import sys

### Load president details

In [225]:
prez_dets = pd.read_csv('prez_list.csv', index_col='president_no')
prez_dets.drop_duplicates('president_name', inplace=True)
prez_dets

Unnamed: 0_level_0,president,party,term,vp,term_start,term_end,president_name,president_name_norm,president_birth_dt,president_death_dt
president_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1. George Washington (1732-1799),"None, Federalist",1789-1797,John Adams,1789,1797.0,George Washington,george washington,1732,1799.0
2,2. John Adams (1735-1826),Federalist,1797-1801,Thomas Jefferson,1797,1801.0,John Adams,john adams,1735,1826.0
3,3. Thomas Jefferson (1743-1826),Democratic-Republican,1801-1809,"Aaron Burr, George Clinton",1801,1809.0,Thomas Jefferson,thomas jefferson,1743,1826.0
4,4. James Madison (1751-1836),Democratic-Republican,1809-1817,"George Clinton, Elbridge Gerry",1809,1817.0,James Madison,james madison,1751,1836.0
5,5. James Monroe (1758-1831),Democratic-Republican,1817-1825,Daniel Tompkins,1817,1825.0,James Monroe,james monroe,1758,1831.0
6,6. John Quincy Adams (1767-1848),Democratic-Republican,1825-1829,John Calhoun,1825,1829.0,John Quincy Adams,john quincy adams,1767,1848.0
7,7. Andrew Jackson (1767-1845),Democrat,1829-1837,"John Calhoun, Martin van Buren",1829,1837.0,Andrew Jackson,andrew jackson,1767,1845.0
8,8. Martin van Buren (1782-1862),Democrat,1837-1841,Richard Johnson,1837,1841.0,Martin van Buren,martin van buren,1782,1862.0
9,9. William H. Harrison (1773-1841),Whig,1841,John Tyler,1841,,William H. Harrison,william h harrison,1773,1841.0
10,10. John Tyler (1790-1862),Whig,1841-1845,.,1841,1845.0,John Tyler,john tyler,1790,1862.0


In [3]:
speech_dt = datetime.strptime('January 25, 1979', '%B %d, %Y')
speech_yr = speech_dt.year
speech_yr

1979

## Parse speeches

In [4]:
with open('pg5050.txt') as f:
    raw = f.read()

In [5]:
speeches = raw.split('***')

In [6]:
def rm_empty(x):
    return x is not ''

In [7]:
sou_list = filter(rm_empty, speeches[5].split('\r\n'))[3:]
print '{} speeches found'.format(len(sou_list))

214 speeches found


In [8]:
Speech = collections.namedtuple('Speech', 'speech_type, speaker, party, date, body')

In [9]:
def parse_speech(s):
    try:
        s_paragraphs = filter(rm_empty, s.split('\r\n\r\n'))
        s_header = s_paragraphs[0].split('\r\n')
        s_body = '\n\n'.join(
            filter(lambda x: len(x.split()) > 5, [' '.join(p.split('\r\n')) for p in s_paragraphs[1:]]))
        if s_header[0] != 'State of the Union Address':
            return None
        
        speaker = s_header[1]
        date_str = s_header[2]
        
        speech_dt = datetime.strptime(date_str, '%B %d, %Y')
        speech_yr = speech_dt.year
        
        president = prez_dets[(prez_dets['term_start'] < speech_yr) & (prez_dets['term_end'] >= speech_yr)].iloc[0]
        president_party = president['party']
        president_name = president['president_name']
        
        return Speech(s_header[0], president_name, president_party, date_str, s_body)
    
    except:
        return None

In [10]:
speeches_clean = filter(lambda x: x is not None, [parse_speech(s) for s in speeches[6:]])
assert(len(speeches_clean) == len(sou_list))

## Vectorize speeches

In [11]:
# Convert text to lower-case and strip punctuation/symbols from words
def normalize_text(text):
    # Replace special characters with spaces
    norm_text = text.lower()
    norm_text = re.sub(r'\d', '0', norm_text)
    norm_text = norm_text.replace('0.0', '00')
    norm_text = norm_text.replace('0,0', '00')
    norm_text = '0'.join(filter(None, norm_text.split('0')))
    norm_text = norm_text.replace('<br />', ' ')
    norm_text = norm_text.replace('\n', ' ')
    norm_text = norm_text.replace('\t', ' ')
    norm_text = norm_text.replace('\t', ' ')
    # Pad punctuation with spaces on both sides
    for char in ['.', '"', ',', '(', ')', '!', '?', ';', ':']:
        norm_text = norm_text.replace(char, ' ' + char + ' ')
    # Consolidate consecutive spaces
    norm_text = ' '.join(norm_text.split())
    return norm_text

In [12]:
docs = []
i = 0
for s in speeches_clean:
    words = normalize_text(s.body.decode('utf-8')).split()
    tags = ['{}; {}'.format(s.speaker, s.date)]
    docs.append(doc2vec.TaggedDocument(words, tags))
    i += 1

### Train model

In [13]:
cores = multiprocessing.cpu_count()
print "{} cores found".format(cores)
assert doc2vec.FAST_VERSION > -1,\
            "this will be painfully slow otherwise"

4 cores found


In [61]:
model = doc2vec.Doc2Vec(dm=1, dm_mean=1, size=300, window=8, negative=2,
                hs=0, min_count=3, workers=cores, iter=5)
model.build_vocab(docs)

In [62]:
model.intersect_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True, lockf=1.0)

In [59]:
random.seed(400)
save_ind = True

In [63]:
model.train(docs)

7064235

In [40]:
for epoch in range(3):
    sys.stdout.write('.')
    sys.stdout.flush()
    
    random.shuffle(docs)
    model.train(docs)
    model.alpha -= 0.002
    model.min_alpha = model.alpha

assert model.docvecs.count == len(docs)
if save_ind:
    model.save('doc2vec_dm1')

..........

In [64]:
dlist = []
for d in docs:
    tag = d.tags[0]
    try:
        dv = model.docvecs[tag]
        dlist.append(len(dv))
    except:
        print tag
len(dlist)

214

## Reduce doc vec dims

In [77]:
# Reduce dimensions with truncated SVD then with t-SNE
def reduce_dims(model):
    vectors = [model.docvecs[v.tags][0] for v in docs]
    X_embedded = TSNE(
        n_components=2, perplexity=5).fit_transform(vectors)
    return X_embedded

In [78]:
X_embedded = reduce_dims(model)

### Plot

In [79]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [80]:
prez_list = [s.speaker for s in speeches_clean]
date_list = [int(s.date[-4:]) for s in speeches_clean]
party_list = [s.party for s in speeches_clean]
assert len(prez_list) == len(date_list)
labels = zip(prez_list, party_list, date_list)
labels = ['; '.join([str(i) for i in l]) for l in labels]

In [81]:
i = 1
party_no = {}
for p in list(set(party_list)):
    party_no[p] = i
    i += 1

In [82]:
fig = plt.figure(figsize=(10, 10))
ax = plt.axes(frameon=False)
plt.setp(ax, xticks=(), yticks=())
plt.subplots_adjust(left=0.0, bottom=0.0, right=1.0, top=0.9,
                wspace=0.0, hspace=0.0)
scatter = plt.scatter(X_embedded[:, 0], X_embedded[:, 1], 
                      c=[party_no[p] for p in party_list], marker="x")
tooltip = mpld3.plugins.PointLabelTooltip(scatter, labels=labels)
mpld3.plugins.connect(fig, tooltip)
mpld3.display()

## Pull speech

In [24]:
speeches_dt = pd.DataFrame(speeches_clean, columns=Speech._fields)

In [25]:
def get_speech(speeches_dt, speaker, year):
    return speeches_dt[(speeches_dt['speaker']==speaker) & (speeches_dt['date'].str[-4:]==str(year))]['body'].iloc[0]

In [27]:
print get_speech(speeches_dt, 'Herbert C. Hoover', 1930)

To the Senate and House of Representatives:

I have the honor to comply with the requirement of the Constitution that I should lay before the Congress information as to the state of the Union, and recommend consideration of such measures as are necessary and expedient.

Substantial progress has been made during the year in national peace and security; the fundamental strength of the Nation's economic life is unimpaired; education and scientific discovery have made advances; our country is more alive to its problems of moral and spiritual welfare.

During the past 12 months we have suffered with other Nations from economic depression.

The origins of this depression lie to some extent within our own borders through a speculative period which diverted capital and energy into speculation rather than constructive enterprise. Had overspeculation in securities been the only force operating, we should have seen recovery many months ago, as these particular dislocations have generally readjust

## Logistic regression

In [85]:
import sklearn

In [163]:
dv = list(model.docvecs)
dt = list(model.docvecs.doctags)

In [389]:
X = pd.DataFrame(dv, index=dt)
X['year'] = df.index.str[-4:]
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,year
"Rutherford Hayes; December 2, 1878",-0.100219,0.108763,-0.004643,0.539669,-0.923998,0.156823,0.383309,-0.331213,0.82465,0.263316,...,1.048688,-0.880303,-0.631754,-0.525346,-0.24164,-0.15724,-0.553723,-0.10449,-0.130653,1878
"William (Bill) Jefferson Clinton; January 25, 1994",-0.293079,0.237407,0.005597,0.203324,-0.534915,-0.612229,-0.153879,-0.384935,1.11809,1.084679,...,-0.709117,0.890949,-0.024738,0.199759,1.238307,0.362992,-0.015817,0.599756,-0.780108,1994
"Dwight David Eisenhower ; January 9, 1958",0.245126,0.871685,0.21342,1.536611,-0.223496,-0.051194,0.356322,-1.4066,0.410522,0.399105,...,-0.263352,-0.752809,-0.006706,-0.611983,-0.727378,-0.02436,0.304457,0.462068,-0.198712,1958
"Dwight David Eisenhower ; January 9, 1959",0.313249,1.007731,0.164486,0.969049,-0.146088,0.213856,-0.110601,-1.48803,0.724898,0.514224,...,-0.249511,-1.040264,0.064321,-0.289343,-1.022538,-0.291704,-0.093118,0.531542,0.195982,1959
"Ronald Wilson Reagan ; February 4, 1986",1.273648,0.229501,0.023835,2.006427,-0.246686,-0.669023,0.285818,-1.246851,1.042454,0.633126,...,0.525879,-1.936056,0.645615,-0.582951,-0.497033,-0.062951,0.26145,0.158087,-0.798062,1986


In [185]:
y = pd.DataFrame([d] + [d.split(';')[0]] + [d.split(';')[1]] for d in dt])
y.head()

Unnamed: 0,0,1,2
0,"Rutherford Hayes; December 2, 1878",Rutherford Hayes,"December 2, 1878"
1,"William (Bill) Jefferson Clinton; January 25, ...",William (Bill) Jefferson Clinton,"January 25, 1994"
2,"Dwight David Eisenhower ; January 9, 1958",Dwight David Eisenhower,"January 9, 1958"
3,"Dwight David Eisenhower ; January 9, 1959",Dwight David Eisenhower,"January 9, 1959"
4,"Ronald Wilson Reagan ; February 4, 1986",Ronald Wilson Reagan,"February 4, 1986"


In [241]:
events = pd.DataFrame([{'key': d, 'name': d.split(';')[0], 'date': d.split(';')[1]} for d in dt])
events.set_index('key', inplace=True)
events.head()

Unnamed: 0_level_0,date,name
key,Unnamed: 1_level_1,Unnamed: 2_level_1
"Rutherford Hayes; December 2, 1878","December 2, 1878",Rutherford Hayes
"William (Bill) Jefferson Clinton; January 25, 1994","January 25, 1994",William (Bill) Jefferson Clinton
"Dwight David Eisenhower ; January 9, 1958","January 9, 1958",Dwight David Eisenhower
"Dwight David Eisenhower ; January 9, 1959","January 9, 1959",Dwight David Eisenhower
"Ronald Wilson Reagan ; February 4, 1986","February 4, 1986",Ronald Wilson Reagan


In [251]:
Y = pd.merge(events, prez_dets, how='left', left_on='name', right_on='president_name')
Y.head()

Unnamed: 0,date,name,president,party,term,vp,term_start,term_end,president_name,president_name_norm,president_birth_dt,president_death_dt
0,"December 2, 1878",Rutherford Hayes,19. Rutherford Hayes (1822-1893),Republican,1877-1881,William Wheeler,1877,1881.0,Rutherford Hayes,rutherford hayes,1822,1893.0
1,"January 25, 1994",William (Bill) Jefferson Clinton,42. William (Bill) Jefferson Clinton (1946- ),Democrat,1993-2001,Al Gore,1993,2001.0,William (Bill) Jefferson Clinton,william bill jefferson clinton,1946,
2,"January 9, 1958",Dwight David Eisenhower,34. Dwight David Eisenhower (1890-1969),Republican,1953-1961,Richard Milhous Nixon,1953,1961.0,Dwight David Eisenhower,dwight david eisenhower,1890,1969.0
3,"January 9, 1959",Dwight David Eisenhower,34. Dwight David Eisenhower (1890-1969),Republican,1953-1961,Richard Milhous Nixon,1953,1961.0,Dwight David Eisenhower,dwight david eisenhower,1890,1969.0
4,"February 4, 1986",Ronald Wilson Reagan,40. Ronald Wilson Reagan (1911- 2004),Republican,1981-1989,George H. W. Bush,1981,1989.0,Ronald Wilson Reagan,ronald wilson reagan,1911,2004.0


In [252]:
Y.groupby('name').size().sort_values(ascending=False)[:5]

name
Franklin Delano Roosevelt           12
Harry S Truman                       9
William (Bill) Jefferson Clinton     9
George Washington                    9
Dwight David Eisenhower              9
dtype: int64

In [257]:
y = (Y['party'] == 'Republican')*1
y[:5]

0    1
1    0
2    1
3    1
4    1
Name: party, dtype: int64

In [None]:
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))

In [402]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [422]:
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
X_reduced = sel.fit_transform(X)
X_normalized = preprocessing.normalize(X_reduced, norm='l2')

In [423]:
model2 = LogisticRegression()
model2.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [424]:
np.sort(model2.coef_)

array([[ -9.16436624e-01,  -8.68493931e-01,  -8.08409401e-01,
         -7.91621179e-01,  -7.54115622e-01,  -7.16182612e-01,
         -6.79301406e-01,  -6.78091081e-01,  -6.40707284e-01,
         -6.24520933e-01,  -5.90329232e-01,  -5.60855862e-01,
         -5.23609565e-01,  -5.20274058e-01,  -5.13308155e-01,
         -4.98702447e-01,  -4.85737753e-01,  -4.82976532e-01,
         -4.82705889e-01,  -4.74400857e-01,  -4.67189202e-01,
         -4.66656399e-01,  -4.61742533e-01,  -4.56583367e-01,
         -4.55594879e-01,  -4.48437039e-01,  -4.45110114e-01,
         -4.27618043e-01,  -4.26364851e-01,  -4.24961028e-01,
         -4.23053454e-01,  -4.23050657e-01,  -4.17573672e-01,
         -4.08636906e-01,  -4.07387912e-01,  -3.93203765e-01,
         -3.90735937e-01,  -3.82849746e-01,  -3.80348419e-01,
         -3.68181195e-01,  -3.66929786e-01,  -3.56563310e-01,
         -3.50236318e-01,  -3.48013003e-01,  -3.46331150e-01,
         -3.45873762e-01,  -3.44954913e-01,  -3.42404778e-01,
        

#### Logistic regression

In [419]:
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
X_reduced = sel.fit_transform(X)
# X_normalized = preprocessing.normalize(X.values, norm='l2')
X_normalized = preprocessing.normalize(X_reduced, norm='l2')

In [421]:
clf = LogisticRegression()
scores = cross_val_score(clf, X_normalized, y, scoring='accuracy', cv=10)
print scores.mean()

0.616977225673


#### Random forest

In [401]:
clf = RandomForestClassifier(max_depth=3, min_samples_split=3, random_state=0)
scores = cross_val_score(clf, X, y)
scores.mean()

0.57981220657276999

#### GBM

In [394]:
clf = GradientBoostingClassifier(n_estimators=30, learning_rate=1.0, max_depth=3, random_state=0)
scores = cross_val_score(clf, X, y)
scores.mean()

0.70109546165884196