In [1]:
import collections
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
import re
import matplotlib.pyplot as plt
import mpld3
import pandas as pd
import difflib
from gensim.models import doc2vec
import multiprocessing
import random
from datetime import datetime
import sys

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.


### Load president details

In [2]:
prez_dets = pd.read_csv('prez_list.csv', index_col='president_no')
prez_dets.head()

Unnamed: 0_level_0,president,party,term,vp,term_start,term_end,president_name,president_name_norm,president_birth_dt,president_death_dt
president_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1. George Washington (1732-1799),"None, Federalist",1789-1797,John Adams,1789,1797.0,George Washington,george washington,1732,1799
2,2. John Adams (1735-1826),Federalist,1797-1801,Thomas Jefferson,1797,1801.0,John Adams,john adams,1735,1826
3,3. Thomas Jefferson (1743-1826),Democratic-Republican,1801-1809,"Aaron Burr, George Clinton",1801,1809.0,Thomas Jefferson,thomas jefferson,1743,1826
4,4. James Madison (1751-1836),Democratic-Republican,1809-1817,"George Clinton, Elbridge Gerry",1809,1817.0,James Madison,james madison,1751,1836
5,5. James Monroe (1758-1831),Democratic-Republican,1817-1825,Daniel Tompkins,1817,1825.0,James Monroe,james monroe,1758,1831


In [3]:
speech_dt = datetime.strptime('January 25, 1979', '%B %d, %Y')
speech_yr = speech_dt.year
speech_yr

1979

## Parse speeches

In [4]:
with open('pg5050.txt') as f:
    raw = f.read()

In [5]:
speeches = raw.split('***')

In [6]:
def rm_empty(x):
    return x is not ''

In [7]:
sou_list = filter(rm_empty, speeches[5].split('\r\n'))[3:]
print '{} speeches found'.format(len(sou_list))

214 speeches found


In [8]:
Speech = collections.namedtuple('Speech', 'speech_type, speaker, party, date, body')

In [9]:
def parse_speech(s):
    try:
        s_paragraphs = filter(rm_empty, s.split('\r\n\r\n'))
        s_header = s_paragraphs[0].split('\r\n')
        s_body = '\n\n'.join(
            filter(lambda x: len(x.split()) > 5, [' '.join(p.split('\r\n')) for p in s_paragraphs[1:]]))
        if s_header[0] != 'State of the Union Address':
            return None
        
        speaker = s_header[1]
        date_str = s_header[2]
        
        speech_dt = datetime.strptime(date_str, '%B %d, %Y')
        speech_yr = speech_dt.year
        
        president = prez_dets[(prez_dets['term_start'] < speech_yr) & (prez_dets['term_end'] >= speech_yr)].iloc[0]
        president_party = president['party']
        president_name = president['president_name']
        
        return Speech(s_header[0], president_name, president_party, date_str, s_body)
    
    except:
        return None

In [10]:
speeches_clean = filter(lambda x: x is not None, [parse_speech(s) for s in speeches[6:]])
assert(len(speeches_clean) == len(sou_list))

## Vectorize speeches

In [11]:
# Convert text to lower-case and strip punctuation/symbols from words
def normalize_text(text):
    # Replace special characters with spaces
    norm_text = text.lower()
    norm_text = re.sub(r'\d', '0', norm_text)
    norm_text = norm_text.replace('0.0', '00')
    norm_text = norm_text.replace('0,0', '00')
    norm_text = '0'.join(filter(None, norm_text.split('0')))
    norm_text = norm_text.replace('<br />', ' ')
    norm_text = norm_text.replace('\n', ' ')
    norm_text = norm_text.replace('\t', ' ')
    norm_text = norm_text.replace('\t', ' ')
    # Pad punctuation with spaces on both sides
    for char in ['.', '"', ',', '(', ')', '!', '?', ';', ':']:
        norm_text = norm_text.replace(char, ' ' + char + ' ')
    # Consolidate consecutive spaces
    norm_text = ' '.join(norm_text.split())
    return norm_text

In [12]:
docs = []
i = 0
for s in speeches_clean:
    words = normalize_text(s.body.decode('utf-8')).split()
    tags = ['{}; {}'.format(s.speaker, s.date)]
    docs.append(doc2vec.TaggedDocument(words, tags))
    i += 1

### Train model

In [13]:
cores = multiprocessing.cpu_count()
print "{} cores found".format(cores)
assert doc2vec.FAST_VERSION > -1,\
            "this will be painfully slow otherwise"

2 cores found


In [14]:
model = doc2vec.Doc2Vec(dm=1, dm_mean=1, size=100, window=8, negative=2,
                hs=0, min_count=3, workers=cores)
model.build_vocab(docs)

In [15]:
random.seed(400)
save_ind = True

In [16]:
for epoch in range(10):
    sys.stdout.write('.')
    sys.stdout.flush()
    
    random.shuffle(docs)
    model.train(docs)
    model.alpha -= 0.002
    model.min_alpha = model.alpha

assert model.docvecs.count == len(docs)
if save_ind:
    model.save('doc2vec_dm1')

..........

In [17]:
dlist = []
for d in docs:
    tag = d.tags[0]
    try:
        dv = model.docvecs[tag]
        dlist.append(len(dv))
    except:
        print tag
len(dlist)

214

## Reduce doc vec dims

In [18]:
# Reduce dimensions with truncated SVD then with t-SNE
def reduce_dims(model):
    vectors = [model.docvecs[v.tags][0] for v in docs]
    X_embedded = TSNE(
        n_components=2, perplexity=10).fit_transform(vectors)
    return X_embedded

In [19]:
X_embedded = reduce_dims(model)

### Plot

In [20]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [21]:
prez_list = [s.speaker for s in speeches_clean]
date_list = [int(s.date[-4:]) for s in speeches_clean]
party_list = [s.party for s in speeches_clean]
assert len(prez_list) == len(date_list)
labels = zip(prez_list, party_list, date_list)
labels = ['; '.join([str(i) for i in l]) for l in labels]

In [22]:
i = 1
party_no = {}
for p in list(set(party_list)):
    party_no[p] = i
    i += 1

In [23]:
fig = plt.figure(figsize=(10, 10))
ax = plt.axes(frameon=False)
plt.setp(ax, xticks=(), yticks=())
plt.subplots_adjust(left=0.0, bottom=0.0, right=1.0, top=0.9,
                wspace=0.0, hspace=0.0)
scatter = plt.scatter(X_embedded[:, 0], X_embedded[:, 1], 
                      c=[party_no[p] for p in party_list], marker="x")
tooltip = mpld3.plugins.PointLabelTooltip(scatter, labels=labels)
mpld3.plugins.connect(fig, tooltip)
mpld3.display()

In [24]:
speeches_dt = pd.DataFrame(speeches_clean, columns=Speech._fields)

In [25]:
def get_speech(speeches_dt, speaker, year):
    return speeches_dt[(speeches_dt['speaker']==speaker) & (speeches_dt['date'].str[-4:]==str(year))]['body'].iloc[0]

In [27]:
print get_speech(speeches_dt, 'Herbert C. Hoover', 1930)

To the Senate and House of Representatives:

I have the honor to comply with the requirement of the Constitution that I should lay before the Congress information as to the state of the Union, and recommend consideration of such measures as are necessary and expedient.

Substantial progress has been made during the year in national peace and security; the fundamental strength of the Nation's economic life is unimpaired; education and scientific discovery have made advances; our country is more alive to its problems of moral and spiritual welfare.

During the past 12 months we have suffered with other Nations from economic depression.

The origins of this depression lie to some extent within our own borders through a speculative period which diverted capital and energy into speculation rather than constructive enterprise. Had overspeculation in securities been the only force operating, we should have seen recovery many months ago, as these particular dislocations have generally readjust