In [10]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas
import inspect
from datetime import datetime
import seaborn as sns
import glob
from os.path import basename
sns.set_context('poster')
import re

## Tokenize text on punct. and format DF

In [2]:
# Make your word tokenizer. We don't care about punct.
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+') 

In [4]:
biglist = []

In [5]:
speeches = glob.glob('../data/SOTU/*')
for s in speeches:
    print basename(s)
    with open(s) as speech:
        text = speech.read()
        tokens = tokenizer.tokenize(text.lower())
        biglist.append([basename(s), tokens])

clinton-1993.txt
gwbush-2001.txt
hwbush-1989.txt
obama-2009.txt
reagan-1981.txt


In [12]:
df = pandas.DataFrame(biglist)
df.columns = ['president', 'tokens']
#df.to_csv("data/tokenized-speeches.csv")

In [14]:
df.head() 

Unnamed: 0,president,tokens
0,clinton-1993.txt,"[mr, president, mr, speaker, members, of, the,..."
1,gwbush-2001.txt,"[mr, speaker, mr, vice, president, members, of..."
2,hwbush-1989.txt,"[mr, speaker, mr, president, and, distinguishe..."
3,obama-2009.txt,"[madam, speaker, mr, vice, president, members,..."
4,reagan-1981.txt,"[mr, speaker, mr, president, distinguished, me..."


In [17]:
df['year'] = df['president'].apply(lambda x: re.findall('\d+', x)[0])
df['president'] = df['president'].apply(lambda x: re.findall('\w+', x)[0])

## Wordcount

In [23]:
df['word_count'] = df['tokens'].apply(lambda x: len(x))

In [21]:
df.head()

Unnamed: 0,president,tokens,year,word_count
0,clinton,"[mr, president, mr, speaker, members, of, the,...",1993,7127
1,gwbush,"[mr, speaker, mr, vice, president, members, of...",2001,4449
2,hwbush,"[mr, speaker, mr, president, and, distinguishe...",1989,4917
3,obama,"[madam, speaker, mr, vice, president, members,...",2009,6199
4,reagan,"[mr, speaker, mr, president, distinguished, me...",1981,4572


In [20]:
df.word_count.describe()

count       5.000000
mean     5452.800000
std      1165.146429
min      4449.000000
25%      4572.000000
50%      4917.000000
75%      6199.000000
max      7127.000000
Name: word_count, dtype: float64

In [None]:
df.sort_values('word_count', ascending=True)

In [None]:
df[df.president=='trump']

In [None]:
df[['president', 'word_count']].to_csv('data/president_word_count.csv')

### Read in CSV with date-- this was joined manually.

In [None]:
df_with_date = pandas.read_csv('data/name-date-wc.csv',index_col=False)

In [None]:
df_with_date.sort_values('word_count', ascending=False)[:10]

In [None]:
df_with_date.sort_values('word_count').plot(x='president',
                                            y='word_count', 
                                            kind='barh',
                                            legend=None)
plt.title('Length of Address')
plt.xlabel('Words') 
plt.ylabel("")
plt.savefig('figs/length-vs-president-sorted.png')

In [None]:
df_with_date.sort_values('first-term', ascending=False).plot(x='president',
                                            y='word_count', 
                                            kind='barh',
                                            legend=None)
plt.title('Length of Address')
plt.xlabel('Words') 
plt.ylabel("")
#plt.savefig('figs/length-vs-president.png')

## Texty stuff

In [None]:
## DONT TOKENIZE HERE....
untokenized = []
speeches = glob.glob('data/transcripts/*')
for s in speeches:
    print basename(s)
    with open(s) as speech:
        text = speech.read() 
        untokenized.append([basename(s), text])

raw = pandas.DataFrame(untokenized)
raw.columns = ['president', 'text']
raw.to_csv("data/raw-speeches.csv")

In [None]:
raw.head()

In [None]:
from textstat.textstat import textstat

In [None]:
raw['fk_grade'] = raw.text.apply(lambda x: textstat.flesch_kincaid_grade(x))
raw['readability'] = raw.text.apply(lambda x: textstat.flesch_reading_ease(x))

In [None]:
raw[['president', 'fk_grade', 'readability']].to_csv('data/fk-grade-readability.csv')

In [None]:
raw.fk_grade.describe()

## Now with all fields

In [None]:
joined = pandas.read_csv('data/name-date-wc-grade-readability.csv')

In [None]:
joined.head()

In [None]:
joined.sort_values('fk_grade', ascending=False).plot(x='president',
                                            y='fk_grade', 
                                            kind='barh',
                                            legend=None)
plt.title('Grade Level of Address')
plt.xlabel('Grade') 
plt.ylabel("")
plt.savefig('figs/grade-level.png')

In [None]:
joined.sort_values('readability', ascending=False).plot(x='president',
                                            y='readability', 
                                            kind='barh',
                                            legend=None)
plt.title('Readability')
plt.xlabel('Grade') 
plt.ylabel("")
#plt.savefig('figs/grade-level.png')

## Just modern prezzies

In [None]:
subset = joined.sort_values('first-term', ascending=False)[:10]
subset.to_csv('data/last-10-presidents.csv')

In [None]:
subset.sort_values('fk_grade', ascending=False).plot(x='president',
                                            y='fk_grade', 
                                            kind='barh',
                                            legend=None)
plt.title('Grade Level of Inaugural Address')
plt.xlabel('Grade') 
plt.ylabel("")
plt.savefig('figs/grade-level-subset.png')

In [None]:
subset.sort_values('word_count', ascending=False).plot(x='president',
                                            y='word_count', 
                                            kind='barh',
                                            legend=None)
plt.title('Length of Inaugural Address')
plt.xlabel('Words') 
plt.ylabel("")
plt.savefig('figs/word-count-subset.png')

In [None]:
subset.sort_values('fk_grade', ascending=True).fk_grade

## TF-IDF

In [None]:
import math
from textblob import TextBlob as tb

def tf(word, blob):
    return blob.words.count(word) / len(blob.words)

def n_containing(word, bloblist):
    return sum(1 for blob in bloblist if word in blob.words)

def idf(word, bloblist):
    return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))

def tfidf(word, blob, bloblist):
    return tf(word, blob) * idf(word, bloblist)

In [None]:
joined_10 = pandas.merge(raw, subset, left_on='fk_grade', right_on='fk_grade')

In [None]:
joined_10 = joined_10[['president_x', 'text','fk_grade','first-term', 'word_count']]

In [None]:
joined_10.columns = ['president', 'text', 'fk_grade', 'first_term', 'word_count']

In [None]:
joined_10 = joined_10.drop_duplicates('president')

In [None]:
joined_10

In [None]:
bloblist = [tb(t) for t in joined_10.text]
for i, blob in enumerate(bloblist):
    print("Top words in document {}".format(i + 1))
    scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for word, score in sorted_words[:3]:
        print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))

In [None]:
 joined_10.to_csv("data/last-10-all-cols.csv")

 ## AMERICA INDEX

In [None]:
df = pandas.read_csv('data/last-10-all-cols.csv')
df.head()

In [None]:
df['tokens'] = df.text.apply(lambda x: tokenizer.tokenize(x.lower()))

In [None]:
from collections import Counter

In [None]:
df['america_ct'] = df.tokens.apply(lambda x: Counter(x)['america'])

In [None]:
df.sort_values('america_ct', ascending=False)

In [None]:
df_all

In [None]:
df_all['america_ct'] = df_all.tokens.apply(lambda x: Counter(x)['america'])
df_all['american_s_ct'] = df_all.tokens.apply(lambda x: Counter(x)['american'] + Counter(x)['americans'])
df_all['america_cans_ct'] = df_all.tokens.apply(lambda x: Counter(x)['america'] + Counter(x)['american'] + Counter(x)['americans'])
df_all[['president', 'america_ct', 'america_cans_ct', 'american_s_ct']].sort_values('america_ct', ascending=False)

In [None]:
df_all[['president', 'america_ct', 'america_cans_ct']].to_csv('data/america-count-ALL-PRES.csv')

In [None]:
df_all[['president', 'america_ct', 'america_cans_ct']].sort_values('america_ct', ascending=True).plot(x='president',
                                            y='america_ct', 
                                            kind='barh',
                                            legend=None)
plt.title('Number of times America was mentioned')
plt.xlabel('Words') 
plt.ylabel("")
plt.savefig('figs/america-count.png')

In [None]:
df_a