In [8]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import spacy
from nltk.corpus import state_union, stopwords
from collections import Counter
nltk.download('state_union')

[nltk_data] Downloading package state_union to
[nltk_data]     /Users/rodrickleary/nltk_data...
[nltk_data]   Unzipping corpora/state_union.zip.


True

# Data Processing

In [9]:
# Import Presidential State of the Unions file ids
state_union.fileids()

['1945-Truman.txt',
 '1946-Truman.txt',
 '1947-Truman.txt',
 '1948-Truman.txt',
 '1949-Truman.txt',
 '1950-Truman.txt',
 '1951-Truman.txt',
 '1953-Eisenhower.txt',
 '1954-Eisenhower.txt',
 '1955-Eisenhower.txt',
 '1956-Eisenhower.txt',
 '1957-Eisenhower.txt',
 '1958-Eisenhower.txt',
 '1959-Eisenhower.txt',
 '1960-Eisenhower.txt',
 '1961-Kennedy.txt',
 '1962-Kennedy.txt',
 '1963-Johnson.txt',
 '1963-Kennedy.txt',
 '1964-Johnson.txt',
 '1965-Johnson-1.txt',
 '1965-Johnson-2.txt',
 '1966-Johnson.txt',
 '1967-Johnson.txt',
 '1968-Johnson.txt',
 '1969-Johnson.txt',
 '1970-Nixon.txt',
 '1971-Nixon.txt',
 '1972-Nixon.txt',
 '1973-Nixon.txt',
 '1974-Nixon.txt',
 '1975-Ford.txt',
 '1976-Ford.txt',
 '1977-Ford.txt',
 '1978-Carter.txt',
 '1979-Carter.txt',
 '1980-Carter.txt',
 '1981-Reagan.txt',
 '1982-Reagan.txt',
 '1983-Reagan.txt',
 '1984-Reagan.txt',
 '1985-Reagan.txt',
 '1986-Reagan.txt',
 '1987-Reagan.txt',
 '1988-Reagan.txt',
 '1989-Bush.txt',
 '1990-Bush.txt',
 '1991-Bush-1.txt',
 '1991-B

In [10]:
# Grab first speech given by Bush and Clinton
clinton = state_union.raw('1993-Clinton.txt')
bush = state_union.raw('1989-Bush.txt')

In [11]:
# Parse using SpaCy
nlp = spacy.load('en')
clinton_doc = nlp(clinton)
bush_doc = nlp(bush)

In [12]:
# Group into sentences
bush_sents = [[sent, 'Bush'] for sent in bush_doc.sents]
clinton_sents = [[sent, 'Clinton'] for sent in clinton_doc.sents]

# Combine
sentences = pd.DataFrame(bush_sents + clinton_sents)
sentences.head()

Unnamed: 0,0,1
0,"(PRESIDENT, GEORGE)",Bush
1,"(H.W., BUSH, 'S)",Bush
2,(ADDRESS),Bush
3,"(ON, ADMINISTRATION, GOALS, BEFORE, A, JOINT, ...",Bush
4,"(Mr., Speaker, ,, Mr., President, ,, and, dist...",Bush


In [13]:
# Look at excerpts from each 
print(bush_doc[:100])
print('\nBush speech length:', len(bush_doc))

print('\n', clinton_doc[:100])
print('\nClinton speech length:', len(clinton_doc))

PRESIDENT GEORGE H.W. BUSH'S ADDRESS ON ADMINISTRATION GOALS BEFORE A JOINT SESSION OF CONGRESS
 
February 9, 1989 

Mr. Speaker, Mr. President, and distinguished Members of the House and Senate, honored guests, and fellow citizens: Less than 3 weeks ago, I joined you on the West Front of this very building and, looking over the monuments to our proud past, offered you my hand in filling the next page of American history with a story of extended prosperity and continued peace. And tonight I'm back to offer

Bush speech length: 5685

 PRESIDENT BILL CLINTON'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION
 
February 17, 1993 

Mr. President, Mr. Speaker, Members of the House and the Senate, distinguished Americans here as visitors in this Chamber, as am I. It is nice to have a fresh excuse for giving a long speech. [Laughter]
When Presidents speak to Congress and the Nation from this podium, typically they comment on the full range and challenges and opportunit

# Bag of Words Features

In [14]:
# Create bag of words function for each text
def bag_of_words(text):
    
    # filter out punctuation and stop words
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return most common words
    return [item[0] for item in Counter(allwords).most_common(500)]

# Get bags 
bush_words = bag_of_words(bush_doc)
clinton_words = bag_of_words(clinton_doc)

# Combine bags to create common set of unique words
common_words = set(bush_words + clinton_words)

In [15]:
# Create bag of words data frame using combined common words and sentences
def bow_features(sentences, common_words):
    
    # Build data frame
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentences in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentences
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
    
    return df

In [16]:
# Create bow features 
bow = bow_features(sentences, common_words)
bow.head()

Unnamed: 0,provide,dare,mile,proud,charge,loss,treatment,condemn,dream,road,...,rate,company,forth,one,world,tough,safe,excuse,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(PRESIDENT, GEORGE)",Bush
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(H.W., BUSH, 'S)",Bush
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,(ADDRESS),Bush
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(ON, ADMINISTRATION, GOALS, BEFORE, A, JOINT, ...",Bush
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Mr., Speaker, ,, Mr., President, ,, and, dist...",Bush


# TF-IDF Features

In [17]:
# Grab sentence level documents in NLTK
clinton = state_union.sents('1993-Clinton.txt')
bush = state_union.sents('1989-Bush.txt')

In [18]:
# Create list of text 
clinton_list = [" ".join(sent) for sent in clinton]
bush_list = [" ".join(sent) for sent in bush]
joined = clinton_list + bush_list

In [19]:
# Vectorize
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.5, 
                             min_df=2, 
                             stop_words='english',   
                             use_idf=True,
                             norm=u'l2', 
                             smooth_idf=True 
                            )

tfidf = vectorizer.fit_transform(joined).tocsr()

# Supervised Learning Models

In [20]:
from sklearn.model_selection import cross_val_score

# Specify model inputs for each feature set

# BoW
X_bow = bow.drop(['text_sentence', 'text_source'], 1)
Y_bow = bow['text_source']

# Tfidf
X_tfidf = tfidf
Y_tfidf = ['Clinton']*len(clinton_list) + ['Bush']*len(bush_list)

# Logistic Regression

In [21]:
from sklearn.linear_model import LogisticRegression

# BoW
lr = LogisticRegression()
lr_bow = lr.fit(X_bow, Y_bow)
print('BoW Logistic Regression Scores: ', cross_val_score(lr_bow, X_bow, Y_bow, cv=5))
print('Avg Score:', np.mean(cross_val_score(lr_bow, X_bow, Y_bow, cv=5)))

# Tfidf
lr = LogisticRegression()
lr_tfidf = lr.fit(X_tfidf, Y_tfidf)
print('\nTfidf Logistic Regression Scores:', cross_val_score(lr_tfidf, X_tfidf, Y_tfidf, cv=5))
print('Avg Score:', np.mean(cross_val_score(lr_tfidf, X_tfidf, Y_tfidf, cv=5)))



BoW Logistic Regression Scores:  [0.64227642 0.70731707 0.64227642 0.75409836 0.75206612]
Avg Score: 0.6996068790114808

Tfidf Logistic Regression Scores: [0.58474576 0.69491525 0.66666667 0.77586207 0.76724138]
Avg Score: 0.6978862263783363




# Random Forest

In [22]:
from sklearn import ensemble

# BoW
rfc = ensemble.RandomForestClassifier()
rfc_bow = rfc.fit(X_bow, Y_bow)
print('BoW Random Forest Scores: ', cross_val_score(rfc_bow, X_bow, Y_bow, cv=5))
print('Avg Score:', np.mean(cross_val_score(rfc_bow, X_bow, Y_bow, cv=5)))

# Tfidf
rfc = ensemble.RandomForestClassifier()
rfc_tfidf = rfc.fit(X_tfidf, Y_tfidf)
print('\nTfidf Random Forest Scores:', cross_val_score(rfc_tfidf, X_tfidf, Y_tfidf, cv=5))
print('Avg Score:', np.mean(cross_val_score(rfc_tfidf, X_tfidf, Y_tfidf, cv=5)))



BoW Random Forest Scores:  [0.60162602 0.65853659 0.55284553 0.59016393 0.66942149]
Avg Score: 0.6356285034195689





Tfidf Random Forest Scores: [0.55084746 0.63559322 0.55555556 0.74137931 0.69827586]
Avg Score: 0.6053692297701648


# Gradient Boosting

In [23]:
# BoW
clf = ensemble.GradientBoostingClassifier()
clf_bow = clf.fit(X_bow, Y_bow)
print('Bow Gradient Boosting Scores:', cross_val_score(clf_bow, X_bow,Y_bow, cv=5))
print('Avg Score:', np.mean(cross_val_score(clf_bow, X_bow, Y_bow, cv=5)))

# Tfidf
clf = ensemble.GradientBoostingClassifier()
clf_tfidf = clf.fit(X_tfidf, Y_tfidf)
print('\nTfidf Random Forest Scores:', cross_val_score(clf_tfidf, X_tfidf, Y_tfidf, cv=5))
print('Avg Score:', np.mean(cross_val_score(clf_tfidf, X_tfidf, Y_tfidf, cv=5)))

Bow Gradient Boosting Scores: [0.69105691 0.67479675 0.65853659 0.7295082  0.71900826]
Avg Score: 0.6831323668879555

Tfidf Random Forest Scores: [0.6440678  0.68644068 0.60683761 0.73275862 0.6637931 ]
Avg Score: 0.6514376557918347


# Pick A Model and Try to Increase Accuracy by 5%
Model: Logistic Regression Using BoW Feature Set

In [24]:
# Increase BoW size

# Update function to include 1000 most common words
def bag_of_words(text):
    
    # filter out punctuation and stop words
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return most common words
    return [item[0] for item in Counter(allwords).most_common(1000)]

# Get bags 
bush_words = bag_of_words(bush_doc)
clinton_words = bag_of_words(clinton_doc)

# Combine bags to create common set of unique words
common_words = set(bush_words + clinton_words)

In [25]:
# Create bow features 
big_bow = bow_features(sentences, common_words)

In [26]:
big_bow.head()

Unnamed: 0,insured,charge,inadequate,dream,Office,period,lifelong,wasteful,cancel,Vice,...,entire,labor,Federal,flourish,forth,few,6,safe,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(PRESIDENT, GEORGE)",Bush
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(H.W., BUSH, 'S)",Bush
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,(ADDRESS),Bush
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(ON, ADMINISTRATION, GOALS, BEFORE, A, JOINT, ...",Bush
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Mr., Speaker, ,, Mr., President, ,, and, dist...",Bush


In [27]:
# Make new X and Y inputs
X_big_bow = big_bow.drop(['text_sentence', 'text_source'], 1)
Y_big_bow = big_bow['text_source']

# Rerun BoW
lr = LogisticRegression()
lr_big_bow = lr.fit(X_big_bow, Y_big_bow)
print('BoW (big) Logistic Regression Scores: ', cross_val_score(lr_big_bow, X_big_bow, Y_big_bow, cv=5))
print('Avg. Score ', np.mean(cross_val_score(lr_big_bow, X_big_bow, Y_big_bow, cv=5)))



BoW (big) Logistic Regression Scores:  [0.64227642 0.69105691 0.63414634 0.73770492 0.74380165]
Avg. Score  0.6897972491444195




Using a bigger bag of words actually made the average score get worse by about 1%. Try out another method - include punctuation in BoW.

In [28]:
# Update function, go back to 500 most common words and add in punctuation
def bag_of_words(text):
    
    # filter out punctuation and stop words
    allwords = [token.lemma_
                for token in text
                if not token.is_stop]
                   
    # Return most common words
    return [item[0] for item in Counter(allwords).most_common(500)]

# Get bags 
bush_words = bag_of_words(bush_doc)
clinton_words = bag_of_words(clinton_doc)

# Combine bags to create common set of unique words
common_words = set(bush_words + clinton_words)

In [29]:
# Create bow features 
bow = bow_features(sentences, common_words)

In [30]:
# Regenerate model features
X_bow = bow.drop(['text_sentence', 'text_source'], 1)
Y_bow = bow['text_source']

In [31]:
# Rerun model
lr = LogisticRegression(
    )
lr_bow = lr.fit(X_bow, Y_bow)
print('BoW #3 - Logistic Regression Scores: ', cross_val_score(lr_bow, X_bow, Y_bow, cv=5))
print('Avg. Score ', np.mean(cross_val_score(lr_bow, X_bow, Y_bow, cv=5)))



BoW #3 - Logistic Regression Scores:  [0.64227642 0.70731707 0.64227642 0.75409836 0.75206612]
Avg. Score  0.6996068790114808


