In [1]:
import warnings
warnings.simplefilter('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)

import numpy as np
import pandas as pd
import scipy.stats as stats
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns

from random import shuffle
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [2]:
# import data
df = pd.read_pickle('data/clean_speech_approvals.pkl')
df.set_index('president', inplace=True)

In [3]:
sotu_corpus = list(df.speech)

In [4]:
no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(sotu_corpus)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

In [5]:
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(sotu_corpus)
tf_feature_names = tf_vectorizer.get_feature_names()

In [6]:
no_topics = 20

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

In [7]:
# Run LDA
lda = LatentDirichletAllocation(n_components=no_topics, 
                                max_iter=5, learning_method='online', learning_offset=50.,
                                random_state=0).fit(tf)

In [8]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print('Topic {}:'.format(topic_idx))
        print(' '.join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [9]:
no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)

Topic 0:
federal shall defense programs military strength public billion problems economy
Topic 1:
ve tonight americans let jobs know just children care ll
Topic 2:
production fighting shall victory men enemy 000 forces pacific know
Topic 3:
iraq terrorists iraqi al qaida terror terrorist afghanistan weapons enemy
Topic 4:
soviet communist military aggression defense atomic union strength europe korea
Topic 5:
vietnam tonight south recommend billion try crime conflict think propose
Topic 6:
tax programs federal billion spending inflation percent ve rates regulations
Topic 7:
youth european export experience expenditures expected expect expansion expanding expanded
Topic 8:
administration 1978 1977 energy 1979 regulatory reform 1980 legislation programs
Topic 9:
think commitments believe tonight want reduced treaty poverty going administration
Topic 10:
youth european export experience expenditures expected expect expansion expanding expanded
Topic 11:
youth european export experience e

In [10]:
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
federal increase health legislation tax economy act administration education better
Topic 1:
americans federal tax health tonight economy just ve jobs programs
Topic 2:
americans federal men programs defense let union soviet shall know
Topic 3:
federal tonight americans let budget jobs tax programs increase energy
Topic 4:
let federal tax budget americans tonight energy jobs programs percent
Topic 5:
federal budget act administration health tax know jobs economy programs
Topic 6:
let soviet just americans act health jobs right federal energy
Topic 7:
dollars federal billion legislation economy programs tax act fiscal administration
Topic 8:
americans federal administration legislation programs tax reform energy ve support
Topic 9:
federal administration shall defense men economy international health military rights
Topic 10:
dollars fiscal expenditures federal billion 1947 legislation estimated 1945 administration
Topic 11:
federal budget let programs tax care percent health a