In [51]:
import pandas as pd
import pickle
import torch
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
import re
import string
from gensim import matutils, models
import scipy.sparse
from collections import Counter

## Topic modelling using LDA
https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation

In [52]:
#We select speeches from 5 historically significant years from corpus
data = pd.read_pickle('pickled_data/data_first_clean.pkl')
data.drop(['President', 'Party','speech', 'first_clean_tokenized'], axis=1, inplace = True)
#rename column for clarity
data = data.rename({'first_clean' : 'speech'}, axis=1)

years = [1946, 1976, 1990, 2002, 2009]

data = data.loc[data['year'].isin(years)]
data = data.reset_index(drop=True)

data['year'] = data['year'].apply(str)
data.head()

Unnamed: 0,year,speech
0,1946,a quarter century ago the congress decided tha...
1,1976,"mr. speaker, mr. vice president, members of th..."
2,1990,"tonight, i come not to speak about the ""state ..."
3,2002,we last met in an hour of shock and suffering....
4,2009,"madame speaker, mr. vice president, members of..."


In [53]:
#Check a sample to see if more cleaning is needed
data.speech[1]

'mr. speaker, mr. vice president, members of the 94th congress, and distinguished guests:as we begin our bicentennial, america is still one of the youngest nations in recorded history. long before our forefathers came to these shores, men and women had been struggling on this planet to forge a better life for themselves and their families.in man\'s long, upward march from savagery and slavery--throughout the nearly 2,000 years of the christian calendar, the nearly 6,000 years of jewish reckoning--there have been many deep, terrifying valleys, but also many bright and towering peaks.one peak stands highest in the ranges of human history. one example shines forth of a people uniting to produce abundance and to share the good life fairly and with freedom. one union holds out the promise of justice and opportunity for every citizen: that union is the united states of america.we have not remade paradise on earth. we know perfection will not be found here. but think for a minute how far we h

In [54]:
def clean_for_tdm(text):
    '''Remove forward slash, punctuation and numbers'''
    text = text.replace("\\", "")
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text   

In [55]:
#clean column speech
data['speech'] = data.speech.map(lambda x : clean_for_tdm(x))

In [56]:
data.head()

Unnamed: 0,year,speech
0,1946,a quarter century ago the congress decided tha...
1,1976,mr speaker mr vice president members of the c...
2,1990,tonight i come not to speak about the state of...
3,2002,we last met in an hour of shock and suffering ...
4,2009,madame speaker mr vice president members of co...


In [57]:
data.speech[1]

'mr speaker mr vice president members of the  congress and distinguished guestsas we begin our bicentennial america is still one of the youngest nations in recorded history long before our forefathers came to these shores men and women had been struggling on this planet to forge a better life for themselves and their familiesin mans long upward march from savagery and slaverythroughout the nearly  years of the christian calendar the nearly  years of jewish reckoningthere have been many deep terrifying valleys but also many bright and towering peaksone peak stands highest in the ranges of human history one example shines forth of a people uniting to produce abundance and to share the good life fairly and with freedom one union holds out the promise of justice and opportunity for every citizen that union is the united states of americawe have not remade paradise on earth we know perfection will not be found here but think for a minute how far we have come in  yearswe came from many roots

## Create DTM (document term matrix)

In [58]:
#set index to year for DTM
data.set_index('year', inplace = True)
data.head()

Unnamed: 0_level_0,speech
year,Unnamed: 1_level_1
1946,a quarter century ago the congress decided tha...
1976,mr speaker mr vice president members of the c...
1990,tonight i come not to speak about the state of...
2002,we last met in an hour of shock and suffering ...
2009,madame speaker mr vice president members of co...


In [59]:
cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data.speech)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data.index
data_dtm

Unnamed: 0_level_0,aaa,abess,abide,abilities,ability,able,ably,aboutand,abovementioned,abreast,...,young,younger,youngest,youngi,youngmy,youre,youve,zero,zone,zones
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1946,1,0,1,1,3,4,1,0,1,1,...,1,0,0,0,0,0,0,0,2,1
1976,0,0,0,1,2,0,0,0,0,0,...,0,1,1,1,1,0,0,0,0,0
1990,0,0,0,0,0,0,0,1,0,0,...,2,0,0,0,0,2,0,0,0,1
2002,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
2009,0,1,0,0,3,4,0,0,0,0,...,2,0,0,0,0,0,1,0,0,0


In [60]:
#data_dtm.to_pickle('pickled_data/dtm.pkl')
#pickle.dump(cv, open("cv.pkl", "wb"))

 ## Inspecting the data

In [61]:
data_dtm = data_dtm.transpose()
data_dtm.head()

year,1946,1976,1990,2002,2009
aaa,1,0,0,0,0
abess,0,0,0,0,1
abide,1,0,0,0,0
abilities,1,1,0,0,0
ability,3,2,0,0,3


In [62]:
#Find the 20 most used words in each speech
top_words = {}

for c in data_dtm.columns:
    top = data_dtm[c].sort_values(ascending=False).head(20)
    top_words[c]= list(zip(top.index, top.values))

top_words

{'1946': [('dollars', 197),
  ('war', 177),
  ('year', 166),
  ('million', 131),
  ('fiscal', 117),
  ('expenditures', 112),
  ('government', 112),
  ('program', 93),
  ('united', 89),
  ('billion', 79),
  ('federal', 68),
  ('congress', 67),
  ('states', 66),
  ('legislation', 60),
  ('national', 58),
  ('nations', 52),
  ('world', 51),
  ('estimated', 51),
  ('economic', 51),
  ('business', 50)],
 '1976': [('federal', 34),
  ('year', 25),
  ('america', 17),
  ('government', 17),
  ('people', 16),
  ('new', 16),
  ('programs', 15),
  ('congress', 14),
  ('budget', 14),
  ('economy', 13),
  ('americans', 13),
  ('american', 13),
  ('world', 12),
  ('states', 12),
  ('local', 12),
  ('help', 12),
  ('tax', 12),
  ('future', 11),
  ('state', 11),
  ('know', 10)],
 '1990': [('american', 20),
  ('america', 19),
  ('world', 19),
  ('new', 17),
  ('year', 14),
  ('time', 14),
  ('tonight', 12),
  ('future', 11),
  ('need', 10),
  ('idea', 10),
  ('ago', 10),
  ('let', 9),
  ('people', 9),
  

In [63]:
'''Print the 15 most used words in each speech, check if some should be added to stopword list,
if they are irrelevant for the topic analysis'''
for year, t_words in top_words.items():
    print(year)
    print(', '.join([word for word, count in t_words[0:14]]))
    print('')

1946
dollars, war, year, million, fiscal, expenditures, government, program, united, billion, federal, congress, states, legislation

1976
federal, year, america, government, people, new, programs, congress, budget, economy, americans, american, world, states

1990
american, america, world, new, year, time, tonight, future, need, idea, ago, let, people, today

2002
america, security, world, american, terror, new, good, weapons, people, war, jobs, terrorists, country, freedom

2009
american, economy, know, plan, health, people, new, care, america, years, time, energy, education, budget



In [64]:
# make list of top 20 words in each of the 5 speeches, from top_words dict
words = []
for year in data_dtm.columns:
    top = [word for (word, count) in top_words[year]]
    for t in top:
        words.append(t)
        
words

['dollars',
 'war',
 'year',
 'million',
 'fiscal',
 'expenditures',
 'government',
 'program',
 'united',
 'billion',
 'federal',
 'congress',
 'states',
 'legislation',
 'national',
 'nations',
 'world',
 'estimated',
 'economic',
 'business',
 'federal',
 'year',
 'america',
 'government',
 'people',
 'new',
 'programs',
 'congress',
 'budget',
 'economy',
 'americans',
 'american',
 'world',
 'states',
 'local',
 'help',
 'tax',
 'future',
 'state',
 'know',
 'american',
 'america',
 'world',
 'new',
 'year',
 'time',
 'tonight',
 'future',
 'need',
 'idea',
 'ago',
 'let',
 'people',
 'today',
 'kids',
 'hope',
 'state',
 'capital',
 'change',
 'budget',
 'america',
 'security',
 'world',
 'american',
 'terror',
 'new',
 'good',
 'weapons',
 'people',
 'war',
 'jobs',
 'terrorists',
 'country',
 'freedom',
 'nation',
 'afghanistan',
 'states',
 'terrorist',
 'time',
 'camps',
 'american',
 'economy',
 'know',
 'plan',
 'health',
 'people',
 'new',
 'care',
 'america',
 'years',
 '

In [65]:
# word and number of speeches it appears in
Counter(words).most_common()

[('world', 4),
 ('america', 4),
 ('people', 4),
 ('new', 4),
 ('american', 4),
 ('year', 3),
 ('states', 3),
 ('budget', 3),
 ('time', 3),
 ('war', 2),
 ('government', 2),
 ('federal', 2),
 ('congress', 2),
 ('economy', 2),
 ('americans', 2),
 ('future', 2),
 ('state', 2),
 ('know', 2),
 ('jobs', 2),
 ('country', 2),
 ('nation', 2),
 ('dollars', 1),
 ('million', 1),
 ('fiscal', 1),
 ('expenditures', 1),
 ('program', 1),
 ('united', 1),
 ('billion', 1),
 ('legislation', 1),
 ('national', 1),
 ('nations', 1),
 ('estimated', 1),
 ('economic', 1),
 ('business', 1),
 ('programs', 1),
 ('local', 1),
 ('help', 1),
 ('tax', 1),
 ('tonight', 1),
 ('need', 1),
 ('idea', 1),
 ('ago', 1),
 ('let', 1),
 ('today', 1),
 ('kids', 1),
 ('hope', 1),
 ('capital', 1),
 ('change', 1),
 ('security', 1),
 ('terror', 1),
 ('good', 1),
 ('weapons', 1),
 ('terrorists', 1),
 ('freedom', 1),
 ('afghanistan', 1),
 ('terrorist', 1),
 ('camps', 1),
 ('plan', 1),
 ('health', 1),
 ('care', 1),
 ('years', 1),
 ('energy

In [66]:
'''looking at the data, we decide that the most common words are irrelevant if they appear in more than 2 speeches'''
add_stop_words = [word for word, count in Counter(words).most_common() if count > 2]
add_stop_words

['world',
 'america',
 'people',
 'new',
 'american',
 'year',
 'states',
 'budget',
 'time']

In [67]:
#update stop word list with the words found above, union is used to avoid duplicates
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

In [68]:
#DTM is upated with the new stopwords

cv = CountVectorizer(stop_words=stop_words)
data_cv = cv.fit_transform(data.speech)
data_stop = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_stop.index = data.index

In [69]:
data_stop.head()

Unnamed: 0_level_0,aaa,abess,abide,abilities,ability,able,ably,aboutand,abovementioned,abreast,...,young,younger,youngest,youngi,youngmy,youre,youve,zero,zone,zones
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1946,1,0,1,1,3,4,1,0,1,1,...,1,0,0,0,0,0,0,0,2,1
1976,0,0,0,1,2,0,0,0,0,0,...,0,1,1,1,1,0,0,0,0,0
1990,0,0,0,0,0,0,0,1,0,0,...,2,0,0,0,0,2,0,0,0,1
2002,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
2009,0,1,0,0,3,4,0,0,0,0,...,2,0,0,0,0,0,1,0,0,0


In [70]:
tdm = data_stop.T
tdm.head()

year,1946,1976,1990,2002,2009
aaa,1,0,0,0,0
abess,0,0,0,0,1
abide,1,0,0,0,0
abilities,1,1,0,0,0
ability,3,2,0,0,3


In [71]:
#change dtm df, first to sparse matrix and then to gensim corpus

sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [73]:
# gensim requires dictionary of the all terms and their respective location in the term-document matrix
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [74]:
# corpus = TDM and id2word = dict {location : term}
'''LDA for 2 topics and 10 passes'''
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=10)
lda.print_topics()

[(0,
  '0.004*"know" + 0.004*"federal" + 0.004*"economy" + 0.004*"health" + 0.003*"americans" + 0.003*"jobs" + 0.003*"congress" + 0.003*"help" + 0.003*"make" + 0.003*"future"'),
 (1,
  '0.014*"dollars" + 0.012*"war" + 0.009*"million" + 0.008*"fiscal" + 0.008*"government" + 0.008*"expenditures" + 0.006*"program" + 0.006*"united" + 0.005*"billion" + 0.005*"federal"')]

In [None]:
# function that selects nouns only and return tho

def nouns(text):
    '''tokenize a string and return only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)