In [67]:
import pandas as pd
import pickle
import torch
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
import re
import string
from gensim import matutils, models
import scipy.sparse
from collections import Counter
from nltk import word_tokenize, pos_tag
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForSequenceClassification

## Topic modelling using LDA
https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation

In [68]:
#We select speeches from 5 historically significant years from corpus
data = pd.read_pickle('pickled_data/data_first_clean.pkl')
data.drop(['President', 'Party','speech', 'first_clean_tokenized'], axis=1, inplace = True)
#rename column for clarity
data = data.rename({'first_clean' : 'speech'}, axis=1)

years = [1946, 1976, 1990, 2002, 2009]

data = data.loc[data['year'].isin(years)]
data = data.reset_index(drop=True)

data['year'] = data['year'].apply(str)
data.head()

Unnamed: 0,year,speech
0,1946,a quarter century ago the congress decided tha...
1,1976,"mr. speaker, mr. vice president, members of th..."
2,1990,"tonight, i come not to speak about the ""state ..."
3,2002,we last met in an hour of shock and suffering....
4,2009,"madame speaker, mr. vice president, members of..."


In [69]:
#Check a sample to see if more cleaning is needed

#data.loc[2, 'speech']

In [70]:
def clean_for_tdm(text):
    '''Remove forward slash, punctuation and numbers'''
    text = text.replace("\\", "")
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text   

In [71]:
#clean column speech
data['speech'] = data.speech.map(lambda x : clean_for_tdm(x))

In [72]:
data.head()

Unnamed: 0,year,speech
0,1946,a quarter century ago the congress decided tha...
1,1976,mr speaker mr vice president members of the c...
2,1990,tonight i come not to speak about the state of...
3,2002,we last met in an hour of shock and suffering ...
4,2009,madame speaker mr vice president members of co...


In [73]:
#data.iloc[2,1]

## Create DTM (document term matrix)

In [74]:
#set index to year for DTM
#data.set_index('year', inplace = True)
data.head()

Unnamed: 0,year,speech
0,1946,a quarter century ago the congress decided tha...
1,1976,mr speaker mr vice president members of the c...
2,1990,tonight i come not to speak about the state of...
3,2002,we last met in an hour of shock and suffering ...
4,2009,madame speaker mr vice president members of co...


In [75]:
cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data.speech)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data.index
data_dtm

Unnamed: 0,aaa,abess,abide,abilities,ability,able,ably,aboutand,abovementioned,abreast,...,young,younger,youngest,youngi,youngmy,youre,youve,zero,zone,zones
0,1,0,1,1,3,4,1,0,1,1,...,1,0,0,0,0,0,0,0,2,1
1,0,0,0,1,2,0,0,0,0,0,...,0,1,1,1,1,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,2,0,0,0,0,2,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
4,0,1,0,0,3,4,0,0,0,0,...,2,0,0,0,0,0,1,0,0,0


In [76]:
#data_dtm.to_pickle('pickled_data/dtm.pkl')
#pickle.dump(cv, open("cv.pkl", "wb"))

 ## Inspecting the data

In [77]:
data_dtm = data_dtm.transpose()
data_dtm.head()

Unnamed: 0,0,1,2,3,4
aaa,1,0,0,0,0
abess,0,0,0,0,1
abide,1,0,0,0,0
abilities,1,1,0,0,0
ability,3,2,0,0,3


In [78]:
#Find the 20 most used words in each speech
top_words = {}

for c in data_dtm.columns:
    top = data_dtm[c].sort_values(ascending=False).head(20)
    top_words[c]= list(zip(top.index, top.values))

top_words

{0: [('dollars', 197),
  ('war', 177),
  ('year', 166),
  ('million', 131),
  ('fiscal', 117),
  ('expenditures', 112),
  ('government', 112),
  ('program', 93),
  ('united', 89),
  ('billion', 79),
  ('federal', 68),
  ('congress', 67),
  ('states', 66),
  ('legislation', 60),
  ('national', 58),
  ('nations', 52),
  ('world', 51),
  ('estimated', 51),
  ('economic', 51),
  ('business', 50)],
 1: [('federal', 34),
  ('year', 25),
  ('america', 17),
  ('government', 17),
  ('people', 16),
  ('new', 16),
  ('programs', 15),
  ('congress', 14),
  ('budget', 14),
  ('economy', 13),
  ('americans', 13),
  ('american', 13),
  ('world', 12),
  ('states', 12),
  ('local', 12),
  ('help', 12),
  ('tax', 12),
  ('future', 11),
  ('state', 11),
  ('know', 10)],
 2: [('american', 20),
  ('america', 19),
  ('world', 19),
  ('new', 17),
  ('year', 14),
  ('time', 14),
  ('tonight', 12),
  ('future', 11),
  ('need', 10),
  ('idea', 10),
  ('ago', 10),
  ('let', 9),
  ('people', 9),
  ('today', 9),
 

In [79]:
'''Print the 15 most used words in each speech, check if some should be added to stopword list,
if they are irrelevant for the topic analysis'''
for year, t_words in top_words.items():
    print(year)
    print(', '.join([word for word, count in t_words[0:14]]))
    print('')

0
dollars, war, year, million, fiscal, expenditures, government, program, united, billion, federal, congress, states, legislation

1
federal, year, america, government, people, new, programs, congress, budget, economy, americans, american, world, states

2
american, america, world, new, year, time, tonight, future, need, idea, ago, let, people, today

3
america, security, world, american, terror, new, good, weapons, people, war, jobs, terrorists, country, freedom

4
american, economy, know, plan, health, people, new, care, america, years, time, energy, education, budget



In [80]:
# make list of top 20 words in each of the 5 speeches, from top_words dict
words = []
for year in data_dtm.columns:
    top = [word for (word, count) in top_words[year]]
    for t in top:
        words.append(t)
        
words

['dollars',
 'war',
 'year',
 'million',
 'fiscal',
 'expenditures',
 'government',
 'program',
 'united',
 'billion',
 'federal',
 'congress',
 'states',
 'legislation',
 'national',
 'nations',
 'world',
 'estimated',
 'economic',
 'business',
 'federal',
 'year',
 'america',
 'government',
 'people',
 'new',
 'programs',
 'congress',
 'budget',
 'economy',
 'americans',
 'american',
 'world',
 'states',
 'local',
 'help',
 'tax',
 'future',
 'state',
 'know',
 'american',
 'america',
 'world',
 'new',
 'year',
 'time',
 'tonight',
 'future',
 'need',
 'idea',
 'ago',
 'let',
 'people',
 'today',
 'kids',
 'hope',
 'state',
 'capital',
 'change',
 'budget',
 'america',
 'security',
 'world',
 'american',
 'terror',
 'new',
 'good',
 'weapons',
 'people',
 'war',
 'jobs',
 'terrorists',
 'country',
 'freedom',
 'nation',
 'afghanistan',
 'states',
 'terrorist',
 'time',
 'camps',
 'american',
 'economy',
 'know',
 'plan',
 'health',
 'people',
 'new',
 'care',
 'america',
 'years',
 '

In [81]:
# word and number of speeches it appears in
Counter(words).most_common()

[('world', 4),
 ('america', 4),
 ('people', 4),
 ('new', 4),
 ('american', 4),
 ('year', 3),
 ('states', 3),
 ('budget', 3),
 ('time', 3),
 ('war', 2),
 ('government', 2),
 ('federal', 2),
 ('congress', 2),
 ('economy', 2),
 ('americans', 2),
 ('future', 2),
 ('state', 2),
 ('know', 2),
 ('jobs', 2),
 ('country', 2),
 ('nation', 2),
 ('dollars', 1),
 ('million', 1),
 ('fiscal', 1),
 ('expenditures', 1),
 ('program', 1),
 ('united', 1),
 ('billion', 1),
 ('legislation', 1),
 ('national', 1),
 ('nations', 1),
 ('estimated', 1),
 ('economic', 1),
 ('business', 1),
 ('programs', 1),
 ('local', 1),
 ('help', 1),
 ('tax', 1),
 ('tonight', 1),
 ('need', 1),
 ('idea', 1),
 ('ago', 1),
 ('let', 1),
 ('today', 1),
 ('kids', 1),
 ('hope', 1),
 ('capital', 1),
 ('change', 1),
 ('security', 1),
 ('terror', 1),
 ('good', 1),
 ('weapons', 1),
 ('terrorists', 1),
 ('freedom', 1),
 ('afghanistan', 1),
 ('terrorist', 1),
 ('camps', 1),
 ('plan', 1),
 ('health', 1),
 ('care', 1),
 ('years', 1),
 ('energy

In [82]:
'''looking at the data, we decide that the most common words are irrelevant if they appear in more than 2 speeches'''
add_stop_words = [word for word, count in Counter(words).most_common() if count > 2]
add_stop_words

['world',
 'america',
 'people',
 'new',
 'american',
 'year',
 'states',
 'budget',
 'time']

In [83]:
#update stop word list with the words found above, union is used to avoid duplicates
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

## Prepare for analysis and first run

In [84]:
#DTM is upated with the new stopwords

cv_stop = CountVectorizer(stop_words=stop_words)
data_cv = cv_stop.fit_transform(data.speech)
data_stop = pd.DataFrame(data_cv.toarray(), columns=cv_stop.get_feature_names())
data_stop.index = data.index

#data_stop.to_pickle('pickled_data/dtm_stop.pkl')

In [85]:
data_stop.head()

Unnamed: 0,aaa,abess,abide,abilities,ability,able,ably,aboutand,abovementioned,abreast,...,young,younger,youngest,youngi,youngmy,youre,youve,zero,zone,zones
0,1,0,1,1,3,4,1,0,1,1,...,1,0,0,0,0,0,0,0,2,1
1,0,0,0,1,2,0,0,0,0,0,...,0,1,1,1,1,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,2,0,0,0,0,2,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
4,0,1,0,0,3,4,0,0,0,0,...,2,0,0,0,0,0,1,0,0,0


In [86]:
tdm = data_stop.T
tdm.head()

Unnamed: 0,0,1,2,3,4
aaa,1,0,0,0,0
abess,0,0,0,0,1
abide,1,0,0,0,0
abilities,1,1,0,0,0
ability,3,2,0,0,3


In [87]:
#change dtm df, first to sparse matrix and then to gensim corpus

sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [88]:
# gensim requires dictionary of the all terms and their respective location in the term-document matrix
id2word = dict((v, k) for k, v in cv_stop.vocabulary_.items())

In [89]:
# corpus = TDM and id2word = dict {location : term}
'''LDA for 2 topics and 10 passes'''
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=10)
lda.print_topics()

[(0,
  '0.014*"dollars" + 0.012*"war" + 0.009*"million" + 0.008*"fiscal" + 0.008*"government" + 0.008*"expenditures" + 0.006*"program" + 0.006*"united" + 0.005*"billion" + 0.005*"federal"'),
 (1,
  '0.004*"know" + 0.004*"federal" + 0.004*"economy" + 0.004*"health" + 0.003*"americans" + 0.003*"congress" + 0.003*"jobs" + 0.003*"help" + 0.003*"future" + 0.003*"make"')]

In [90]:
'''LDA for 3 topics and 10 passes'''
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=3, passes=10)
lda.print_topics()

[(0,
  '0.005*"know" + 0.004*"health" + 0.004*"nation" + 0.004*"country" + 0.004*"tonight" + 0.004*"plan" + 0.004*"jobs" + 0.003*"economy" + 0.003*"need" + 0.003*"make"'),
 (1,
  '0.014*"dollars" + 0.013*"war" + 0.010*"million" + 0.009*"fiscal" + 0.008*"government" + 0.008*"expenditures" + 0.007*"program" + 0.006*"united" + 0.006*"billion" + 0.005*"federal"'),
 (2,
  '0.009*"federal" + 0.005*"government" + 0.004*"programs" + 0.004*"congress" + 0.003*"economy" + 0.003*"americans" + 0.003*"tax" + 0.003*"help" + 0.003*"local" + 0.003*"future"')]

In [91]:
'''LDA for 4 topics and 10 passes'''
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=4, passes=10)
lda.print_topics()

[(0,
  '0.006*"security" + 0.005*"terror" + 0.004*"good" + 0.004*"weapons" + 0.004*"war" + 0.004*"jobs" + 0.003*"country" + 0.003*"nation" + 0.003*"freedom" + 0.003*"terrorists"'),
 (1,
  '0.013*"dollars" + 0.012*"war" + 0.009*"million" + 0.008*"government" + 0.008*"fiscal" + 0.007*"expenditures" + 0.007*"federal" + 0.006*"program" + 0.006*"united" + 0.005*"billion"'),
 (2,
  '0.006*"know" + 0.005*"health" + 0.005*"care" + 0.004*"tonight" + 0.004*"plan" + 0.004*"economy" + 0.004*"years" + 0.004*"future" + 0.003*"need" + 0.003*"nation"'),
 (3,
  '0.000*"dollars" + 0.000*"war" + 0.000*"government" + 0.000*"million" + 0.000*"united" + 0.000*"billion" + 0.000*"congress" + 0.000*"fiscal" + 0.000*"program" + 0.000*"federal"')]

### As the results include several words that are irrelevant to possible topics, we try and narrow the search by only including nouns

In [92]:
# function that selects nouns only and return those as a string, details see: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

def nouns(text):
    '''tokenize a string and return only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [93]:
data_clean = pd.read_pickle('pickled_data/dtm.pkl')
data_clean.T

Unnamed: 0_level_0,aaa,abess,abide,abilities,ability,able,ably,aboutand,abovementioned,abreast,...,young,younger,youngest,youngi,youngmy,youre,youve,zero,zone,zones
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1946,1,0,1,1,3,4,1,0,1,1,...,1,0,0,0,0,0,0,0,2,1
1976,0,0,0,1,2,0,0,0,0,0,...,0,1,1,1,1,0,0,0,0,0
1990,0,0,0,0,0,0,0,1,0,0,...,2,0,0,0,0,2,0,0,0,1
2002,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
2009,0,1,0,0,3,4,0,0,0,0,...,2,0,0,0,0,0,1,0,0,0


In [94]:
data_nouns = pd.DataFrame(data.speech.apply(nouns))
data_nouns

Unnamed: 0,speech
0,quarter century congress programs departments ...
1,mr speaker vice president members congress gue...
2,tonight i state government initiative year lin...
3,hour shock suffering months nation victims yor...
4,madame speaker vice president members congress...


In [95]:
# Recreate a document-term matrix with only nouns
cvn = CountVectorizer(stop_words=stop_words)
data_cvn = cvn.fit_transform(data_nouns.speech)
data_dtmn = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names())
data_dtmn.index = data_nouns.index
data_dtmn

Unnamed: 0,aaa,abilities,ability,aboutand,abroad,absence,absorption,abundance,abuse,acceptance,...,years,yearswe,yesterday,yield,york,youngmy,youre,youve,zone,zones
0,1,1,3,0,1,0,1,0,0,0,...,41,0,0,1,1,0,0,0,1,1
1,0,1,2,0,0,0,0,1,1,0,...,8,1,0,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,6,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,2,0,0,0,2,0,0,0,0,0
4,0,0,3,0,0,1,0,0,1,1,...,15,0,1,1,0,0,0,1,0,0


In [96]:
# Create the gensim corpus
corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn.transpose()))

# Create the vocabulary dictionary
id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())

In [97]:
'''LDA for 2 topics, 10 passes and nouns only'''
ldan = models.LdaModel(corpus=corpusn, num_topics=2, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.008*"economy" + 0.007*"health" + 0.007*"jobs" + 0.007*"americans" + 0.006*"security" + 0.006*"congress" + 0.006*"government" + 0.006*"tax" + 0.006*"care" + 0.006*"country"'),
 (1,
  '0.023*"dollars" + 0.020*"war" + 0.013*"government" + 0.011*"program" + 0.011*"expenditures" + 0.008*"congress" + 0.007*"legislation" + 0.006*"nations" + 0.006*"production" + 0.006*"business"')]

In [98]:
#3 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=3, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.009*"economy" + 0.008*"jobs" + 0.008*"health" + 0.007*"americans" + 0.007*"security" + 0.007*"congress" + 0.007*"tax" + 0.007*"government" + 0.006*"country" + 0.006*"care"'),
 (1,
  '0.026*"dollars" + 0.023*"war" + 0.015*"government" + 0.012*"expenditures" + 0.012*"program" + 0.009*"congress" + 0.008*"legislation" + 0.007*"nations" + 0.007*"business" + 0.007*"production"'),
 (2,
  '0.006*"idea" + 0.005*"future" + 0.005*"today" + 0.005*"kids" + 0.005*"state" + 0.005*"change" + 0.004*"nation" + 0.004*"capital" + 0.004*"home" + 0.004*"europe"')]

In [99]:
#4 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=4, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.008*"health" + 0.007*"nation" + 0.007*"country" + 0.007*"jobs" + 0.007*"economy" + 0.006*"care" + 0.006*"plan" + 0.006*"years" + 0.006*"security" + 0.006*"americans"'),
 (1,
  '0.027*"dollars" + 0.024*"war" + 0.015*"government" + 0.013*"expenditures" + 0.013*"program" + 0.009*"congress" + 0.008*"legislation" + 0.007*"nations" + 0.007*"production" + 0.007*"business"'),
 (2,
  '0.001*"dollars" + 0.001*"war" + 0.001*"government" + 0.001*"expenditures" + 0.001*"program" + 0.001*"congress" + 0.001*"programs" + 0.001*"years" + 0.001*"nations" + 0.001*"business"'),
 (3,
  '0.010*"government" + 0.009*"programs" + 0.007*"congress" + 0.007*"economy" + 0.007*"americans" + 0.007*"tax" + 0.006*"state" + 0.006*"growth" + 0.005*"energy" + 0.005*"jobs"')]

### Using nouns AND adjectives

In [100]:
# function that returns nouns and adjectives from a text string
def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

In [101]:

data_nouns_adj = pd.DataFrame(data.speech.apply(nouns_adj))
data_nouns_adj.speech[1]


'mr speaker mr vice president members congress guestsas bicentennial america youngest nations recorded history forefathers shores men women planet better life familiesin mans long upward march savagery years christian calendar years jewish reckoningthere many deep valleys many bright peaksone peak highest ranges human history example shines forth people abundance share good life freedom union promise justice opportunity citizen union united states americawe paradise earth perfection minute yearswe many roots many branches americans generations deeds other homeland refuge shores unison i proud america i proud americanlife little better children i i life father mother i better children hands brains voice vote america exists conditions people ideas practical reality best times much translation best intentions recent past sound most history great things ageold problems overconfident abilities policeman indulgent parent homewe thought country massive national programs programs things rush g

In [102]:
# Recreate a document-term matrix with nouns AND adjectives

cvna = CountVectorizer(stop_words=stop_words)
data_cvna = cvna.fit_transform(data_nouns_adj.speech)
data_dtmna = pd.DataFrame(data_cvna.toarray(), columns=cvna.get_feature_names())
data_dtmna.index = data_nouns_adj.index
data_dtmna

Unnamed: 0,aaa,abilities,ability,able,aboutand,abovementioned,abroad,absence,absorption,abundance,...,york,young,younger,youngest,youngi,youngmy,youre,youve,zone,zones
0,1,1,3,4,0,1,1,0,1,1,...,1,1,0,0,0,0,0,0,1,1
1,0,1,2,0,0,0,0,0,0,1,...,0,0,1,1,1,1,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,2,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,2,0,1,0,0,0,0,0,0,0
4,0,0,3,4,0,0,0,1,0,0,...,0,2,0,0,0,0,0,1,0,0


In [103]:
# Create the gensim corpus
corpusna = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmna.transpose()))

# Create the vocabulary dictionary
id2wordna = dict((v, k) for k, v in cvna.vocabulary_.items())

In [104]:
#as above, 2 topics and 10 passes
ldana = models.LdaModel(corpus=corpusna, num_topics=2, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.006*"federal" + 0.005*"economy" + 0.005*"health" + 0.005*"jobs" + 0.005*"americans" + 0.005*"future" + 0.004*"congress" + 0.004*"nation" + 0.004*"country" + 0.004*"government"'),
 (1,
  '0.018*"dollars" + 0.016*"war" + 0.011*"fiscal" + 0.010*"government" + 0.009*"expenditures" + 0.009*"program" + 0.008*"united" + 0.006*"federal" + 0.006*"congress" + 0.006*"legislation"')]

In [105]:
# 3 topics, 10 passes
ldana = models.LdaModel(corpus=corpusna, num_topics=3, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.006*"health" + 0.006*"jobs" + 0.006*"nation" + 0.005*"country" + 0.005*"economy" + 0.005*"care" + 0.005*"americans" + 0.005*"plan" + 0.005*"security" + 0.005*"years"'),
 (1,
  '0.001*"war" + 0.001*"dollars" + 0.001*"jobs" + 0.001*"health" + 0.001*"federal" + 0.001*"government" + 0.001*"congress" + 0.001*"united" + 0.001*"fiscal" + 0.001*"economy"'),
 (2,
  '0.017*"dollars" + 0.016*"war" + 0.011*"government" + 0.010*"fiscal" + 0.009*"federal" + 0.008*"program" + 0.008*"united" + 0.008*"expenditures" + 0.007*"congress" + 0.006*"legislation"')]

In [106]:
#4 topics, 10 passes
ldana = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.001*"war" + 0.001*"dollars" + 0.000*"government" + 0.000*"fiscal" + 0.000*"program" + 0.000*"federal" + 0.000*"united" + 0.000*"security" + 0.000*"nation" + 0.000*"great"'),
 (1,
  '0.017*"dollars" + 0.016*"war" + 0.011*"government" + 0.011*"fiscal" + 0.009*"federal" + 0.008*"program" + 0.008*"united" + 0.008*"expenditures" + 0.007*"congress" + 0.006*"legislation"'),
 (2,
  '0.007*"jobs" + 0.007*"economy" + 0.007*"health" + 0.006*"plan" + 0.006*"security" + 0.006*"country" + 0.006*"nation" + 0.006*"americans" + 0.005*"care" + 0.005*"energy"'),
 (3,
  '0.006*"future" + 0.005*"idea" + 0.005*"tonight" + 0.005*"today" + 0.005*"kids" + 0.004*"state" + 0.004*"free" + 0.004*"capital" + 0.004*"change" + 0.004*"nation"')]

In [110]:
data_nouns_adj.columns

Index(['speech'], dtype='object')

In [118]:
# which topics of the 4 lists found, are in which speech (year)
corpus_transformed = ldana[corpusna]
print(len(corpus_transformed))
print(data_dtmna.index)
for x in data_dtmna.index:
    print(x)
    print(type(x))
list(zip([a for [(a,b)] in corpus_transformed], data.index))

5
RangeIndex(start=0, stop=5, step=1)
0
<class 'int'>
1
<class 'int'>
2
<class 'int'>
3
<class 'int'>
4
<class 'int'>


ValueError: too many values to unpack (expected 1)

## Creating Word Clouds for speeches from 5 significant years
### 1946: End of WW2, 1976: End of Vietnam war, 1990: End of the cold war, 2002: Following 9/11, 2009: Global fin.crisis

In [None]:
data_st = pd.read_pickle('pickled_data/data_first_clean.pkl')
data_st.drop(['President', 'Party','speech', 'first_clean_tokenized'], axis=1, inplace = True)
data_st = data_st.rename({'first_clean' : 'speech'}, axis=1)
#Significant years: 1946, end of ww2, 1976 end of Vietnam war, 1990 end of cold war, 2002 9/11, 2009 glob fin crisis
years = [1946, 1976, 1990, 2002, 2009]

data_st = data_st.loc[data_st['year'].isin(years)]
data_st = data_st.reset_index(drop=True)

data_st['year'] = data_st.year.astype('str')

data_st.head()

In [None]:
#make dict for plotting and alterative analysis below
speech_dict = dict(zip(data_st.year, data_st.speech))

In [None]:
#make wordcloud for each of the 5 speeches
stop_words = text.ENGLISH_STOP_WORDS

wc = WordCloud(stopwords=stop_words, background_color="black", colormap="Dark2",
               max_font_size=150, random_state=42)

plt.rcParams['figure.figsize'] = [10, 6]


for key, value in speech_dict.items():
    wc.generate(value)
    
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(key)    
    plt.show()

## As an experiment, we try the prelearned model from: https://huggingface.co/MoritzLaurer/policy-distilbert-7d on the same 5 years

In [None]:
def ml_policy(text):
    model_name = "MoritzLaurer/policy-distilbert-7d"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    input = tokenizer(text, truncation=True, return_tensors="pt")

    output = model(input["input_ids"])
    prediction = torch.softmax(output["logits"][0], -1).tolist()

    label_names = ["external relations", "freedom and democracy",
               "political system", "economy", "welfare and quality of life",
               "fabric of society", "social groups"]
    prediction = {name: round(float(pred) * 100, 1) for pred, name in
              zip(prediction, label_names)}
    return prediction

In [None]:
for x, y in speech_dict.items():
    print('Year: ', x)
    print(ml_policy(y))