In [62]:
import pandas as pd
import pickle
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from modules.cleaners import simple_clean
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
import re
import string
from gensim import matutils, models
import scipy.sparse

In [63]:
df_first = pd.read_pickle('pickled_data/data_first_clean.pkl')
df_second = pd.read_pickle('pickled_data/data_second_clean.pkl')
df_first.head()

Unnamed: 0,year,President,Party,speech,first_clean,first_clean_tokenized
0,1900,William McKinley,Republican,To the Senate and House of Representatives: \n...,to the senate and house of representatives: at...,"[senate, house, representatives, outgoing, old..."
1,1901,Theodore Roosevelt,Republican,To the Senate and House of Representatives: \n...,to the senate and house of representatives: th...,"[senate, house, representatives, congress, ass..."
2,1902,Theodore Roosevelt,Republican,To the Senate and House of Representatives: \n...,to the senate and house of representatives: we...,"[senate, house, representatives, still, contin..."
3,1903,Theodore Roosevelt,Republican,To the Senate and House of Representatives: \n...,to the senate and house of representatives: th...,"[senate, house, representatives, country, cong..."
4,1904,Theodore Roosevelt,Republican,To the Senate and House of Representatives: \n...,to the senate and house of representatives: th...,"[senate, house, representatives, nation, conti..."


In [64]:
def clean_for_tdm(text):
    '''Remove forward slash, punctuation and numbers'''
    text = text.replace("\\", "")
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text   

In [65]:
crisis = [1930, 1931, 1932, 1986, 1987, 1988, 2008, 2009, 2010]
growth = [1926, 1927, 1928, 1992, 1993, 1994, 2005, 2006, 2007]
selection = crisis+growth

In [72]:
clean_txt = lambda x: clean_for_tdm(x)

In [74]:
data = df_first[['year', 'first_clean']].copy()
data = data[data.year.isin(selection)]
data = data.reset_index(drop=True)
data['year'] = data['year'].apply(str)
data.to_pickle("pickled_data/for_word_cloud.pkl")
data

Unnamed: 0,year,first_clean
0,1926,members of the congress: in reporting to the c...
1,1927,members of the congress: it is gratifying to r...
2,1928,to the congress of the united states:no congre...
3,1930,to the senate and house of representatives: i ...
4,1931,to the senate and house of representatives: it...
5,1932,to the senate and house of representatives: in...
6,1986,"mr. speaker, mr. president, distinguished memb..."
7,1987,"thank you very much. mr. speaker, mr. presiden..."
8,1988,"""thank you. mr. speaker, mr. president, and di..."
9,1992,"mr. speaker and mr. president, distinguished m..."


In [75]:
data_clean = pd.DataFrame(data.first_clean.apply(clean_txt))
data_clean

Unnamed: 0,first_clean
0,members of the congress in reporting to the co...
1,members of the congress it is gratifying to re...
2,to the congress of the united statesno congres...
3,to the senate and house of representatives i h...
4,to the senate and house of representatives it ...
5,to the senate and house of representatives in ...
6,mr speaker mr president distinguished members ...
7,thank you very much mr speaker mr president di...
8,thank you mr speaker mr president and distingu...
9,mr speaker and mr president distinguished memb...


In [76]:
stop_words = text.ENGLISH_STOP_WORDS

cv = CountVectorizer(stop_words=stop_words)
data_cv = cv.fit_transform(data_clean['first_clean'])
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data.year

data_dtm

Unnamed: 0_level_0,abandon,abandoned,abandoning,abandonment,abbas,abess,abide,abiding,abidinglogic,abilities,...,youre,youth,youthful,youve,zarqawi,zeroemission,zimbabwe,zinc,zone,zones
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1926,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1927,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1928,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1930,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1931,1,1,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1932,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1986,0,2,0,1,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
1987,1,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
1988,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1992,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0


In [77]:
data_dtm = data_dtm.transpose()
data_dtm.head()

year,1926,1927,1928,1930,1931,1932,1986,1987,1988,1992,1993,1994,2005,2006,2007,2008,2009,2010
abandon,0,0,0,0,1,0,0,1,0,0,0,0,2,1,1,1,0,0
abandoned,0,1,1,0,1,0,2,0,0,0,0,1,0,0,1,0,0,0
abandoning,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
abandonment,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0
abbas,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [78]:
# Find the top 25 words in each speech
top_dict = {}
for c in data_dtm.columns:
    top = data_dtm[c].sort_values(ascending=False).head(25)
    top_dict[c]= list(zip(top.index, top.values))

top_dict

{'1926': [('government', 49),
  ('country', 36),
  ('national', 31),
  ('federal', 26),
  ('people', 25),
  ('present', 25),
  ('congress', 23),
  ('ought', 23),
  ('years', 22),
  ('reduction', 21),
  ('legislation', 21),
  ('general', 19),
  ('public', 19),
  ('time', 17),
  ('great', 17),
  ('agriculture', 17),
  ('war', 16),
  ('law', 15),
  ('work', 15),
  ('large', 14),
  ('nation', 14),
  ('year', 14),
  ('agricultural', 13),
  ('tariff', 13),
  ('development', 13)],
 '1927': [('government', 46),
  ('country', 25),
  ('congress', 25),
  ('public', 24),
  ('national', 23),
  ('people', 22),
  ('agriculture', 21),
  ('federal', 19),
  ('states', 19),
  ('department', 16),
  ('law', 15),
  ('land', 15),
  ('great', 15),
  ('farmer', 15),
  ('state', 15),
  ('property', 15),
  ('necessary', 14),
  ('legislation', 14),
  ('large', 14),
  ('order', 13),
  ('war', 13),
  ('provide', 13),
  ('years', 13),
  ('power', 13),
  ('reduction', 12)],
 '1928': [('government', 45),
  ('congress'

In [79]:
# Print the top 12 words used in each year
for year, top_words in top_dict.items():
    print(year)
    print(', '.join([word for word, count in top_words[0:11]]))
    print('---')

1926
government, country, national, federal, people, present, congress, ought, years, reduction, legislation
---
1927
government, country, congress, public, national, people, agriculture, federal, states, department, law
---
1928
government, congress, states, public, year, years, country, federal, new, present, private
---
1930
year, congress, government, construction, public, depression, federal, work, economic, employment, country
---
1931
banks, federal, government, action, year, meet, public, credit, congress, economic, financial
---
1932
government, economic, congress, federal, people, great, action, recovery, world, national, country
---
1986
america, future, people, american, work, applause, family, tonight, world, budget, government
---
1987
congress, people, america, applause, world, years, new, american, freedom, let, government
---
1988
america, applause, government, years, world, freedom, budget, people, lets, federal, family
---
1992
people, know, plan, world, right, help,

AttributeError: 'DataFrame' object has no attribute 'transcript'

In [80]:
tdm = data.transpose()
tdm.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
year,1926,1927,1928,1930,1931,1932,1986,1987,1988,1992,1993,1994,2005,2006,2007,2008,2009,2010
first_clean,members of the congress: in reporting to the c...,members of the congress: it is gratifying to r...,to the congress of the united states:no congre...,to the senate and house of representatives: i ...,to the senate and house of representatives: it...,to the senate and house of representatives: in...,"mr. speaker, mr. president, distinguished memb...","thank you very much. mr. speaker, mr. presiden...","""thank you. mr. speaker, mr. president, and di...","mr. speaker and mr. president, distinguished m...","mr. president, mr. speaker, members of the hou...","thank you very much. mr. speaker, mr. presiden...","as a new congress gathers, all of us in the el...","mr. speaker, vice president cheney, members of...","thank you very much. and tonight, i have a hig...","the president: madam speaker, vice president c...","madame speaker, mr. vice president, members of...","madam speaker, vice president biden, members o..."


In [81]:
top_dict = {}
for c in data_tdm.columns:
    top = data_tdm[c].sort_values(ascending=False).head(30)
    top_dict[c]= list(zip(top.index, top.values))

top_dict

{'abandon': [(12, 2),
  (15, 1),
  (14, 1),
  (13, 1),
  (4, 1),
  (7, 1),
  (17, 0),
  (5, 0),
  (1, 0),
  (2, 0),
  (3, 0),
  (8, 0),
  (6, 0),
  (16, 0),
  (9, 0),
  (10, 0),
  (11, 0),
  (0, 0)],
 'abandoned': [(6, 2),
  (1, 1),
  (14, 1),
  (2, 1),
  (11, 1),
  (4, 1),
  (17, 0),
  (7, 0),
  (3, 0),
  (5, 0),
  (8, 0),
  (16, 0),
  (9, 0),
  (10, 0),
  (12, 0),
  (13, 0),
  (15, 0),
  (0, 0)],
 'abandoning': [(13, 1),
  (17, 0),
  (7, 0),
  (1, 0),
  (2, 0),
  (3, 0),
  (4, 0),
  (5, 0),
  (6, 0),
  (8, 0),
  (16, 0),
  (9, 0),
  (10, 0),
  (11, 0),
  (12, 0),
  (14, 0),
  (15, 0),
  (0, 0)],
 'abandonment': [(2, 1),
  (4, 1),
  (6, 1),
  (17, 0),
  (16, 0),
  (1, 0),
  (3, 0),
  (5, 0),
  (7, 0),
  (8, 0),
  (9, 0),
  (10, 0),
  (11, 0),
  (12, 0),
  (13, 0),
  (14, 0),
  (15, 0),
  (0, 0)],
 'abbas': [(12, 1),
  (17, 0),
  (7, 0),
  (1, 0),
  (2, 0),
  (3, 0),
  (4, 0),
  (5, 0),
  (6, 0),
  (8, 0),
  (16, 0),
  (9, 0),
  (10, 0),
  (11, 0),
  (13, 0),
  (14, 0),
  (15, 0),
  (0

In [82]:
sparse_counts = scipy.sparse.csr_matrix(data_tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [83]:
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [84]:
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=10)
lda.print_topics()

[(0,
  '0.178*"abandon" + 0.172*"abandoning" + 0.166*"abandoned" + 0.130*"abbas" + 0.107*"abandonment" + 0.087*"abess" + 0.002*"abidinglogic" + 0.002*"ability" + 0.001*"able" + 0.001*"abolishing"'),
 (1,
  '0.114*"able" + 0.099*"abortion" + 0.091*"ability" + 0.081*"aboard" + 0.079*"abidinglogic" + 0.078*"abolishing" + 0.074*"ablebodied" + 0.069*"abolition" + 0.066*"abilities" + 0.058*"abiding"')]

In [85]:
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=3, passes=10)
lda.print_topics()

[(0,
  '0.179*"abandon" + 0.176*"abandoning" + 0.167*"abandoned" + 0.130*"abbas" + 0.107*"abandonment" + 0.088*"abess" + 0.012*"ability" + 0.008*"abidinglogic" + 0.006*"abortion" + 0.006*"able"'),
 (1,
  '0.109*"abiding" + 0.000*"abandoning" + 0.000*"abide" + 0.000*"aboard" + 0.000*"abilities" + 0.000*"abolishing" + 0.000*"abidinglogic" + 0.000*"ablebodied" + 0.000*"abolition" + 0.000*"abortion"'),
 (2,
  '0.118*"able" + 0.102*"abortion" + 0.091*"ability" + 0.085*"aboard" + 0.082*"abolishing" + 0.081*"abidinglogic" + 0.078*"ablebodied" + 0.071*"abolition" + 0.068*"abilities" + 0.059*"abolished"')]

In [86]:
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=4, passes=10)
lda.print_topics()

[(0,
  '0.206*"able" + 0.179*"abortion" + 0.166*"ability" + 0.119*"abolition" + 0.081*"abiding" + 0.049*"abilities" + 0.033*"abolishing" + 0.033*"abidinglogic" + 0.028*"ablebodied" + 0.014*"abide"'),
 (1,
  '0.171*"abolishing" + 0.000*"abilities" + 0.000*"abortion" + 0.000*"abiding" + 0.000*"abandoning" + 0.000*"abolition" + 0.000*"able" + 0.000*"abandon" + 0.000*"abandoned" + 0.000*"abide"'),
 (2,
  '0.194*"abandon" + 0.186*"abandoning" + 0.180*"abandoned" + 0.140*"abbas" + 0.116*"abandonment" + 0.096*"abess" + 0.002*"abidinglogic" + 0.002*"abolishing" + 0.001*"abide" + 0.000*"ability"'),
 (3,
  '0.177*"aboard" + 0.138*"abidinglogic" + 0.132*"ablebodied" + 0.120*"abolished" + 0.117*"abolishing" + 0.099*"abide" + 0.089*"abilities" + 0.031*"abiding" + 0.007*"abolition" + 0.000*"ability"')]

In [87]:
from nltk import word_tokenize, pos_tag

def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [91]:
data_nouns = pd.DataFrame(data_clean['first_clean'].apply(nouns))
data_nouns

Unnamed: 0,first_clean
0,members congress congress state union i peace ...
1,members congress year state union country whol...
2,congress statesno congress states state union ...
3,senate house representatives i honor requireme...
4,senate house representatives duty constitution...
5,senate house representatives accord duty trans...
6,mr speaker president members congress guests c...
7,thank mr speaker president members congress gu...
8,thank mr president members house senate years ...
9,mr speaker mr president members congress guest...


In [94]:
# Re-add the additional stop words since we are recreating the document-term matrix
add_stop_words = ['like', 'im', 'know', 'just', 'dont', 'thats', 'right', 'people',
                  'youre', 'got', 'gonna', 'time', 'think', 'yeah', 'said']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

# Recreate a document-term matrix with only nouns
cvn = CountVectorizer(stop_words=stop_words)
data_cvn = cvn.fit_transform(data_nouns['first_clean'])
data_dtmn = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names())
data_dtmn.index = data_nouns.index
data_dtmn

Unnamed: 0,abandonment,abbas,abilities,ability,abolition,abortion,abortions,absence,absent,absolute,...,york,youll,youth,youve,zarqawi,zeroemission,zimbabwe,zinc,zone,zones
0,0,0,0,3,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,4,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
4,1,0,1,3,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,1,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
8,0,0,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,2,0,0,0,0,0,0,1,0


In [95]:
# Create the gensim corpus
corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn.transpose()))

# Create the vocabulary dictionary
id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())

In [96]:
ldan = models.LdaModel(corpus=corpusn, num_topics=2, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.009*"health" + 0.009*"year" + 0.009*"america" + 0.009*"country" + 0.009*"world" + 0.009*"years" + 0.008*"americans" + 0.008*"government" + 0.007*"care" + 0.007*"congress"'),
 (1,
  '0.016*"government" + 0.012*"congress" + 0.008*"country" + 0.008*"year" + 0.007*"years" + 0.006*"world" + 0.006*"states" + 0.005*"legislation" + 0.005*"agriculture" + 0.004*"action"')]

In [97]:
ldan = models.LdaModel(corpus=corpusn, num_topics=3, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.010*"america" + 0.009*"year" + 0.008*"years" + 0.008*"government" + 0.008*"world" + 0.008*"americans" + 0.008*"congress" + 0.008*"security" + 0.007*"nation" + 0.007*"country"'),
 (1,
  '0.018*"government" + 0.011*"congress" + 0.009*"country" + 0.008*"year" + 0.006*"years" + 0.006*"states" + 0.005*"agriculture" + 0.005*"legislation" + 0.005*"world" + 0.005*"action"'),
 (2,
  '0.011*"health" + 0.010*"world" + 0.009*"care" + 0.009*"year" + 0.009*"plan" + 0.009*"country" + 0.009*"years" + 0.008*"congress" + 0.008*"jobs" + 0.008*"government"')]

In [98]:
ldan = models.LdaModel(corpus=corpusn, num_topics=4, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.014*"government" + 0.011*"congress" + 0.010*"year" + 0.009*"banks" + 0.008*"action" + 0.007*"construction" + 0.006*"world" + 0.006*"country" + 0.005*"employment" + 0.005*"depression"'),
 (1,
  '0.015*"government" + 0.011*"country" + 0.011*"congress" + 0.009*"years" + 0.008*"year" + 0.007*"states" + 0.006*"world" + 0.006*"care" + 0.006*"health" + 0.005*"plan"'),
 (2,
  '0.010*"america" + 0.009*"americans" + 0.009*"country" + 0.009*"world" + 0.009*"year" + 0.008*"security" + 0.008*"congress" + 0.008*"health" + 0.008*"nation" + 0.008*"years"'),
 (3,
  '0.009*"america" + 0.009*"world" + 0.009*"freedom" + 0.009*"government" + 0.009*"budget" + 0.008*"years" + 0.008*"family" + 0.007*"year" + 0.005*"future" + 0.005*"applause"')]

In [99]:
def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

In [100]:
data_nouns_adj = pd.DataFrame(data_clean['first_clean'].apply(nouns_adj))
data_nouns_adj

Unnamed: 0,first_clean
0,members congress congress state union i imposs...
1,members congress fourth consecutive year state...
2,congress united statesno congress united state...
3,senate house representatives i honor requireme...
4,senate house representatives duty constitution...
5,senate house representatives accord constituti...
6,mr speaker president members congress guests f...
7,thank much mr speaker president members congre...
8,thank speaker mr president members house senat...
9,mr speaker mr president members congress guest...


In [102]:
cvna = CountVectorizer(stop_words=stop_words, max_df=.8)
data_cvna = cvna.fit_transform(data_nouns_adj['first_clean'])
data_dtmna = pd.DataFrame(data_cvna.toarray(), columns=cvna.get_feature_names())
data_dtmna.index = data_nouns_adj.index
data_dtmna

Unnamed: 0,abandonment,abbas,abiding,abidinglogic,abilities,ability,able,ablebodied,abolition,abortion,...,younger,youth,youthful,youve,zarqawi,zeroemission,zimbabwe,zinc,zone,zones
0,0,0,0,0,0,3,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,4,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,2,4,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,1,0,0,0,1,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,0,0,0,0,1,1,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0


In [103]:
corpusna = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmna.transpose()))

id2wordna = dict((v, k) for k, v in cvna.vocabulary_.items())

In [104]:
ldana = models.LdaModel(corpus=corpusna, num_topics=2, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.008*"americans" + 0.006*"jobs" + 0.005*"plan" + 0.005*"applause" + 0.004*"tonight" + 0.004*"reform" + 0.003*"iraq" + 0.003*"energy" + 0.003*"spending" + 0.003*"welfare"'),
 (1,
  '0.005*"agriculture" + 0.005*"present" + 0.004*"construction" + 0.004*"banks" + 0.004*"necessary" + 0.004*"department" + 0.004*"general" + 0.004*"reduction" + 0.003*"cent" + 0.003*"cooperation"')]

In [105]:
ldana = models.LdaModel(corpus=corpusna, num_topics=3, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.007*"banks" + 0.006*"construction" + 0.005*"depression" + 0.004*"employment" + 0.004*"credit" + 0.004*"financial" + 0.004*"recovery" + 0.004*"measures" + 0.004*"unemployment" + 0.004*"reduction"'),
 (1,
  '0.008*"americans" + 0.007*"jobs" + 0.006*"plan" + 0.005*"applause" + 0.005*"tonight" + 0.004*"reform" + 0.004*"iraq" + 0.003*"energy" + 0.003*"spending" + 0.003*"terrorists"'),
 (2,
  '0.005*"agriculture" + 0.005*"present" + 0.005*"department" + 0.004*"general" + 0.004*"land" + 0.004*"necessary" + 0.004*"reduction" + 0.003*"policy" + 0.003*"order" + 0.003*"property"')]

In [106]:
ldana = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.010*"jobs" + 0.010*"plan" + 0.008*"americans" + 0.005*"reform" + 0.005*"welfare" + 0.004*"investment" + 0.004*"energy" + 0.004*"spending" + 0.004*"percent" + 0.004*"tonight"'),
 (1,
  '0.000*"americans" + 0.000*"jobs" + 0.000*"applause" + 0.000*"iraq" + 0.000*"plan" + 0.000*"military" + 0.000*"agriculture" + 0.000*"trade" + 0.000*"women" + 0.000*"terrorists"'),
 (2,
  '0.005*"agriculture" + 0.005*"present" + 0.005*"construction" + 0.004*"necessary" + 0.004*"banks" + 0.004*"department" + 0.004*"general" + 0.004*"reduction" + 0.004*"cent" + 0.003*"cooperation"'),
 (3,
  '0.007*"americans" + 0.007*"applause" + 0.005*"iraq" + 0.004*"tonight" + 0.004*"jobs" + 0.004*"terrorists" + 0.003*"plan" + 0.003*"reform" + 0.003*"hope" + 0.003*"women"')]

In [107]:
ldana = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna, passes=80)
ldana.print_topics()

[(0,
  '0.006*"agriculture" + 0.005*"present" + 0.005*"construction" + 0.005*"banks" + 0.005*"necessary" + 0.005*"general" + 0.005*"department" + 0.004*"reduction" + 0.004*"cent" + 0.003*"cooperation"'),
 (1,
  '0.003*"true" + 0.003*"dream" + 0.003*"tonight" + 0.002*"human" + 0.002*"real" + 0.002*"reform" + 0.002*"music" + 0.002*"americas" + 0.002*"race" + 0.002*"moral"'),
 (2,
  '0.009*"americans" + 0.007*"jobs" + 0.006*"plan" + 0.005*"applause" + 0.005*"tonight" + 0.004*"reform" + 0.004*"iraq" + 0.004*"energy" + 0.003*"terrorists" + 0.003*"workers"'),
 (3,
  '0.005*"agreement" + 0.004*"revolution" + 0.003*"democratic" + 0.003*"rule" + 0.003*"trade" + 0.002*"days" + 0.002*"arms" + 0.002*"lets" + 0.002*"poverty" + 0.002*"nicaragua"')]

In [114]:
# Let's take a look at which topics each transcript contains
corpus_transformed = ldana[corpusna]
print(len(corpus_transformed))
print(data_dtmna[:2])
print(len(data_dtmna.index))
#list(zip([a for [(a,b)] in corpus_transformed], data_dtmna.index))

18
   abandonment  abbas  abiding  abidinglogic  abilities  ability  able  \
0            0      0        0             0          0        3     1   
1            0      0        0             0          0        4     1   

   ablebodied  abolition  abortion  ...  younger  youth  youthful  youve  \
0           0          1         0  ...        0      0         0      0   
1           0          0         0  ...        0      0         0      0   

   zarqawi  zeroemission  zimbabwe  zinc  zone  zones  
0        0             0         0     0     0      0  
1        0             0         0     0     0      0  

[2 rows x 5615 columns]
18
