In [None]:
import pandas as pd
import pickle
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from modules.cleaners import simple_clean
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
import re
import string
from gensim import matutils, models
import scipy.sparse

In [None]:
df_first = pd.read_pickle('pickled_data/data_first_clean.pkl')
df_second = pd.read_pickle('pickled_data/data_second_clean.pkl')
df_first.head()

In [None]:
def clean_for_tdm(text):
    '''Remove forward slash, punctuation and numbers'''
    text = text.replace("\\", "")
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text   

In [None]:
crisis = [1930, 1931, 1932, 1986, 1987, 1988, 2008, 2009, 2010]
growth = [1926, 1927, 1928, 1992, 1993, 1994, 2005, 2006, 2007]
selection = crisis+growth

In [None]:
clean_txt = lambda x: clean_for_tdm(x)

In [None]:
data = df_first[['year', 'first_clean']].copy()
data = data[data.year.isin(selection)]
data = data.reset_index(drop=True)
data['year'] = data['year'].apply(str)
data.to_pickle("pickled_data/for_word_cloud.pkl")
data

In [None]:
data_clean = pd.DataFrame(data.first_clean.apply(clean_txt))
data_clean

In [None]:
stop_words = text.ENGLISH_STOP_WORDS

cv = CountVectorizer(stop_words=stop_words)
data_cv = cv.fit_transform(data_clean['first_clean'])
data_tdm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_tdm.index = data.year

data_tdm

In [None]:
data_tdm = data_tdm.transpose()
data_tdm.head()

In [None]:
# Find the top 25 words in each speech
top_dict = {}
for c in data_tdm.columns:
    top = data_tdm[c].sort_values(ascending=False).head(25)
    top_dict[c]= list(zip(top.index, top.values))

top_dict

In [None]:
# Print the top 12 words used in each year
for year, top_words in top_dict.items():
    print(year)
    print(', '.join([word for word, count in top_words[0:11]]))
    print('---')

In [None]:
tdm = data.transpose()
tdm.head()

In [None]:
top_dict = {}
for c in data_tdm.columns:
    top = data_tdm[c].sort_values(ascending=False).head(30)
    top_dict[c]= list(zip(top.index, top.values))

top_dict

In [None]:
sparse_counts = scipy.sparse.csr_matrix(data_tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [None]:
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [None]:
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=10)
lda.print_topics()

In [None]:
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=3, passes=10)
lda.print_topics()

In [None]:
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=4, passes=10)
lda.print_topics()

In [None]:
from nltk import word_tokenize, pos_tag

def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [None]:
data_nouns = pd.DataFrame(data_clean['first_clean'].apply(nouns))
data_nouns

In [None]:
# Re-add the additional stop words since we are recreating the document-term matrix
add_stop_words = ['like', 'im', 'know', 'just', 'dont', 'thats', 'right', 'people',
                  'youre', 'got', 'gonna', 'time', 'think', 'yeah', 'said']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

# Recreate a document-term matrix with only nouns
cvn = CountVectorizer(stop_words=stop_words)
data_cvn = cvn.fit_transform(data_nouns['first_clean'])
data_dtmn = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names())
data_dtmn.index = data_nouns.index
data_dtmn

In [None]:
# Create the gensim corpus
corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn.transpose()))

# Create the vocabulary dictionary
id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())

In [None]:
ldan = models.LdaModel(corpus=corpusn, num_topics=2, id2word=id2wordn, passes=10)
ldan.print_topics()

In [None]:
ldan = models.LdaModel(corpus=corpusn, num_topics=3, id2word=id2wordn, passes=10)
ldan.print_topics()

In [None]:
ldan = models.LdaModel(corpus=corpusn, num_topics=4, id2word=id2wordn, passes=10)
ldan.print_topics()

In [None]:
def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

In [None]:
data_nouns_adj = pd.DataFrame(data_clean['first_clean'].apply(nouns_adj))
data_nouns_adj

In [None]:
cvna = CountVectorizer(stop_words=stop_words, max_df=.8)
data_cvna = cvna.fit_transform(data_nouns_adj['first_clean'])
data_dtmna = pd.DataFrame(data_cvna.toarray(), columns=cvna.get_feature_names())
data_dtmna.index = data_nouns_adj.index
data_dtmna

In [None]:
corpusna = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmna.transpose()))

id2wordna = dict((v, k) for k, v in cvna.vocabulary_.items())

In [None]:
ldana = models.LdaModel(corpus=corpusna, num_topics=2, id2word=id2wordna, passes=10)
ldana.print_topics()

In [None]:
ldana = models.LdaModel(corpus=corpusna, num_topics=3, id2word=id2wordna, passes=10)
ldana.print_topics()

In [None]:
ldana = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna, passes=10)
ldana.print_topics()

In [None]:
ldana = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna, passes=80)
ldana.print_topics()

In [None]:
# Let's take a look at which topics each transcript contains
corpus_transformed = ldana[corpusna]
print(len(corpus_transformed))
print(data_dtmna[:18])
print(len(data_dtmna.index))
#list(zip([a for [(a,b)] in corpus_transformed], data_dtmna.index))