In [None]:
import pandas as pd
import pickle
import torch
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
import re
import string
from gensim import matutils, models
import scipy.sparse
from collections import Counter
from nltk import word_tokenize, pos_tag
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForSequenceClassification

## Topic modelling using LDA
https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation

In [None]:
#We select speeches from 5 historically significant years from corpus
data = pd.read_pickle('pickled_data/data_first_clean.pkl')
data.drop(['President', 'Party','speech', 'first_clean_tokenized'], axis=1, inplace = True)
#rename column for clarity
data = data.rename({'first_clean' : 'speech'}, axis=1)

years = [1946, 1976, 1990, 2002, 2009]

data = data.loc[data['year'].isin(years)]
#data = data.reset_index(drop=True)




data['year'] = data['year'].apply(str)
#test 1
data = data.set_index('year')
data.head()

In [None]:
#Check a sample to see if more cleaning is needed

#data.loc[2, 'speech']

In [None]:
def clean_for_tdm(text):
    '''Remove forward slash, punctuation and numbers'''
    text = text.replace("\\", "")
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text   

In [None]:
#clean column speech
data['speech'] = data.speech.map(lambda x : clean_for_tdm(x))

In [None]:
data.head()

In [None]:
#data.iloc[2,1]

## Create DTM (document term matrix)

In [None]:
#set index to year for DTM
#data.set_index('year', inplace = True)
data.head()

In [None]:
cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data.speech)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data.index
data_dtm

In [None]:
#data_dtm.to_pickle('pickled_data/dtm.pkl')
#pickle.dump(cv, open("cv.pkl", "wb"))

 ## Inspecting the data

In [None]:
data_dtm = data_dtm.transpose()
data_dtm.head()

In [None]:
#Find the 20 most used words in each speech
top_words = {}

for c in data_dtm.columns:
    top = data_dtm[c].sort_values(ascending=False).head(20)
    top_words[c]= list(zip(top.index, top.values))

top_words

In [None]:
'''Print the 15 most used words in each speech, check if some should be added to stopword list,
if they are irrelevant for the topic analysis'''
for year, t_words in top_words.items():
    print(year)
    print(', '.join([word for word, count in t_words[0:14]]))
    print('')

In [None]:
# make list of top 20 words in each of the 5 speeches, from top_words dict
words = []
for year in data_dtm.columns:
    top = [word for (word, count) in top_words[year]]
    for t in top:
        words.append(t)
        
words

In [None]:
'''looking at the data, we decide that the most common words are irrelevant if they appear in more than 2 speeches
(words are likely generic to SOTU speeches and not relevant as topics in individual speeches )'''
add_stop_words = [word for word, count in Counter(words).most_common() if count > 2]
add_stop_words

In [None]:
#update stop word list with the words found above, union is used to avoid duplicates
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

## Prepare for analysis and first run

In [None]:
#DTM is upated with the new stopwords

cv_stop = CountVectorizer(stop_words=stop_words)
data_cv = cv_stop.fit_transform(data.speech)
data_stop = pd.DataFrame(data_cv.toarray(), columns=cv_stop.get_feature_names())
data_stop.index = data.index

#data_stop.to_pickle('pickled_data/dtm_stop.pkl')

In [None]:
data_stop.head()

In [None]:
tdm = data_stop.T
tdm.head()

In [None]:
#change dtm df, first to sparse matrix and then to gensim corpus

sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [None]:
# gensim requires dictionary of the all terms and their respective location in the term-document matrix
id2word = dict((v, k) for k, v in cv_stop.vocabulary_.items())

In [None]:
# corpus = TDM and id2word = dict {location : term}
'''LDA for 2 topics and 10 passes'''
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=10)
lda.print_topics()

In [None]:
'''LDA for 3 topics and 10 passes'''
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=3, passes=10)
lda.print_topics()

In [None]:
'''LDA for 4 topics and 10 passes'''
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=4, passes=10)
lda.print_topics()

### As the results include several words that are irrelevant to possible topics, we try and narrow the search by only including nouns

In [None]:
# function that selects nouns only and return those as a string, details see: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

def nouns(text):
    '''tokenize a string and return only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [None]:
data_clean = pd.read_pickle('pickled_data/dtm.pkl')
data_clean.T

In [None]:
data_nouns = pd.DataFrame(data.speech.apply(nouns))
data_nouns

In [None]:
# Recreate a document-term matrix with only nouns
cvn = CountVectorizer(stop_words=stop_words)
data_cvn = cvn.fit_transform(data_nouns.speech)
data_dtmn = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names())
data_dtmn.index = data_nouns.index
data_dtmn

In [None]:
# Create the gensim corpus
corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn.transpose()))

# Create the vocabulary dictionary
id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())

In [None]:
'''LDA for 2 topics, 10 passes and nouns only'''
ldan = models.LdaModel(corpus=corpusn, num_topics=2, id2word=id2wordn, passes=10)
ldan.print_topics()

In [None]:
#3 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=3, id2word=id2wordn, passes=10)
ldan.print_topics()

In [None]:
#4 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=4, id2word=id2wordn, passes=10)
ldan.print_topics()

### Using nouns AND adjectives

In [None]:
# function that returns nouns and adjectives from a text string
def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

In [None]:

data_nouns_adj = pd.DataFrame(data.speech.apply(nouns_adj))
data_nouns_adj.speech[1]


In [None]:
# Recreate a document-term matrix with nouns AND adjectives

cvna = CountVectorizer(stop_words=stop_words)
data_cvna = cvna.fit_transform(data_nouns_adj.speech)
data_dtmna = pd.DataFrame(data_cvna.toarray(), columns=cvna.get_feature_names())
data_dtmna.index = data_nouns_adj.index
data_dtmna

In [None]:
# Create the gensim corpus
corpusna = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmna.transpose()))

# Create the vocabulary dictionary
id2wordna = dict((v, k) for k, v in cvna.vocabulary_.items())

In [None]:
#as above, 2 topics and 10 passes
ldana = models.LdaModel(corpus=corpusna, num_topics=2, id2word=id2wordna, passes=10)
ldana.print_topics()

In [None]:
# 3 topics, 10 passes
ldana = models.LdaModel(corpus=corpusna, num_topics=3, id2word=id2wordna, passes=10)
ldana.print_topics()

In [None]:
#4 topics, 10 passes
ldana = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna, passes=10)
ldana.print_topics()

In [None]:
ldana = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna, passes=80)
ldana.print_topics()

In [None]:
# which topics of the 4 lists found, are in which speech (year)
corpus_transformed = ldana[corpusna]
list(zip([a for [(a,b)] in corpus_transformed], data_dtmna.index))

## Creating Word Clouds for speeches from 5 significant years
### 1946: End of WW2, 1976: End of Vietnam war, 1990: End of the cold war, 2002: Following 9/11, 2009: Global fin.crisis

In [None]:
data_st = pd.read_pickle('pickled_data/data_first_clean.pkl')
data_st.drop(['President', 'Party','speech', 'first_clean_tokenized'], axis=1, inplace = True)
data_st = data_st.rename({'first_clean' : 'speech'}, axis=1)
#Significant years: 1946, end of ww2, 1976 end of Vietnam war, 1990 end of cold war, 2002 9/11, 2009 glob fin crisis
years = [1946, 1976, 1990, 2002, 2009]

data_st = data_st.loc[data_st['year'].isin(years)]
data_st = data_st.reset_index(drop=True)

data_st['year'] = data_st.year.astype('str')

data_st.head()

In [None]:
#make dict for plotting and alterative analysis below
speech_dict = dict(zip(data_st.year, data_st.speech))

In [None]:
#make wordcloud for each of the 5 speeches
stop_words = text.ENGLISH_STOP_WORDS

wc = WordCloud(stopwords=stop_words, background_color="black", colormap="Dark2",
               max_font_size=150, random_state=42)

plt.rcParams['figure.figsize'] = [10, 6]


for key, value in speech_dict.items():
    wc.generate(value)
    
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(key)    
    plt.show()

## As an experiment, we try the prelearned model from: https://huggingface.co/MoritzLaurer/policy-distilbert-7d on the same 5 years

In [None]:
def ml_policy(text):
    model_name = "MoritzLaurer/policy-distilbert-7d"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    input = tokenizer(text, truncation=True, return_tensors="pt")

    output = model(input["input_ids"])
    prediction = torch.softmax(output["logits"][0], -1).tolist()

    label_names = ["external relations", "freedom and democracy",
               "political system", "economy", "welfare and quality of life",
               "fabric of society", "social groups"]
    prediction = {name: round(float(pred) * 100, 1) for pred, name in
              zip(prediction, label_names)}
    return prediction

In [None]:
for x, y in speech_dict.items():
    print('Year: ', x)
    print(ml_policy(y))