In [None]:
import pandas as pd
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.feature_extraction import text
from textblob import TextBlob
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

## Creating Word Clouds for speeches from 5 significant years
### 1946: End of WW2, 1976: End of Vietnam war, 1990: End of the cold war, 2002: Following 9/11, 2009: Global fin.crisis

In [None]:
data = pd.read_pickle('pickled_data/data_first_clean.pkl')
data.drop(['President', 'Party','speech', 'first_clean_tokenized'], axis=1, inplace = True)
data = data.rename({'first_clean' : 'speech'}, axis=1)
#Significant years: 1946, end of ww2, 1976 end of Vietnam war, 1990 end of cold war, 2002 9/11, 2009 glob fin crisis
years = [1946, 1976, 1990, 2002, 2009]

data = data.loc[data['year'].isin(years)]
data = data.reset_index(drop=True)

data['year'] = data.year.astype('str')
data.head()

In [None]:
speech_dict = dict(zip(data.year, data.speech))

In [None]:
stop_words = text.ENGLISH_STOP_WORDS

wc = WordCloud(stopwords=stop_words, background_color="black", colormap="Dark2",
               max_font_size=150, random_state=42)

plt.rcParams['figure.figsize'] = [10, 6]


for key, value in speech_dict.items():
    wc.generate(value)
    
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(key)    
    plt.show()

### Sentiment analysis of the 5 years, using TextBlob

In [None]:
def sentiment(text):
    try:
        return TextBlob(text).sentiment
    except:
        return None

In [None]:
data.head()

In [None]:
data['polarity'] = data.speech.apply(sentiment).apply(lambda x: x[0])
data['subjectivity'] = data['speech'].apply(sentiment).apply(lambda x: x[1])

In [None]:
data

## As an experiment, we try the prelearned model from: https://huggingface.co/MoritzLaurer/policy-distilbert-7d on the same 5 years

In [None]:
def ml_policy(text):
    model_name = "MoritzLaurer/policy-distilbert-7d"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    input = tokenizer(text, truncation=True, return_tensors="pt")

    output = model(input["input_ids"])
    prediction = torch.softmax(output["logits"][0], -1).tolist()

    label_names = ["external relations", "freedom and democracy",
               "political system", "economy", "welfare and quality of life",
               "fabric of society", "social groups"]
    prediction = {name: round(float(pred) * 100, 1) for pred, name in
              zip(prediction, label_names)}
    return prediction

In [None]:
for x, y in speech_dict.items():
    print('Year: ', x)
    print(ml_policy(y))

In [None]:
first_speech = list(range(1901, 2022, 4))


In [None]:
data2 = pd.read_pickle('pickled_data/data_first_clean.pkl')
#list of years where 1st year in office years
first_speech = list(range(1901, 2022, 4))
data2 = data2.loc[data2['year'].isin(first_speech)]
data2 = data2.reset_index(drop=True)
data2.head()

In [None]:
def sentiment(text):
    try:
        return TextBlob(text).sentiment
    except:
        return None

In [None]:
data2['polarity'] = data2['first_clean'].apply(sentiment).apply(lambda x: x[0])
data2['subjectivity'] = data2['first_clean'].apply(sentiment).apply(lambda x: x[1])

In [None]:
plt.rcParams['figure.figsize'] = [10, 8]

data2['year'] = data2['year'].apply(str)

for index, president in enumerate(data2.index):
    x = data2.polarity.loc[president]
    y = data2.subjectivity.loc[president]
    plt.scatter(x, y, color='blue')
    plt.text(x+.001, y+.001, data2['year'][index], fontsize=10)
    #plt.xlim(.09, .185)
    
plt.title('Sentiment Analysis', fontsize=20)
plt.xlabel('<-- Negative -------- Positive -->', fontsize=15)
plt.ylabel('<-- Facts -------- Opinions -->', fontsize=15)

plt.show()

In [None]:
#list of year-on-year % change in US GDP
df_csv = pd.read_csv('data/gdp.csv')
df_csv.rename(columns={'date': 'year', 'change-chained' : 'GDP'}, inplace=True)

gdp = df_csv.iloc[:, [0, 4]].copy()

gdp.sort_values(by=['year'], inplace=True)
print(gdp.head(10))