In [None]:
import pandas as pd
import seaborn as sns
from pymannkendall import pymannkendall as pmk
from helpers import dataloader as dl
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import entropy, zscore
from helpers.afa import *

import visualization as vz

vz.visuals(font='Roboto')
main_colors = [col for c,col in enumerate(sns.color_palette('BrBG',10)) if c not in [4,5]]

In [None]:
# Load Data
words, dist, data, keys = dl.load_lda()

# Remove irrelevant topics (by labels that are manually added)
sem_topics = [str(t) for t,l in keys.items() if any(ss in l for ss in ['nonsem','rhet']) == False]
dist = dist[sem_topics]

In [None]:
# Get a new DF with the topic index of the top value per speech
ixm = dist.idxmax(axis=1)
ixm = pd.DataFrame(ixm)
ixm['date'] = data.date.tolist()

# Group by topic and year and calculate mean entropy
edf = (
    ixm.groupby([ixm[0], ixm.date.dt.to_period('6M')])
    .apply(lambda d_: dist.loc[d_.index].apply(entropy, axis=1).mean())
    .reset_index(name='entropy')
    .rename(columns={0: 'topic', 'date': 'date', 'entropy': 'entropy'})
)

In [None]:
edfp = edf.pivot(index='date',columns='topic',values='entropy').fillna(0.0)

In [None]:
# Run Mann-Kendall test for monotonic trend detection
pmk_results = {t:pmk.original_test(edfp[t]) for t in edfp.columns}
pmk_results = pd.DataFrame([{"topic":topic,"p":v.p,"slope":v.slope} for topic,v in pmk_results.items()])
pmk_results['label'] = pmk_results.topic.apply(lambda x: keys.get(int(x)))

In [None]:
# Sort on Mann-Kendall slopes
pmk_results_sorted = pmk_results.sort_values('slope',ascending=False).reset_index(drop=True)

In [None]:
# Store topic prominences
topic_prominence = dist.set_index(data.date)
topic_prominence = topic_prominence.groupby(topic_prominence.index.to_period('6M')).mean()

In [None]:
fig, ax = plt.subplots(2,5,figsize=(16,4),sharex=True,sharey=True)

for c, topic in enumerate(pmk_results_sorted.topic[:10]):
    a = ax.flatten()[c]

    Xe = edfp.index.to_timestamp()
    Ye = adaptive_filter(edfp[topic] / edfp.mean(axis=1),span=36)
    Ye = zscore(np.squeeze(np.asarray(Ye)))
    a.plot(Xe,Ye,color=main_colors[0])

    Xp = topic_prominence.index.to_timestamp()
    Yp = adaptive_filter(topic_prominence[topic],span=36)
    Yp = zscore(np.squeeze(np.asarray(Yp)))
    a.plot(Xe,Yp,color=main_colors[-1],linestyle='-.',zorder=-1,linewidth=.75)

    a.set_title(keys[int(topic)].upper(),fontsize=10)
    a.xaxis.set_tick_params(rotation=90)

fig.text(0.5, -.05, '6-Month Periods (1945 - 1991)', ha='center',fontsize=15)
fig.text(.075, 0.5, 'Normalized Entropy / Prominence', va='center', rotation='vertical',fontsize=15)
plt.show()