In [None]:
import pandas as pd
import seaborn as sns
from pymannkendall import pymannkendall as pmk
from helpers import dataloader as dl
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import entropy, zscore
from helpers.afa import *

import visualization as vz

vz.visuals(font='Roboto')
main_colors = [col for c,col in enumerate(sns.color_palette('BrBG',10)) if c not in [4,5]]

In [None]:
# Load Data
words, dist, data, keys = dl.load_lda()
dist = dist.set_index(data.date)

In [None]:
# Select only "semantic" topics, normalize data
selected_topics = [str(i) for i in range(250) if any(s in keys[i] for s in ['proc','nonsem']) == False and keys[i] != 'rhet' and keys[i] != 'proc']
dist = dist[selected_topics]
dist = dist.div(dist.sum(axis=1), axis=0)

In [None]:
# Get Topic Rankings, highest = 1
distr = dist.rank(1, ascending=False).set_index(data.date)

In [None]:
# Loop over periods and topics, for every period, calculate proportion of topic rank

L_RANK = 3

r = []

for date, data_ in tqdm(distr.groupby(distr.index.to_period('6M'))):
    if date.year == 1945:
        continue
    
    for topic in distr.columns:
        for rank in range(1, L_RANK):
            rank_count = data_[data_[topic] == rank].shape[0]
            rank_prop = rank_count / data_.shape[0]
            r.append({"time": date.to_timestamp(), "topic": topic, "rank": rank, "rankprop": rank_prop})
rd = pd.DataFrame(r)

# Get Rank 1 - Rank 2

In [None]:
# Calculate diachronic R1 - R2 for all topics

dp = rd.pivot_table(index='time', columns=['topic', 'rank'], values='rankprop', aggfunc='mean')
smdf = (dp[2] - dp[1]).mean(level='topic')\
                      .reset_index()\
                      .rename(columns={'topic': 'topic', 0: 'submean'})

In [None]:
def plot_ranks(ax, rd, topic, ranks):
    """
    Function for plotting rank proportions over time
    """
    df_ = rd[(rd.topic == topic) & (rd['rank'].isin(list(range(1,ranks+1))))]
    df_ = df_.pivot(index='time',columns='rank',values='rankprop')

    x = df_.index
    for c,_ in enumerate(df_.columns):
        y = df_[_]
        y = adaptive_filter(y,span=36)
        ax.plot(x,y,linewidth=2.5 if _ == 1 else 1,color=list(reversed(main_colors))[c], zorder = 10 - c, linestyle ='--' if _ != 1 else None)

In [None]:
# Sort by mean R1 - R2
sorted_subtracted = smdf[~smdf.topic.apply(lambda x: keys.get(int(x))).str.contains('rhet|proc')]
sorted_subtracted = sorted_subtracted.sort_values('submean',ascending=True).topic.tolist()

In [None]:
# Plot highest
fig, ax = plt.subplots(2,4, sharex=True, sharey=True, figsize=(16,4))

for c,t in enumerate(sorted_subtracted[:8]):
    a = ax.flatten()[c]
    plot_ranks(ax = a, topic = str(t), ranks = 5, rd = rd)
    a.set_title(keys[int(t)].upper())
    a.xaxis.set_tick_params(rotation=90)
plt.subplots_adjust(hspace=.5)

fig.text(0.5, -.1, '6-Month Periods (1945 - 1991)', ha='center',fontsize=15)
fig.text(.075, 0.5, 'Norm. Rank Prominence', va='center', rotation='vertical',fontsize=15)
plt.show()

In [None]:
# Plot lowest
fig, ax = plt.subplots(2,4, sharex=True, sharey=True, figsize=(16,4))

for c,t in enumerate(sorted_subtracted[-8:]):
    a = ax.flatten()[c]
    plot_ranks(ax = a, topic = str(t), ranks = 5, rd = rd)
    a.set_title(keys[int(t)].upper())
    a.xaxis.set_tick_params(rotation=90)
plt.subplots_adjust(hspace=.5)

fig.text(0.5, -.1, '6-Month Periods (1945 - 1991)', ha='center',fontsize=15)
fig.text(.075, 0.5, 'Norm. Rank Prominence', va='center', rotation='vertical',fontsize=15)
plt.show()

In [None]:
# Find subtracted R1 - R2 slopes
r = []
for topic, d_ in tqdm(rd.groupby('topic')):
    dp_ = d_.pivot(index='time',columns='rank',values='rankprop')
    pmk1_ = pmk.original_test(dp_[1])
    pmk2_ = pmk.original_test(dp_[2])
    r.append({"topic":topic,"s1":pmk1_.slope,"s2":pmk2_.slope})

In [None]:
df = pd.DataFrame(r)
df['subslope'] = df['s1'] - df['s2']
df['l'] = df.topic.apply(lambda x: keys.get(int(x)))
ls = df.sort_values('subslope').topic.tolist()
ls = ls[1:9]

In [None]:
# Plot top rising slopes
fig,a = plt.subplots(2,4,figsize=(16,4),sharex=True,sharey=True)

sns.set_palette('BrBG')

for c,i in enumerate(ls):
    dfp = rd[rd.topic == i].pivot(index='time',columns='rank',values='rankprop')
    dfp['subtracted'] = dfp[2] - dfp[1]
    Y = adaptive_filter(dfp.subtracted)
    X = dfp.index 
    
    ax = a.flatten()[c]
    ax.plot(X,Y,c=main_colors[-1])
    ax.xaxis.set_tick_params(rotation=90)
    ax.set_title(keys[int(i)].upper())

plt.subplots_adjust(hspace=.5)

fig.text(0.5, -.1, '6-Month Periods (1945 - 1991)', ha='center',fontsize=15)
fig.text(.075, 0.5, 'T(Rank 2 - Rank 1)', va='center', rotation='vertical',fontsize=15)
plt.show()