In [None]:
import copy
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import re
import seaborn as sns
import time

from collections import Counter, defaultdict
from gensim.matutils import hellinger
from matplotlib import rcParams

%matplotlib inline

In [None]:
def get_upper_bound(num_tests = 1, alpha = 0.05, one_tailed = False):
    """Helper function for implementing bootstrap percentile method"""
    lower = alpha/num_tests/2
    if one_tailed:
        lower *= 2
    return 1 - lower


def get_upper_index(upper_bound, distribution):
    """Find index based on upper bound and distribution size"""
    upper_idx = int(np.floor(upper_bound * len(distribution)))
    return upper_idx


def hypothesis_testing_hellinger(real_ests, sim_ests_dict, alpha = 0.05, bonferroni = True, one_tailed = False):
    """Compare proximity (1 - Hellinger distance) to distribution of simulated proximities
    using the bootstrap percentile method"""
    n_topcs_pref, n_topics_alt = real_ests.shape
    test_mat = np.zeros(real_ests.shape)
    num_tests = 1
    if bonferroni == True:
        num_tests = real_ests.size
    upper_bound = get_upper_bound(num_tests, alpha, one_tailed)
    for i, row in enumerate(real_ests):
        for j, col in enumerate(row):
            hel_ij = real_ests[i][j]
            sims = sim_ests_dict[i][j] # in contrast to Tjur R2 tests, comparing only to cell_ij
            sims = sorted(sims)
            upper_idx = get_upper_index(upper_bound, sims)
            test_stat = sims[upper_idx]
            if hel_ij > test_stat:
                test_mat[i][j] = 1
    print(np.count_nonzero(test_mat)/test_mat.size)
    return test_mat


def results_to_strings(results, sig_tests, filter_cols=False):
    """Add asterisks to significant results (etc.)"""
    mat = []
    for i, row in enumerate(results):
        row_str = []
        for j, cell in enumerate(row):
            res = f"{cell:.3f}"
            if sig_tests[i][j]:
                res += "*"
            row_str.append(res)
        mat.append(row_str)
    mat = np.array(mat)
    col_names = [f"Topic {i+1}" for i in range(sig_tests.shape[1])]
    if filter_cols:
        idx_to_filter = filter_columns(sig_tests)
        mat = np.delete(mat, idx_to_filter, 1)
        col_names = list(np.delete(col_names, idx_to_filter))
    return mat, col_names

In [None]:
f = "lda15beta.txt"
lda15beta = pd.read_csv(f)
cols = [col for col in lda15beta.columns if "Unnamed" not in col]
lda15beta = lda15beta[cols].to_numpy()
beta_exp_15 = np.exp(lda15beta)

f = "lda35beta.txt"
lda35beta = pd.read_csv(f)
cols = [col for col in lda35beta.columns if "Unnamed" not in col]
lda35beta = lda35beta[cols].to_numpy()
beta_exp_35 = np.exp(lda35beta)

# EXPONENTIATED BETA MATRIX

real_hellinger_dists = []

for i, topic_i in enumerate(np.flip(beta_exp_15, axis=0)):
    row = []
    for topic_j in beta_exp_35:
        dist = 1-hellinger(topic_i, topic_j)
        row.append(dist)
    real_hellinger_dists.append(row)

real_hellinger_dists = np.array(real_hellinger_dists)

In [None]:
print(pd.DataFrame(beta_exp_15).shape)
print(pd.DataFrame(beta_exp_35).shape)

In [None]:
print(sum(pd.DataFrame(beta_exp_15).sum(axis=1)))
print(sum(pd.DataFrame(beta_exp_35).sum(axis=1)))

In [None]:
path_to_sims = "simulated_corpora/"

k15_sims_beta = [f for f in os.listdir(path_to_sims) if "_lda15beta" in f]
k35_sims_beta = [f for f in os.listdir(path_to_sims) if "_lda35beta" in f]

fs = [k15_sims_beta, k35_sims_beta]

for f in fs:
    print(len(f))

In [None]:
reg = "sim[0-9]{1,4}"

sim_nums = []

for i, f in enumerate(fs):
    f = [int(re.findall(reg, f_)[0].replace("sim", "")) for f_ in f]
    sim_nums += f

c = Counter(sim_nums)
for value in c.values():
    assert value == 2 # assert simulation number occurs only once in each file list
    
sim_nums = set(sim_nums)
print(len(sim_nums))
print(min(sim_nums))
print(max(sim_nums))

missing = [i for i in range(1,1085) if i not in sim_nums]
print(len(missing))

In [None]:
vocab_sizes = []
count = 0

start_time = time.time()

hellinger_dist_sims = defaultdict(lambda: defaultdict(lambda: []))

for num in sorted(list(sim_nums)):
    
    count += 1
    
    f = path_to_sims + f"/sim{num}_lda15beta.txt"
    lda15beta = pd.read_csv(f)
    cols = [col for col in lda15beta.columns if "Unnamed" not in col]
    lda15beta = lda15beta[cols].to_numpy()
    beta_exp_15 = np.exp(lda15beta)

    f = path_to_sims + f"/sim{num}_lda35beta.txt"
    lda35beta = pd.read_csv(f)
    cols = [col for col in lda35beta.columns if "Unnamed" not in col]
    lda35beta = lda35beta[cols].to_numpy()
    beta_exp_35 = np.exp(lda35beta)
    
    assert beta_exp_15.shape[1] == beta_exp_35.shape[1]
    
    vocab_sizes.append(beta_exp_15.shape[1])
    
    hellinger_dists = np.zeros((beta_exp_15.shape[0], beta_exp_35.shape[0]))

    for i, topic_i in enumerate(np.flip(beta_exp_15, axis=0)):
        for j, topic_j in enumerate(beta_exp_35):
            dist = 1-hellinger(topic_i, topic_j)
            hellinger_dists[i][j] = dist
            hellinger_dist_sims[i][j].append(dist)

    if count % 100 == 0:
        print(count, f"{time.time() - start_time:.2f}")
        
hellinger_sims_dict = defaultdict(lambda: {})

for key_i in hellinger_dist_sims.keys():
    for key_j in hellinger_dist_sims[key_i].keys():
        dists = hellinger_dist_sims[key_i][key_j]
        hellinger_sims_dict[key_i][key_j] = dists
        
hellinger_sims_dict = dict(hellinger_sims_dict)

outf = "hellinger_sims_dict.d"
pickle.dump(hellinger_sims_dict, open(outf, "wb"))

In [None]:
hellinger_sims_dict = pickle.load(open("hellinger_sims_dict.d", "rb"))

In [None]:
print(len(hellinger_sims_dict.keys()))
print(len(hellinger_sims_dict[0].keys()))

lengths = set()

for i in hellinger_sims_dict.keys():
    for j in hellinger_sims_dict[i].keys():
        lengths.add(len(hellinger_sims_dict[i][j]))
        
lengths

In [None]:
real_hellinger_dists.shape

## Figure 4. Proximity (1 − Hellinger Distance) of topics as distributions over words in the preferred (k = 15) and alternative (k = 35) models.

In [None]:
topic_labels = ["Transitions", "Family Leave", "Promotion Inequality", "Power", "Burnout", "Unequal Compensation", 
                "Psychosocial Support", "Respect", "Training", "Staff Interactions", "Career Advancement", "Favoritism", 
                "Hierarchy", "Agency", "Pumping"]

topic_labels_hel = copy.copy(topic_labels)
topic_labels_hel.reverse()

alt_labels = [f"Topic {i}" for i in range(1,36)]

sig_tests_hellinger = hypothesis_testing_hellinger(real_hellinger_dists, hellinger_dist_sims, 
                                         alpha = 0.05, bonferroni = True, one_tailed = False)
mat, _ = results_to_strings(real_hellinger_dists, sig_tests_hellinger)
df = pd.DataFrame(mat, columns=alt_labels)

outf = "model_comparison_table.csv"
df.to_csv(outf)

mask = np.invert(sig_tests_hellinger.astype(bool))

sns.set_style("white")

f, ax = plt.subplots(figsize=(20, 9))

cmap = sns.color_palette("Blues", n_colors=1000)
rcParams['figure.figsize'] = 16,9
rcParams["font.family"] = "Times New Roman"
plt.rcParams["font.family"] = "Times New Roman"

plot = sns.heatmap(real_hellinger_dists, vmin=0.0, vmax=1.0, center=0.4,
            square=False, linewidths=0.05,
            linecolor="lightgray",
            yticklabels=topic_labels_hel,
            xticklabels=alt_labels,
            annot=True,
            fmt=".3f",
            cmap = cmap,
            mask = mask)

plot.set_xticklabels(plot.get_xticklabels(), rotation=45, horizontalalignment="right")
plt.tight_layout()

for i in range(real_hellinger_dists.shape[1]+1):
    plt.axvline(x=i, color="lightgray")
for i in range(real_hellinger_dists.shape[0]+1):
    plt.axhline(y=i, color="lightgray")

plt.savefig("comparing_topic_models_qual_coding_figure4.png", format="png", transparent=False, dpi=600)
plt.show()

print("Figure 4. Proximity (1 − Hellinger Distance) of topics as distributions over words in the preferred (k = 15) and alternative (k = 35) models.")