In [1]:
import pandas as pd

import sys
sys.path.append("../") # go to parent dir
from codebase.topic_utilities import jaccard_similarity

# Step 1, Load original topics

In [2]:
rolling_xls = pd.ExcelFile('../export/Mar-Jun-Rolling.xlsx')
sliding_xls = pd.ExcelFile('../export/Mar-Jun-Sliding.xlsx')

In [3]:
# read rolling_xls
for i, week_index in enumerate(range(10, 27)):
    week_name = f"Week{week_index}"
    if i == 0:
        rolling_df = pd.read_excel(rolling_xls,sheet_name=week_name)
    else:
        rolling_df = rolling_df.append(pd.read_excel(rolling_xls,sheet_name=week_name), ignore_index=True)

# read sliding_xls
for i, week_index in enumerate(range(10, 27)):
    if i == 0:
        week_name = f"W{week_index}"
        sliding_df = pd.read_excel(sliding_xls,sheet_name=week_name)
    else:
        week_name = f"W{week_index-1}W{week_index}"
        sliding_df = sliding_df.append(pd.read_excel(sliding_xls,sheet_name=week_name), ignore_index=True)

In [None]:
# format rolling_df
rolling_df.reset_index(inplace=True)
rolling_df.rename(columns={"index": "topic_id_perm_prev"}, inplace=True)
rolling_df = rolling_df[["topic_id_perm_prev", "ratio", "coherence"]]
rolling_df

# format sliding_df
sliding_df.reset_index(inplace=True)
sliding_df.rename(columns={"index": "topic_id_perm_prev"}, inplace=True)
sliding_df = sliding_df[["topic_id_perm_prev", "ratio", "coherence"]]
sliding_df

# Step 2, Join with topics selected in label_list.csv

In [6]:
rolling_label_list = pd.read_csv("../export/label_list_rolling.csv")
sliding_label_list = pd.read_csv("../export/label_list_sliding.csv")

In [7]:
rolling_join_df = rolling_df.set_index('topic_id_perm_prev').join(rolling_label_list.set_index('topic_id_perm_prev'),
                                                        how="right")
sliding_join_df = sliding_df.set_index('topic_id_perm_prev').join(sliding_label_list.set_index('topic_id_perm_prev'),
                                                        how="right")

# Step 3, Report Topic Performance

In [8]:
# select coherent topics
rd = rolling_join_df[(rolling_join_df.Coherent == 1)]
sd = sliding_join_df[(sliding_join_df.Coherent == 1)]

rolling_coherence_mean = rd[["coherence"]].mean(axis = 0, skipna = True).values
rolling_coherence_std = rd[["coherence"]].std(axis = 0, skipna = True).values
sliding_coherence_mean = sd[["coherence"]].mean(axis = 0, skipna = True).values
sliding_coherence_std = sd[["coherence"]].std(axis = 0, skipna = True).values
print(f"Rolling Model: mean={rolling_coherence_mean} with std={rolling_coherence_std}")
print(f"Sliding Model: mean={sliding_coherence_mean} with std={sliding_coherence_std}")

Rolling Model: mean=[-6.12399562] with std=[1.65071176]
Sliding Model: mean=[-5.60958161] with std=[1.45640777]


In [24]:
def calculate_diversity_max(topic_words):
    div = []
    for i in range(len(topic_words)):
        tmp = []
        for j in range(len(topic_words)):
            if i == j:
                continue
            else:
                # tmp.append(jaccard_similarity(topic_words[i], topic_words[j]))
                tmp.append(jaccard_similarity(topic_words[i].split(" ")[:10], topic_words[j].split(" ")[:10]))
        div.append(1-max(tmp))
    return div

def calculate_diversity_avg(topic_words):
    div = []
    for i in range(len(topic_words)):
        for j in range(len(topic_words)):
            if i == j:
                continue
            else:
                div.append(1-jaccard_similarity(topic_words[i], topic_words[j]))
    return div

In [25]:
rd_div_list = calculate_diversity_max(rd.total_words.to_list())
sd_div_list = calculate_diversity_max(sd["relevant words"].to_list())

In [26]:
import numpy as np
print(f"Rolling Model Diveristy: mean={np.mean(rd_div_list)}, std={np.std(rd_div_list)}")
print(f"Sliding Model Diveristy: mean={np.mean(sd_div_list)}, std={np.std(sd_div_list)}")

Rolling Model Diveristy: mean=0.8419708187472288, std=0.06830396572447622
Sliding Model Diveristy: mean=0.8196366733828033, std=0.09045677156022981
