# Topic Visualization
This notebook contains the code to visually explore our labeled topics.

## Imports
Necessary imports.

In [None]:
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import umap
import seaborn as sns
import ast

## Loading Files

In [None]:
# csv containing our chunks, assigned topic numbers, and relevant metadata
chunk_df = pd.read_csv("data/bertopic_results.csv")

In [None]:
# csv containing our topic number to iptc topic label mappings
label_names_df = pd.read_csv("data/cleaned_topic_labels.csv")
label_names_df = label_names_df.rename(columns={"Topic":"topic"})

## Merging Data
We merge our two csv files `chunk_df` and `label_names_df` into `df_merged` so that we get a new DataFrame containing all of our chunks with correct topic labels.

In [None]:
df_merged = chunk_df.merge(
    label_names_df[["topic", "iptc_news_topic", "all_topics"]],
    on="topic",
    how="left"
)

In [None]:
df_merged.head(1)

## Extracting Broad Topics
Here, for each topic, we get the broader category it falls into: E.g. national election --> politics and government.

In [None]:
def extract_broadest_topic(topics):
    if isinstance(topics, list) and len(topics) > 0:
        return topics[-1]
    return None

if isinstance(df_merged["all_topics"].iloc[0], str):
    df_merged["all_topics"] = df_merged["all_topics"].apply(ast.literal_eval)

df_merged["broadest_topic"] = df_merged["all_topics"].apply(extract_broadest_topic)

## Filtering Out Non-News
We then filter to only get those topics relevant to news. Specifically, this means cutting human interest, sport, lifestyle and leisure, and arts, culture, entertainment and media.

In [None]:
non_news = ["human interest", "lifestyle and leisure", "arts, culture, entertainment and media", "sport"]
news_only = df_merged[~df_merged["broadest_topic"].isin(non_news)]

## UMAP Calculation

In [None]:
# load the embeddings
embeddings = np.load("models/embeddings/docs_embeddings.npy")

In [None]:
umap_reducer = umap.UMAP(n_components=2, random_state=42)
umap_results = umap_reducer.fit_transform(
    embeddings[news_only.index]  # only get embeddings for our news chunks
)

# add UMAP results back to the filtered DataFrame
news_only["umap_1"] = umap_results[:, 0]
news_only["umap_2"] = umap_results[:, 1]

In [None]:
# exporting news chunks with UMAP
news_only.to_csv("data/news_chunks_w_umap.csv")

## UMAP Plotting

In [None]:
plt.figure(figsize=(10,8))
sns.scatterplot(
    data=news_only, 
    x="umap_1", y="umap_2",
    hue="source_type", 
    s=5, alpha=0.5,
    legend=True  
)
plt.title("UMAP by Broader Topic")
plt.show()

In [None]:
# Identify unique source_types
source_types = news_only["source_type"].unique()
num_types = len(source_types)

# Set up side-by-side plots
fig, axes = plt.subplots(1, num_types, figsize=(8*num_types, 8), sharex=True, sharey=True)
if num_types == 1:
    axes = [axes]  # Make iterable if only one source_type

for ax, src in zip(axes, source_types):
    subset = news_only[news_only["source_type"] == src]

    sns.scatterplot(
        ax=ax,
        data=subset,
        x="umap_1", y="umap_2",
        hue="broadest_topic",
        palette="tab20",
        s=3, alpha=0.5,
        legend=True
    )
    ax.set_title(f"UMAP by Broadest Topic: {src}")
    ax.set_xlabel("UMAP 1")
    ax.set_ylabel("UMAP 2")

plt.tight_layout()
plt.show()

In [None]:
topic_counts = (
    news_only.groupby(["source_type", "broadest_topic"])
    .size()
    .reset_index(name="count")
)

# converting counts to proportions
topic_totals = topic_counts.groupby("source_type")["count"].transform("sum")
topic_counts["proportion"] = topic_counts["count"] / topic_totals

plt.figure(figsize=(12,6))
sns.barplot(
    data=topic_counts,
    x="broadest_topic",
    y="proportion",
    hue="source_type"
)
plt.xticks(rotation=45, ha="right")
plt.ylabel("Proportion")
plt.title("Proportion of Each Topic in Each Source Type")
plt.tight_layout()
plt.show()

In [None]:
def plot_subtopic_distribution(
    df, 
    drill_path=None, 
    all_topics_col="all_topics", 
    source_col="source_type"
):
    """
    Plots the distribution of the next-more-granular subtopic beneath the specified drill_path, split by source_type.
    If drill_path is None or empty, shows distribution of broadest topics.
    """
    # if drill_path is not specified, plot distribution of broadest topics
    if not drill_path:
        # extract broadest topic (last element of all_topics)
        df = df[df[all_topics_col].apply(lambda x: isinstance(x, list) and len(x) > 0)].copy()
        df["broadest_topic"] = df[all_topics_col].apply(lambda x: x[-1])
        counts = (
            df.groupby([source_col, "broadest_topic"])
            .size()
            .reset_index(name="count")
        )
        counts["source_total"] = counts.groupby(source_col)["count"].transform("sum")
        counts["proportion"] = counts["count"] / counts["source_total"]

        # sort topics by total count
        order = counts.groupby("broadest_topic")["count"].sum().sort_values(ascending=False).index

        plt.figure(figsize=(12,6))
        sns.barplot(
            data=counts,
            x="broadest_topic",
            y="proportion",
            hue=source_col,
            order=order
        )
        plt.xticks(rotation=45, ha="right")
        plt.title(f"Distribution of broadest topics by {source_col}")
        plt.ylabel("Proportion within Source")
        plt.xlabel("Broadest Topic")
        plt.legend(title=source_col)
        plt.tight_layout()
        plt.show()
        return

    # drilldown as before
    if not isinstance(drill_path, (list, tuple)):
        print("drill_path must be a list or None.")
        return
    if len(drill_path) < 1:
        print("drill_path must be non-empty or None to get broadest topic distribution.")
        return
    drill_path = list(reversed(drill_path))
    # find rows whose all_topics ends with the drill_path (ordered broadest last)
    def matches_drill_path(all_topics):
        if not isinstance(all_topics, list):
            return False
        if len(all_topics) < len(drill_path):
            return False
        return all_topics[-len(drill_path):] == drill_path

    subset = df[df[all_topics_col].apply(matches_drill_path)].copy()
    if subset.empty:
        print(f"No entries found for drill_path {drill_path}.")
        return
    
    # get next-more-granular subtopic (one level deeper)
    def get_next_subtopic(all_topics):
        if not isinstance(all_topics, list):
            return None
        idx = len(all_topics) - len(drill_path)
        if idx > 0:
            return all_topics[idx - 1]
        return None

    subset["next_subtopic"] = subset[all_topics_col].apply(get_next_subtopic)
    # remove missing/empty
    subset = subset[subset["next_subtopic"].notnull() & (subset["next_subtopic"] != "")]
    if subset.empty:
        print(f"No subtopics found one level deeper than {drill_path}.")
        return
    
    # calculate counts and proportions
    counts = (
        subset.groupby([source_col, "next_subtopic"])
        .size()
        .reset_index(name="count")
    )
    counts["source_total"] = counts.groupby(source_col)["count"].transform("sum")
    counts["proportion"] = counts["count"] / counts["source_total"]

    # sort by total count
    plot_order = (
        counts.groupby("next_subtopic")["count"].sum().sort_values(ascending=False).index
    )

    # plot
    plt.figure(figsize=(12,6))
    sns.barplot(
        data=counts,
        x="next_subtopic",
        y="proportion",
        hue=source_col,
        order=plot_order
    )
    plt.xticks(rotation=45, ha="right")
    path_txt = " > ".join(drill_path)
    plt.title(f"Distribution of subtopics within \"{path_txt}\" by {source_col}")
    plt.ylabel("Proportion within Source")
    plt.xlabel("Subtopic")
    plt.legend(title=source_col)
    plt.tight_layout()
    plt.show()

In [None]:
# suptopic plotting!
plot_subtopic_distribution(news_only, ["society", "mankind"])