# Sentiment Word Cloud Visualization

## Imports
Add necessary imports.

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS

## Loading DataFrame

In [None]:
df = pd.read_csv("data/chunks_w_all_topics.csv")

## Generate Sentiment DataFrame
Generate a DataFrame `avg_sentiment` where rows represent a combination of a source and a topic. Key outcome column is the average sentiment score of that source on that topic.

In [None]:
# compute per-topic average sentiment
avg_sentiment = (
    exploded
    .groupby(["source_type", "source_name", "topic"])["sentiment_score"]
    .mean()
    .reset_index()
    .rename(columns={"sentiment_score": "avg_sentiment_score"})
)

# compute sentiment across all topics for each combination of sources
all_topics_sources = (
    exploded
    .groupby(["source_type", "source_name"])["sentiment_score"]
    .mean()
    .reset_index()
    .assign(topic="all_topics")
    .rename(columns={"sentiment_score": "avg_sentiment_score"})
)

# compute an all-topics sentiment for each source type
all_topics_types = (
    exploded
    .groupby(["source_type"])["sentiment_score"]
    .mean()
    .reset_index()
    .assign(topic="all_topics")
    .rename(columns={"sentiment_score": "avg_sentiment_score"})
)
all_topics_types["source_name"] = all_topics_types["source_type"]

# concatenate all together
avg_sentiment_all = pd.concat([avg_sentiment, all_topics_sources, all_topics_types], ignore_index=True)

# normalize by sentiment distribution quantile
def add_split_quantile_normalized_sentiment(avg_sentiment_df):
    df = avg_sentiment_df.copy()
    negatives = df["avg_sentiment_score"] < 0
    positives = df["avg_sentiment_score"] > 0

    df["quantile_sentiment_scaled"] = 0.0

    if negatives.sum() > 0:
        df.loc[negatives, "quantile_sentiment_scaled"] = -df.loc[negatives, "avg_sentiment_score"].rank(method="average", pct=True)
    if positives.sum() > 0:
        df.loc[positives, "quantile_sentiment_scaled"] = df.loc[positives, "avg_sentiment_score"].rank(method="average", pct=True)
    # zero stays at zero
    return df

avg_sentiment_all = add_split_quantile_normalized_sentiment(avg_sentiment_all)

def get_sentiment_label(s):
    if s < -1/3:
        return "negative"
    elif s < 1/3:
        return "neutral"
    else:
        return "positive"

avg_sentiment_all["sentiment_label"] = avg_sentiment_all["quantile_sentiment_scaled"].apply(get_sentiment_label)

# display
display(avg_sentiment_all.sample(10))

## Generate Top Unique Words DataFrame
Generate two DataFrames `topwords_df` and `topwords_type_df`. Rows represent unique combinations of specific podcast / news sources and topics for `topwrods_df`, and source types (podcast/news) and topics for `topwords_type_df`. Key outcome variable is the top 20 unique words for each source-topic combo, found using tf-idf amongst all other sources on that given topic.

In [None]:
def precompute_tfidf_topwords_alltopics(exploded_df, group_cols, top_n=20, extra_stopwords=None, compute_all_topics=True):
    # define stopwords
    stopwords = ENGLISH_STOP_WORDS
    if extra_stopwords:
        stopwords = list(stopwords.union(set(extra_stopwords)))
    else:
        stopwords = list(stopwords)

    results = []

    # calculation by topic
    topics = exploded_df["topic"].unique()
    for topic in topics:
        df_topic = exploded_df[exploded_df["topic"] == topic]
        groups = df_topic.groupby(group_cols)
        corpus = groups["text"].apply(lambda x: " ".join(x))

        if len(corpus) < 2:
            continue

        vectorizer = TfidfVectorizer(stop_words=stopwords, max_features=5000)
        tfidf_matrix = vectorizer.fit_transform(corpus)
        feature_names = vectorizer.get_feature_names_out()

        for idx, group in enumerate(corpus.index):
            scores = tfidf_matrix[idx].toarray().flatten()
            top_indices = scores.argsort()[::-1][:top_n]
            # only include positive scores!
            words_scores = [
                (feature_names[i], float(scores[i]))
                for i in top_indices if scores[i] > 0
            ]
            words = [w for w, s in words_scores]
            words_scored = [{"text": w, "value": s} for w, s in words_scores]
            group_dict = {col: val for col, val in zip(group_cols, group if isinstance(group, tuple) else (group,))}
            group_dict['topic'] = topic
            results.append({
                **group_dict,
                'top_words': words_scored,
                'top_words_plain': words
            })

    # calculation now for ALL topics
    if compute_all_topics:
        group_cols_no_topic = [col for col in group_cols if col != "topic"]
        if group_cols_no_topic:
            groups = exploded_df.groupby(group_cols_no_topic)
            corpus = groups["text"].apply(lambda x: " ".join(x))

            if len(corpus) >= 2:
                vectorizer = TfidfVectorizer(stop_words=stopwords, max_features=5000)
                tfidf_matrix = vectorizer.fit_transform(corpus)
                feature_names = vectorizer.get_feature_names_out()

                for idx, group in enumerate(corpus.index):
                    scores = tfidf_matrix[idx].toarray().flatten()
                    top_indices = scores.argsort()[::-1][:top_n]
                    words_scores = [
                        (feature_names[i], float(scores[i]))
                        for i in top_indices if scores[i] > 0
                    ]
                    words = [w for w, s in words_scores]
                    words_scored = [{"text": w, "value": s} for w, s in words_scores]
                    group_dict = {col: val for col, val in zip(group_cols_no_topic, group if isinstance(group, tuple) else (group,))}
                    group_dict["topic"] = "all_topics"
                    results.append({
                        **group_dict,
                        "top_words": words_scored,
                        "top_words_plain": words
                    })

    return pd.DataFrame(results)

In [None]:
# applying this now:
topwords_df = precompute_tfidf_topwords_alltopics(
    exploded_df=exploded,
    group_cols=["source_type", "source_name"],
    top_n=20,
    extra_stopwords=extra_stopwords)

topwords_type_df = precompute_tfidf_topwords_alltopics(
    exploded_df=exploded,
    group_cols=["source_type"],
    top_n=20,
    extra_stopwords=extra_stopwords)

topwords_type_df = topwords_type_df.copy()
topwords_type_df["source_name"] = topwords_type_df["source_type"]

topwords_all_df = pd.concat([topwords_df, topwords_type_df], ignore_index=True)

In [None]:
print(topwords_all_df.columns)
print(avg_sentiment_all.columns)

## Export

In [None]:
topwords_all_df.to_parquet("data/topwords_by_topic.parquet", index=False)
avg_sentiment_all.to_csv("data/avg_sentiment_by_source_topic.csv", index=False)