# Generate Chunk CSV with all Topics

## Imports 
Add necessary imports.

In [None]:
import pandas as pd
import re

## Loading and Preprocessing

In [None]:
# loading sentiment chunk dataframe
sentiment_df = pd.read_csv("data/sentiment_analysis_textblob.csv")

In [None]:
# loading iptc topics
iptc_topics = pd.read_csv("data/cleaned_topic_labels.csv")
iptc_topics = iptc_topics.rename(columns={"Topic":"topic"})

In [None]:
# loading special topic dataframe
special_topics = pd.read_csv("data/chunks_for_stance_detection.csv")
sparse_cols = [col for col in special_topics.columns if col.startswith("topic_")]
# encoding boolean values sparsely for memory saving
special_topics[sparse_cols] = special_topics[sparse_cols].astype(pd.SparseDtype("bool", fill_value=False))

In [None]:
# getting only news-related iptc topics
topics = iptc_topics["iptc_news_topic"].unique()
non_news = ["human interest", "lifestyle and leisure", "arts, culture, entertainment and media", "sport"]
non_news_full = iptc_topics.apply(lambda x: x["iptc_news_topic"] if any(item in x["all_topics"] for item in non_news) else None, axis=1).unique()
news_topics = list(set(topics) - set(non_news_full))

In [None]:
# one hot encoding iptc news topics
topic_dict = {
    f"topic_{topic.replace(" ", "_")}": iptc_topics["all_topics"].astype(str).str.contains(
        rf"\b{re.escape(topic)}\b", regex=True, na=False
    )
    for topic in news_topics
}

topic_df = pd.DataFrame(topic_dict, index=iptc_topics.index).astype(pd.SparseDtype("bool", fill_value=False))

iptc_topics = pd.concat([iptc_topics, topic_df], axis=1)

## Merging into Big DataFrame

In [None]:
# merging iptc topics onto chunk data
sentiment_df = sentiment_df.merge(
    iptc_topics[["topic"] + [x for x in iptc_topics.columns if "topic_" in x]],
    on="topic",
    how="left"
)

# merging special topics onto chunk data
sentiment_df = sentiment_df.merge(
    special_topics[[x for x in special_topics.columns if ("topic_" in x) and x not in iptc_topics.columns]],
    right_index=True,
    left_index=True,
    how="left"
)

In [None]:
sentiment_df.head()

## Export

In [None]:
sentiment_df.to_csv("data/chunks_w_all_topics.csv", index=False)