# Generate Dataset for Stance Detection
Here we generate the dataset required to sort our chunks for stance detection.

## Imports
Add necessary imports

In [None]:
import pandas as pd

## Data Loading

In [None]:
special_topics = pd.read_csv("data/special_topic_labels.csv")
special_topics = special_topics.rename(columns={"Topic":"topic"})
iptc_topics = pd.read_csv("data/cleaned_topic_labels.csv")
iptc_topics = iptc_topics.rename(columns={"Topic":"topic"})
all_chunk_df = pd.read_csv("data/bertopic_results.csv")

## Filtering IPTC Topics
Here we filter IPTC topics for those relevant to stance detection.

In [None]:
iptc_stance_topics = [
    'LGBTQ',
    'euthanasia',
    'tobacco and nicotine',
    'pornography',
    'unions',
    'Christian Orthodoxy',
    'global warming',
    'abortion',
    'environmental policy',
    'capital punishment',
    'prostitution',
    'war',
    'immigration',
    'personal weapon control policy',
    'immigration policy',
    'privacy',
    'climate change',
    'racism',
    'discrimination',
    'welfare',
    'civil rights',
    'communism',
    'dictatorship',
    'democratic socialism',
    'religion',
    'atheism and agnosticism',
    'government aid',
    'military service',
    'terrorism',
    'genocide',
    'diversity, equity and inclusion',
    'family planning',
    'foreign aid',
    'police',
    'vaccine',
    'nuclear power',
    'nuclear policy',
    'welfare'
]

In [None]:
# one hot encoding IPTC topics
for topic in iptc_stance_topics:
    col_name = f"topic_{topic}".replace(" ", "_")
    iptc_topics[col_name] = iptc_topics["all_topics"].apply(lambda x: topic in x)

## Exporting Dataset with Stance Topic Labels

In [None]:
stance_chunk_df = all_chunk_df.merge(
    iptc_topics[["topic"] + [x for x in iptc_topics.columns if "topic_" in x]],
    on="topic",
    how="left"
)

# stance_chunk_df = stance_chunk_df.merge(
#     special_topics[["topic"] + [x for x in special_topics.columns if "topic_" in x]],
#     on="topic",
#     how="left"
# )

In [None]:
selected_topics = ["Donald Trump",
"Kamala Harris",
"Tim Walz",
"JD Vance",
"Joe Biden",
"Bernie Sanders",
"Mike Johnson",
"Nancy Pelosi",
"Alexandria Ocasio-Cortez",
"Mitch McConnell",
"Pete Buttigieg",
"Xi Jinping",
"Vladimir Putin",
"Volodymyr Zelensky",
"Benjamin Netanyahu",
"Claudia Sheinbaum",
"Ali Khamenei",
"Diddy",
"Luigi Mangione",
"Pope",
"Kanye West",
"Jeff Bezos",
"Mark Zuckerberg",
"Tim Cook",
"Elon Musk",
"Sam Altman",
"Karoline Leavitt",
"Jeffrey Epstein",
"Mark Carney",
"Justin Trudeau",
"Robert F. Kennedy Jr",
"Pete Hegseth",
"Bob Menendez",
"Ron DeSantis",
"Kevin McCarthy",
"Chuck Schumer",
"Taylor Swift", 
"Caitlin Clark",
"China",
"Russia",
"Canada",
"United States",
"Mexico",
"El Salvador",
"Israel",
"Iran",
"United Kingdom",
"Taiwan",
"Saudi Arabia",
"India",
"Pakistan",
"ICE",
"Democratic Party",
"Republican Party",
"USAID",
"Texas floods",
"Israel Gaza",
"India Pakistan",
"Russia Ukraine",
"FEMA",
"TikTok",
"Crypto",
"Tariffs",
"COVID-19",
"Taylor Swift Travis Kelce",
"Opioids"]

In [None]:
topic_inputs = {
    "Donald Trump": {
        "required_words": ["Trump"],
        "cooccurrence_set": {"President", "Nominee", "Candidate", "Donald", "Republican"}
    },
    "Kamala Harris": {
        "required_words": ["Harris"],
        "cooccurrence_set": {"Kamala", "Vice", "President", "VP", "Candidate", "Nominee", "Democrat"}
    },
    "Tim Walz": {
        "required_words": ["Walz"],
        "cooccurrence_set": {"Tim", "Governor", "Minnesota", "Vice", "President", "Candidate", "Nominee", "Democrat"}
    },
    "JD Vance": {
        "required_words": ["Vance"],
        "cooccurrence_set": {"JD", "Senator", "Ohio", "Vice", "President", "Nominee", "Republican"}
    },
    "Joe Biden": {
        "required_words": ["Biden"],
        "cooccurrence_set": {"Joe", "President", "Vice", "Joseph", "Nominee", "Democrat", "Candidate"}
    },
    "Bernie Sanders": {
        "required_words": ["Sanders"],
        "cooccurrence_set": {"Bernie", "Senator", "Vermont", "Independent"}
    },
    "Mike Johnson": {
        "required_words": ["Johnson"],
        "cooccurrence_set": {"Mike", "Speaker", "Republican", "Louisiana", "House", "Representative"}
    },
    "Nancy Pelosi": {
        "required_words": ["Pelosi"],
        "cooccurrence_set": {"Nancy", "Representative", "House", "Speaker", "California", "Democrat"}
    },
    "Alexandria Ocasio-Cortez": {
        "required_words": ["Ocasio-Cortez", "AOC", "Cortez"],
        "cooccurrence_set": {"Alexandria", "Representative", "New", "York", "House", "Democrat", ""}
    },
    "Mitch McConnell": {
        "required_words": ["McConnell"],
        "cooccurrence_set": {"Mitch", "Senator", "Kentucky", "Republican"}
    },
    "Pete Buttigieg": {
        "required_words": ["Buttigieg"],
        "cooccurrence_set": {"Pete", "Secretary", "Transportation", "Democrat"}
    },
    "Xi Jinping": {
        "required_words": ["Xi"],
        "cooccurrence_set": {"Jinping", "President", "Chairman", "China"}
    },
    "Vladimir Putin": {
        "required_words": ["Putin"],
        "cooccurrence_set": {"Vladimir", "President", "Russia"}
    },
    "Volodymyr Zelensky": {
        "required_words": ["Zelensky", "Zelenskyy"],
        "cooccurrence_set": {"Volodymyr", "President", "Ukraine"}
    },
    "Benjamin Netanyahu": {
        "required_words": ["Netanyahu"],
        "cooccurrence_set": {"Benjamin", "Prime", "Minister", "Israel"}
    },
    "Claudia Sheinbaum": {
        "required_words": ["Sheinbaum"],
        "cooccurrence_set": {"Claudia", "President", "Mexico"}
    },
    "Ali Khamenei": {
        "required_words": ["Khamenei"],
        "cooccurrence_set": {"Ali", "Supreme", "Leader", "Iran"}
    },
    "Diddy": {
        "required_words": ["Diddy", "Combs"]
    },
    "Luigi Mangione": {
        "required_words": ["Mangione"],
        "cooccurrence_set": {"Luigi"}
    },
    "Pope": {
        "required_words": ["Pope", "Pontiff"],
        "cooccurrence_set": {"Francis", "Leo", "Vatican", "Catholic"}
    },
    "Kanye West": {
        "required_words": ["Kanye", "Ye"],
        "cooccurrence_set": {"West", "Rapper", "Artist"}
    },
    "Jeff Bezos": {
        "required_words": ["Bezos"],
        "cooccurrence_set": {"Jeff", "Amazon", "Founder"}
    },
    "Mark Zuckerberg": {
        "required_words": ["Zuckerberg"],
        "cooccurrence_set": {"Mark", "Meta", "Facebook", "CEO"}
    },
    "Tim Cook": {
        "required_words": ["Cook"],
        "cooccurrence_set": {"Tim", "Apple", "CEO"}
    },
    "Elon Musk": {
        "required_words": ["Musk"],
        "cooccurrence_set": {"Elon", "Tesla", "SpaceX", "CEO"}
    },
    "Sam Altman": {
        "required_words": ["Altman"],
        "cooccurrence_set": {"Sam", "OpenAI", "CEO", "ChatGPT"}
    },
    "Karoline Leavitt": {
        "required_words": ["Leavitt"],
        "cooccurrence_set": {"Karoline", "Spokesperson", "Press", "Secretary", "Republican"}
    },
    "Jeffrey Epstein": {
        "required_words": ["Epstein"],
        "cooccurrence_set": {"Jeffrey", "Financier", "Sex", "Trafficking"}
    },
    "Mark Carney": {
        "required_words": ["Carney"],
        "cooccurrence_set": {"Mark", "Governor", "Bank", "Prime", "Minister", "Canada"}
    },
    "Justin Trudeau": {
        "required_words": ["Trudeau"],
        "cooccurrence_set": {"Justin", "Prime", "Minister", "Canada"}
    },
    "Robert F. Kennedy Jr": {
        "required_words": ["Kennedy", "RFK"],
        "cooccurrence_set": {"Robert", "Jr", "Health", "Secretary", "Candidate", "Independent"}
    },
    "Pete Hegseth": {
        "required_words": ["Hegseth"],
        "cooccurrence_set": {"Pete", "Defense", "Secretary", "Republican", "Fox"}
    },
    "Bob Menendez": {
        "required_words": ["Menendez"],
        "cooccurrence_set": {"Bob", "Senator", "Democrat", "New", "Jersey"}
    },
    "Ron DeSantis": {
        "required_words": ["DeSantis"],
        "cooccurrence_set": {"Ron", "Governor", "Florida", "Republican", "Candidate"}
    },
    "Kevin McCarthy": {
        "required_words": ["McCarthy"],
        "cooccurrence_set": {"Kevin", "Speaker", "California", "Republican", "House"}
    },
    "Chuck Schumer": {
        "required_words": ["Schumer"],
        "cooccurrence_set": {"Chuck", "Senator", "New", "York", "Senate"}
    },
    "Taylor Swift": {
        "required_words": ["Swift"],
        "cooccurrence_set": {"Taylor", "Singer", "Artist"}
    },
    "Caitlin Clark": {
        "required_words": ["Clark"],
        "cooccurrence_set": {"Caitlin", "Basketball", "Iowa", "WNBA", "Fever"}
    },
    "China": {
        "required_words": ["China", "Chinese", "Sino", "Shanghai", "Beijing"]
    },
    "Russia": {
        "required_words": ["Russia", "Russian", "Moscow", "Kremlin"]
    },
    "Canada": {
        "required_words": ["Canada", "Canadian", "Ottawa", "Toronto", "Vancouver"]
    },
    "United States": {
        "required_words": ["United", "States", "USA", "US", "America", "American", "Washington"]
    },
    "Mexico": {
        "required_words": ["Mexico", "Mexican", "Guadalajara", "Monterrey", "MexicoCity"]
    },
    "El Salvador": {
        "required_words": ["Salvador", "Salvadoran", "San"]
    },
    "Israel": {
        "required_words": ["Israel", "Israeli", "Jerusalem", "TelAviv", "Tel", "Aviv"]
    },
    "Iran": {
        "required_words": ["Iran", "Iranian", "Tehran"]
    },
    "United Kingdom": {
        "required_words": ["United", "Kingdom", "Britain", "British", "England", "UK", "London"]
    },
    "Taiwan": {
        "required_words": ["Taiwan", "Taipei", "Taiwanese"]
    },
    "Saudi Arabia": {
        "required_words": ["Saudi", "Arabia", "Riyadh", "Arabian"]
    },
    "India": {
        "required_words": ["India", "Indian", "Delhi", "Mumbai", "Bangalore"]
    },
    "Pakistan": {
        "required_words": ["Pakistan", "Pakistani", "Islamabad", "Karachi", "Lahore"]
    },
    "ICE": {
        "required_words": ["ICE"]
    },
    "Democratic Party": {
        "required_words": ["Democratic", "Democrat", "DNC"]
    },
    "Republican Party": {
        "required_words": ["Republican", "GOP", "RNC"]
    },
    "DOGE": {
        "required_words": ["DOGE"]
    },
    "USAID": {
        "required_words": ["USAID"]
    },
    "Texas floods": {
        "required_words": ["Texas", "Houston", "Dallas"],
        "cooccurrence_set": {"Flood", "Flooding"}
    },
    "Israel Gaza": {
        "required_words": ["Israel", "Gaza", "Palestine"],
        "cooccurrence_set": {"War", "Conflict", "Hamas", "Hostage", "Attack"}
    },
    "India Pakistan": {
        "required_words": ["India", "Pakistan"],
        "cooccurrence_set": {"Border", "Kashmir", "Conflict", "Fighting", "War", "Jet", "Missile"}
    },
    "Russia Ukraine": {
        "required_words": ["Russia", "Ukraine"],
        "cooccurrence_set": {"War", "Conflict", "Invasion", "Attack", "Bombing"}
    },
    "FEMA": {
        "required_words": ["FEMA"]
    },
    "TikTok": {
        "required_words": ["TikTok"]
    },
    "Crypto": {
        "required_words": ["Crypto", "Cryptocurrency", "Bitcoin", "Ethereum", "Memecoin"]
    },
    "Tariffs": {
        "required_words": ["Tariff", "Tariffs", "Duties"]
    },
    "COVID-19": {
        "required_words": ["COVID", "COVID-19", "Coronavirus", "Pandemic"]
    },
    "Taylor Swift Travis Kelce": {
        "required_words": ["Swift", "Taylor"],
        "cooccurrence_set": {"Kelce", "Travis"}
    },
    "Opioids": {
        "required_words": ["Opioid", "Opioids", "Fentanyl", "Oxycontin", "Heroin"]
    }
}

In [None]:
def check_topic(row, required_words, cooccurrence_set=None, keywords_only=False):
    required_words = {word.lower() for word in required_words}
    cooccurrence_set = {word.lower() for word in cooccurrence_set} if cooccurrence_set else set()
    
    if keywords_only:
        # representative keywords only
        words = set(str(w).lower() for w in row["Representation"])
    else:
        # both representative keywords and words from a representative doc
        words = set(str(row["text"]).lower().split())
    
    # if no co-occurrence set, just check required_words
    if not cooccurrence_set:
        return bool(required_words & words)
    else:
        # check at least one required and at least one from cooccurrence_set present
        return bool(required_words & words) and bool(cooccurrence_set & words)

In [None]:
for topic in selected_topics:
    col_name = f"topic_{topic.lower()}".replace(" ", "_")
    stance_chunk_df[col_name] = stance_chunk_df.apply(lambda x: check_topic(x, topic_inputs[topic].get("required_words"), topic_inputs[topic].get("cooccurrence_set")), axis=1)

In [None]:
stance_chunk_df.value_counts()

In [None]:
stance_chunk_df.to_csv("data/chunks_for_stance_detection.csv", index=False)