# Topic Label Cleaning
Cleaning labels assigned by hand.

## Imports
Add necessary imports.

In [None]:
import pandas as pd
import json
import ast

## Load Topics

In [None]:
df = pd.read_csv("data/iptc_labeled_topics.csv")

## Correct Hand-Labeling Mistakes

In [None]:
topic_renames = {
    "activism": "political movement and association",
    "american football": "American football",
    "animals": "animal",
    "assassination": "homicide",
    "astrology": "arts, culture, entertainment and media",
    "athletes": "sport",
    "audio": "radio",
    "automotive industry": "automotive",
    "baptist": "Baptist",
    "barbershop": "personal service",
    "bible": "Bible",
    "blackout": "disaster",
    "body": "health",
    "buddhism": "Buddhism",
    "candy": "grocery",
    "cannabis": "drug related crimes",
    "child custody": "family",
    "cholera": "communicable disease",
    "christian orthodoxy": "Christian Orthodoxy",
    "christianity": "Christianity",
    "christmas": "Christmas",
    "cinema": "movies",
    "cinrma": "movies",
    "civin rights": "civil rights",
    "comic": "cartoon",
    "compliance": "regulations",
    "conspiracy": "crime",
    "conspiracy theory": "disinformation and misinformation",
    "country": "country music",
    "dairy": "farms",
    "dating and relationships": "Dating and Relationships",
    "defamation": "civil law",
    "delivery service": "shipping and postal service",
    "dementia": "mental health",
    "deodorant": "health and beauty product",
    "detention": "prison",
    "detention facility": "prison",
    "disco music": "dance band music",
    "documentary": "movies",
    "electric vehicle": "automotive",
    "emissions": "environmental pollution",
    "emotions": "mental health",
    "extinct species": "endangered species",
    "eyewear": "health and beauty product",
    "fantasy sports": "sport",
    "farming": "farms",
    "fire department": "emergency response",
    "football": "American football",
    "footwear": "clothing",
    "forensic science": "science and technology",
    "formula one": "Formula One",
    "friendship": "family",
    "generation": "demographics",
    "governor": "public officials",
    "halloween": "Halloween",
    "hanukkah": "Hanukkah",
    "harassment": "sex crime",
    "hasidism": "Hasidism",
    "health research": "medical research",
    "heritage": "monument and heritage site",
    "hiv and aids": "HIV and AIDS",
    "holiday": "public holiday",
    "horror movies": "movies",
    "hospitality": "hotel and accommodation",
    "identity theft": "cyber crime",
    "immunity": "vaccine",
    "individualism": "society",
    "indy racing": "Indy Racing",
    "internet": "online media outlet",
    "journalist": "news media",
    "judaism": "Judaism",
    "jury": "court",
    "labour strike": "labor strike",
    "lawsuit": "litigation",
    "lgbtq": "LGBTQ",
    "libel": "civil law",
    "medicaid": "healthcare policy",
    "medicare": "healthcare policy",
    "mental health and disorder": "mental health",
    "methodist": "Methodist",
    "midsummer": "Midsummer",
    "military": "armed forces",
    "mormonism": "Mormonism",
    "murder": "homicide",
    "naming": "name ceremony",
    "non-governmental organisation (ngo)": "non-governmental organisation (NGO)",
    "nutrition": "diet",
    "oligarchy": "dictatorship",
    "olympic games": "Olympic Games",
    "organ transplant": "health treatment and procedure",
    "paralympic games": "Paralympic Games",
    "paranormal phenomena": "arts, culture, entertainment and media",
    "planet": "nature",
    "plant": "flowers and plants",
    "pregnancy": "pregnancy and childbirth",
    "promotion": "advertising",
    "racial discrimination": "racism",
    "ramadan": "Ramadan",
    "rape": "sex crime",
    "recipes": "cooking and baking",
    "river": "rivers",
    "robbbery and theft": "robbery and theft",
    "sanitation": "health",
    "senator": "public officials",
    "sentiment": "mental health",
    "sexual behaviour": "sexual behavior",
    "sleep and health": "health",
    "snack food": "grocery",
    "species": "animal",
    "sports betting": "gaming and lottery",
    "sports broadcasting": "television industry",
    "sweepstakes": "gaming and lottery",
    "taekwon-do": "Taekwon-Do",
    "tariffs": "tariff",
    "technology": "science and technology",
    "territory": "border dispute",
    "theatre": "theater",
    "ticketing": "advertising",
    "twins": "family",
    "village": "society",
    "warfare": "conflict, war and peace",
    "wealth": "economy",
    "whale": "animal",
    "writing": "literature"
}

In [None]:
df["iptc_news_topic"] = df["iptc_news_topic"].apply(lambda x: topic_renames[x.lower()] if x.lower() in topic_renames.keys() else x.lower())

## Including Broader Topics
For each specific topic, include a list of itself and all topics it falls under (if applicable).

In [None]:
# load our topic data schema
with open("data/cptall-en-US.json") as f:
    d = json.load(f)["conceptSet"]

df_schema = pd.DataFrame(d)
df_schema["prefLabel"] = df_schema["prefLabel"].apply(lambda x: x["en-US"])
df_schema["definition"] = df_schema["definition"].apply(lambda x: x["en-US"] if "en-US" in x.keys() else None)

In [None]:
# build label to q-code and q-code to label dictionaries
qcode_to_label = dict(zip(df_schema['qcode'], df_schema['prefLabel']))
label_to_qcode = dict(zip(df_schema['prefLabel'], df_schema['qcode']))

# build a child to parent taxonomy dictionary
child_to_parent = {}

for idx, row in df_schema.iterrows():
    parent_qcode = row["qcode"]
    if pd.notnull(row["narrower"]):
        # reading child qcodes as literals, not as strings
        children = ast.literal_eval(row["narrower"])
        for child_qcode in children:
            # getting qcode(s) for parent topics
            child_to_parent[child_qcode] = parent_qcode

In [None]:
def get_all_broader_topics(qcode, child_to_parent):
    hierarchy = [qcode] # start with just itslef
    while qcode in child_to_parent: # while the topic still has a parent topic
        qcode = child_to_parent[qcode] # move to the parent topic
        hierarchy.append(qcode) # add the parent to the list, and search for parents again from the parent topic until there are no more parents
    return hierarchy

In [None]:
# getting q-codes for all topic names
df["iptc_news_qcode"] = df["iptc_news_topic"].map(label_to_qcode)

# getting all broader topics as q-codes
df["all_topics_qcode"] = df["iptc_news_qcode"].apply(lambda q: get_all_broader_topics(q, child_to_parent))

# function to convert these q-codes to their actual labels
def qcodes_to_labels(qcodes):
    return [qcode_to_label.get(q, q) for q in qcodes]

df["all_topics"] = df["all_topics_qcode"].apply(qcodes_to_labels)

# dropping q-code columns now that we have our labels
df_all_labels = df.drop(["iptc_news_qcode",	"all_topics_qcode"], axis=1)

## Export

In [None]:
df_all_labels.to_csv("data/cleaned_topic_labels.csv", index=False)