In [None]:
import pandas as pd
import numpy as np

In [None]:
base_path = "../data/raw/"


mlma = pd.read_csv(base_path + "MLMA.csv")


osact_train = pd.read_csv(
    base_path + "OSACT2022-sharedTask-train.txt",
    sep="\t",
    header=None,
    names=["id", "tweet", "offensiveness", "hate_speech", "vulgarity", "violence"],
    engine="python",
    quoting=3,
    on_bad_lines="skip",
)


osact_dev = pd.read_csv(
    base_path + "OSACT2022-sharedTask-dev.txt",
    sep="\t",
    header=None,
    names=["id", "tweet", "offensiveness", "hate_speech", "vulgarity", "violence"],
    engine="python",
    quoting=3,
    on_bad_lines="skip",
)


sohateful = pd.read_excel(base_path + "SoHateful.xlsx", sheet_name="Sheet1")

armi = pd.read_csv(base_path + "ArMI2021_training.tsv", sep="\t")

ar_hf = pd.read_csv(base_path + "ar_hf_112024.csv")


def process_mlma(row):
    sentiment = str(row["sentiment"]).lower()

    target = str(row["target"]).lower()

    is_hate = 1 if "hateful" in sentiment else 0

    oh = 0

    gh = 0

    rh = 0

    if is_hate:
        if "origin" in target:
            oh = 1

        if "religion" in target:
            rh = 1

        if target in ["gender"]:
            gh = 1

    return pd.Series([row["tweet"], is_hate, oh, gh, rh])


mlma_processed = mlma.apply(process_mlma, axis=1)

mlma_processed.columns = ["text", "is_hate", "OH", "GH", "RH"]

mlma_processed["source"] = "MLMA"


# --- 3. Process OSACT (Train + Dev) ---

osact_combined = pd.concat([osact_train, osact_dev], ignore_index=True)

osact_combined.columns = [
    "id",
    "tweet",
    "offensiveness",
    "hate_speech",
    "vulgarity",
    "violence",
]

assert osact_combined["tweet"].notna().all()

assert osact_combined["tweet"].str.len().gt(1).all()

osact_combined.sample(5)["tweet"].values


def process_osact(row):
    hs_label = str(row["hate_speech"])

    is_hate = 1 if hs_label != "NOT_HS" else 0

    oh = 0

    gh = 0

    rh = 0

    if is_hate:
        if hs_label == "HS1":
            oh = 1

        elif hs_label == "HS6":
            gh = 1

        elif hs_label in ["HS2", "HS3"]:
            rh = 1

    return pd.Series([row["tweet"], is_hate, oh, gh, rh])


osact_processed = osact_combined.apply(process_osact, axis=1)

osact_processed.columns = ["text", "is_hate", "OH", "GH", "RH"]

osact_processed["source"] = "OSACT"


def process_sohateful(row):
    q4_4 = str(
        row["Q4.4 :Hate speech type خطاب كراهية موجه ضد أشخاص لهم سمات مشتركة"]
    ).lower()

    text = row["info_text"]

    if q4_4 in ["nan", "no", "none"]:
        is_hate = 0

        oh, gh, rh = 0, 0, 0

    else:
        is_hate = 1

        oh, gh, rh = 0, 0, 0

        if "origin-race-nationality" in q4_4:
            oh = 1

        if "gender" in q4_4:
            gh = 1

        if "religion-sect" in q4_4 or "ideology" in q4_4:
            rh = 1

    return pd.Series([text, is_hate, oh, gh, rh])


sohateful_processed = sohateful.apply(process_sohateful, axis=1)

sohateful_processed.columns = ["text", "is_hate", "OH", "GH", "RH"]

sohateful_processed["source"] = "SoHateful"


exclusions = ["discredit", "damning", "derailing", "threat of violence"]

armi_filtered = armi[~armi["category"].isin(exclusions)].copy()


def process_armi(row):
    is_misogyny = 1 if row["misogyny"] == "misogyny" else 0

    return pd.Series([row["text"], is_misogyny, 0, is_misogyny, 0])


armi_processed = armi_filtered.apply(process_armi, axis=1)

armi_processed.columns = ["text", "is_hate", "OH", "GH", "RH"]

armi_processed["source"] = "ArMI"


brothers_df = ar_hf[ar_hf["dataset"] == "brothers"].copy()


def process_brothers(row):
    # labels: 1 = Hate, 0 = Not Hate

    is_hate = int(row["labels"])

    oh = 0

    gh = 0

    rh = 0

    if is_hate:
        rh = 1

    return pd.Series([row["text"], is_hate, oh, gh, rh])


brothers_processed = brothers_df.apply(process_brothers, axis=1)

brothers_processed.columns = ["text", "is_hate", "OH", "GH", "RH"]

brothers_processed["source"] = "Brothers"


egy_train = pd.read_parquet(base_path + "train-egyptian-5-way.parquet")

egy_test = pd.read_parquet(base_path + "test-egyptian-5-way.parquet")


egy = pd.concat([egy_train, egy_test], ignore_index=True)

egy["label"] = egy["label"].astype(str).str.lower().str.strip()

egy = egy[egy["label"].isin(["racism", "sexism", "religious discrimination"])].copy()


def process_egyptian(row):
    label = row["label"]

    is_hate = 1

    oh = gh = rh = 0

    if label == "racism":
        oh = 1

    elif label == "sexism":
        gh = 1

    elif label == "religious discrimination":
        rh = 1

    return pd.Series([row["text"], is_hate, oh, gh, rh])


egy_processed = egy.apply(process_egyptian, axis=1)

egy_processed.columns = ["text", "is_hate", "OH", "GH", "RH"]

egy_processed["source"] = "Egyptian5Way"


harmonized_df = pd.concat(
    [
        mlma_processed,
        osact_processed,
        sohateful_processed,
        armi_processed,
        brothers_processed,
        egy_processed,
    ],
    ignore_index=True,
)


cols = ["is_hate", "OH", "GH", "RH"]

harmonized_df[cols] = harmonized_df[cols].astype(int)


print("Total rows:", len(harmonized_df))

print("Distribution:\n", harmonized_df[cols].sum())


harmonized_df.to_csv('../data/interim/Unified_Datasets.csv', index=False)

In [None]:
SUBS = ["OH", "GH", "RH"]

PRIORITY = ["OSACT", "Egyptian5Way", "ArMI", "Brothers", "MLMA", "SoHateful"]


def resolve_group(group):
    sources = list(group["source"].unique())

    # If any source says it's hate, we generally trust it

    is_hate = int(group["is_hate"].max() == 1)

    oh = gh = rh = 0

    # Determine highest-priority dataset present for this specific tweet

    chosen = None

    for p in PRIORITY:
        if p in sources:
            chosen = p

            break

    if chosen == "OSACT":
        row = group[group["source"] == "OSACT"].iloc[0]

        oh, gh, rh = row[SUBS]

    elif chosen == "Egyptian5Way":
        row = group[group["source"] == "Egyptian5Way"].iloc[0]

        oh, gh, rh = row[SUBS]

    elif chosen == "SoHateful":
        sh = group[group["source"] == "SoHateful"][SUBS].sum()

        oh, gh, rh = (sh > 0).astype(int)

    elif chosen == "ArMI":
        # ArMI is purely Gender Hate

        gh = int(group[group["source"] == "ArMI"]["is_hate"].max() == 1)

    elif chosen == "Brothers":
        # Brothers is purely Religious Hate

        rh = int(group[group["source"] == "Brothers"]["is_hate"].max() == 1)

    elif chosen == "MLMA":
        ml = group[group["source"] == "MLMA"][SUBS].sum()

        oh, gh, rh = (ml > 0).astype(int)

    return pd.Series(
        {
            "text": group["text"].iloc[0],
            "is_hate": is_hate,
            "OH": int(oh),
            "GH": int(gh),
            "RH": int(rh),
            "sources": ",".join(sorted(sources)),
        }
    )


print("Resolving duplicates... (This may take a moment)")

clean_df = harmonized_df.groupby("text", as_index=False).apply(resolve_group)

print("-" * 30)

print("Final rows:", len(clean_df))

print("Final distribution:")

print(clean_df[SUBS + ["is_hate"]].sum())

clean_df.to_csv('../data/processed/Harmonized_Dataset.csv', index=False)

print("Saved: Harmonized_Dataset.csv")

In [None]:
print(f"Original shape: {clean_df.shape}")


undefined_mask = (
    (clean_df["is_hate"] == 1)
    & (clean_df["OH"] == 0)
    & (clean_df["GH"] == 0)
    & (clean_df["RH"] == 0)
)


# Filter the dataframe

clean_df = clean_df[~undefined_mask].copy()


# Reset index

clean_df = clean_df.reset_index(drop=True)

print(f"New shape: {clean_df.shape}")

print(f"Dropped {undefined_mask.sum()} rows.")

remaining = (
    (clean_df["is_hate"] == 1)
    & (clean_df["OH"] == 0)
    & (clean_df["GH"] == 0)
    & (clean_df["RH"] == 0)
).sum()

print(f"Remaining undefined samples: {remaining}")
