In [None]:
import re
import unicodedata
import pandas as pd

In [None]:
clean_df=pd.read_csv('../data/processed/Harmonized_Dataset.csv')

In [None]:
URL_PATTERN = re.compile(
    r"(https?://\S+|www\.\S+|\bURL\b|\bLINK\b|@url)", flags=re.IGNORECASE
)


MENTION_PATTERN = re.compile(r"@\w+")


RT_PATTERN = re.compile(r"(^|\s)(RT\s+@?\w+)", flags=re.IGNORECASE)


MULTI_SPACE_PATTERN = re.compile(r"\s+")


# Arabic letter normalization


ARABIC_NORMALIZATION_MAP = {
    "أ": "ا",
    "إ": "ا",
    "آ": "ا",
    "ى": "ي",
    "ؤ": "و",
    "ئ": "ي",
    "ة": "ه",
    "ـ": "",
}


def normalize_arabic(text: str) -> str:
    for src, tgt in ARABIC_NORMALIZATION_MAP.items():
        text = text.replace(src, tgt)

    return text


def remove_invalid_unicode(text: str) -> str:
    """


    Remove badly decoded unicode characters while


    keeping emojis and Arabic/Latin text.


    """

    return "".join(ch for ch in text if unicodedata.category(ch)[0] != "C")


def preprocess_tweet(text: str) -> str:
    """


    SOTA preprocessing for Arabic Twitter hate speech.


    Compatible with MARBERTv2 and OSACT-style datasets.


    """

    if not isinstance(text, str):
        return ""

    # 1. Unicode cleanup

    text = remove_invalid_unicode(text)

    # 2. Normalize newlines and tabs

    text = text.replace("\n", " ").replace("\t", " ")

    # 3. Normalize URLs

    text = URL_PATTERN.sub(" URL ", text)

    # 4. Normalize mentions

    text = MENTION_PATTERN.sub("@USER", text)

    # 5. Normalize RT marker

    text = RT_PATTERN.sub(" RT @USER ", text)

    # 6. Arabic orthographic normalization

    text = normalize_arabic(text)

    # 7. Remove diacritics

    text = re.sub(r"[\u064B-\u065F\u0670]", "", text)

    # 8. Remove excessive punctuation repetition

    text = re.sub(r"([!؟?]){3,}", r"\1\1", text)

    # 9. Normalize whitespace

    text = MULTI_SPACE_PATTERN.sub(" ", text).strip()

    return text


In [None]:
def dominant_subtype(row):
    if row["is_hate"] == 0:
        return "NH"

    if row["OH"] == 1:
        return "OH"

    if row["GH"] == 1:
        return "GH"

    if row["RH"] == 1:
        return "RH"

    return "UNK"


clean_df["stratify_label"] = clean_df.apply(dominant_subtype, axis=1)

print(clean_df["stratify_label"].value_counts())


In [None]:
from sklearn.model_selection import train_test_split


train_df, temp_df = train_test_split(
    clean_df, test_size=0.2, stratify=clean_df["stratify_label"], random_state=42
)


dev_df, test_df = train_test_split(
    temp_df, test_size=0.5, stratify=temp_df["stratify_label"], random_state=42
)

In [None]:
train_df.to_parquet('../data/processed/train.parquet', index=False)
dev_df.to_parquet('../data/processed/dev.parquet', index=False)
test_df.to_parquet('../data/processed/test.parquet', index=False)

print("✅ Data splits saved to data/processed/")