In [None]:
import pandas as pd


import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
train_df = pd.read_parquet('../data/processed/train.parquet')
dev_df = pd.read_parquet('../data/processed/dev.parquet')
test_df = pd.read_parquet('../data/processed/test.parquet') 

print(f"Loaded Train: {train_df.shape}, Dev: {dev_df.shape}")

In [None]:
TARGET_COL = "stratify_label"

X_train = train_df["text_clean"].astype(str)
y_train = train_df[TARGET_COL]
X_test = test_df["text_clean"].astype(str)
y_test = test_df[TARGET_COL]


# Define Models


models = {
    "LR": LogisticRegression(class_weight="balanced", max_iter=1000, random_state=42),
    "RF": RandomForestClassifier(class_weight="balanced", n_jobs=-1, random_state=42),
    "GB": GradientBoostingClassifier(random_state=42),
    "SVM": LinearSVC(class_weight="balanced", random_state=42),
    "DT": DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "NB": MultinomialNB(),
}


metrics_data = []


print(f"ðŸš€ Training models on Multi-Class Target: '{TARGET_COL}'...")


for name, clf in models.items():
    print(f"   -> Training {name}...")

    # Pipeline

    pipe = Pipeline(
        [
            ("tfidf", TfidfVectorizer(ngram_range=(1, 3), max_features=10000)),
            ("clf", clf),
        ]
    )

    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_test)

    report = classification_report(y_test, y_pred, output_dict=True)

    for label, scores in report.items():
        if label in ["accuracy", "macro avg", "weighted avg"]:
            continue

        metrics_data.append(
            {
                "Model": name,
                "Class": label,
                "Metric": "Precision",
                "Score": scores["precision"],
            }
        )

        metrics_data.append(
            {
                "Model": name,
                "Class": label,
                "Metric": "Recall",
                "Score": scores["recall"],
            }
        )

        metrics_data.append(
            {
                "Model": name,
                "Class": label,
                "Metric": "F1-Score",
                "Score": scores["f1-score"],
            }
        )


df_metrics = pd.DataFrame(metrics_data)

sns.set_theme(style="whitegrid")

plt.figure(figsize=(14, 7))

f1_data = df_metrics[df_metrics["Metric"] == "F1-Score"]

chart = sns.barplot(data=f1_data, x="Model", y="Score", hue="Class", palette="viridis")

plt.title(f"Performance per Hate Class ({TARGET_COL})", fontsize=16)
plt.ylabel("F1 Score")
plt.ylim(0, 1.1)
plt.legend(title="Hate Type", bbox_to_anchor=(1.01, 1), loc="upper left")


for container in chart.containers:
    chart.bar_label(container, fmt="%.2f", padding=3, fontsize=9)


plt.tight_layout()
plt.show()
