# Imports and Setup

In [5]:
import os, pathlib

# Move from …\Analysis  →  …\Israel-Palestine-Political-Affiliation-Text-Classification
os.chdir(pathlib.Path.cwd().parent)

print("Now working in:", pathlib.Path.cwd())

Now working in: c:\Users\yonat\CodeProjects


In [6]:
import optuna

from sklearn.metrics import f1_score
from sklearn.utils import resample
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# Local Code
from classifiers import *
from dataset import EmbeddingDataset, TextDataset
from embedder import Embedder
from Config.dataset_config import *


# Define optimizers

In [7]:
# Helper dataset
class HelperDataset(Dataset):
    def __init__(self, X, y):
        super(HelperDataset).__init__()
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, item):
        return self.X[item], self.y[item]
    
# Custom tqdm callback
class TqdmCallback:
    def __init__(self, n_trials):
        self.pbar = tqdm(total=n_trials)

    def __call__(self, study, trial):
        self.pbar.update(1)

    def close(self):
        self.pbar.close()

In [8]:
# ------------------------------------------------------------------
# 0.  Utilities and Hyperparameters Space
# ------------------------------------------------------------------

model_hyperparameters = {
    'logistic_regression': {
        'num_epochs': (5, 20, 'int'),
        'learning_rate': (1e-5, 1e-3, 'loguniform'),
        'weight_decay': (1e-5, 1e-3, 'loguniform'),
        "batch_norm": ([False], 'categorical'),    # Do not modify in optimization
        "drop_out": (0, 0, 'int'),    # Do not modify in optimization
        "layers": ([[768, 3]], 'categorical') # Do not modify in optimization
    },
    'svm': {
        'C': (1e-4, 1e2, 'loguniform'),
        'kernel': (['linear', 'rbf', 'sigmoid'], 'categorical'),
        'degree': (2, 4, 'int'),
        'gamma': (['scale', 'auto'], 'categorical')
    },
    'xgboost': {
        'n_estimators': (50, 200, 'int'),
        'learning_rate': (1e-3, 0.2, 'loguniform'),
        'booster': (['gbtree', 'gblinear', 'dart'], 'categorical'),
        'max_depth': (3, 10, 'int'),
        'min_child_weight': (2, 10, 'int'),
        'colsample_bytree': (0.5, 1.0, 'uniform'),
        'subsample': (0.5, 1.0, 'uniform'),
        'reg_alpha': (1e-8, 10.0, 'loguniform'),
        'reg_lambda': (1e-8, 10.0, 'loguniform'),
        'gamma': (1e-8, 1.0, 'loguniform')
    },
    'dnn': {
        "num_epochs": (5, 20, 'int'),  # Adjust after trial and error
        "learning_rate": (1e-5, 1e-3, 'loguniform'),
        'weight_decay': (1e-5, 1e-3, 'loguniform'),
        "batch_norm": ([True, False], 'categorical'),
        "drop_out": (0.0, 0.5, 'uniform'),
        "layers": ([[768, 64, 3],
                    [768, 128, 3],
                    [768, 64, 64, 3],
                    [768, 128, 64, 3],
                    [768, 512, 32, 3],
                    [768, 512, 128, 3],
                    [768, 512, 128, 64, 3]], 'categorical')  # Layer dimensions, including an input and output layer.
    }
}

def _build_estimator(model_name: str, params: dict) -> Classifier:
    """
    Always create a **fresh** classifier for a trial.
    """
    if model_name not in {"logistic_regression", "svm", "xgboost", "dnn"}:
        raise ValueError(f"Unsupported model: {model_name}")
    return Classifier(params, model_type=model_name, log=False)

def macro_f1_01(y_true, y_pred_or_proba, threshold_0=0.5, threshold_1=0.5):
    """
    Macro‑F1 for classes 0 & 1 (class 2 ignored).
    Used in order to optimize the classifier towards the important classes: 0 & 1.
    """
    y_true = np.asarray(y_true)
    yp = np.asarray(y_pred_or_proba)

    # probs → hard labels
    if yp.ndim == 2:
        if threshold_0 != 0.5 or threshold_1 != 0.5:    # custom cut‑offs
            y_pred = np.full(len(yp), 2, dtype=int)
            y_pred[yp[:, 0] >= threshold_0] = 0
            y_pred[yp[:, 1] >= threshold_1] = 1
        else:                                           # argmax
            y_pred = yp.argmax(axis=1)
    else:
        y_pred = yp.astype(int)

    return f1_score(y_true, y_pred, labels=[0, 1],
                    average="macro", zero_division=0)

def _bootstrap_ci(y_true, y_pred, n_iter: int = 1000, alpha: float = .05):
    """basic percentile bootstrap CI around macro‑F1(0,1)"""
    y_true = np.asarray(y_true)  # <-- safe cast
    y_pred = np.asarray(y_pred)
    n       = len(y_true)
    stats   = []
    for _ in range(n_iter):
        idx  = resample(np.arange(n), replace=True, n_samples=n)
        stats.append(macro_f1_01(y_true[idx], y_pred[idx]))
    lower, upper = np.percentile(stats, [100*alpha/2, 100*(1-alpha/2)])
    return float(lower), float(upper)

def _score_on_validation(estimator: Classifier,
                         X_tr, y_tr, X_val, y_val, return_ci: bool = False):
    """
    Fit on TRAIN → score on VAL.
    Handles both scikit‑learn and PyTorch heads.
    return_ci flag will return both score and confidence interval.
    """
    # ‑‑‑ prepare loaders   (HelperDataset just wraps (X,y) tensors/ndarrays)
    train_loader = DataLoader(HelperDataset(X_tr,  y_tr),
                              batch_size=BATCH_SIZE, shuffle=True)
    val_loader   = DataLoader(HelperDataset(X_val, y_val),
                              batch_size=BATCH_SIZE, shuffle=False)

    # -- fit + predict ------------------------------------------------------
    estimator.fit((train_loader, (X_tr, y_tr)))
    preds = estimator.predict((val_loader, (X_val, y_val)))

    # -- score --------------------------------------------------------------
    score = macro_f1_01(y_val, preds)

    if not return_ci:
        return score

    ci_low, ci_high = _bootstrap_ci(y_val, preds)
    return score, ci_low, ci_high

# ------------------------------------------------------------------
# 1.  Optuna — hyper‑param suggestion helper
# ------------------------------------------------------------------
def suggest_hyperparameters(trial, hp_space):
    """
    hp_space is the dict that lives in model_hyperparameters[…]
    """
    params = {}
    for key, spec in hp_space.items():
        if len(spec) == 2 and spec[1] == "categorical":
            params[key] = trial.suggest_categorical(key, spec[0])

        elif len(spec) == 3:
            low, high, kind = spec
            if kind == "loguniform":
                params[key] = trial.suggest_float(key, low, high, log=True)
            elif kind == "uniform":
                params[key] = trial.suggest_float(key, low, high)
            elif kind == "int":
                params[key] = trial.suggest_int(key, low, high)
            elif kind == "categorical":
                params[key] = trial.suggest_categorical(key, low)
            else:
                raise ValueError(f"Unknown param type: {kind}")
        else:
            raise ValueError(f"Malformed spec for {key}: {spec}")
    return params


# ------------------------------------------------------------------
# 2.  Optuna objective  (no CV — uses VAL split)
# ------------------------------------------------------------------
def objective(trial, model_name: str, X_train, y_train, X_val,   y_val) -> float:

    # -------- suggest hyper‑parameters
    params = suggest_hyperparameters(trial,
                                     model_hyperparameters[model_name])

    # logistic_regression  ≡  one‑layer DNN, override a few things
    if model_name == "logistic_regression":
        params.update(batch_norm=False, drop_out=0.0, layers=[768, 3])

    # booster‑specific extras for XGBoost
    if model_name == "xgboost" and params["booster"] in {"gbtree", "dart"}:
        params.update(
            grow_policy=trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
            colsample_bytree=trial.suggest_float("colsample_bytree", 0.5, 1.0),
            gamma=trial.suggest_float("gamma", 1e-8, 1.0, log=True),
            max_depth=trial.suggest_int("max_depth", 3, 10),
            min_child_weight=trial.suggest_int("min_child_weight", 2, 10),
            subsample=trial.suggest_float("subsample", 0.5, 1.0)
        )
        if params["booster"] == "dart":
            params.update(
                sample_type   = trial.suggest_categorical("sample_type", ["uniform", "weighted"]),
                normalize_type= trial.suggest_categorical("normalize_type", ["tree", "forest"]),
                rate_drop     = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True),
                skip_drop     = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)
            )

    # -------- build, fit, score
    estimator = _build_estimator(model_name, params)
    eval_score = _score_on_validation(estimator, X_train, y_train, X_val, y_val, return_ci=False)
    return eval_score


# ------------------------------------------------------------------
# 3.  Optimiser wrapper  (no CV, keeps history)
# ------------------------------------------------------------------
def optimize_model(model_name: str, X_train, y_train, X_val,   y_val, n_trials: int = 50, timeout: int  = 36000):
    """
    Tune hyper‑parameters by maximising macro‑F1 (0+1 classes) on VAL set.
    Returns: best_params, best_score, all_trial_scores
    """
    all_scores = []

    def _optuna_obj(trial):
        score = objective(trial, model_name, X_train, y_train, X_val,   y_val)
        all_scores.append(score)
        return score

    study = optuna.create_study(direction="maximize")
    study.optimize(_optuna_obj, n_trials=n_trials, timeout=timeout, callbacks=[TqdmCallback(n_trials)], n_jobs=3) # Uses all available cores (-1)

    best_params = study.best_params
    best_score  = study.best_value

    # ------------------------------------------------------------------
    # refit *best* on full TRAIN and compute bootstrap CI --------------
    # ------------------------------------------------------------------
    best_est   = _build_estimator(model_name, best_params)

    best_score, ci_low, ci_high = _score_on_validation(
            best_est, X_train, y_train, X_val, y_val, return_ci=True)
    
    print(f"[{model_name}]  best F1={best_score:.4f}   "
            f"Best hyperparameters: {best_params}",
            f"bootstrap 95% CI=({ci_low:.4f}, {ci_high:.4f})")

    return best_params, best_score, (ci_low, ci_high)

# Optimize models

## Create Datasets

In [10]:
'''
Create 4 different EmbeddingDataset datasets out of the 2 TextDataset datasets:
augmented dataset - with distilbert embedding or tfidf, 
undersampled balanced dataset - with distilbert embedding or tfidf,
regular dataset - with distilbert embedding or tfidf
'''

# ---------------------------------------------
# 1.  Build the three text‑level datasets
# ---------------------------------------------
txt_regular = TextDataset(
    csv_path          = DATA_PATH,
    id_column_idx     = ID_COLUMN_IDX,
    comment_column_idx= COMMENT_COLUMN_IDX,
    label_column_idx  = LABEL_COLUMN_IDX,
    split_column_idx  = SUBSET_COLUMN_IDX,  # TRAIN / VAL / TEST column
    augmented_classes = [],                 # ‑‑ no aug
    augmentation_ratio= 0,
    undersampling_targets = {},             # ‑‑ no undersampling
)

txt_augmented = TextDataset(
    csv_path          = DATA_PATH,
    id_column_idx     = ID_COLUMN_IDX,
    comment_column_idx= COMMENT_COLUMN_IDX,
    label_column_idx  = LABEL_COLUMN_IDX,
    split_column_idx  = SUBSET_COLUMN_IDX,
    augmented_classes = AUGMENTED_CLASSES,           # e.g. ["Pro‑Israel"]
    augmentation_ratio= 3,                            # add 3 extra copies
    undersampling_targets = {},                       # keep original class sizes
)


# ---------------------------------------------
# 2.  Wrap each text set in 1‑EmbeddingDataset
# ---------------------------------------------

EMB_METHODS = ["distilbert", "tf-idf"]    
embedder    = Embedder()                  # your wrapper around HF / TF‑IDF

embedding_sets: dict[str, dict[str, EmbeddingDataset]] = {}   # {method : {variant : EmbeddingDataset}}

for method in EMB_METHODS:
    embedding_sets[method] = {
        "regular"      : EmbeddingDataset(txt_regular,      embedder, method),
        "augmented"    : EmbeddingDataset(txt_augmented,    embedder, method),
    }

# ---------------------------------------------
# 3.  Fetch TRAIN and VAL tensors for each dataset
# ---------------------------------------------
(X_bert_reg_tr ,  y_bert_reg_tr ,
 X_bert_reg_val,  y_bert_reg_val) = (
     *embedding_sets["distilbert"]["regular"].get_subset("TRAIN")[:],
     *embedding_sets["distilbert"]["regular"].get_subset("VAL")[:] )

(X_bert_aug_tr ,  y_bert_aug_tr ,
 X_bert_aug_val,  y_bert_aug_val) = (
     *embedding_sets["distilbert"]["augmented"].get_subset("TRAIN")[:],
     *embedding_sets["distilbert"]["augmented"].get_subset("VAL")[:] )

(X_tfidf_reg_tr , y_tfidf_reg_tr ,
 X_tfidf_reg_val, y_tfidf_reg_val) = (
     *embedding_sets["tf-idf"]["regular"].get_subset("TRAIN")[:],
     *embedding_sets["tf-idf"]["regular"].get_subset("VAL")[:] )

(X_tfidf_aug_tr , y_tfidf_aug_tr ,
 X_tfidf_aug_val, y_tfidf_aug_val) = (
     *embedding_sets["tf-idf"]["augmented"].get_subset("TRAIN")[:],
     *embedding_sets["tf-idf"]["augmented"].get_subset("VAL")[:] )

Cleaning Comments: 100%|██████████| 43214/43214 [00:01<00:00, 27100.97it/s]


[TextDataset] rows: train=29690, val=4213, test=8656


Cleaning Comments: 100%|██████████| 43214/43214 [00:01<00:00, 30924.06it/s]
Augment: 100%|██████████| 29690/29690 [00:59<00:00, 502.14row/s]


[TextDataset] rows: train=55997, val=4213, test=8656


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'Data\\cache'

## Logistic Regression

In [7]:
lr_results = {}
lr_results['bert_without_augmentation'] = optimize_model(
    model_name = 'logistic_regression',
    X_train    = X_bert_reg_tr,
    y_train    = y_bert_reg_tr,
    X_val      = X_bert_reg_val,
    y_val      = y_bert_reg_val
)

[I 2025-05-15 01:34:53,210] A new study created in memory with name: no-name-3261fe19-a354-41d4-8ead-7223de803f7f
[I 2025-05-15 01:36:15,220] Trial 6 finished with value: 0.5630624755445415 and parameters: {'num_epochs': 11, 'learning_rate': 5.8861649521441797e-05, 'weight_decay': 0.0001283251298666932, 'batch_norm': False, 'drop_out': 0, 'layers': [768, 3]}. Best is trial 6 with value: 0.5630624755445415.
[I 2025-05-15 01:36:29,210] Trial 7 finished with value: 0.5897455966419685 and parameters: {'num_epochs': 13, 'learning_rate': 0.00021308052631900011, 'weight_decay': 0.00042842478035934984, 'batch_norm': False, 'drop_out': 0, 'layers': [768, 3]}. Best is trial 7 with value: 0.5897455966419685.
[I 2025-05-15 01:36:30,591] Trial 5 finished with value: 0.5463948677420855 and parameters: {'num_epochs': 13, 'learning_rate': 3.320329497884912e-05, 'weight_decay': 1.2078949298058857e-05, 'batch_norm': False, 'drop_out': 0, 'layers': [768, 3]}. Best is trial 7 with value: 0.589745596641968

[logistic_regression]  best F1=0.6002   Best hyperparameters: {'num_epochs': 20, 'learning_rate': 0.0006763931659185851, 'weight_decay': 9.239168299828083e-05, 'batch_norm': False, 'drop_out': 0, 'layers': [768, 3]} bootstrap 95% CI=(0.5742, 0.6256)


In [8]:
lr_results['bert_with_augmentation'] = optimize_model(
    model_name = 'logistic_regression',
    X_train    = X_bert_aug_tr,
    y_train    = y_bert_aug_tr,
    X_val      = X_bert_aug_val,
    y_val      = y_bert_aug_val
)

[I 2025-05-15 01:46:33,982] A new study created in memory with name: no-name-cb60c1a9-c45d-4756-9363-82660773b95e
[I 2025-05-15 01:48:28,639] Trial 3 finished with value: 0.6038500506585613 and parameters: {'num_epochs': 8, 'learning_rate': 0.0004801712813066417, 'weight_decay': 5.031909189369294e-05, 'batch_norm': False, 'drop_out': 0, 'layers': [768, 3]}. Best is trial 3 with value: 0.6038500506585613.
[I 2025-05-15 01:48:59,324] Trial 1 finished with value: 0.5986384710224565 and parameters: {'num_epochs': 10, 'learning_rate': 6.03519916469526e-05, 'weight_decay': 1.0510129853039426e-05, 'batch_norm': False, 'drop_out': 0, 'layers': [768, 3]}. Best is trial 3 with value: 0.6038500506585613.
[I 2025-05-15 01:49:26,864] Trial 0 finished with value: 0.5915897692184126 and parameters: {'num_epochs': 12, 'learning_rate': 3.1941967629191086e-05, 'weight_decay': 0.00031891355885052905, 'batch_norm': False, 'drop_out': 0, 'layers': [768, 3]}. Best is trial 3 with value: 0.6038500506585613.


[logistic_regression]  best F1=0.6104   Best hyperparameters: {'num_epochs': 16, 'learning_rate': 0.0008997254467020284, 'weight_decay': 3.8238891604423175e-05, 'batch_norm': False, 'drop_out': 0, 'layers': [768, 3]} bootstrap 95% CI=(0.5875, 0.6339)


In [9]:
lr_results['tfidf_without_augmentation'] = optimize_model(
    model_name = 'logistic_regression',
    X_train    = X_tfidf_reg_tr,
    y_train    = y_tfidf_reg_tr,
    X_val      = X_tfidf_reg_val,
    y_val      = y_tfidf_reg_val
)

[I 2025-05-15 02:10:09,139] A new study created in memory with name: no-name-77ac5b94-3c57-4161-b40e-403949f3a284
[I 2025-05-15 02:11:09,764] Trial 5 finished with value: 0.010422683651030107 and parameters: {'num_epochs': 8, 'learning_rate': 1.3517061238927537e-05, 'weight_decay': 0.00012884055461963748, 'batch_norm': False, 'drop_out': 0, 'layers': [768, 3]}. Best is trial 5 with value: 0.010422683651030107.
[I 2025-05-15 02:11:24,085] Trial 3 finished with value: 0.0013386880856760374 and parameters: {'num_epochs': 10, 'learning_rate': 0.0005938447480906891, 'weight_decay': 4.2404745689931965e-05, 'batch_norm': False, 'drop_out': 0, 'layers': [768, 3]}. Best is trial 5 with value: 0.010422683651030107.
[I 2025-05-15 02:11:39,459] Trial 4 finished with value: 0.0 and parameters: {'num_epochs': 12, 'learning_rate': 0.00044634329231872845, 'weight_decay': 0.000689231690958817, 'batch_norm': False, 'drop_out': 0, 'layers': [768, 3]}. Best is trial 5 with value: 0.010422683651030107.
[I 

[logistic_regression]  best F1=0.1393   Best hyperparameters: {'num_epochs': 18, 'learning_rate': 0.0009787152414867852, 'weight_decay': 2.323570658313695e-05, 'batch_norm': False, 'drop_out': 0, 'layers': [768, 3]} bootstrap 95% CI=(0.1195, 0.1593)


In [10]:
lr_results['tfidf_with_augmentation'] = optimize_model(
    model_name = 'logistic_regression',
    X_train    = X_tfidf_aug_tr,
    y_train    = y_tfidf_aug_tr,
    X_val      = X_tfidf_aug_val,
    y_val      = y_tfidf_aug_val
)

[I 2025-05-15 02:20:04,621] A new study created in memory with name: no-name-a4f6c63a-1748-4d52-9ccc-0bffdb9ed181
[I 2025-05-15 02:21:16,005] Trial 4 finished with value: 0.26947749404920934 and parameters: {'num_epochs': 5, 'learning_rate': 2.8588957111980665e-05, 'weight_decay': 0.00040882135337907104, 'batch_norm': False, 'drop_out': 0, 'layers': [768, 3]}. Best is trial 4 with value: 0.26947749404920934.
[I 2025-05-15 02:21:16,654] Trial 6 finished with value: 0.30210397686468543 and parameters: {'num_epochs': 5, 'learning_rate': 0.0002004764546118579, 'weight_decay': 1.2858218701355141e-05, 'batch_norm': False, 'drop_out': 0, 'layers': [768, 3]}. Best is trial 6 with value: 0.30210397686468543.
[I 2025-05-15 02:21:45,749] Trial 5 finished with value: 0.40815612490271425 and parameters: {'num_epochs': 7, 'learning_rate': 0.00047919651784441913, 'weight_decay': 0.00011483113734142058, 'batch_norm': False, 'drop_out': 0, 'layers': [768, 3]}. Best is trial 5 with value: 0.408156124902

[logistic_regression]  best F1=0.5443   Best hyperparameters: {'num_epochs': 17, 'learning_rate': 0.0007273309244100031, 'weight_decay': 1.1436305935567236e-05, 'batch_norm': False, 'drop_out': 0, 'layers': [768, 3]} bootstrap 95% CI=(0.5189, 0.5687)


In [11]:
print("Logistic Regression results:\n\n")
df = pd.DataFrame([
    {
        'Experiment': key,
        'Best Parameters': value[0],
        'Best Eval Score': value[1],
        'CI (95%)': value[2]
    }
    for key, value in lr_results.items()
])
display(df)


Logistic Regression results:




Unnamed: 0,Experiment,Best Parameters,Best Eval Score,CI (95%)
0,bert_without_augmentation,"{'num_epochs': 20, 'learning_rate': 0.00067639...",0.600196,"(0.5741540967583567, 0.6255775550147233)"
1,bert_with_augmentation,"{'num_epochs': 16, 'learning_rate': 0.00089972...",0.610361,"(0.5874523346355454, 0.6339389397280898)"
2,tfidf_without_augmentation,"{'num_epochs': 18, 'learning_rate': 0.00097871...",0.139265,"(0.11945675943110402, 0.15929143635095555)"
3,tfidf_with_augmentation,"{'num_epochs': 17, 'learning_rate': 0.00072733...",0.54434,"(0.5188994560097875, 0.5687425539703366)"


## SVM

In [17]:
svm_results = {}
svm_results['bert_without_augmentation'] = optimize_model(
    model_name = 'svm',
    X_train    = X_bert_reg_tr,
    y_train    = y_bert_reg_tr,
    X_val      = X_bert_reg_val,
    y_val      = y_bert_reg_val,
    n_trials = 100
)

[I 2025-05-15 20:52:23,769] A new study created in memory with name: no-name-ac6006f2-d2b7-445d-93bc-cc87f3a0ad34
  0%|          | 0/100 [00:00<?, ?it/s][I 2025-05-15 20:55:00,458] Trial 1 finished with value: 0.5156607742640308 and parameters: {'C': 0.20638795885710978, 'kernel': 'sigmoid', 'degree': 3, 'gamma': 'scale'}. Best is trial 1 with value: 0.5156607742640308.
  1%|          | 1/100 [02:36<4:18:32, 156.69s/it][I 2025-05-15 21:02:52,366] Trial 3 finished with value: 0.5913438938525919 and parameters: {'C': 0.0008370874233199209, 'kernel': 'linear', 'degree': 2, 'gamma': 'auto'}. Best is trial 3 with value: 0.5913438938525919.
  2%|▏         | 2/100 [10:28<9:18:46, 342.11s/it][I 2025-05-15 21:04:37,205] Trial 2 finished with value: 0.18373812038014783 and parameters: {'C': 0.0007346646677901582, 'kernel': 'rbf', 'degree': 2, 'gamma': 'scale'}. Best is trial 3 with value: 0.5913438938525919.
  3%|▎         | 3/100 [12:13<6:17:55, 233.77s/it][I 2025-05-15 21:05:40,036] Trial 4 fi

[svm]  best F1=0.6382   Best hyperparameters: {'C': 14.113323810506161, 'kernel': 'rbf', 'degree': 4, 'gamma': 'scale'} bootstrap 95% CI=(0.6143, 0.6624)


In [None]:
svm_results['bert_with_augmentation'] = optimize_model(
    model_name = 'svm',
    X_train    = X_bert_aug_tr,
    y_train    = y_bert_aug_tr,
    X_val      = X_bert_aug_val,
    y_val      = y_bert_aug_val
)

[I 2025-05-15 04:02:06,168] A new study created in memory with name: no-name-adcad1c0-2e61-40d5-a989-98101c03b836
  0%|          | 0/50 [00:00<?, ?it/s][I 2025-05-15 04:15:08,474] Trial 0 finished with value: 0.5988766053831862 and parameters: {'C': 14.294579237140699, 'kernel': 'sigmoid', 'degree': 4, 'gamma': 'auto'}. Best is trial 0 with value: 0.5988766053831862.
  2%|▏         | 1/50 [13:02<10:38:52, 782.30s/it][I 2025-05-15 04:24:05,265] Trial 4 finished with value: 0.6339189024611019 and parameters: {'C': 0.26744797760450434, 'kernel': 'rbf', 'degree': 2, 'gamma': 'scale'}. Best is trial 4 with value: 0.6339189024611019.
  4%|▍         | 2/50 [21:59<8:30:18, 637.88s/it] [I 2025-05-15 04:28:09,161] Trial 6 finished with value: 0.6220872796775561 and parameters: {'C': 0.05849434719062382, 'kernel': 'rbf', 'degree': 4, 'gamma': 'scale'}. Best is trial 4 with value: 0.6339189024611019.
  6%|▌         | 3/50 [26:02<5:58:45, 457.99s/it][I 2025-05-15 04:31:24,912] Trial 7 finished with

In [None]:
svm_results['tfidf_without_augmentation'] = optimize_model(
    model_name = 'svm',
    X_train    = X_tfidf_reg_tr,
    y_train    = y_tfidf_reg_tr,
    X_val      = X_tfidf_reg_val,
    y_val      = y_tfidf_reg_val
)

In [None]:
svm_results['tfidf_with_augmentation'] = optimize_model(
    model_name = 'svm',
    X_train    = X_tfidf_aug_tr,
    y_train    = y_tfidf_aug_tr,
    X_val      = X_tfidf_aug_val,
    y_val      = y_tfidf_aug_val
)

In [18]:
print("SVM results:\n\n")
df = pd.DataFrame([
    {
        'Experiment': key,
        'Best Parameters': value[0],
        'Best Eval Score': value[1],
        'CI (95%)': value[2]
    }
    for key, value in svm_results.items()
])
display(df)

SVM results:




Unnamed: 0,Experiment,Best Parameters,Best Eval Score,CI (95%)
0,bert_without_augmentation,"{'C': 14.113323810506161, 'kernel': 'rbf', 'de...",0.638179,"(0.6142822431331255, 0.6624198682227005)"


## XGBoost

In [13]:
xgb_results = {}
xgb_results['bert_without_augmentation'] = optimize_model(
    model_name = 'xgboost',
    X_train    = X_bert_reg_tr,
    y_train    = y_bert_reg_tr,
    X_val      = X_bert_reg_val,
    y_val      = y_bert_reg_val,
    n_trials = 100
)

[I 2025-05-15 16:40:23,531] A new study created in memory with name: no-name-d54c6898-92de-4c8c-a120-ed40d3d5c6a4
  0%|          | 0/100 [00:00<?, ?it/s][I 2025-05-15 16:40:43,521] Trial 1 finished with value: 0.5872804061025436 and parameters: {'n_estimators': 129, 'learning_rate': 0.008665749332990566, 'booster': 'gblinear', 'max_depth': 3, 'min_child_weight': 10, 'colsample_bytree': 0.9518770512514789, 'subsample': 0.5930836128270734, 'reg_alpha': 0.00023049735300880039, 'reg_lambda': 0.002832714474683314, 'gamma': 0.33945450614017925}. Best is trial 1 with value: 0.5872804061025436.
  1%|          | 1/100 [00:19<32:59, 19.99s/it][I 2025-05-15 16:41:12,617] Trial 2 finished with value: 0.5971101781226416 and parameters: {'n_estimators': 112, 'learning_rate': 0.02697426543063567, 'booster': 'gbtree', 'max_depth': 3, 'min_child_weight': 2, 'colsample_bytree': 0.8869256703299871, 'subsample': 0.6379367146114844, 'reg_alpha': 7.709908337574803, 'reg_lambda': 1.3138231424348024e-08, 'gam

[xgboost]  best F1=0.6400   Best hyperparameters: {'n_estimators': 97, 'learning_rate': 0.19893565401854227, 'booster': 'gbtree', 'max_depth': 7, 'min_child_weight': 3, 'colsample_bytree': 0.698884472870104, 'subsample': 0.5802363751849828, 'reg_alpha': 1.1264120790978405e-07, 'reg_lambda': 0.7242980309966568, 'gamma': 1.0153525598898357e-05, 'grow_policy': 'depthwise'} bootstrap 95% CI=(0.6136, 0.6633)


In [None]:
xgb_results['bert_with_augmentation'] = optimize_model(
    model_name = 'xgboost',
    X_train    = X_bert_aug_tr,
    y_train    = y_bert_aug_tr,
    X_val      = X_bert_aug_val,
    y_val      = y_bert_aug_val
)

In [9]:
xgb_results['tfidf_without_augmentation'] = optimize_model(
    model_name = 'xgboost',
    X_train    = X_tfidf_reg_tr,
    y_train    = y_tfidf_reg_tr,
    X_val      = X_tfidf_reg_val,
    y_val      = y_tfidf_reg_val
)

[I 2025-05-15 13:45:37,701] A new study created in memory with name: no-name-c2b88e3f-1372-4634-ac26-54b8b04482c5
  0%|          | 0/50 [00:00<?, ?it/s][I 2025-05-15 13:45:54,275] Trial 0 finished with value: 0.0 and parameters: {'n_estimators': 146, 'learning_rate': 0.15020861438022654, 'booster': 'gblinear', 'max_depth': 10, 'min_child_weight': 4, 'colsample_bytree': 0.8467262422988333, 'subsample': 0.8104585153371822, 'reg_alpha': 0.009353432542695172, 'reg_lambda': 0.0006814361471557467, 'gamma': 0.00016276417702419104}. Best is trial 0 with value: 0.0.
  2%|▏         | 1/50 [00:16<13:32, 16.57s/it][I 2025-05-15 13:46:00,494] Trial 2 finished with value: 0.0 and parameters: {'n_estimators': 194, 'learning_rate': 0.06725107414171043, 'booster': 'gblinear', 'max_depth': 6, 'min_child_weight': 8, 'colsample_bytree': 0.6337009009790975, 'subsample': 0.5620620904155708, 'reg_alpha': 0.1067107676093894, 'reg_lambda': 0.001083519849260911, 'gamma': 7.114473067716964e-06}. Best is trial 0 

[xgboost]  best F1=0.5769   Best hyperparameters: {'n_estimators': 96, 'learning_rate': 0.13671810205729562, 'booster': 'gbtree', 'max_depth': 10, 'min_child_weight': 9, 'colsample_bytree': 0.9544259373139633, 'subsample': 0.7821265873367009, 'reg_alpha': 1.8675227772962615e-06, 'reg_lambda': 0.0003869335467411198, 'gamma': 0.23432819427336474, 'grow_policy': 'lossguide'} bootstrap 95% CI=(0.5520, 0.6039)


In [None]:
xgb_results['tfidf_with_augmentation'] = optimize_model(
    model_name = 'xgboost',
    X_train    = X_tfidf_aug_tr,
    y_train    = y_tfidf_aug_tr,
    X_val      = X_tfidf_aug_val,
    y_val      = y_tfidf_aug_val
)

In [14]:
print("XGBoost results:\n\n")
df = pd.DataFrame([
    {
        'Experiment': key,
        'Best Parameters': value[0],
        'Best Eval Score': value[1],
        'CI (95%)': value[2]
    }
    for key, value in xgb_results.items()
])
display(df)

XGBoost results:




Unnamed: 0,Experiment,Best Parameters,Best Eval Score,CI (95%)
0,bert_without_augmentation,"{'n_estimators': 97, 'learning_rate': 0.198935...",0.63998,"(0.6136302135287747, 0.6633417826405086)"


## DNN

Given the potential a network classification head shown, we'll attempt to optimise it to train-val loss, with adjustments to learning process.

This is a 1-block flow to train / optimize the process for a DNN on the embedding dataset.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.utils.class_weight import compute_class_weight

from dataset import TextDataset, EmbeddingDataset
from embedder import Embedder
from classifiers import assess_model, get_dataloader, DNN
from Config.dataset_config import *
from Config.classifiers_config import *

from sklearn.metrics import f1_score

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("[Dataset Status]: Building datasets and dataloaders...")
text_dataset = TextDataset(
    csv_path          = DATA_PATH,
    id_column_idx     = ID_COLUMN_IDX,
    comment_column_idx= COMMENT_COLUMN_IDX,
    label_column_idx  = LABEL_COLUMN_IDX,
    split_column_idx  = SUBSET_COLUMN_IDX,  # TRAIN / VAL / TEST column
    augmented_classes = [],                 # ‑‑ no aug
    augmentation_ratio= 0,
    undersampling_targets = {},             # ‑‑ no undersampling
)

embedding_dataset = EmbeddingDataset(
    text_dataset=text_dataset,
    embedder=Embedder(),
    embedding_method=EMBEDDING_METHOD
)

train_ds = embedding_dataset.get_subset('TRAIN')
val_ds   = embedding_dataset.get_subset('VAL')
test_ds  = embedding_dataset.get_subset('TEST')

train_package = get_dataloader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_package   = get_dataloader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)
test_package  = get_dataloader(test_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

# Unpack for sklearn metrics
_, (X_val, y_val)   = val_package
_, (X_test, y_test) = test_package


print("[Model Fit Status]: Training direct DNN...")
model_config = {
        'num_epochs': 30, 
        'learning_rate': 3e-4, 
        'weight_decay': 1e-4, 
        'batch_norm': True, 
        'drop_out': 0.5, 
        'layers': [768, 512, 256, 128, 64, 3]
    }
dnn = DNN(model_config).to(DEVICE)
train_loader, (X_train, y_train) = train_package
val_loader, _                     = val_package

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y_train),
    y=y_train
)
weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(DEVICE)
criterion = nn.CrossEntropyLoss(weight=weights_tensor)
optimizer = optim.Adam(dnn.parameters(), lr=model_config["learning_rate"])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", factor=0.5, patience=2)
best_f1 = -1
best_model_path = os.path.join(CHECKPOINTS, "best_dnn.pt")

train_losses, val_losses, val_f1_scores = [], [], []

for epoch in range(model_config['num_epochs']):
    dnn.train()
    total_loss = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        optimizer.zero_grad()
        preds = dnn(xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * xb.size(0)

    train_loss = total_loss / len(train_loader.dataset)
    train_losses.append(train_loss)

    # Validation
    dnn.eval()
    val_preds, val_labels = [], []
    total_val_loss = 0
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            preds = dnn(xb)
            loss = criterion(preds, yb)
            total_val_loss += loss.item() * xb.size(0)
            val_preds.extend(preds.argmax(dim=1).cpu().numpy())
            val_labels.extend(yb.cpu().numpy())

    val_loss = total_val_loss / len(val_loader.dataset)
    val_losses.append(val_loss)

    # Macro-F1
    val_f1 = f1_score(np.array(val_labels), np.array(val_preds), average="macro", zero_division=0)
    val_f1_scores.append(val_f1)
    scheduler.step(val_f1)

    if val_f1 > best_f1:
        best_f1 = val_f1
        torch.save(dnn.state_dict(), best_model_path)
        print(f"Epoch {epoch+1}/{model_config['num_epochs']}: Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | F1: {val_f1:.4f} | ↪️ Saved new best model.")
    else:
        print(f"Epoch {epoch+1}/{model_config['num_epochs']}: Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | F1: {val_f1:.4f}")


print(f"\n[Evaluation on Test Set]")
dnn.eval()
all_preds = []
with torch.no_grad():
    for xb, _ in test_package[0]:
        xb = xb.to(DEVICE)
        preds = dnn(xb)
        all_preds.extend(preds.argmax(dim=1).cpu().numpy())

# Classification report
_, _ = assess_model(all_preds, test_package, valid_labels=[0, 1, 2])

[Dataset Status]: Building datasets and dataloaders...


Cleaning Comments: 100%|██████████| 43214/43214 [00:01<00:00, 28953.44it/s]


[TextDataset] rows: train=29690, val=4213, test=8656
[EmbeddingDataset]: Loading precomputed embeddings from Data\cache\distilbert_embeddings_regular.pkl...
[EmbeddingDataset Status]: Embedding generation complete.
[Dataloader Status]: Preparing the dataloader...
[DL] EmbeddingDataset ready. X shape (29690, 768), y len 29690
[Dataloader Status]: Preparing the dataloader...
[DL] EmbeddingDataset ready. X shape (4213, 768), y len 4213
[Dataloader Status]: Preparing the dataloader...
[DL] EmbeddingDataset ready. X shape (8656, 768), y len 8656
[Model Fit Status]: Training direct DNN...
Epoch 1/30: Train Loss: 0.4243 | Val Loss: 0.6811 | F1: 0.7041 | ↪️ Saved new best model.
Epoch 2/30: Train Loss: 0.3061 | Val Loss: 0.7275 | F1: 0.7196 | ↪️ Saved new best model.
Epoch 3/30: Train Loss: 0.2775 | Val Loss: 0.7467 | F1: 0.7122
Epoch 4/30: Train Loss: 0.2633 | Val Loss: 0.7217 | F1: 0.6999
Epoch 5/30: Train Loss: 0.2524 | Val Loss: 0.8336 | F1: 0.7253 | ↪️ Saved new best model.
Epoch 6/30: Tr