# Imports and Setup

In [1]:
from pathlib import Path
import os

# Start from current path and walk up until we find a known file
curr_path = Path().resolve()
marker_file = "requirements.txt"  # or any other file you know exists in your project root

while not (curr_path / marker_file).exists() and curr_path != curr_path.parent:
    curr_path = curr_path.parent

# Set working directory if found
if (curr_path / marker_file).exists():
    os.chdir(curr_path)
    print("Now working in:", Path.cwd())
else:
    raise RuntimeError(f"Project root with {marker_file} not found.")

Now working in: C:\Users\yonat\CodeProjects\Israel-Palestine-Political-Affiliation-Text-Classification


In [2]:
import optuna

from sklearn.metrics import f1_score
from sklearn.utils import resample
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# Local Code
from classifiers import *
from dataset import EmbeddingDataset, TextDataset
from embedder import Embedder
from Config.dataset_config import *


  from .autonotebook import tqdm as notebook_tqdm


# Define optimizers

In [3]:
# Helper dataset
class HelperDataset(Dataset):
    def __init__(self, X, y):
        super(HelperDataset).__init__()
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, item):
        return self.X[item], self.y[item]
    
# Custom tqdm callback
class TqdmCallback:
    def __init__(self, n_trials):
        self.pbar = tqdm(total=n_trials)

    def __call__(self, study, trial):
        self.pbar.update(1)

    def close(self):
        self.pbar.close()

In [4]:
# ------------------------------------------------------------------
# 0.  Utilities and Hyperparameters Space
# ------------------------------------------------------------------

model_hyperparameters = {
    'logistic_regression': {
        'num_epochs': (5, 20, 'int'),
        'learning_rate': (1e-5, 1e-3, 'loguniform'),
        'weight_decay': (1e-5, 1e-3, 'loguniform'),
        "batch_norm": ([False], 'categorical'),    # Do not modify in optimization
        "drop_out": (0, 0, 'int'),    # Do not modify in optimization
        "layers": (["(768, 3)"], 'categorical') # Do not modify in optimization
    },
    'svm': {
        'C': (1e-4, 1e2, 'loguniform'),
        'kernel': (['linear', 'rbf', 'sigmoid'], 'categorical'),
        'degree': (2, 4, 'int'),
        'gamma': (['scale', 'auto'], 'categorical')
    },
    'xgboost': {
        'n_estimators': (50, 200, 'int'),
        'learning_rate': (1e-3, 0.2, 'loguniform'),
        'booster': (['gbtree', 'gblinear', 'dart'], 'categorical'),
        'max_depth': (3, 10, 'int'),
        'min_child_weight': (2, 10, 'int'),
        'colsample_bytree': (0.5, 1.0, 'uniform'),
        'subsample': (0.5, 1.0, 'uniform'),
        'reg_alpha': (1e-8, 10.0, 'loguniform'),
        'reg_lambda': (1e-8, 10.0, 'loguniform'),
        'gamma': (1e-8, 1.0, 'loguniform')
    },
    'dnn': {
        "num_epochs": (5, 20, 'int'),  # Adjust after trial and error
        "learning_rate": (1e-5, 1e-3, 'loguniform'),
        'weight_decay': (1e-5, 1e-3, 'loguniform'),
        "batch_norm": ([True, False], 'categorical'),
        "drop_out": (0.0, 0.5, 'uniform'),
        "layers": (["(768, 64, 3)",
                    "(768, 128, 3)",
                    "(768, 64, 64, 3)",
                    "(768, 128, 64, 3)",
                    "(768, 512, 32, 3)",
                    "(768, 512, 128, 3)",
                    "(768, 512, 128, 64, 3)"], 'categorical')  # Layer dimensions, including an input and output layer.
    }
}

def _build_estimator(model_name: str, params: dict) -> Classifier:
    """
    Always create a **fresh** classifier for a trial.
    """
    if model_name not in {"logistic_regression", "svm", "xgboost", "dnn"}:
        raise ValueError(f"Unsupported model: {model_name}")
    return Classifier(params, model_type=model_name, log=False)

def macro_f1_01(y_true, y_pred_or_proba, threshold_0=0.5, threshold_1=0.5):
    """
    Macro-F1 for classes 0 & 1 (class 2 ignored).
    Used in order to optimize the classifier towards the important classes: 0 & 1.
    """
    y_true = np.asarray(y_true)
    yp = np.asarray(y_pred_or_proba)

    # probs → hard labels
    if yp.ndim == 2:
        if threshold_0 != 0.5 or threshold_1 != 0.5:    # custom cut‑offs
            y_pred = np.full(len(yp), 2, dtype=int)
            y_pred[yp[:, 0] >= threshold_0] = 0
            y_pred[yp[:, 1] >= threshold_1] = 1
        else:                                           # argmax
            y_pred = yp.argmax(axis=1)
    else:
        y_pred = yp.astype(int)

    return f1_score(y_true, y_pred, labels=[0, 1],
                    average="macro", zero_division=0)

def _bootstrap_ci(y_true, y_pred, n_iter: int = 1000, alpha: float = .05):
    """basic percentile bootstrap CI around macro-F1(0,1)"""
    y_true = np.asarray(y_true)  # <-- safe cast
    y_pred = np.asarray(y_pred)
    n       = len(y_true)
    stats   = []
    for _ in range(n_iter):
        idx  = resample(np.arange(n), replace=True, n_samples=n)
        stats.append(macro_f1_01(y_true[idx], y_pred[idx]))
    lower, upper = np.percentile(stats, [100*alpha/2, 100*(1-alpha/2)])
    return float(lower), float(upper)

def _score_on_validation(estimator: Classifier,
                         X_tr, y_tr, X_val, y_val, return_ci: bool = False):
    """
    Fit on TRAIN → score on VAL.
    Handles both scikit-learn and PyTorch heads.
    return_ci flag will return both score and confidence interval.
    """
    # ‑‑‑ prepare loaders   (HelperDataset just wraps (X,y) tensors/ndarrays)
    train_loader = DataLoader(HelperDataset(X_tr,  y_tr),
                              batch_size=BATCH_SIZE, shuffle=True)
    val_loader   = DataLoader(HelperDataset(X_val, y_val),
                              batch_size=BATCH_SIZE, shuffle=False)

    # -- fit + predict ------------------------------------------------------
    estimator.fit((train_loader, (X_tr, y_tr)))
    preds = estimator.predict((val_loader, (X_val, y_val)))

    # -- score --------------------------------------------------------------
    score = macro_f1_01(y_val, preds)

    if not return_ci:
        return score

    ci_low, ci_high = _bootstrap_ci(y_val, preds)
    return score, ci_low, ci_high

# ------------------------------------------------------------------
# 1.  Optuna — hyper‑param suggestion helper
# ------------------------------------------------------------------
def suggest_hyperparameters(trial, hp_space):
    """
    hp_space is the dict that lives in model_hyperparameters[…]
    """
    params = {}
    for key, spec in hp_space.items():
        if len(spec) == 2 and spec[1] == "categorical":
            params[key] = trial.suggest_categorical(key, spec[0])

        elif len(spec) == 3:
            low, high, kind = spec
            if kind == "loguniform":
                params[key] = trial.suggest_float(key, low, high, log=True)
            elif kind == "uniform":
                params[key] = trial.suggest_float(key, low, high)
            elif kind == "int":
                params[key] = trial.suggest_int(key, low, high)
            elif kind == "categorical":
                params[key] = trial.suggest_categorical(key, low)
            else:
                raise ValueError(f"Unknown param type: {kind}")
        else:
            raise ValueError(f"Malformed spec for {key}: {spec}")
    return params


# ------------------------------------------------------------------
# 2.  Optuna objective  (no CV — uses VAL split)
# ------------------------------------------------------------------
def objective(trial, model_name: str, X_train, y_train, X_val,   y_val) -> float:

    # -------- suggest hyper‑parameters
    params = suggest_hyperparameters(trial,
                                     model_hyperparameters[model_name])

    # logistic_regression  ≡  one‑layer DNN, override a few things
    if model_name == "logistic_regression":
        params.update(batch_norm=False, drop_out=0.0, layers=[768, 3])

    # booster‑specific extras for XGBoost
    if model_name == "xgboost" and params["booster"] in {"gbtree", "dart"}:
        params.update(
            grow_policy=trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
            colsample_bytree=trial.suggest_float("colsample_bytree", 0.5, 1.0),
            gamma=trial.suggest_float("gamma", 1e-8, 1.0, log=True),
            max_depth=trial.suggest_int("max_depth", 3, 10),
            min_child_weight=trial.suggest_int("min_child_weight", 2, 10),
            subsample=trial.suggest_float("subsample", 0.5, 1.0)
        )
        if params["booster"] == "dart":
            params.update(
                sample_type   = trial.suggest_categorical("sample_type", ["uniform", "weighted"]),
                normalize_type= trial.suggest_categorical("normalize_type", ["tree", "forest"]),
                rate_drop     = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True),
                skip_drop     = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)
            )

    # -------- build, fit, score
    estimator = _build_estimator(model_name, params)
    eval_score = _score_on_validation(estimator, X_train, y_train, X_val, y_val, return_ci=False)
    return eval_score


# ------------------------------------------------------------------
# 3.  Optimiser wrapper  (no CV, keeps history)
# ------------------------------------------------------------------
def optimize_model(model_name: str, X_train, y_train, X_val,   y_val, n_trials: int = 50, timeout: int  = 36000):
    """
    Tune hyper-parameters by maximising macro-F1 (0+1 classes) on VAL set.
    Returns: best_params, best_score, all_trial_scores
    """
    all_scores = []

    def _optuna_obj(trial):
        score = objective(trial, model_name, X_train, y_train, X_val,   y_val)
        all_scores.append(score)
        return score

    study = optuna.create_study(direction="maximize")
    study.optimize(_optuna_obj, n_trials=n_trials, timeout=timeout, callbacks=[TqdmCallback(n_trials)], n_jobs=4) # Uses all available cores (-1)

    best_params = study.best_params
    best_score  = study.best_value

    # ------------------------------------------------------------------
    # refit *best* on full TRAIN and compute bootstrap CI --------------
    # ------------------------------------------------------------------
    best_est   = _build_estimator(model_name, best_params)

    best_score, ci_low, ci_high = _score_on_validation(
            best_est, X_train, y_train, X_val, y_val, return_ci=True)
    
    print(f"[{model_name}]  best F1={best_score:.4f}   "
            f"Best hyperparameters: {best_params}",
            f"bootstrap 95% CI=({ci_low:.4f}, {ci_high:.4f})")

    return best_params, best_score, (ci_low, ci_high)

# Optimize models

## Create Datasets

In [5]:
'''
Create 4 different EmbeddingDataset datasets out of the 2 TextDataset datasets:
augmented dataset - with distilbert embedding or tfidf, 
undersampled balanced dataset - with distilbert embedding or tfidf,
regular dataset - with distilbert embedding or tfidf
'''

# ---------------------------------------------
# 1.  Build the three text‑level datasets
# ---------------------------------------------
txt_regular = TextDataset(
    csv_path          = DATA_PATH,
    id_column_idx     = ID_COLUMN_IDX,
    comment_column_idx= COMMENT_COLUMN_IDX,
    label_column_idx  = LABEL_COLUMN_IDX,
    split_column_idx  = SUBSET_COLUMN_IDX,  # TRAIN / VAL / TEST column
    augmented_classes = [],                 # ‑‑ no aug
    augmentation_ratio= 0,
    undersampling_targets = {},             # ‑‑ no undersampling
)

txt_augmented = TextDataset(
    csv_path          = DATA_PATH,
    id_column_idx     = ID_COLUMN_IDX,
    comment_column_idx= COMMENT_COLUMN_IDX,
    label_column_idx  = LABEL_COLUMN_IDX,
    split_column_idx  = SUBSET_COLUMN_IDX,
    augmented_classes = AUGMENTED_CLASSES,           # e.g. ["Pro‑Israel"]
    augmentation_ratio= 3,                            # add 3 extra copies
    undersampling_targets = {},                       # keep original class sizes
)


# ---------------------------------------------
# 2.  Wrap each text set in 1‑EmbeddingDataset
# ---------------------------------------------

EMB_METHODS = ["distilbert", "tf-idf"]    
embedder    = Embedder()                  # your wrapper around HF / TF‑IDF

embedding_sets: dict[str, dict[str, EmbeddingDataset]] = {}   # {method : {variant : EmbeddingDataset}}

for method in EMB_METHODS:
    embedding_sets[method] = {
        "regular"      : EmbeddingDataset(txt_regular,      embedder, method),
        "augmented"    : EmbeddingDataset(txt_augmented,    embedder, method),
    }

# ---------------------------------------------
# 3.  Fetch TRAIN and VAL tensors for each dataset
# ---------------------------------------------
(X_bert_reg_tr ,  y_bert_reg_tr ,
 X_bert_reg_val,  y_bert_reg_val) = (
     *embedding_sets["distilbert"]["regular"].get_subset("TRAIN")[:],
     *embedding_sets["distilbert"]["regular"].get_subset("VAL")[:] )

(X_bert_aug_tr ,  y_bert_aug_tr ,
 X_bert_aug_val,  y_bert_aug_val) = (
     *embedding_sets["distilbert"]["augmented"].get_subset("TRAIN")[:],
     *embedding_sets["distilbert"]["augmented"].get_subset("VAL")[:] )

(X_tfidf_reg_tr , y_tfidf_reg_tr ,
 X_tfidf_reg_val, y_tfidf_reg_val) = (
     *embedding_sets["tf-idf"]["regular"].get_subset("TRAIN")[:],
     *embedding_sets["tf-idf"]["regular"].get_subset("VAL")[:] )

(X_tfidf_aug_tr , y_tfidf_aug_tr ,
 X_tfidf_aug_val, y_tfidf_aug_val) = (
     *embedding_sets["tf-idf"]["augmented"].get_subset("TRAIN")[:],
     *embedding_sets["tf-idf"]["augmented"].get_subset("VAL")[:] )

Cleaning Comments: 100%|██████████| 43214/43214 [00:01<00:00, 30691.68it/s]


[TextDataset] rows: train=33162, val=8390, test=1007


Cleaning Comments: 100%|██████████| 43214/43214 [00:01<00:00, 34670.07it/s]
Augment: 100%|██████████| 33162/33162 [01:12<00:00, 456.65row/s]


[TextDataset] rows: train=61512, val=8390, test=1007
[EmbeddingDataset]: Loading precomputed embeddings from Data\cache\distilbert_embeddings_regular.pkl...
[EmbeddingDataset Status]: Embedding generation complete.
[EmbeddingDataset]: Loading precomputed embeddings from Data\cache\distilbert_embeddings_augmented.pkl...
[EmbeddingDataset Status]: Embedding generation complete.
[EmbeddingDataset]: Loading precomputed embeddings from Data\cache\tf-idf_embeddings_regular.pkl...
[EmbeddingDataset Status]: Embedding generation complete.
[EmbeddingDataset]: Loading precomputed embeddings from Data\cache\tf-idf_embeddings_augmented.pkl...
[EmbeddingDataset Status]: Embedding generation complete.


## Logistic Regression

In [10]:
lr_results = {}
lr_results['bert_without_augmentation'] = optimize_model(
    model_name = 'logistic_regression',
    X_train    = X_bert_reg_tr,
    y_train    = y_bert_reg_tr,
    X_val      = X_bert_reg_val,
    y_val      = y_bert_reg_val
)

[I 2025-06-07 23:34:03,473] A new study created in memory with name: no-name-02654ef2-2142-486b-851e-9cf5659d757b

[A
[A
[A
[A
[A
Predicting: 100%|██████████████████████████████████████████████████| 66/66 [00:00<00:00, 112.40it/s]
[I 2025-06-07 23:34:20,848] Trial 0 finished with value: 0.5997764780022845 and parameters: {'num_epochs': 6, 'learning_rate': 5.090969641367862e-05, 'weight_decay': 7.089226155590825e-05, 'batch_norm': False, 'drop_out': 0, 'layers': '(768, 3)'}. Best is trial 0 with value: 0.5997764780022845.

[A
[A
[A
[A
[A
Predicting: 100%|██████████████████████████████████████████████████| 66/66 [00:00<00:00, 115.28it/s]
[I 2025-06-07 23:34:29,447] Trial 2 finished with value: 0.611437148050954 and parameters: {'num_epochs': 9, 'learning_rate': 7.743742002890808e-05, 'weight_decay': 4.455959897396848e-05, 'batch_norm': False, 'drop_out': 0, 'layers': '(768, 3)'}. Best is trial 2 with value: 0.611437148050954.

[A
[A
[A
[A
[A
Predicting: 100%|██████████████

[logistic_regression]  best F1=0.6265   Best hyperparameters: {'num_epochs': 18, 'learning_rate': 0.0009969490839651179, 'weight_decay': 3.924609836734019e-05, 'batch_norm': False, 'drop_out': 0, 'layers': [768, 3]} bootstrap 95% CI=(0.6099, 0.6434)


In [11]:
lr_results['bert_with_augmentation'] = optimize_model(
    model_name = 'logistic_regression',
    X_train    = X_bert_aug_tr,
    y_train    = y_bert_aug_tr,
    X_val      = X_bert_aug_val,
    y_val      = y_bert_aug_val
)

[I 2025-06-07 23:46:40,100] A new study created in memory with name: no-name-74380694-e240-4744-b556-fe8a9096b206

[A
[A
[A
[A
[A
[A
Predicting: 100%|███████████████████████████████████████████████████| 66/66 [00:00<00:00, 97.69it/s]
[I 2025-06-07 23:48:08,950] Trial 1 finished with value: 0.617610790502921 and parameters: {'num_epochs': 12, 'learning_rate': 6.226884558708537e-05, 'weight_decay': 0.0003525197137930763, 'batch_norm': False, 'drop_out': 0, 'layers': '(768, 3)'}. Best is trial 1 with value: 0.617610790502921.

[A
[A
[A
[A
[A
[A
Predicting: 100%|███████████████████████████████████████████████████| 66/66 [00:00<00:00, 98.92it/s]
[I 2025-06-07 23:48:13,727] Trial 3 finished with value: 0.6086316033671373 and parameters: {'num_epochs': 13, 'learning_rate': 1.8945851210110385e-05, 'weight_decay': 3.061511297386663e-05, 'batch_norm': False, 'drop_out': 0, 'layers': '(768, 3)'}. Best is trial 1 with value: 0.617610790502921.

[A
[A
[A
[A
[A
[A
Predicting: 100%|

[logistic_regression]  best F1=0.6259   Best hyperparameters: {'num_epochs': 11, 'learning_rate': 0.0004934505406247907, 'weight_decay': 0.000722505939193999, 'batch_norm': False, 'drop_out': 0, 'layers': [768, 3]} bootstrap 95% CI=(0.6097, 0.6423)


In [12]:
lr_results['tfidf_without_augmentation'] = optimize_model(
    model_name = 'logistic_regression',
    X_train    = X_tfidf_reg_tr,
    y_train    = y_tfidf_reg_tr,
    X_val      = X_tfidf_reg_val,
    y_val      = y_tfidf_reg_val
)

[I 2025-06-08 00:02:52,251] A new study created in memory with name: no-name-7e892c70-0c0e-4b50-8af3-c9cc67f3241e

[A
[A
[A
[A
[A
Predicting: 100%|██████████████████████████████████████████████████| 66/66 [00:00<00:00, 109.04it/s]
[I 2025-06-08 00:03:32,689] Trial 1 finished with value: 0.5111391500931575 and parameters: {'num_epochs': 11, 'learning_rate': 9.60732464311274e-05, 'weight_decay': 1.1848029346922025e-05, 'batch_norm': False, 'drop_out': 0, 'layers': '(768, 3)'}. Best is trial 1 with value: 0.5111391500931575.

[A
[A
[A
[A
[A
Predicting: 100%|██████████████████████████████████████████████████| 66/66 [00:00<00:00, 113.11it/s]
[I 2025-06-08 00:03:43,128] Trial 2 finished with value: 0.5203968819697179 and parameters: {'num_epochs': 15, 'learning_rate': 8.1450326966437e-05, 'weight_decay': 0.0009538403184825659, 'batch_norm': False, 'drop_out': 0, 'layers': '(768, 3)'}. Best is trial 2 with value: 0.5203968819697179.

[A
[A
[A
[A
[A
Predicting: 100%|████████████

[logistic_regression]  best F1=0.5574   Best hyperparameters: {'num_epochs': 20, 'learning_rate': 0.0009515211373175522, 'weight_decay': 0.0004098010092465123, 'batch_norm': False, 'drop_out': 0, 'layers': [768, 3]} bootstrap 95% CI=(0.5394, 0.5748)


In [13]:
lr_results['tfidf_with_augmentation'] = optimize_model(
    model_name = 'logistic_regression',
    X_train    = X_tfidf_aug_tr,
    y_train    = y_tfidf_aug_tr,
    X_val      = X_tfidf_aug_val,
    y_val      = y_tfidf_aug_val
)

[I 2025-06-08 00:16:45,568] A new study created in memory with name: no-name-32943d0d-f5d6-453f-983e-4b4212221f74

[A
[A
[A
[A
[A
Predicting: 100%|██████████████████████████████████████████████████| 66/66 [00:00<00:00, 100.64it/s]
[I 2025-06-08 00:17:29,588] Trial 3 finished with value: 0.5078984477398967 and parameters: {'num_epochs': 6, 'learning_rate': 7.192572859418172e-05, 'weight_decay': 0.0007040738573001243, 'batch_norm': False, 'drop_out': 0, 'layers': '(768, 3)'}. Best is trial 3 with value: 0.5078984477398967.

[A
[A
[A
[A
[A
[A
Predicting: 100%|███████████████████████████████████████████████████| 66/66 [00:00<00:00, 94.84it/s]
[I 2025-06-08 00:18:29,225] Trial 2 finished with value: 0.5457804892099839 and parameters: {'num_epochs': 14, 'learning_rate': 0.0003019887255890387, 'weight_decay': 3.1885942213661405e-05, 'batch_norm': False, 'drop_out': 0, 'layers': '(768, 3)'}. Best is trial 2 with value: 0.5457804892099839.

[A
[A
[A
[A
[A
[A
Predicting: 100%|██

[logistic_regression]  best F1=0.5592   Best hyperparameters: {'num_epochs': 18, 'learning_rate': 0.0009701142553201713, 'weight_decay': 0.0002278112936921285, 'batch_norm': False, 'drop_out': 0, 'layers': [768, 3]} bootstrap 95% CI=(0.5424, 0.5778)


In [14]:
print("Logistic Regression results:\n\n")
df = pd.DataFrame([
    {
        'Experiment': key,
        'Best Parameters': value[0],
        'Best Eval Score': value[1],
        'CI (95%)': value[2]
    }
    for key, value in lr_results.items()
])
display(df)


Logistic Regression results:




Unnamed: 0,Experiment,Best Parameters,Best Eval Score,CI (95%)
0,bert_without_augmentation,"{'num_epochs': 18, 'learning_rate': 0.00099694...",0.626529,"(0.6099079483474952, 0.6433992656279307)"
1,bert_with_augmentation,"{'num_epochs': 11, 'learning_rate': 0.00049345...",0.62588,"(0.6096956792246664, 0.6422602300882342)"
2,tfidf_without_augmentation,"{'num_epochs': 20, 'learning_rate': 0.00095152...",0.557432,"(0.5393799247483949, 0.5747544614311749)"
3,tfidf_with_augmentation,"{'num_epochs': 18, 'learning_rate': 0.00097011...",0.55921,"(0.5423917324073446, 0.5777922807068604)"


## SVM

In [None]:
svm_results = {}
svm_results['bert_without_augmentation'] = ({'C': 0.010119346664428618, 'kernel': 'rbf', 'degree': 2, 'gamma': 'scale'}, 0.6542, (0.6371, 0.6704))
svm_results['bert_with_augmentation'] = ({'C': 2.845045562184184, 'kernel': 'rbf', 'degree': 4, 'gamma': 'scale'}, 0.6544, (0.6371, 0.6704))

In [15]:
svm_results = {}
svm_results['bert_without_augmentation'] = optimize_model(
    model_name = 'svm',
    X_train    = X_bert_reg_tr,
    y_train    = y_bert_reg_tr,
    X_val      = X_bert_reg_val,
    y_val      = y_bert_reg_val,
    n_trials = 100
)

[I 2025-06-08 00:41:40,899] A new study created in memory with name: no-name-b11f4ac4-8282-4d96-adfe-1e2b0dd4bd92

[A

[A[A
[A

[A[A
[A

[A[A
[A
[A

[A[A
[A

[A[A
[A

[A[A
[A

[A[A
[A
Predicting: 100%|█████████████████████████████████████████████████████| 9/9 [01:18<00:00,  8.70s/it]
[I 2025-06-08 00:50:34,557] Trial 3 finished with value: 0.6191269947850221 and parameters: {'C': 0.000347709599322713, 'kernel': 'linear', 'degree': 2, 'gamma': 'scale'}. Best is trial 3 with value: 0.6191269947850221.


[A[A

[A[A

Predicting: 100%|█████████████████████████████████████████████████████| 9/9 [01:29<00:00,  9.94s/it]
[I 2025-06-08 00:50:51,782] Trial 0 finished with value: 0.5857381659860487 and parameters: {'C': 0.07128826637447801, 'kernel': 'sigmoid', 'degree': 4, 'gamma': 'scale'}. Best is trial 3 with value: 0.6191269947850221.

[A
[A
[A
[A
[A
[A
[A
[A
[A
Predicting: 100%|█████████████████████████████████████████████████████| 9/9 [01:58<00:00, 13.15s

[svm]  best F1=0.6542   Best hyperparameters: {'C': 20.6226413694565, 'kernel': 'rbf', 'degree': 2, 'gamma': 'scale', 'class_weight': 'balanced'} bootstrap 95% CI=(0.6371, 0.6704)


In [None]:
svm_results['bert_with_augmentation'] = optimize_model(
    model_name = 'svm',
    X_train    = X_bert_aug_tr,
    y_train    = y_bert_aug_tr,
    X_val      = X_bert_aug_val,
    y_val      = y_bert_aug_val
)

[I 2025-06-08 04:56:37,580] A new study created in memory with name: no-name-b8ccf98e-3c67-47fc-ba04-882a08903aea

[A
[A
[A
[A
[A
[A
[A
[A
[A
Predicting: 100%|█████████████████████████████████████████████████████| 9/9 [02:09<00:00, 14.39s/it]
[I 2025-06-08 05:13:18,072] Trial 3 finished with value: 0.4860881809817471 and parameters: {'C': 0.6270146208823831, 'kernel': 'sigmoid', 'degree': 3, 'gamma': 'scale'}. Best is trial 3 with value: 0.4860881809817471.

[A
[A
[A
[A
[A
[A
[A
[A
[A
Predicting: 100%|█████████████████████████████████████████████████████| 9/9 [02:41<00:00, 17.97s/it]
[I 2025-06-08 05:29:44,034] Trial 0 finished with value: 0.6494779515743686 and parameters: {'C': 12.711265454667013, 'kernel': 'rbf', 'degree': 3, 'gamma': 'scale'}. Best is trial 0 with value: 0.6494779515743686.

[A
[A
[A
[A
[A
[A
[A
[A
[A
Predicting: 100%|█████████████████████████████████████████████████████| 9/9 [06:00<00:00, 40.03s/it]
[I 2025-06-08 05:38:52,940] Trial 1 fi

In [None]:
svm_results['tfidf_without_augmentation'] = optimize_model(
    model_name = 'svm',
    X_train    = X_tfidf_reg_tr,
    y_train    = y_tfidf_reg_tr,
    X_val      = X_tfidf_reg_val,
    y_val      = y_tfidf_reg_val
)

In [None]:
svm_results['tfidf_with_augmentation'] = optimize_model(
    model_name = 'svm',
    X_train    = X_tfidf_aug_tr,
    y_train    = y_tfidf_aug_tr,
    X_val      = X_tfidf_aug_val,
    y_val      = y_tfidf_aug_val
)

In [None]:
print("SVM results:\n\n")
df = pd.DataFrame([
    {
        'Experiment': key,
        'Best Parameters': value[0],
        'Best Eval Score': value[1],
        'CI (95%)': value[2]
    }
    for key, value in svm_results.items()
])
display(df)

SVM results:




Unnamed: 0,Experiment,Best Parameters,Best Eval Score,CI (95%)
0,bert_without_augmentation,"{'C': 14.113323810506161, 'kernel': 'rbf', 'de...",0.638179,"(0.6142822431331255, 0.6624198682227005)"


## XGBoost

In [None]:
xgb_results = {}
xgb_results['bert_without_augmentation'] = optimize_model(
    model_name = 'xgboost',
    X_train    = X_bert_reg_tr,
    y_train    = y_bert_reg_tr,
    X_val      = X_bert_reg_val,
    y_val      = y_bert_reg_val,
    n_trials = 100
)

[I 2025-05-15 16:40:23,531] A new study created in memory with name: no-name-d54c6898-92de-4c8c-a120-ed40d3d5c6a4
  0%|          | 0/100 [00:00<?, ?it/s][I 2025-05-15 16:40:43,521] Trial 1 finished with value: 0.5872804061025436 and parameters: {'n_estimators': 129, 'learning_rate': 0.008665749332990566, 'booster': 'gblinear', 'max_depth': 3, 'min_child_weight': 10, 'colsample_bytree': 0.9518770512514789, 'subsample': 0.5930836128270734, 'reg_alpha': 0.00023049735300880039, 'reg_lambda': 0.002832714474683314, 'gamma': 0.33945450614017925}. Best is trial 1 with value: 0.5872804061025436.
  1%|          | 1/100 [00:19<32:59, 19.99s/it][I 2025-05-15 16:41:12,617] Trial 2 finished with value: 0.5971101781226416 and parameters: {'n_estimators': 112, 'learning_rate': 0.02697426543063567, 'booster': 'gbtree', 'max_depth': 3, 'min_child_weight': 2, 'colsample_bytree': 0.8869256703299871, 'subsample': 0.6379367146114844, 'reg_alpha': 7.709908337574803, 'reg_lambda': 1.3138231424348024e-08, 'gam

[xgboost]  best F1=0.6400   Best hyperparameters: {'n_estimators': 97, 'learning_rate': 0.19893565401854227, 'booster': 'gbtree', 'max_depth': 7, 'min_child_weight': 3, 'colsample_bytree': 0.698884472870104, 'subsample': 0.5802363751849828, 'reg_alpha': 1.1264120790978405e-07, 'reg_lambda': 0.7242980309966568, 'gamma': 1.0153525598898357e-05, 'grow_policy': 'depthwise'} bootstrap 95% CI=(0.6136, 0.6633)


In [None]:
xgb_results['bert_with_augmentation'] = optimize_model(
    model_name = 'xgboost',
    X_train    = X_bert_aug_tr,
    y_train    = y_bert_aug_tr,
    X_val      = X_bert_aug_val,
    y_val      = y_bert_aug_val
)

In [None]:
xgb_results['tfidf_without_augmentation'] = optimize_model(
    model_name = 'xgboost',
    X_train    = X_tfidf_reg_tr,
    y_train    = y_tfidf_reg_tr,
    X_val      = X_tfidf_reg_val,
    y_val      = y_tfidf_reg_val
)

[I 2025-05-15 13:45:37,701] A new study created in memory with name: no-name-c2b88e3f-1372-4634-ac26-54b8b04482c5
  0%|          | 0/50 [00:00<?, ?it/s][I 2025-05-15 13:45:54,275] Trial 0 finished with value: 0.0 and parameters: {'n_estimators': 146, 'learning_rate': 0.15020861438022654, 'booster': 'gblinear', 'max_depth': 10, 'min_child_weight': 4, 'colsample_bytree': 0.8467262422988333, 'subsample': 0.8104585153371822, 'reg_alpha': 0.009353432542695172, 'reg_lambda': 0.0006814361471557467, 'gamma': 0.00016276417702419104}. Best is trial 0 with value: 0.0.
  2%|▏         | 1/50 [00:16<13:32, 16.57s/it][I 2025-05-15 13:46:00,494] Trial 2 finished with value: 0.0 and parameters: {'n_estimators': 194, 'learning_rate': 0.06725107414171043, 'booster': 'gblinear', 'max_depth': 6, 'min_child_weight': 8, 'colsample_bytree': 0.6337009009790975, 'subsample': 0.5620620904155708, 'reg_alpha': 0.1067107676093894, 'reg_lambda': 0.001083519849260911, 'gamma': 7.114473067716964e-06}. Best is trial 0 

[xgboost]  best F1=0.5769   Best hyperparameters: {'n_estimators': 96, 'learning_rate': 0.13671810205729562, 'booster': 'gbtree', 'max_depth': 10, 'min_child_weight': 9, 'colsample_bytree': 0.9544259373139633, 'subsample': 0.7821265873367009, 'reg_alpha': 1.8675227772962615e-06, 'reg_lambda': 0.0003869335467411198, 'gamma': 0.23432819427336474, 'grow_policy': 'lossguide'} bootstrap 95% CI=(0.5520, 0.6039)


In [None]:
xgb_results['tfidf_with_augmentation'] = optimize_model(
    model_name = 'xgboost',
    X_train    = X_tfidf_aug_tr,
    y_train    = y_tfidf_aug_tr,
    X_val      = X_tfidf_aug_val,
    y_val      = y_tfidf_aug_val
)

In [None]:
print("XGBoost results:\n\n")
df = pd.DataFrame([
    {
        'Experiment': key,
        'Best Parameters': value[0],
        'Best Eval Score': value[1],
        'CI (95%)': value[2]
    }
    for key, value in xgb_results.items()
])
display(df)

XGBoost results:




Unnamed: 0,Experiment,Best Parameters,Best Eval Score,CI (95%)
0,bert_without_augmentation,"{'n_estimators': 97, 'learning_rate': 0.198935...",0.63998,"(0.6136302135287747, 0.6633417826405086)"


## DNN

Given the potential a network classification head shown, we'll attempt to optimise it to train-val loss, with adjustments to learning process.

This is a 1-block flow to train / optimize the process for a DNN on the embedding dataset.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.utils.class_weight import compute_class_weight

from dataset import TextDataset, EmbeddingDataset
from embedder import Embedder
from classifiers import assess_model, get_dataloader, DNN
from Config.dataset_config import *
from Config.classifiers_config import *

from sklearn.metrics import f1_score

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("[Dataset Status]: Building datasets and dataloaders...")
text_dataset = TextDataset(
    csv_path          = DATA_PATH,
    id_column_idx     = ID_COLUMN_IDX,
    comment_column_idx= COMMENT_COLUMN_IDX,
    label_column_idx  = LABEL_COLUMN_IDX,
    split_column_idx  = SUBSET_COLUMN_IDX,  # TRAIN / VAL / TEST column
    augmented_classes = [],                 # ‑‑ no aug
    augmentation_ratio= 0,
    undersampling_targets = {},             # ‑‑ no undersampling
)

embedding_dataset = EmbeddingDataset(
    text_dataset=text_dataset,
    embedder=Embedder(),
    embedding_method=EMBEDDING_METHOD
)

train_ds = embedding_dataset.get_subset('TRAIN')
val_ds   = embedding_dataset.get_subset('VAL')
test_ds  = embedding_dataset.get_subset('TEST')

train_package = get_dataloader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_package   = get_dataloader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)
test_package  = get_dataloader(test_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

# Unpack for sklearn metrics
_, (X_val, y_val)   = val_package
_, (X_test, y_test) = test_package


print("[Model Fit Status]: Training direct DNN...")
model_config = {
        'num_epochs': 30, 
        'learning_rate': 3e-4, 
        'weight_decay': 1e-4, 
        'batch_norm': True, 
        'drop_out': 0.2, 
        'layers': [768, 256, 3]
    }
dnn = DNN(model_config).to(DEVICE)
train_loader, (X_train, y_train) = train_package
val_loader, _                     = val_package

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y_train),
    y=y_train
)
weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(DEVICE)
criterion = nn.CrossEntropyLoss(weight=weights_tensor)
optimizer = optim.Adam(dnn.parameters(), lr=model_config["learning_rate"])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", factor=0.5, patience=2)
best_f1 = -1
best_model_path = os.path.join(CHECKPOINTS, "best_dnn.pt")

train_losses, val_losses, val_f1_scores = [], [], []

for epoch in range(model_config['num_epochs']):
    dnn.train()
    total_loss = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        optimizer.zero_grad()
        preds = dnn(xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * xb.size(0)

    train_loss = total_loss / len(train_loader.dataset)
    train_losses.append(train_loss)

    # Validation
    dnn.eval()
    val_preds, val_labels = [], []
    total_val_loss = 0
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            preds = dnn(xb)
            loss = criterion(preds, yb)
            total_val_loss += loss.item() * xb.size(0)
            val_preds.extend(preds.argmax(dim=1).cpu().numpy())
            val_labels.extend(yb.cpu().numpy())

    val_loss = total_val_loss / len(val_loader.dataset)
    val_losses.append(val_loss)

    # Macro-F1
    val_f1 = f1_score(np.array(val_labels), np.array(val_preds), average="macro", zero_division=0)
    val_f1_scores.append(val_f1)
    scheduler.step(val_f1)

    if val_f1 > best_f1:
        best_f1 = val_f1
        torch.save(dnn.state_dict(), best_model_path)
        print(f"Epoch {epoch+1}/{model_config['num_epochs']}: Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | F1: {val_f1:.4f} | ↪️ Saved new best model.")
    else:
        print(f"Epoch {epoch+1}/{model_config['num_epochs']}: Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | F1: {val_f1:.4f}")


print(f"\n[Evaluation on Test Set]")
dnn.eval()
all_preds = []
with torch.no_grad():
    for xb, _ in test_package[0]:
        xb = xb.to(DEVICE)
        preds = dnn(xb)
        all_preds.extend(preds.argmax(dim=1).cpu().numpy())

# Classification report
_, _ = assess_model(all_preds, test_package, valid_labels=[0, 1, 2])

[Dataset Status]: Building datasets and dataloaders...


Cleaning Comments:   0%|          | 0/43214 [00:00<?, ?it/s]

Cleaning Comments: 100%|██████████| 43214/43214 [00:01<00:00, 32242.76it/s]


[TextDataset] rows: train=29690, val=4213, test=8656
[EmbeddingDataset]: Loading precomputed embeddings from Data\cache\distilbert_embeddings_regular.pkl...
[EmbeddingDataset Status]: Embedding generation complete.
[Dataloader Status]: Preparing the dataloader...
[DL] peek batch 0: y[:5] = [2 2 2 2 2]
[DL] EmbeddingDataset ready. X shape (29690, 768), y len 29690
[Dataloader Status]: Preparing the dataloader...
[DL] peek batch 0: y[:5] = [0 1 1 2 1]
[DL] EmbeddingDataset ready. X shape (4213, 768), y len 4213
[Dataloader Status]: Preparing the dataloader...
[DL] peek batch 0: y[:5] = [0 2 2 0 0]
[DL] EmbeddingDataset ready. X shape (8656, 768), y len 8656
[Model Fit Status]: Training direct DNN...
Epoch 1/30: Train Loss: 0.3492 | Val Loss: 0.7068 | F1: 0.7020 | ↪️ Saved new best model.
Epoch 2/30: Train Loss: 0.2872 | Val Loss: 0.8089 | F1: 0.7160 | ↪️ Saved new best model.
Epoch 3/30: Train Loss: 0.2661 | Val Loss: 0.7520 | F1: 0.6909
Epoch 4/30: Train Loss: 0.2513 | Val Loss: 0.7445 