# Imports and Setup

In [3]:
import os, pathlib

# Move from …\Analysis  →  …\Israel-Palestine-Political-Affiliation-Text-Classification
os.chdir(pathlib.Path.cwd().parent)

print("Now working in:", pathlib.Path.cwd())

Now working in: c:\Users\yonat\CodeProjects\Israel-Palestine-Political-Affiliation-Text-Classification


In [5]:
import optuna

from sklearn.metrics import f1_score
from sklearn.utils import resample
import pandas as pd
from copy import deepcopy
from pathlib import Path

from torch.utils.data import Dataset, DataLoader

import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')

from tqdm import tqdm

# Local Code
from classifiers import *
from dataset import EmbeddingDataset, TextDataset
from embedder import Embedder
from Config.dataset_config import *

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\yonat\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\yonat\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yonat\AppData\Roaming\nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yonat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Define optimizers

In [6]:
# Helper dataset
class HelperDataset(Dataset):
    def __init__(self, X, y):
        super(HelperDataset).__init__()
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, item):
        return self.X[item], self.y[item]
    
# Custom tqdm callback
class TqdmCallback:
    def __init__(self, n_trials):
        self.pbar = tqdm(total=n_trials)

    def __call__(self, study, trial):
        self.pbar.update(1)

    def close(self):
        self.pbar.close()

In [16]:
# ------------------------------------------------------------------
# 0.  Utilities and Hyperparameters Space
# ------------------------------------------------------------------

model_hyperparameters = {
    'logistic_regression': {
        'num_epochs': (5, 20, 'int'),
        'learning_rate': (1e-5, 1e-3, 'loguniform'),
        'weight_decay': (1e-5, 1e-3, 'loguniform'),
        "batch_norm": ([False], 'categorical'),    # Do not modify in optimization
        "drop_out": (0, 0, 'int'),    # Do not modify in optimization
        "layers": ([[768, 3]], 'categorical') # Do not modify in optimization
    },
    'svm': {
        'C': (1e-4, 1e2, 'loguniform'),
        'kernel': (['linear', 'rbf', 'sigmoid'], 'categorical'),
        'degree': (2, 4, 'int'),
        'gamma': (['scale', 'auto'], 'categorical')
    },
    'xgboost': {
        'n_estimators': (50, 200, 'int'),
        'learning_rate': (1e-3, 0.2, 'loguniform'),
        'booster': (['gbtree', 'gblinear', 'dart'], 'categorical'),
        'max_depth': (3, 10, 'int'),
        'min_child_weight': (2, 10, 'int'),
        'colsample_bytree': (0.5, 1.0, 'uniform'),
        'subsample': (0.5, 1.0, 'uniform'),
        'reg_alpha': (1e-8, 10.0, 'loguniform'),
        'reg_lambda': (1e-8, 10.0, 'loguniform'),
        'gamma': (1e-8, 1.0, 'loguniform')
    },
    'dnn': {
        "num_epochs": (5, 20, 'int'),  # Adjust after trial and error
        "learning_rate": (1e-5, 1e-3, 'loguniform'),
        'weight_decay': (1e-5, 1e-3, 'loguniform'),
        "batch_norm": ([True, False], 'categorical'),
        "drop_out": (0.0, 0.5, 'uniform'),
        "layers": ([[768, 64, 3],
                    [768, 128, 3],
                    [768, 64, 64, 3],
                    [768, 128, 64, 3],
                    [768, 512, 32, 3],
                    [768, 512, 128, 3],
                    [768, 512, 128, 64, 3]], 'categorical')  # Layer dimensions, including an input and output layer.
    }
}

def _build_estimator(model_name: str, params: dict) -> Classifier:
    """
    Always create a **fresh** classifier for a trial.
    """
    if model_name not in {"logistic_regression", "svm", "xgboost", "dnn"}:
        raise ValueError(f"Unsupported model: {model_name}")
    return Classifier(params, model_type=model_name, log=False)

def macro_f1_01(y_true, y_pred_or_proba, threshold_0=0.5, threshold_1=0.5):
    """
    Macro‑F1 for classes 0 & 1 (class 2 ignored).
    Used in order to optimize the classifier towards the important classes: 0 & 1.
    """
    y_true = np.asarray(y_true)
    yp = np.asarray(y_pred_or_proba)

    # probs → hard labels
    if yp.ndim == 2:
        if threshold_0 != 0.5 or threshold_1 != 0.5:    # custom cut‑offs
            y_pred = np.full(len(yp), 2, dtype=int)
            y_pred[yp[:, 0] >= threshold_0] = 0
            y_pred[yp[:, 1] >= threshold_1] = 1
        else:                                           # argmax
            y_pred = yp.argmax(axis=1)
    else:
        y_pred = yp.astype(int)

    return f1_score(y_true, y_pred, labels=[0, 1],
                    average="macro", zero_division=0)

def _bootstrap_ci(y_true, y_pred, n_iter: int = 1000, alpha: float = .05):
    """basic percentile bootstrap CI around macro‑F1(0,1)"""
    y_true = np.asarray(y_true)  # <-- safe cast
    y_pred = np.asarray(y_pred)
    n       = len(y_true)
    stats   = []
    for _ in range(n_iter):
        idx  = resample(np.arange(n), replace=True, n_samples=n)
        stats.append(macro_f1_01(y_true[idx], y_pred[idx]))
    lower, upper = np.percentile(stats, [100*alpha/2, 100*(1-alpha/2)])
    return float(lower), float(upper)

def _score_on_validation(estimator: Classifier,
                         X_tr, y_tr, X_val, y_val, return_ci: bool = False):
    """
    Fit on TRAIN → score on VAL.
    Handles both scikit‑learn and PyTorch heads.
    return_ci flag will return both score and confidence interval.
    """
    # ‑‑‑ prepare loaders   (HelperDataset just wraps (X,y) tensors/ndarrays)
    train_loader = DataLoader(HelperDataset(X_tr,  y_tr),
                              batch_size=BATCH_SIZE, shuffle=True)
    val_loader   = DataLoader(HelperDataset(X_val, y_val),
                              batch_size=BATCH_SIZE, shuffle=False)

    # -- fit + predict ------------------------------------------------------
    estimator.fit((train_loader, (X_tr, y_tr)))
    preds = estimator.predict((val_loader, (X_val, y_val)))

    # -- score --------------------------------------------------------------
    score = macro_f1_01(y_val, preds)

    if not return_ci:
        return score

    ci_low, ci_high = _bootstrap_ci(y_val, preds)
    return score, ci_low, ci_high

# ------------------------------------------------------------------
# 1.  Optuna — hyper‑param suggestion helper
# ------------------------------------------------------------------
def suggest_hyperparameters(trial, hp_space):
    """
    hp_space is the dict that lives in model_hyperparameters[…]
    """
    params = {}
    for key, spec in hp_space.items():
        if len(spec) == 2 and spec[1] == "categorical":
            params[key] = trial.suggest_categorical(key, spec[0])

        elif len(spec) == 3:
            low, high, kind = spec
            if kind == "loguniform":
                params[key] = trial.suggest_float(key, low, high, log=True)
            elif kind == "uniform":
                params[key] = trial.suggest_float(key, low, high)
            elif kind == "int":
                params[key] = trial.suggest_int(key, low, high)
            elif kind == "categorical":
                params[key] = trial.suggest_categorical(key, low)
            else:
                raise ValueError(f"Unknown param type: {kind}")
        else:
            raise ValueError(f"Malformed spec for {key}: {spec}")
    return params


# ------------------------------------------------------------------
# 2.  Optuna objective  (no CV — uses VAL split)
# ------------------------------------------------------------------
def objective(trial, model_name: str, X_train, y_train, X_val,   y_val) -> float:

    # -------- suggest hyper‑parameters
    params = suggest_hyperparameters(trial,
                                     model_hyperparameters[model_name])

    # logistic_regression  ≡  one‑layer DNN, override a few things
    if model_name == "logistic_regression":
        params.update(batch_norm=False, drop_out=0.0, layers=[768, 3])

    # booster‑specific extras for XGBoost
    if model_name == "xgboost" and params["booster"] in {"gbtree", "dart"}:
        params.update(
            grow_policy=trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
            colsample_bytree=trial.suggest_float("colsample_bytree", 0.5, 1.0),
            gamma=trial.suggest_float("gamma", 1e-8, 1.0, log=True),
            max_depth=trial.suggest_int("max_depth", 3, 10),
            min_child_weight=trial.suggest_int("min_child_weight", 2, 10),
            subsample=trial.suggest_float("subsample", 0.5, 1.0)
        )
        if params["booster"] == "dart":
            params.update(
                sample_type   = trial.suggest_categorical("sample_type", ["uniform", "weighted"]),
                normalize_type= trial.suggest_categorical("normalize_type", ["tree", "forest"]),
                rate_drop     = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True),
                skip_drop     = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)
            )

    # -------- build, fit, score
    estimator = _build_estimator(model_name, params)
    eval_score = _score_on_validation(estimator, X_train, y_train, X_val, y_val, return_ci=False)
    return eval_score


# ------------------------------------------------------------------
# 3.  Optimiser wrapper  (no CV, keeps history)
# ------------------------------------------------------------------
def optimize_model(model_name: str, X_train, y_train, X_val,   y_val, n_trials: int = 50, timeout: int  = 36000):
    """
    Tune hyper‑parameters by maximising macro‑F1 (0+1 classes) on VAL set.
    Returns: best_params, best_score, all_trial_scores
    """
    all_scores = []

    def _optuna_obj(trial):
        score = objective(trial, model_name, X_train, y_train, X_val,   y_val)
        all_scores.append(score)
        return score

    study = optuna.create_study(direction="maximize")
    study.optimize(_optuna_obj, n_trials=n_trials, timeout=timeout, callbacks=[TqdmCallback(n_trials)], n_jobs=-1) # Uses all available cores

    best_params = study.best_params
    best_score  = study.best_value

    # ------------------------------------------------------------------
    # refit *best* on full TRAIN and compute bootstrap CI --------------
    # ------------------------------------------------------------------
    best_est   = _build_estimator(model_name, best_params)

    best_score, ci_low, ci_high = _score_on_validation(
            best_est, X_train, y_train, X_val, y_val, return_ci=True)
    
    print(f"[{model_name}]  best F1={best_score:.4f}   "
            f"Best hyperparameters: {best_params}",
            f"bootstrap 95% CI=({ci_low:.4f}, {ci_high:.4f})")

    return best_params, best_score, (ci_low, ci_high)

# Optimize models

## Create Datasets

In [12]:
'''
Create 6 different EmbeddingDataset datasets out of the 3 TextDataset datasets:
augmented dataset - with distilbert embedding or tfidf, 
undersampled balanced dataset - with distilbert embedding or tfidf,
regular dataset - with distilbert embedding or tfidf
'''

# ---------------------------------------------
# 1.  Build the three text‑level datasets
# ---------------------------------------------
txt_regular = TextDataset(
    csv_path          = DATA_PATH,
    id_column_idx     = ID_COLUMN_IDX,
    comment_column_idx= COMMENT_COLUMN_IDX,
    label_column_idx  = LABEL_COLUMN_IDX,
    split_column_idx  = SUBSET_COLUMN_IDX,  # TRAIN / VAL / TEST column
    augmented_classes = [],                 # ‑‑ no aug
    augmentation_ratio= 0,
    undersampling_targets = {},             # ‑‑ no undersampling
)

txt_augmented = TextDataset(
    csv_path          = DATA_PATH,
    id_column_idx     = ID_COLUMN_IDX,
    comment_column_idx= COMMENT_COLUMN_IDX,
    label_column_idx  = LABEL_COLUMN_IDX,
    split_column_idx  = SUBSET_COLUMN_IDX,
    augmented_classes = AUGMENTED_CLASSES,           # e.g. ["Pro‑Israel"]
    augmentation_ratio= 3,                            # add 3 extra copies
    undersampling_targets = {},                       # keep original class sizes
)


# ---------------------------------------------
# 2.  Wrap each text set in 1‑EmbeddingDataset
# ---------------------------------------------

EMB_METHODS = ["distilbert", "tf-idf"]    
embedder    = Embedder()                  # your wrapper around HF / TF‑IDF

embedding_sets: dict[str, dict[str, EmbeddingDataset]] = {}   # {method : {variant : EmbeddingDataset}}

for method in EMB_METHODS:
    embedding_sets[method] = {
        "regular"      : EmbeddingDataset(txt_regular,      embedder, method),
        "augmented"    : EmbeddingDataset(txt_augmented,    embedder, method),
    }

# ---------------------------------------------
# 3.  Fetch TRAIN and VAL tensors for each dataset
# ---------------------------------------------
(X_bert_reg_tr ,  y_bert_reg_tr ,
 X_bert_reg_val,  y_bert_reg_val) = (
     *embedding_sets["distilbert"]["regular"].get_subset("TRAIN")[:],
     *embedding_sets["distilbert"]["regular"].get_subset("VAL")[:] )

(X_bert_aug_tr ,  y_bert_aug_tr ,
 X_bert_aug_val,  y_bert_aug_val) = (
     *embedding_sets["distilbert"]["augmented"].get_subset("TRAIN")[:],
     *embedding_sets["distilbert"]["augmented"].get_subset("VAL")[:] )

(X_tfidf_reg_tr , y_tfidf_reg_tr ,
 X_tfidf_reg_val, y_tfidf_reg_val) = (
     *embedding_sets["tf-idf"]["regular"].get_subset("TRAIN")[:],
     *embedding_sets["tf-idf"]["regular"].get_subset("VAL")[:] )

(X_tfidf_aug_tr , y_tfidf_aug_tr ,
 X_tfidf_aug_val, y_tfidf_aug_val) = (
     *embedding_sets["tf-idf"]["augmented"].get_subset("TRAIN")[:],
     *embedding_sets["tf-idf"]["augmented"].get_subset("VAL")[:] )

Cleaning Comments: 100%|██████████| 43214/43214 [00:01<00:00, 32488.04it/s]


[TextDataset] rows: train=29690, val=4213, test=8656


Cleaning Comments: 100%|██████████| 43214/43214 [00:01<00:00, 29870.38it/s]
Augment: 100%|██████████| 29690/29690 [01:22<00:00, 361.60row/s]


[TextDataset] rows: train=55997, val=4213, test=8656
[EmbeddingDataset]: Loading precomputed embeddings from Data\cache\distilbert_embeddings_regular.pkl...
[EmbeddingDataset Status]: Embedding generation complete.
[EmbeddingDataset]: Loading precomputed embeddings from Data\cache\distilbert_embeddings_augmented.pkl...
[EmbeddingDataset Status]: Embedding generation complete.
[EmbeddingDataset]: Loading precomputed embeddings from Data\cache\tf-idf_embeddings_regular.pkl...
[EmbeddingDataset Status]: Embedding generation complete.
[EmbeddingDataset]: Loading precomputed embeddings from Data\cache\tf-idf_embeddings_augmented.pkl...
[EmbeddingDataset Status]: Embedding generation complete.


## Logistic Regression

In [17]:
lr_results = {}
lr_results['bert_without_augmentation'] = optimize_model(
    model_name = 'logistic_regression',
    X_train    = X_bert_reg_tr,
    y_train    = y_bert_reg_tr,
    X_val      = X_bert_reg_val,
    y_val      = y_bert_reg_val
)

[I 2025-05-12 22:25:51,645] A new study created in memory with name: no-name-2b2e39e7-29a3-4c9b-ae51-5c87b94bd426
[I 2025-05-12 22:26:24,319] Trial 6 finished with value: 0.6636744290539494 and parameters: {'num_epochs': 5, 'learning_rate': 2.3244834262724793e-05, 'weight_decay': 0.00011284680298463986, 'batch_norm': False, 'drop_out': 0, 'layers': [768, 3]}. Best is trial 6 with value: 0.6636744290539494.
[I 2025-05-12 22:26:24,562] Trial 1 finished with value: 0.6632723395547706 and parameters: {'num_epochs': 5, 'learning_rate': 0.00021143238989878698, 'weight_decay': 0.00039874348755810333, 'batch_norm': False, 'drop_out': 0, 'layers': [768, 3]}. Best is trial 6 with value: 0.6636744290539494.
[I 2025-05-12 22:26:48,311] Trial 2 finished with value: 0.6645542562555011 and parameters: {'num_epochs': 8, 'learning_rate': 0.0002118323298295744, 'weight_decay': 0.00043952391222109807, 'batch_norm': False, 'drop_out': 0, 'layers': [768, 3]}. Best is trial 2 with value: 0.6645542562555011.

[logistic_regression]  best F1=0.6623   Best hyperparameters: {'num_epochs': 20, 'learning_rate': 7.072422239087199e-05, 'weight_decay': 1.370266901333882e-05, 'batch_norm': False, 'drop_out': 0, 'layers': [768, 3]} bootstrap 95% CI=(0.6386, 0.6857)


In [18]:
lr_results['bert_with_augmentation'] = optimize_model(
    model_name = 'logistic_regression',
    X_train    = X_bert_aug_tr,
    y_train    = y_bert_aug_tr,
    X_val      = X_bert_aug_val,
    y_val      = y_bert_aug_val
)

[I 2025-05-12 22:45:19,746] A new study created in memory with name: no-name-3e6e0a50-b0f8-48ec-962a-a537a037bf70
[I 2025-05-12 22:46:31,760] Trial 0 finished with value: 0.6492790780112611 and parameters: {'num_epochs': 6, 'learning_rate': 8.895593738237241e-05, 'weight_decay': 0.00042546913203011963, 'batch_norm': False, 'drop_out': 0, 'layers': [768, 3]}. Best is trial 0 with value: 0.6492790780112611.
[I 2025-05-12 22:46:43,316] Trial 3 finished with value: 0.6438853714082664 and parameters: {'num_epochs': 7, 'learning_rate': 2.4227205640297424e-05, 'weight_decay': 1.908356179568374e-05, 'batch_norm': False, 'drop_out': 0, 'layers': [768, 3]}. Best is trial 0 with value: 0.6492790780112611.
[I 2025-05-12 22:46:58,045] Trial 5 finished with value: 0.6506080164157404 and parameters: {'num_epochs': 8, 'learning_rate': 0.0007818778796804584, 'weight_decay': 4.928672146015131e-05, 'batch_norm': False, 'drop_out': 0, 'layers': [768, 3]}. Best is trial 5 with value: 0.6506080164157404.
[I

[logistic_regression]  best F1=0.6507   Best hyperparameters: {'num_epochs': 14, 'learning_rate': 0.0004691507832634235, 'weight_decay': 0.00010067288184001171, 'batch_norm': False, 'drop_out': 0, 'layers': [768, 3]} bootstrap 95% CI=(0.6274, 0.6732)


In [19]:
lr_results['tfidf_without_augmentation'] = optimize_model(
    model_name = 'logistic_regression',
    X_train    = X_tfidf_reg_tr,
    y_train    = y_tfidf_reg_tr,
    X_val      = X_tfidf_reg_val,
    y_val      = y_tfidf_reg_val
)

[I 2025-05-12 23:08:58,496] A new study created in memory with name: no-name-fb1576ed-70c3-4ce7-bb32-4041b77993c1
[I 2025-05-12 23:09:37,823] Trial 3 finished with value: 0.0 and parameters: {'num_epochs': 5, 'learning_rate': 0.00023836275095275584, 'weight_decay': 0.00017763655904371595, 'batch_norm': False, 'drop_out': 0, 'layers': [768, 3]}. Best is trial 3 with value: 0.0.
[I 2025-05-12 23:09:45,382] Trial 4 finished with value: 0.00267379679144385 and parameters: {'num_epochs': 6, 'learning_rate': 0.0005565300159070525, 'weight_decay': 0.00021775065562093223, 'batch_norm': False, 'drop_out': 0, 'layers': [768, 3]}. Best is trial 4 with value: 0.00267379679144385.
[I 2025-05-12 23:09:54,295] Trial 6 finished with value: 0.07151867337538091 and parameters: {'num_epochs': 7, 'learning_rate': 0.0009030822130961161, 'weight_decay': 1.43193176499559e-05, 'batch_norm': False, 'drop_out': 0, 'layers': [768, 3]}. Best is trial 6 with value: 0.07151867337538091.
[I 2025-05-12 23:10:02,457] 

[logistic_regression]  best F1=0.4285   Best hyperparameters: {'num_epochs': 20, 'learning_rate': 0.0009949133699727482, 'weight_decay': 2.354038834496562e-05, 'batch_norm': False, 'drop_out': 0, 'layers': [768, 3]} bootstrap 95% CI=(0.3981, 0.4579)


In [20]:
lr_results['tfidf_with_augmentation'] = optimize_model(
    model_name = 'logistic_regression',
    X_train    = X_tfidf_aug_tr,
    y_train    = y_tfidf_aug_tr,
    X_val      = X_tfidf_aug_val,
    y_val      = y_tfidf_aug_val
)

[I 2025-05-12 23:21:51,569] A new study created in memory with name: no-name-f012e611-c634-45fd-8cdc-c345013c6d84
[I 2025-05-12 23:23:20,977] Trial 4 finished with value: 0.36232218409800815 and parameters: {'num_epochs': 6, 'learning_rate': 2.9020331236847436e-05, 'weight_decay': 3.3968562723958635e-05, 'batch_norm': False, 'drop_out': 0, 'layers': [768, 3]}. Best is trial 4 with value: 0.36232218409800815.
[I 2025-05-12 23:23:54,855] Trial 7 finished with value: 0.3303105471612992 and parameters: {'num_epochs': 8, 'learning_rate': 6.6243977850527e-05, 'weight_decay': 0.00025688714479494683, 'batch_norm': False, 'drop_out': 0, 'layers': [768, 3]}. Best is trial 4 with value: 0.36232218409800815.
[I 2025-05-12 23:24:38,337] Trial 6 finished with value: 0.5125862426866853 and parameters: {'num_epochs': 10, 'learning_rate': 0.0002474855103033042, 'weight_decay': 0.0005950289576990787, 'batch_norm': False, 'drop_out': 0, 'layers': [768, 3]}. Best is trial 6 with value: 0.5125862426866853.

[logistic_regression]  best F1=0.5872   Best hyperparameters: {'num_epochs': 20, 'learning_rate': 0.0009561003431562942, 'weight_decay': 5.296415252083821e-05, 'batch_norm': False, 'drop_out': 0, 'layers': [768, 3]} bootstrap 95% CI=(0.5613, 0.6123)


In [21]:
print("Logistic Regression results:\n\n")
df = pd.DataFrame([
    {
        'Experiment': key,
        'Best Parameters': value[0],
        'Best Eval Score': value[1],
        'CI (95%)': value[2]
    }
    for key, value in lr_results.items()
])
display(df)


Logistic Regression results:




Unnamed: 0,Experiment,Best Parameters,Best Eval Score,CI (95%)
0,bert_without_augmentation,"{'num_epochs': 20, 'learning_rate': 7.07242223...",0.662295,"(0.6386457000158067, 0.6856731794208792)"
1,bert_with_augmentation,"{'num_epochs': 14, 'learning_rate': 0.00046915...",0.650653,"(0.6274250285939126, 0.6731757610483733)"
2,tfidf_without_augmentation,"{'num_epochs': 20, 'learning_rate': 0.00099491...",0.428512,"(0.3980757046692924, 0.4578570930456449)"
3,tfidf_with_augmentation,"{'num_epochs': 20, 'learning_rate': 0.00095610...",0.587153,"(0.5613056836817851, 0.6122785387556879)"


## SVM

In [22]:
svm_results = {}
svm_results['bert_without_augmentation'] = optimize_model(
    model_name = 'svm',
    X_train    = X_bert_reg_tr,
    y_train    = y_bert_reg_tr,
    X_val      = X_bert_reg_val,
    y_val      = y_bert_reg_val
)

[I 2025-05-12 23:48:52,839] A new study created in memory with name: no-name-ea057419-1289-4275-bdf1-4093614a8ff4
  0%|          | 0/50 [00:00<?, ?it/s][I 2025-05-12 23:53:56,958] Trial 7 finished with value: 0.6406549987134573 and parameters: {'C': 0.24306597401122937, 'kernel': 'sigmoid', 'degree': 4, 'gamma': 'scale'}. Best is trial 7 with value: 0.6406549987134573.
  2%|▏         | 1/50 [05:04<4:08:21, 304.12s/it][I 2025-05-12 23:54:20,568] Trial 0 finished with value: 0.6637793233199722 and parameters: {'C': 0.8763290466081766, 'kernel': 'rbf', 'degree': 4, 'gamma': 'scale'}. Best is trial 0 with value: 0.6637793233199722.
  4%|▍         | 2/50 [05:27<1:51:17, 139.11s/it][I 2025-05-12 23:54:23,628] Trial 3 finished with value: 0.6636277147504424 and parameters: {'C': 0.3540085156197167, 'kernel': 'rbf', 'degree': 3, 'gamma': 'scale'}. Best is trial 0 with value: 0.6637793233199722.
  6%|▌         | 3/50 [05:30<1:00:18, 76.99s/it] [I 2025-05-12 23:54:42,671] Trial 2 finished with v

[svm]  best F1=0.6717   Best hyperparameters: {'C': 0.009484124244989617, 'kernel': 'rbf', 'degree': 3, 'gamma': 'scale'} bootstrap 95% CI=(0.6488, 0.6926)


In [None]:
svm_results['bert_with_augmentation'] = optimize_model(
    model_name = 'svm',
    X_train    = X_bert_aug_tr,
    y_train    = y_bert_aug_tr,
    X_val      = X_bert_aug_val,
    y_val      = y_bert_aug_val
)

[I 2025-05-13 00:43:52,857] A new study created in memory with name: no-name-a0dc51ef-c2f4-4dde-b6b6-fa621f99d0bf
  0%|          | 0/50 [00:00<?, ?it/s][I 2025-05-13 01:14:15,757] Trial 4 finished with value: 0.6118128547134633 and parameters: {'C': 0.1678928619689755, 'kernel': 'sigmoid', 'degree': 3, 'gamma': 'scale'}. Best is trial 4 with value: 0.6118128547134633.
  2%|▏         | 1/50 [30:22<24:48:42, 1822.91s/it][I 2025-05-13 01:15:40,915] Trial 3 finished with value: 0.6474979491386382 and parameters: {'C': 0.18563177296710018, 'kernel': 'rbf', 'degree': 3, 'gamma': 'auto'}. Best is trial 3 with value: 0.6474979491386382.
  4%|▍         | 2/50 [31:48<10:40:34, 800.71s/it] [I 2025-05-13 01:17:37,964] Trial 0 finished with value: 0.6403950106510041 and parameters: {'C': 0.051296761334011266, 'kernel': 'sigmoid', 'degree': 3, 'gamma': 'scale'}. Best is trial 3 with value: 0.6474979491386382.
  6%|▌         | 3/50 [33:45<6:22:40, 488.53s/it] [I 2025-05-13 01:18:20,678] Trial 1 finis

In [None]:
svm_results['tfidf_without_augmentation'] = optimize_model(
    model_name = 'svm',
    X_train    = X_tfidf_reg_tr,
    y_train    = y_tfidf_reg_tr,
    X_val      = X_tfidf_reg_val,
    y_val      = y_tfidf_reg_val
)

In [None]:
svm_results['tfidf_with_augmentation'] = optimize_model(
    model_name = 'svm',
    X_train    = X_tfidf_aug_tr,
    y_train    = y_tfidf_aug_tr,
    X_val      = X_tfidf_aug_val,
    y_val      = y_tfidf_aug_val
)

In [None]:
print("SVM results:\n\n")
df = pd.DataFrame([
    {
        'Experiment': key,
        'Best Parameters': value[0],
        'Best Eval Score': value[1],
        'CI (95%)': value[2]
    }
    for key, value in svm_results.items()
])
display(df)

## XGBoost

In [None]:
xgb_results = {}
xgb_results['bert_without_augmentation'] = optimize_model(
    model_name = 'xgboost',
    X_train    = X_bert_reg_tr,
    y_train    = y_bert_reg_tr,
    X_val      = X_bert_reg_val,
    y_val      = y_bert_reg_val
)

In [None]:
xgb_results['bert_with_augmentation'] = optimize_model(
    model_name = 'xgboost',
    X_train    = X_bert_aug_tr,
    y_train    = y_bert_aug_tr,
    X_val      = X_bert_aug_val,
    y_val      = y_bert_aug_val
)

In [None]:
xgb_results['tfidf_without_augmentation'] = optimize_model(
    model_name = 'xgboost',
    X_train    = X_tfidf_reg_tr,
    y_train    = y_tfidf_reg_tr,
    X_val      = X_tfidf_reg_val,
    y_val      = y_tfidf_reg_val
)

In [None]:
xgb_results['tfidf_with_augmentation'] = optimize_model(
    model_name = 'xgboost',
    X_train    = X_tfidf_aug_tr,
    y_train    = y_tfidf_aug_tr,
    X_val      = X_tfidf_aug_val,
    y_val      = y_tfidf_aug_val
)

In [None]:
print("XGBoost results:\n\n")
df = pd.DataFrame([
    {
        'Experiment': key,
        'Best Parameters': value[0],
        'Best Eval Score': value[1],
        'CI (95%)': value[2]
    }
    for key, value in xgb_results.items()
])
display(df)

## DNN

In [None]:
dnn_results = {}
dnn_results['bert_without_augmentation'] = optimize_model(
    model_name = 'dnn',
    X_train    = X_bert_reg_tr,
    y_train    = y_bert_reg_tr,
    X_val      = X_bert_reg_val,
    y_val      = y_bert_reg_val
)

In [None]:
dnn_results['bert_with_augmentation'] = optimize_model(
    model_name = 'dnn',
    X_train    = X_bert_aug_tr,
    y_train    = y_bert_aug_tr,
    X_val      = X_bert_aug_val,
    y_val      = y_bert_aug_val
)

In [None]:
dnn_results['tfidf_without_augmentation'] = optimize_model(
    model_name = 'dnn',
    X_train    = X_tfidf_reg_tr,
    y_train    = y_tfidf_reg_tr,
    X_val      = X_tfidf_reg_val,
    y_val      = y_tfidf_reg_val
)

In [None]:
dnn_results['tfidf_with_augmentation'] = optimize_model(
    model_name = 'dnn',
    X_train    = X_tfidf_aug_tr,
    y_train    = y_tfidf_aug_tr,
    X_val      = X_tfidf_aug_val,
    y_val      = y_tfidf_aug_val
)

In [None]:
print("DNN results:\n\n")
df = pd.DataFrame([
    {
        'Experiment': key,
        'Best Parameters': value[0],
        'Best Eval Score': value[1],
        'CI (95%)': value[2]
    }
    for key, value in dnn_results.items()
])
display(df)