# Imports and Setup

In [None]:
import os
print(os.getcwd())
os.chdir(r'C:\Users\shaha\Projects\Python Projects\Israel-Palestine-Political-Affiliation-Text-Classification')
print(os.getcwd())

In [None]:
import optuna

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.utils import resample
import pandas as pd
from copy import deepcopy
from pathlib import Path

from torch.utils.data import Dataset, DataLoader

import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')

from tqdm import tqdm

# Local Code
from classifiers import *
from dataset import EmbeddingDataset, TextDataset
from embedder import Embedder
from Config.dataset_config import *

# Define optimizers

In [None]:
# Helper dataset
class HelperDataset(Dataset):
    def __init__(self, X, y):
        super(HelperDataset).__init__()
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, item):
        return self.X[item], self.y[item]
    
# Custom tqdm callback
class TqdmCallback:
    def __init__(self, n_trials):
        self.pbar = tqdm(total=n_trials)

    def __call__(self, study, trial):
        self.pbar.update(1)

    def close(self):
        self.pbar.close()

In [None]:
# ------------------------------------------------------------------
# 0.  Utilities and Hyperparameters Space
# ------------------------------------------------------------------

model_hyperparameters = {
    'logistic_regression': {
        'num_epochs': (5, 20, 'int'),
        'learning_rate': (1e-5, 1e-3, 'loguniform'),
        'weight_decay': (1e-5, 1e-3, 'loguniform')
    },
    'svm': {
        'C': (1e-4, 1e2, 'loguniform'),
        'kernel': (['linear', 'rbf', 'sigmoid'], 'categorical'),
        'degree': (2, 4, 'int'),
        'gamma': (['scale', 'auto'], 'categorical')
    },
    'xgboost': {
        'n_estimators': (50, 200, 'int'),
        'learning_rate': (1e-3, 0.2, 'loguniform'),
        'booster': (['gbtree', 'gblinear', 'dart'], 'categorical'),
        'max_depth': (3, 10, 'int'),
        'min_child_weight': (2, 10, 'int'),
        'colsample_bytree': (0.5, 1.0, 'uniform'),
        'subsample': (0.5, 1.0, 'uniform'),
        'reg_alpha': (1e-8, 10.0, 'loguniform'),
        'reg_lambda': (1e-8, 10.0, 'loguniform'),
        'gamma': (1e-8, 1.0, 'loguniform')
    },
    'dnn': {
        "num_epochs": (5, 20, 'int'),  # Adjust after trial and error
        "learning_rate": (1e-5, 1e-3, 'loguniform'),
        'weight_decay': (1e-5, 1e-3, 'loguniform'),
        "batch_norm": ([True, False], 'categorical'),
        "drop_out": (0.0, 0.5, 'uniform'),
        "layers": ([[768, 64, 3],
                    [768, 128, 3],
                    [768, 64, 64, 3],
                    [768, 128, 64, 3],
                    [768, 512, 32, 3],
                    [768, 512, 128, 3],
                    [768, 512, 128, 64, 3]], 'categorical')  # Layer dimensions, including an input and output layer.
    }
}

def _build_estimator(model_name: str, params: dict) -> Classifier:
    """
    Always create a **fresh** classifier for a trial.
    """
    if model_name not in {"logistic_regression", "svm", "xgboost", "dnn"}:
        raise ValueError(f"Unsupported model: {model_name}")
    return Classifier(params, model_type=model_name, log=False)

def macro_f1_01(y_true, y_pred_or_proba, threshold_0=0.5, threshold_1=0.5):
    """
    Macro‑F1 for classes 0 & 1 (class 2 ignored).
    Used in order to optimize the classifier towards the important classes: 0 & 1.
    """
    y_true = np.asarray(y_true)
    yp = np.asarray(y_pred_or_proba)

    # probs → hard labels
    if yp.ndim == 2:
        if threshold_0 != 0.5 or threshold_1 != 0.5:    # custom cut‑offs
            y_pred = np.full(len(yp), 2, dtype=int)
            y_pred[yp[:, 0] >= threshold_0] = 0
            y_pred[yp[:, 1] >= threshold_1] = 1
        else:                                           # argmax
            y_pred = yp.argmax(axis=1)
    else:
        y_pred = yp.astype(int)

    return f1_score(y_true, y_pred, labels=[0, 1],
                    average="macro", zero_division=0)

def _bootstrap_ci(y_true, y_pred, n_iter: int = 1000, alpha: float = .05):
    """basic percentile bootstrap CI around macro‑F1(0,1)"""
    n       = len(y_true)
    stats   = []
    for _ in range(n_iter):
        idx  = resample(np.arange(n), replace=True, n_samples=n)
        stats.append(macro_f1_01(y_true[idx], y_pred[idx]))
    lower, upper = np.percentile(stats, [100*alpha/2, 100*(1-alpha/2)])
    return float(lower), float(upper)

def _score_on_validation(estimator: Classifier,
                         X_tr, y_tr, X_val, y_val, return_ci: bool = False):
    """
    Fit on TRAIN → score on VAL.
    Handles both scikit‑learn and PyTorch heads.
    return_ci flag will return both score and confidence interval.
    """
    # ‑‑‑ prepare loaders   (HelperDataset just wraps (X,y) tensors/ndarrays)
    train_loader = DataLoader(HelperDataset(X_tr,  y_tr),
                              batch_size=BATCH_SIZE, shuffle=True)
    val_loader   = DataLoader(HelperDataset(X_val, y_val),
                              batch_size=BATCH_SIZE, shuffle=False)

    # -- fit + predict ------------------------------------------------------
    estimator.fit((train_loader, (X_tr, y_tr)))
    preds = estimator.predict((val_loader, (X_val, y_val)))

    # -- score --------------------------------------------------------------
    score = macro_f1_01(y_val, preds)

    if not return_ci:
        return score

    ci_low, ci_high = _bootstrap_ci(y_val, preds)
    return score, ci_low, ci_high

# ------------------------------------------------------------------
# 1.  Optuna — hyper‑param suggestion helper
# ------------------------------------------------------------------
def suggest_hyperparameters(trial, hp_space):
    """
    hp_space is the dict that lives in model_hyperparameters[…]
    """
    params = {}
    for key, spec in hp_space.items():
        if len(spec) == 2 and spec[1] == "categorical":
            params[key] = trial.suggest_categorical(key, spec[0])

        elif len(spec) == 3:
            low, high, kind = spec
            if kind == "loguniform":
                params[key] = trial.suggest_float(key, low, high, log=True)
            elif kind == "uniform":
                params[key] = trial.suggest_float(key, low, high)
            elif kind == "int":
                params[key] = trial.suggest_int(key, low, high)
            elif kind == "categorical":
                params[key] = trial.suggest_categorical(key, low)
            else:
                raise ValueError(f"Unknown param type: {kind}")
        else:
            raise ValueError(f"Malformed spec for {key}: {spec}")
    return params


# ------------------------------------------------------------------
# 2.  Optuna objective  (no CV — uses VAL split)
# ------------------------------------------------------------------
def objective(trial, model_name: str, X_train, y_train, X_val,   y_val) -> float:

    # -------- suggest hyper‑parameters
    params = suggest_hyperparameters(trial,
                                     model_hyperparameters[model_name])

    # logistic_regression  ≡  one‑layer DNN, override a few things
    if model_name == "logistic_regression":
        params.update(batch_norm=False, drop_out=0.0, layers=[768, 3])

    # booster‑specific extras for XGBoost
    if model_name == "xgboost" and params["booster"] in {"gbtree", "dart"}:
        params.update(
            grow_policy=trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
            colsample_bytree=trial.suggest_float("colsample_bytree", 0.5, 1.0),
            gamma=trial.suggest_float("gamma", 1e-8, 1.0, log=True),
            max_depth=trial.suggest_int("max_depth", 3, 10),
            min_child_weight=trial.suggest_int("min_child_weight", 2, 10),
            subsample=trial.suggest_float("subsample", 0.5, 1.0)
        )
        if params["booster"] == "dart":
            params.update(
                sample_type   = trial.suggest_categorical("sample_type", ["uniform", "weighted"]),
                normalize_type= trial.suggest_categorical("normalize_type", ["tree", "forest"]),
                rate_drop     = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True),
                skip_drop     = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)
            )

    # -------- build, fit, score
    estimator = _build_estimator(model_name, params)
    eval_score = _score_on_validation(estimator, X_train, y_train, X_val, y_val, return_ci=False)
    return eval_score


# ------------------------------------------------------------------
# 3.  Optimiser wrapper  (no CV, keeps history)
# ------------------------------------------------------------------
def optimize_model(model_name: str, X_train, y_train, X_val,   y_val, n_trials: int = 50, timeout: int  = 36000):
    """
    Tune hyper‑parameters by maximising macro‑F1 (0+1 classes) on VAL set.
    Returns: best_params, best_score, all_trial_scores
    """
    all_scores = []

    def _optuna_obj(trial):
        score = objective(trial, model_name, X_train, y_train, X_val,   y_val)
        all_scores.append(score)
        return score

    study = optuna.create_study(direction="maximize")
    study.optimize(_optuna_obj, n_trials=n_trials, timeout=timeout, callbacks=[TqdmCallback(n_trials)], n_jobs=-1) # Uses all available cores

    best_params = study.best_params
    best_score  = study.best_value

    # ------------------------------------------------------------------
    # refit *best* on full TRAIN and compute bootstrap CI --------------
    # ------------------------------------------------------------------
    best_est   = _build_estimator(model_name, best_params)

    best_score, ci_low, ci_high = _score_on_validation(
            best_est, X_train, y_train, X_val, y_val, return_ci=True)
    
    print(f"[{model_name}]  best F1={best_score:.4f}   "
            f"Best hyperparameters: {best_params}",
            f"bootstrap 95% CI=({ci_low:.4f}, {ci_high:.4f})")

    return best_params, best_score, (ci_low, ci_high)

# Optimize models

## Create Datasets

In [None]:
'''
Create 6 different EmbeddingDataset datasets out of the 3 TextDataset datasets:
augmented dataset - with distilbert embedding or tfidf, 
undersampled balanced dataset - with distilbert embedding or tfidf,
regular dataset - with distilbert embedding or tfidf
'''

# ---------------------------------------------
# 1.  Build the three text‑level datasets
# ---------------------------------------------
txt_regular = TextDataset(
    csv_path          = DATA_PATH,
    id_column_idx     = ID_COLUMN_IDX,
    comment_column_idx= COMMENT_COLUMN_IDX,
    label_column_idx  = LABEL_COLUMN_IDX,
    split_column_idx  = SUBSET_COLUMN_IDX,  # TRAIN / VAL / TEST column
    augmented_classes = [],                 # ‑‑ no aug
    augmentation_ratio= 0,
    undersampling_targets = {},             # ‑‑ no undersampling
)

txt_undersampled = TextDataset(
    csv_path          = DATA_PATH,
    id_column_idx     = ID_COLUMN_IDX,
    comment_column_idx= COMMENT_COLUMN_IDX,
    label_column_idx  = LABEL_COLUMN_IDX,
    split_column_idx  = SUBSET_COLUMN_IDX,
    augmented_classes = [],
    augmentation_ratio= 0,
    undersampling_targets = UNDERSAMPLING_TARGETS,   # e.g. {"Undefined":8000, …}
)

txt_augmented = TextDataset(
    csv_path          = DATA_PATH,
    id_column_idx     = ID_COLUMN_IDX,
    comment_column_idx= COMMENT_COLUMN_IDX,
    label_column_idx  = LABEL_COLUMN_IDX,
    split_column_idx  = SUBSET_COLUMN_IDX,
    augmented_classes = AUGMENTED_CLASSES,           # e.g. ["Pro‑Israel"]
    augmentation_ratio= 3,                            # add 3 extra copies
    undersampling_targets = {},                       # keep original class sizes
)


# ---------------------------------------------
# 2.  Wrap each text set in 1‑EmbeddingDataset
# ---------------------------------------------

EMB_METHODS = ["distilbert", "tf‑idf"]    # add more if you wish
embedder    = Embedder()                  # your wrapper around HF / TF‑IDF

embedding_sets: dict[str, dict[str, EmbeddingDataset]] = {}   # {method : {variant : EmbeddingDataset}}

for method in EMB_METHODS:
    embedding_sets[method] = {
        "regular"      : EmbeddingDataset(txt_regular,      embedder, method),
        "undersampled" : EmbeddingDataset(txt_undersampled, embedder, method),
        "augmented"    : EmbeddingDataset(txt_augmented,    embedder, method),
    }

# ---------------------------------------------
# 3.  Fetch TRAIN and VAL tensors for each dataset
# ---------------------------------------------
(X_bert_reg_tr ,  y_bert_reg_tr ,
 X_bert_reg_val,  y_bert_reg_val) = (
     *embedding_sets["distilbert"]["regular"].get_subset("TRAIN")[:],
     *embedding_sets["distilbert"]["regular"].get_subset("VAL")[:] )

(X_bert_us_tr ,   y_bert_us_tr ,
 X_bert_us_val,   y_bert_us_val)  = (
     *embedding_sets["distilbert"]["undersampled"].get_subset("TRAIN")[:],
     *embedding_sets["distilbert"]["undersampled"].get_subset("VAL")[:] )

(X_bert_aug_tr ,  y_bert_aug_tr ,
 X_bert_aug_val,  y_bert_aug_val) = (
     *embedding_sets["distilbert"]["augmented"].get_subset("TRAIN")[:],
     *embedding_sets["distilbert"]["augmented"].get_subset("VAL")[:] )

(X_tfidf_reg_tr , y_tfidf_reg_tr ,
 X_tfidf_reg_val, y_tfidf_reg_val) = (
     *embedding_sets["tf‑idf"]["regular"].get_subset("TRAIN")[:],
     *embedding_sets["tf‑idf"]["regular"].get_subset("VAL")[:] )

(X_tfidf_us_tr ,  y_tfidf_us_tr ,
 X_tfidf_us_val,  y_tfidf_us_val)  = (
     *embedding_sets["tf‑idf"]["undersampled"].get_subset("TRAIN")[:],
     *embedding_sets["tf‑idf"]["undersampled"].get_subset("VAL")[:] )

(X_tfidf_aug_tr , y_tfidf_aug_tr ,
 X_tfidf_aug_val, y_tfidf_aug_val) = (
     *embedding_sets["tf‑idf"]["augmented"].get_subset("TRAIN")[:],
     *embedding_sets["tf‑idf"]["augmented"].get_subset("VAL")[:] )

## Logistic Regression

In [None]:
lr_results = {}
lr_results['bert_without_augmentation'] = optimize_model(
    model_name = 'logistic_regression',
    X_train    = X_bert_reg_tr,
    y_train    = y_bert_reg_tr,
    X_val      = X_bert_reg_val,
    y_val      = y_bert_reg_val
)

In [None]:
lr_results['bert_with_undersampling'] = optimize_model('logistic_regression', X_bert_undersampled, y_bert_undersampled)

In [None]:
lr_results['bert_with_augmentation'] = optimize_model('logistic_regression', X_bert_with_augmentation, y_bert_with_augmentation)

In [None]:
lr_results['tfidf_without_augmentation'] = optimize_model('logistic_regression', X_tfidf_no_augmentation, y_tfidf_no_augmentation)

In [None]:
lr_results['tfidf_with_undersampling'] = optimize_model('logistic_regression', X_tfidf_undersampled, y_tfidf_undersampled)

In [None]:
lr_results['tfidf_with_augmentation'] = optimize_model('logistic_regression', X_tfidf_with_augmentation, y_tfidf_with_augmentation)

In [None]:
print("Logistic Regression results:\n\n")
df = pd.DataFrame([
    {
        'Experiment': key,
        'Best Parameters': value[0],
        'Best Avg': value[1],
        'Margin': value[2],
        'STD': value[3],
        'Scores': value[4]
    }
    for key, value in lr_results.items()
])
display(df)


## SVM

In [None]:
svm_results = {}
svm_results['bert_without_augmentation'] = optimize_model('svm', X_bert_no_augmentation, y_bert_no_augmentation)

In [None]:
svm_results['bert_with_undersampling'] = optimize_model('svm', X_bert_undersampled, y_bert_undersampled)

In [None]:
svm_results['bert_with_augmentation'] = optimize_model('svm', X_bert_with_augmentation, y_bert_with_augmentation)

In [None]:
svm_results['tfidf_without_augmentation'] = optimize_model('svm', X_tfidf_no_augmentation, y_tfidf_no_augmentation)

In [None]:
svm_results['tfidf_with_undersampling'] = optimize_model('svm', X_tfidf_undersampled, y_tfidf_undersampled)

In [None]:
svm_results['tfidf_with_augmentation'] = optimize_model('svm', X_tfidf_with_augmentation, y_tfidf_with_augmentation)

In [None]:
print("SVM results:\n\n")
df = pd.DataFrame([
    {
        'Experiment': key,
        'Best Parameters': value[0],
        'Best Avg': value[1],
        'STD': value[2],
        'Scores': value[3]
    }
    for key, value in svm_results.items()
])
display(df)

## XGBoost

In [None]:
xgb_results = {}
xgb_results['bert_without_augmentation'] = optimize_model('xgboost', X_bert_no_augmentation, y_bert_no_augmentation)

In [None]:
xgb_results['bert_with_undersampling'] = optimize_model('xgboost', X_bert_undersampled, y_bert_undersampled)

In [None]:
xgb_results['bert_with_augmentation'] = optimize_model('xgboost', X_bert_with_augmentation, y_bert_with_augmentation)

In [None]:
xgb_results['tfidf_without_augmentation'] = optimize_model('xgboost', X_tfidf_no_augmentation, y_tfidf_no_augmentation)

In [None]:
xgb_results['tfidf_with_undersampling'] = optimize_model('xgboost', X_tfidf_undersampled, y_tfidf_undersampled)

In [None]:
xgb_results['tfidf_with_augmentation'] = optimize_model('xgboost', X_tfidf_with_augmentation, y_tfidf_with_augmentation)

In [None]:
print("XGBoost results:\n\n")
df = pd.DataFrame([
    {
        'Experiment': key,
        'Best Parameters': value[0],
        'Best Avg': value[1],
        'STD': value[2],
        'Scores': value[3]
    }
    for key, value in xgb_results.items()
])
display(df)

## DNN

In [None]:
dnn_results = {}
dnn_results['bert_without_augmentation'] = optimize_model('dnn', X_bert_no_augmentation, y_bert_no_augmentation)

In [None]:
dnn_results['bert_with_undersampling'] = optimize_model('dnn', X_bert_undersampled, y_bert_undersampled)

In [None]:
dnn_results['bert_with_augmentation'] = optimize_model('dnn', X_bert_with_augmentation, y_bert_with_augmentation)

In [None]:
dnn_results['tfidf_without_augmentation'] = optimize_model('dnn', X_tfidf_no_augmentation, y_tfidf_no_augmentation)

In [None]:
dnn_results['tfidf_with_undersampling'] = optimize_model('dnn', X_tfidf_undersampled, y_tfidf_undersampled)

In [None]:
dnn_results['tfidf_with_augmentation'] = optimize_model('dnn', X_tfidf_with_augmentation, y_tfidf_with_augmentation)

In [None]:
print("DNN results:\n\n")
df = pd.DataFrame([
    {
        'Experiment': key,
        'Best Parameters': value[0],
        'Best Avg': value[1],
        'STD': value[2],
        'Scores': value[3]
    }
    for key, value in dnn_results.items()
])
display(df)