# Imports and Setup

In [None]:
import os
print(os.getcwd())
os.chdir(r'C:\Users\shaha\Projects\Python Projects\Israel-Palestine-Political-Affiliation-Text-Classification')
print(os.getcwd())

In [None]:
import optuna

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score, f1_score
import pandas as pd


from classifiers import *
from dataset import EmbeddingDataset
from embedder import Embedder
from Config.dataset_config import *

from torch.utils.data import Dataset, DataLoader

import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')

from tqdm import tqdm

# Define optimizers

In [3]:
# Helper dataset
class HelperDataset(Dataset):
    def __init__(self, X, y):
        super(HelperDataset).__init__()
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, item):
        return self.X[item], self.y[item]

In [4]:
# Custom tqdm callback
class TqdmCallback:
    def __init__(self, n_trials):
        self.pbar = tqdm(total=n_trials)

    def __call__(self, study, trial):
        self.pbar.update(1)

    def close(self):
        self.pbar.close()

In [5]:
model_hyperparameters = {
    'logistic_regression': {
        'num_epochs': (5, 20, 'int'),
        'learning_rate': (1e-5, 1e-3, 'loguniform'),
        'weight_decay': (1e-5, 1e-3, 'loguniform')
    },
    'svm': {
        'C': (1e-4, 1e2, 'loguniform'),
        'kernel': (['linear', 'rbf', 'sigmoid'], 'categorical'),
        'degree': (2, 4, 'int'),
        'gamma': (['scale', 'auto'], 'categorical')
    },
    'xgboost': {
        'n_estimators': (50, 200, 'int'),
        'learning_rate': (1e-3, 0.2, 'loguniform'),
        'booster': (['gbtree', 'gblinear', 'dart'], 'categorical'),
        'max_depth': (3, 10, 'int'),
        'min_child_weight': (2, 10, 'int'),
        'colsample_bytree': (0.5, 1.0, 'uniform'),
        'subsample': (0.5, 1.0, 'uniform'),
        'reg_alpha': (1e-8, 10.0, 'loguniform'),
        'reg_lambda': (1e-8, 10.0, 'loguniform'),
        'gamma': (1e-8, 1.0, 'loguniform')
    },
    'dnn': {
        "num_epochs": (5, 20, 'int'),  # Adjust after trial and error
        "learning_rate": (1e-5, 1e-3, 'loguniform'),
        'weight_decay': (1e-5, 1e-3, 'loguniform'),
        "batch_norm": ([True, False], 'categorical'),
        "drop_out": (0.0, 0.5, 'uniform'),
        "layers": ([[768, 64, 3],
                    [768, 128, 3],
                    [768, 64, 64, 3],
                    [768, 128, 64, 3],
                    [768, 512, 32, 3],
                    [768, 512, 128, 3],
                    [768, 512, 128, 64, 3]], 'categorical')  # Layer dimensions, including an input and output layer.
    }
}

def suggest_hyperparameters(trial, hyperparams):
    params = {}
    for key, value in hyperparams.items():
        if len(value) == 2 and value[1] == 'categorical':
            params[key] = trial.suggest_categorical(key, value[0])
        elif len(value) == 3:
            if value[2] == 'loguniform':
                params[key] = trial.suggest_float(key, value[0], value[1], log=True)
            elif value[2] == 'uniform':
                params[key] = trial.suggest_float(key, value[0], value[1])
            elif value[2] == 'int':
                params[key] = trial.suggest_int(key, value[0], value[1])
            elif value[2] == 'categorical':
                params[key] = trial.suggest_categorical(key, value[0])
        else:
            raise ValueError(f"Hyperparameter tuple for {key} is not in the expected format: {value}")
    return params

def cross_validation(estimator, X, y, n_splits=5):
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = []
    for i, (train_index, val_index) in enumerate(cv.split(X, y)):
        # Split to train and validation sets
        x_train, x_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        help_train_dataset = HelperDataset(x_train, y_train)
        help_val_dataset = HelperDataset(x_val, y_val)

        train_dataloader = DataLoader(help_train_dataset, batch_size=BATCH_SIZE, shuffle=True)
        val_dataloader = DataLoader(help_val_dataset, batch_size=BATCH_SIZE, shuffle=True)

        # Fit to the Classifier train and predict data type
        train = (train_dataloader, (x_train, y_train))
        val = (val_dataloader, (x_val, y_val))

        estimator.fit(train)
        pred = estimator.predict(val)
        score = f1_score(y_val, pred, average='micro')
        scores.append(score)
    return scores

# Define objective function for optuna. The function include all models, and should be called with the model name. The function optimize the Classifier class hyperparameters.
def objective(trial, model_name, X, y, folds_scores):
        # Use suggest_hyperparameters to handle standard parameters
    params = suggest_hyperparameters(trial, model_hyperparameters[model_name])

    # Add unique overrides for specific models
    if model_name == 'logistic_regression':
        params['batch_norm'] = False
        params['drop_out'] = 0.0
        params['layers'] = [768, 3]

    # Add unique parameters for XGBoost (booster-specific) based on booster
    if model_name == 'xgboost':
        if params["booster"] in ["gbtree", "dart"]:
            params["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
            params["colsample_bytree"] = trial.suggest_float("colsample_bytree", 0.5, 1.0)
            params["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
            params["max_depth"] = trial.suggest_int("max_depth", 3, 10)
            params["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
            params["subsample"] = trial.suggest_float("subsample", 0.5, 1.0)

        if params["booster"] == "dart":
            params["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
            params["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
            params["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
            params["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)


    model = Classifier(params, model_type=model_name, log=False)

    # Perform cross validation
    scores = cross_validation(model, X, y)

    folds_scores.append(scores)     # Save scores for statistic tests
    return np.mean(scores)

def optimize_model(model_name, X, y, n_trials=10, timout=36000):
    """
    The actual optimization.
    """
    folds_scores = []   # create a list to store the scores from each trial folds
    study = optuna.create_study(direction='maximize')
    progress_bar = TqdmCallback(n_trials)
    study.optimize(lambda trial: objective(trial, model_name, X, y, folds_scores), n_trials=n_trials, timeout=timout, callbacks=[progress_bar])
    # Close progress bar
    progress_bar.close()

    best_params = study.best_params
    best_value = study.best_value
    attempt_stats = [(np.mean(scores), np.std(scores, ddof=1), scores) for scores in folds_scores]
    best_attempt = sorted(attempt_stats, key=lambda x: (-x[0], x[1]))[0]

    n = len(best_attempt[2])
    z = 1.96  # For 95% confidence interval

    # Calculate margin of error
    margin_of_error = z * (best_attempt[1] / np.sqrt(n))
    print(f"Best hyperparameters for {model_name}: {best_params}")
    print(f"Best F1 score for {model_name}: {best_value}")
    print(f"Attempts Stats: Avg: {best_attempt[0]}, Margin: +-{margin_of_error}, STD: {best_attempt[1]}, Scores: {best_attempt[2]}")
    

    return best_params, best_value, margin_of_error, best_attempt[1], best_attempt[2]

# Optimize models

## Create Datasets

In [None]:
'''
Create 6 different datasets:
augmented dataset - with distilbert embedding or tfidf, 
undersampled balanced dataset - with distilbert embedding or tfidf,
regular dataset - with distilbert embedding or tfidf
'''
bert_embedding_no_augmentation_data = EmbeddingDataset(
    data_path=DATA_PATH,
    subset=SUBSET,
    id_column_idx=ID_COLUMN_IDX,
    comment_column_idx=COMMENT_COLUMN_IDX,
    label_column_idx=LABEL_COLUMN_IDX,
    subset_column_idx=SUBSET_COLUMN_IDX,
    augmented_classes=AUGMENTED_CLASSES,
    augmentation_ratio=0,
    augmentation_methods=AUGMENTATION_METHODS,
    adversation_ratio = ADVERSATION_RATIO,
    undersampling_targets={},
    embedder=Embedder(),
    embedding_method='distilbert'
)

bert_embedding_undersampled_data = EmbeddingDataset(
    data_path=DATA_PATH,
    subset=SUBSET,
    id_column_idx=ID_COLUMN_IDX,
    comment_column_idx=COMMENT_COLUMN_IDX,
    label_column_idx=LABEL_COLUMN_IDX,
    subset_column_idx=SUBSET_COLUMN_IDX,
    augmented_classes=AUGMENTED_CLASSES,
    augmentation_ratio=0,
    augmentation_methods=AUGMENTATION_METHODS,
    adversation_ratio = ADVERSATION_RATIO,
    undersampling_targets=UNDERSAMPLING_TARGETS,
    embedder=Embedder(),
    embedding_method='distilbert'
)

bert_embedding_with_augmentation_data = EmbeddingDataset(
    data_path=DATA_PATH,
    subset=SUBSET,
    id_column_idx=ID_COLUMN_IDX,
    comment_column_idx=COMMENT_COLUMN_IDX,
    label_column_idx=LABEL_COLUMN_IDX,
    subset_column_idx=SUBSET_COLUMN_IDX,
    augmented_classes=AUGMENTED_CLASSES,
    augmentation_ratio=3,
    augmentation_methods=AUGMENTATION_METHODS,
    adversation_ratio = ADVERSATION_RATIO,
    undersampling_targets={},
    embedder=Embedder(),
    embedding_method='distilbert'
)

tfidf_embedding_no_augmentation_data = EmbeddingDataset(
    data_path=DATA_PATH,
    subset=SUBSET,
    id_column_idx=ID_COLUMN_IDX,
    comment_column_idx=COMMENT_COLUMN_IDX,
    label_column_idx=LABEL_COLUMN_IDX,
    subset_column_idx=SUBSET_COLUMN_IDX,
    augmented_classes=AUGMENTED_CLASSES,
    augmentation_ratio=0,
    augmentation_methods=AUGMENTATION_METHODS,
    adversation_ratio = ADVERSATION_RATIO,
    undersampling_targets={},
    embedder=Embedder(),
    embedding_method='tf-idf'
)

tfidf_embedding_undersampled_data = EmbeddingDataset(
    data_path=DATA_PATH,
    subset=SUBSET,
    id_column_idx=ID_COLUMN_IDX,
    comment_column_idx=COMMENT_COLUMN_IDX,
    label_column_idx=LABEL_COLUMN_IDX,
    subset_column_idx=SUBSET_COLUMN_IDX,
    augmented_classes=AUGMENTED_CLASSES,
    augmentation_ratio=0,
    augmentation_methods=AUGMENTATION_METHODS,
    adversation_ratio = ADVERSATION_RATIO,
    undersampling_targets=UNDERSAMPLING_TARGETS,
    embedder=Embedder(),
    embedding_method='tf-idf'
)

tfidf_embedding_with_augmentation_data = EmbeddingDataset(
    data_path=DATA_PATH,
    subset=SUBSET,
    id_column_idx=ID_COLUMN_IDX,
    comment_column_idx=COMMENT_COLUMN_IDX,
    label_column_idx=LABEL_COLUMN_IDX,
    subset_column_idx=SUBSET_COLUMN_IDX,
    augmented_classes=AUGMENTED_CLASSES,
    augmentation_ratio=3,
    augmentation_methods=AUGMENTATION_METHODS,
    adversation_ratio = ADVERSATION_RATIO,
    undersampling_targets={},
    embedder=Embedder(),
    embedding_method='tf-idf'
)

# Get X,y
X_bert_no_augmentation, y_bert_no_augmentation = bert_embedding_no_augmentation_data.embeddings, bert_embedding_no_augmentation_data.labels
X_bert_undersampled, y_bert_undersampled = bert_embedding_undersampled_data.embeddings, bert_embedding_undersampled_data.labels
X_bert_with_augmentation, y_bert_with_augmentation = bert_embedding_with_augmentation_data.embeddings, bert_embedding_with_augmentation_data.labels
X_tfidf_no_augmentation, y_tfidf_no_augmentation = tfidf_embedding_no_augmentation_data.embeddings, tfidf_embedding_no_augmentation_data.labels
X_tfidf_undersampled, y_tfidf_undersampled = tfidf_embedding_undersampled_data.embeddings, tfidf_embedding_undersampled_data.labels
X_tfidf_with_augmentation, y_tfidf_with_augmentation = tfidf_embedding_with_augmentation_data.embeddings, tfidf_embedding_with_augmentation_data.labels

## Logistic Regression

In [None]:
lr_results = {}
lr_results['bert_without_augmentation'] = optimize_model('logistic_regression', X_bert_no_augmentation, y_bert_no_augmentation)

In [None]:
lr_results['bert_with_undersampling'] = optimize_model('logistic_regression', X_bert_undersampled, y_bert_undersampled)

In [None]:
lr_results['bert_with_augmentation'] = optimize_model('logistic_regression', X_bert_with_augmentation, y_bert_with_augmentation)

In [None]:
lr_results['tfidf_without_augmentation'] = optimize_model('logistic_regression', X_tfidf_no_augmentation, y_tfidf_no_augmentation)

In [None]:
lr_results['tfidf_with_undersampling'] = optimize_model('logistic_regression', X_tfidf_undersampled, y_tfidf_undersampled)

In [None]:
lr_results['tfidf_with_augmentation'] = optimize_model('logistic_regression', X_tfidf_with_augmentation, y_tfidf_with_augmentation)

In [None]:
print("Logistic Regression results:\n\n")
df = pd.DataFrame([
    {
        'Experiment': key,
        'Best Parameters': value[0],
        'Best Avg': value[1],
        'Margin': value[2],
        'STD': value[3],
        'Scores': value[4]
    }
    for key, value in lr_results.items()
])
display(df)


## SVM

In [None]:
svm_results = {}
svm_results['bert_without_augmentation'] = optimize_model('svm', X_bert_no_augmentation, y_bert_no_augmentation)

In [None]:
svm_results['bert_with_undersampling'] = optimize_model('svm', X_bert_undersampled, y_bert_undersampled)

In [None]:
svm_results['bert_with_augmentation'] = optimize_model('svm', X_bert_with_augmentation, y_bert_with_augmentation)

In [None]:
svm_results['tfidf_without_augmentation'] = optimize_model('svm', X_tfidf_no_augmentation, y_tfidf_no_augmentation)

In [None]:
svm_results['tfidf_with_undersampling'] = optimize_model('svm', X_tfidf_undersampled, y_tfidf_undersampled)

In [None]:
svm_results['tfidf_with_augmentation'] = optimize_model('svm', X_tfidf_with_augmentation, y_tfidf_with_augmentation)

In [None]:
print("SVM results:\n\n")
df = pd.DataFrame([
    {
        'Experiment': key,
        'Best Parameters': value[0],
        'Best Avg': value[1],
        'STD': value[2],
        'Scores': value[3]
    }
    for key, value in svm_results.items()
])
display(df)

## XGBoost

In [None]:
xgb_results = {}
xgb_results['bert_without_augmentation'] = optimize_model('xgboost', X_bert_no_augmentation, y_bert_no_augmentation)

In [None]:
xgb_results['bert_with_undersampling'] = optimize_model('xgboost', X_bert_undersampled, y_bert_undersampled)

In [None]:
xgb_results['bert_with_augmentation'] = optimize_model('xgboost', X_bert_with_augmentation, y_bert_with_augmentation)

In [None]:
xgb_results['tfidf_without_augmentation'] = optimize_model('xgboost', X_tfidf_no_augmentation, y_tfidf_no_augmentation)

In [None]:
xgb_results['tfidf_with_undersampling'] = optimize_model('xgboost', X_tfidf_undersampled, y_tfidf_undersampled)

In [None]:
xgb_results['tfidf_with_augmentation'] = optimize_model('xgboost', X_tfidf_with_augmentation, y_tfidf_with_augmentation)

In [None]:
print("XGBoost results:\n\n")
df = pd.DataFrame([
    {
        'Experiment': key,
        'Best Parameters': value[0],
        'Best Avg': value[1],
        'STD': value[2],
        'Scores': value[3]
    }
    for key, value in xgb_results.items()
])
display(df)

## DNN

In [None]:
dnn_results = {}
dnn_results['bert_without_augmentation'] = optimize_model('dnn', X_bert_no_augmentation, y_bert_no_augmentation)

In [None]:
dnn_results['bert_with_undersampling'] = optimize_model('dnn', X_bert_undersampled, y_bert_undersampled)

In [None]:
dnn_results['bert_with_augmentation'] = optimize_model('dnn', X_bert_with_augmentation, y_bert_with_augmentation)

In [None]:
dnn_results['tfidf_without_augmentation'] = optimize_model('dnn', X_tfidf_no_augmentation, y_tfidf_no_augmentation)

In [None]:
dnn_results['tfidf_with_undersampling'] = optimize_model('dnn', X_tfidf_undersampled, y_tfidf_undersampled)

In [None]:
dnn_results['tfidf_with_augmentation'] = optimize_model('dnn', X_tfidf_with_augmentation, y_tfidf_with_augmentation)

In [None]:
print("DNN results:\n\n")
df = pd.DataFrame([
    {
        'Experiment': key,
        'Best Parameters': value[0],
        'Best Avg': value[1],
        'STD': value[2],
        'Scores': value[3]
    }
    for key, value in dnn_results.items()
])
display(df)