# Imports and Setup

In [50]:
import optuna
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
import numpy as np

from classifiers import Classifier, DNN
from dataset import TextDataset, get_dataloader
from embedder import Embedder
from Config.dataset_config import *

# Load dataset and preprocess

In [None]:
dataframe =

# Define optimizers

In [None]:
model_hyperparameters = {   # TODO: Add neural network hyperparameters
    'logistic_regression': {
        'C': (1e-4, 1e2, 'loguniform'),
        'max_iter': ([50, 200], 'uniform'),
    },
    'svm': {
        'C': (1e-4, 1e2, 'loguniform'),
        'kernel': (['linear', 'poly', 'rbf', 'sigmoid'], 'categorical'),
        'degree': (2, 5, 'int'),
        'gamma': (['scale', 'auto'], 'categorical')
    },
    'xgboost': {
        'n_estimators': ([5, 100], 'int'),
        'learning_rate': ([1e-3, 1.0], 'loguniform'),
        'booster': (['gbtree', 'gblinear', 'dart'], 'categorical'),
    },
    'dnn': {
        "num_epochs": ([10, 50], 'uniform'),  # Adjust after trial and error
        "learning_rate": ([1e-3, 0.1], 'loguniform'),
        "batch_norm": ([True, False], 'categorical'),
        "drop_out": ([0.0, 1.0], 'uniform'),
        "layers": [768, 128, 64, 3]  # Layer dimentions, including an input and an output layer.
    }
}

def suggest_hyperparameters(trial, hyperparams):
    params = {}
    for key, value in hyperparams.items():
        if len(value) == 2 and value[1] == 'categorical':
            params[key] = trial.suggest_categorical(key, value[0])
        elif len(value) == 3:
            if value[2] == 'loguniform':
                params[key] = trial.suggest_float(key, value[0], value[1], log=True)
            elif value[2] == 'uniform':
                params[key] = trial.suggest_float(key, value[0], value[1])
            elif value[2] == 'int':
                params[key] = trial.suggest_int(key, value[0], value[1])
            elif value[2] == 'categorical':
                params[key] = trial.suggest_categorical(key, value[0])
            elif value[1] == 'custom':
                hidden_dims = params['hidden_dims']
                layer_count = len(hidden_dims)
                params[key] = trial.suggest_categorical(key, value[0][layer_count])
            else:
                raise ValueError(f"Hyperparameter tuple for {key} is not in the expected format: {value}")
    return params

# Define objective function for optuna. The function include all models, and should be called with the model name. The function optimize the Classifier class hyperparameters.
def objective(trial, model_name, data, folds_scores):
    params = suggest_hyperparameters(trial, model_hyperparameters[model_name])

    # Add some more parameters for Logistic Regression
    if model_name == 'Logistic Regression':
        penalty = trial.suggest_categorical('penalty', ['l1', 'l2', None, 'elasticnet'])
        if penalty == 'elasticnet':
            l1_ratio = trial.suggest_float('l1_ratio', 0.0, 1.0, step=0.25)
        else:
            l1_ratio = None
        params['penalty'] = penalty
        params['l1_ratio'] = l1_ratio

    # Add some more parameters for XGBoost
    elif model_name == 'xgboost':
        if params["booster"] in ["gbtree", "dart"]:
            # maximum depth of the tree, signifies complexity of the tree.
            params["max_depth"] = trial.suggest_int("max_depth", 3, 9, step=2)
            # minimum child weight, larger the term more conservative the tree.
            params["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
            params["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
            # defines how selective algorithm is.
            params["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
            params["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

        if params["booster"] == "dart":
            params["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
            params["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
            params["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
            params["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)


    model = Classifier(**params, model_type=model_name)
    model.fit(data)

    # Create a pipeline with just the classifier since feature prep is external
    pipeline = Pipeline([
        ('classifier', model)
    ])

    # Define the cross-validation strategy
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

    # Perform cross-validation and return the mean F1 score
    scores = cross_val_score(pipeline, data, cv=cv, scoring='accuracy')
    folds_scores.append(scores)
    return scores.mean()

def optimize_model(model_name, data, n_trials=50, timout=1200):
    """
    The actual optimization.
    """
    folds_scores = []   # create a list to store the scores from each trial folds
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, model_name, data, folds_scores), n_trials=n_trials, timeout=timout)

    best_params = study.best_params
    best_value = study.best_value

    print(f"Best hyperparameters for {model_name}: {best_params}")
    print(f"Best F1 score for {model_name}: {best_value}")

    return best_params, best_value, folds_scores


# Optimize models

## Create Datasets

In [38]:
# Create 4 different datasets: embedding with and without augmentation, and tf-idf with and without augmentation.
embedder = Embedder()
data_without_augmentation = TextDataset(
    data_path=DATA_PATH,
    subset=SUBSET,
    id_column_idx=ID_COLUMN_IDX,
    comment_column_idx=COMMENT_COLUMN_IDX,
    label_column_idx=LABEL_COLUMN_IDX,
    subset_column_idx=SUBSET_COLUMN_IDX,
    augmented_classes=AUGMENTED_CLASSES,
    augmentation_ratio=0,
    augmentation_methods=AUGMENTATION_METHODS,
    adversation_ratio = ADVERSATION_RATIO
)

data_with_augmentation = TextDataset(
    data_path=DATA_PATH,
    subset=SUBSET,
    id_column_idx=ID_COLUMN_IDX,
    comment_column_idx=COMMENT_COLUMN_IDX,
    label_column_idx=LABEL_COLUMN_IDX,
    subset_column_idx=SUBSET_COLUMN_IDX,
    augmented_classes=AUGMENTED_CLASSES,
    augmentation_ratio=AUGMENTATION_RATIO,
    augmentation_methods=AUGMENTATION_METHODS,
    adversation_ratio = ADVERSATION_RATIO
)

bert_embedding_no_augmentation_loader = get_dataloader(
    dataset=data_without_augmentation,
    embedder=embedder,
    datashape='embedding',
    embedding_method=EMBEDDING_METHOD,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=2
)

bert_embedding_with_augmentation_loader = get_dataloader(
    dataset=data_with_augmentation,
    embedder=embedder,
    datashape='embedding',
    embedding_method=EMBEDDING_METHOD,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=2
)

tfidf_embedding_no_augmentation_loader = get_dataloader(
    dataset=data_without_augmentation,
    embedder=embedder,
    datashape='embedding',
    embedding_method='tf-idf',
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=2
)

tfidf_embedding_with_augmentation_loader = get_dataloader(
    dataset=data_with_augmentation,
    embedder=embedder,
    datashape='embedding',
    embedding_method='tf-idf',
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=2
)

## Logistic Regression

In [None]:
lr_results = {}
lr_results['bert_without_augmentation'] = optimize_model('logistic_regression', bert_embedding_no_augmentation_loader)
lr_results['bert_with_augmentation'] = optimize_model('logistic_regression', bert_embedding_with_augmentation_loader)
lr_results['tfidf_without_augmentation'] = optimize_model('logistic_regression', tfidf_embedding_no_augmentation_loader)
lr_results['tfidf_with_augmentation'] = optimize_model('logistic_regression', tfidf_embedding_with_augmentation_loader)

In [None]:
print("Logistic Regression results:\n")
print(lr_results)

## SVM

In [47]:
svm_results = {}
svm_results['bert_without_augmentation'] = optimize_model('svm', bert_embedding_no_augmentation_loader)
svm_results['bert_with_augmentation'] = optimize_model('svm', bert_embedding_with_augmentation_loader)
svm_results['tfidf_without_augmentation'] = optimize_model('svm', tfidf_embedding_no_augmentation_loader)
svm_results['tfidf_with_augmentation'] = optimize_model('svm', tfidf_embedding_with_augmentation_loader)

## XGBoost

In [53]:
xgb_results = {}
xgb_results['bert_without_augmentation'] = optimize_model('xgboost', bert_embedding_no_augmentation_loader)
xgb_results['bert_with_augmentation'] = optimize_model('xgboost', bert_embedding_with_augmentation_loader)
xgb_results['tfidf_without_augmentation'] = optimize_model('xgboost', tfidf_embedding_no_augmentation_loader)
xgb_results['tfidf_with_augmentation'] = optimize_model('xgboost', tfidf_embedding_with_augmentation_loader)