# Imports and Setup

In [1]:
import optuna

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score


from classifiers import *
from dataset import EmbeddingDataset, get_dataloader
from embedder import Embedder
from Config.dataset_config import *

from torch.utils.data import Dataset, DataLoader

import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')

from tqdm import tqdm

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\amita\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\amita\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\amita\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Define optimizers

In [2]:
# Helper dataset
class HelperDataset(Dataset):
    def __init__(self, X, y):
        super(HelperDataset).__init__()
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, item):
        return self.X[item], self.y[item]

In [3]:
# Custom tqdm callback
class TqdmCallback:
    def __init__(self, n_trials):
        self.pbar = tqdm(total=n_trials)

    def __call__(self, study, trial):
        self.pbar.update(1)

    def close(self):
        self.pbar.close()

In [9]:
model_hyperparameters = {   # TODO: Add neural network hyperparameters
    'logistic_regression': {
        'learning_rate': (1e-5, 1e-3, 'loguniform'),
        'weight_decay': (1e-5, 1e-3, 'loguniform')
    },
    'svm': {
        'C': (1e-4, 1e2, 'loguniform'),
        'kernel': (['linear', 'poly', 'rbf', 'sigmoid'], 'categorical'),
        'degree': (2, 5, 'int'),
        'gamma': (['scale', 'auto'], 'categorical')
    },
    'xgboost': {
        'n_estimators': ([5, 100], 'int'),
        'learning_rate': ([1e-3, 1.0], 'loguniform'),
        'booster': (['gbtree', 'gblinear', 'dart'], 'categorical')
    },
    'dnn': {
        "num_epochs": ([2, 15], 'int'),  # Adjust after trial and error
        "learning_rate": ([1e-5, 1e-3], 'loguniform'),
        "batch_norm": ([True, False], 'categorical'),
        "drop_out": ([0.0, 0.5], 'uniform'),
        "layers": ([[768, 64, 3],
                    [768, 128, 3],
                    [768, 64, 64, 3],
                    [768, 128, 64, 3],
                    [768, 512, 32, 3],
                    [768, 512, 128, 3],
                    [768, 512, 128, 64, 3]], 'custom')  # Layer dimensions, including an input and output layer.
    }
}

def suggest_hyperparameters(trial, hyperparams):
    params = {}
    for key, value in hyperparams.items():
        if len(value) == 2 and value[1] == 'categorical':
            params[key] = trial.suggest_categorical(key, value[0])
        elif len(value) == 3:
            if value[2] == 'loguniform':
                params[key] = trial.suggest_float(key, value[0], value[1], log=True)
            elif value[2] == 'uniform':
                params[key] = trial.suggest_float(key, value[0], value[1])
            elif value[2] == 'int':
                params[key] = trial.suggest_int(key, value[0], value[1])
            elif value[2] == 'categorical':
                params[key] = trial.suggest_categorical(key, value[0])
            elif value[1] == 'custom':
                hidden_dims = params['hidden_dims']
                layer_count = len(hidden_dims)
                params[key] = trial.suggest_categorical(key, value[0][layer_count])
            else:
                raise ValueError(f"Hyperparameter tuple for {key} is not in the expected format: {value}")
    return params

def cross_validation(estimator, X, y, n_splits=10):
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = []
    for i, (train_index, val_index) in enumerate(cv.split(X, y)):
        # Split to train and validation sets
        x_train, x_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        help_train_dataset = HelperDataset(x_train, y_train)
        help_val_dataset = HelperDataset(x_val, y_val)

        train_dataloader = DataLoader(help_train_dataset, batch_size=BATCH_SIZE, shuffle=True)
        val_dataloader = DataLoader(help_val_dataset, batch_size=BATCH_SIZE, shuffle=True)

        # Fit to the Classifier train and predict data type
        train = (train_dataloader, (x_train, y_train))
        val = (val_dataloader, (x_val, y_val))

        estimator.fit(train)
        pred = estimator.predict(val)
        score = f1_score(y_val, pred, average='micro')
        scores.append(score)
    return scores

# Define objective function for optuna. The function include all models, and should be called with the model name. The function optimize the Classifier class hyperparameters.
def objective(trial, model_name, X, y, folds_scores):
    params = suggest_hyperparameters(trial, model_hyperparameters[model_name])

    if model_name == 'logistic_regression':
        params['num_epochs'] = 1
        params['batch_norm'] = False
        params['drop_out'] = 0.0
        params['layers'] = [768, 3]

    # Add some more parameters for XGBoost
    if model_name == 'xgboost':
        if params["booster"] in ["gbtree", "dart"]:
            # maximum depth of the tree, signifies complexity of the tree.
            params["max_depth"] = trial.suggest_int("max_depth", 3, 9, step=2)
            # minimum child weight, larger the term more conservative the tree.
            params["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
            params["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
            # defines how selective algorithm is.
            params["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
            params["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

        if params["booster"] == "dart":
            params["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
            params["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
            params["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
            params["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)


    model = Classifier(params, model_type=model_name, log=False)

    # Create a pipeline with just the classifier since feature prep is external
    pipeline = Pipeline([
        ('classifier', model)
    ])

    # Perform cross validation
    scores = cross_validation(model, X, y, n_splits=10)

    folds_scores.append(scores)     # Save scores for statistic tests
    return np.mean(scores)

def optimize_model(model_name, X, y, n_trials=50, timout=1200):
    """
    The actual optimization.
    """
    folds_scores = []   # create a list to store the scores from each trial folds
    study = optuna.create_study(direction='maximize')
    progress_bar = TqdmCallback(n_trials)
    study.optimize(lambda trial: objective(trial, model_name, X, y, folds_scores), n_trials=n_trials, timeout=timout, callbacks=[progress_bar])
    # Close progress bar
    progress_bar.close()

    best_params = study.best_params
    best_value = study.best_value

    print(f"Best hyperparameters for {model_name}: {best_params}")
    print(f"Best F1 score for {model_name}: {best_value}")

    return best_params, best_value, folds_scores

# Optimize models

## Create Datasets

In [5]:
# Create 4 different datasets: embedding with and without augmentation, and tf-idf with and without augmentation.
embedder = Embedder()
bert_embedding_no_augmentation_data = EmbeddingDataset(
    data_path=DATA_PATH,
    subset=SUBSET,
    id_column_idx=ID_COLUMN_IDX,
    comment_column_idx=COMMENT_COLUMN_IDX,
    label_column_idx=LABEL_COLUMN_IDX,
    subset_column_idx=SUBSET_COLUMN_IDX,
    augmented_classes=AUGMENTED_CLASSES,
    augmentation_ratio=0,
    augmentation_methods=AUGMENTATION_METHODS,
    adversation_ratio = ADVERSATION_RATIO,
    embedder=embedder,
    embedding_method='distilbert'
)

X_bert_no_augmentation, y_bert_no_augmentation = bert_embedding_no_augmentation_data.embeddings, bert_embedding_no_augmentation_data.labels

bert_embedding_with_augmentation_data = EmbeddingDataset(
    data_path=DATA_PATH,
    subset=SUBSET,
    id_column_idx=ID_COLUMN_IDX,
    comment_column_idx=COMMENT_COLUMN_IDX,
    label_column_idx=LABEL_COLUMN_IDX,
    subset_column_idx=SUBSET_COLUMN_IDX,
    augmented_classes=AUGMENTED_CLASSES,
    augmentation_ratio=5,
    augmentation_methods=AUGMENTATION_METHODS,
    adversation_ratio = ADVERSATION_RATIO,
    embedder=embedder,
    embedding_method='distilbert'
)

X_bert_with_augmentation, y_bert_with_augmentation = bert_embedding_with_augmentation_data.embeddings, bert_embedding_with_augmentation_data.labels

tfidf_embedding_no_augmentation_data = EmbeddingDataset(
    data_path=DATA_PATH,
    subset=SUBSET,
    id_column_idx=ID_COLUMN_IDX,
    comment_column_idx=COMMENT_COLUMN_IDX,
    label_column_idx=LABEL_COLUMN_IDX,
    subset_column_idx=SUBSET_COLUMN_IDX,
    augmented_classes=AUGMENTED_CLASSES,
    augmentation_ratio=0,
    augmentation_methods=AUGMENTATION_METHODS,
    adversation_ratio = ADVERSATION_RATIO,
    embedder=embedder,
    embedding_method='tf-idf'
)

X_tfidf_no_augmentation, y_tfidf_no_augmentation = tfidf_embedding_no_augmentation_data.embeddings, tfidf_embedding_no_augmentation_data.labels

tfidf_embedding_with_augmentation_data = EmbeddingDataset(
    data_path=DATA_PATH,
    subset=SUBSET,
    id_column_idx=ID_COLUMN_IDX,
    comment_column_idx=COMMENT_COLUMN_IDX,
    label_column_idx=LABEL_COLUMN_IDX,
    subset_column_idx=SUBSET_COLUMN_IDX,
    augmented_classes=AUGMENTED_CLASSES,
    augmentation_ratio=5,
    augmentation_methods=AUGMENTATION_METHODS,
    adversation_ratio = ADVERSATION_RATIO,
    embedder=embedder,
    embedding_method='tf-idf'
)

X_tfidf_with_augmentation, y_tfidf_with_augmentation = tfidf_embedding_with_augmentation_data.embeddings, tfidf_embedding_with_augmentation_data.labels

[Dataset Status]: Loading the dataset...


Preprocessing comments: 100%|██████████| 6637/6637 [00:00<00:00, 27607.71it/s]


[Dataset Status]: No Augmentation was chosen (augmentation/ adversation ratio == 0 or no augmented_classes). Moving on...
[EmbeddingDataset]: Loading precomputed embeddings from C:\Users\amita\PycharmProjects\Israel-Palestine-Political-Affiliation-Text-Classification\Data\subset B_augmentation=0_embeddings_distilbert.pkl...
[Dataset Status]: Loading the dataset...


Preprocessing comments: 100%|██████████| 6637/6637 [00:00<00:00, 29965.96it/s]
Augmenting data: 100%|██████████| 6537/6537 [00:16<00:00, 398.76row/s]


[EmbeddingDataset]: Loading precomputed embeddings from C:\Users\amita\PycharmProjects\Israel-Palestine-Political-Affiliation-Text-Classification\Data\subset B_augmentation=5_embeddings_distilbert.pkl...
[Dataset Status]: Loading the dataset...


Preprocessing comments: 100%|██████████| 6637/6637 [00:00<00:00, 37508.84it/s]


[Dataset Status]: No Augmentation was chosen (augmentation/ adversation ratio == 0 or no augmented_classes). Moving on...
[EmbeddingDataset]: Loading precomputed embeddings from C:\Users\amita\PycharmProjects\Israel-Palestine-Political-Affiliation-Text-Classification\Data\subset B_augmentation=0_embeddings_tf-idf.pkl...
[Dataset Status]: Loading the dataset...


Preprocessing comments: 100%|██████████| 6637/6637 [00:00<00:00, 37395.36it/s]
Augmenting data: 100%|██████████| 6537/6537 [00:12<00:00, 505.05row/s]


[EmbeddingDataset]: Loading precomputed embeddings from C:\Users\amita\PycharmProjects\Israel-Palestine-Political-Affiliation-Text-Classification\Data\subset B_augmentation=5_embeddings_tf-idf.pkl...


## Logistic Regression

In [10]:
lr_results = {}
lr_results['bert_without_augmentation'] = optimize_model('logistic_regression', X_bert_no_augmentation, y_bert_no_augmentation)

[I 2025-01-21 21:49:16,949] A new study created in memory with name: no-name-3bdc8213-9c68-424d-84e9-53a4649ba7c5

  0%|          | 0/50 [00:00<?, ?it/s][A[I 2025-01-21 21:49:18,579] Trial 0 finished with value: 0.5276163648369556 and parameters: {'learning_rate': 0.0003002489113552147, 'weight_decay': 0.0002848277827833069}. Best is trial 0 with value: 0.5276163648369556.

  2%|▏         | 1/50 [00:01<01:19,  1.63s/it][A[I 2025-01-21 21:49:19,656] Trial 1 finished with value: 0.5340346366569726 and parameters: {'learning_rate': 0.0004935960792804718, 'weight_decay': 1.4086721855697989e-05}. Best is trial 1 with value: 0.5340346366569726.

  4%|▍         | 2/50 [00:02<01:02,  1.30s/it][A[I 2025-01-21 21:49:20,514] Trial 2 finished with value: 0.5046833949168973 and parameters: {'learning_rate': 1.536356067988419e-05, 'weight_decay': 0.0001960894451980013}. Best is trial 1 with value: 0.5340346366569726.

  6%|▌         | 3/50 [00:03<00:51,  1.10s/it][A[I 2025-01-21 21:49:21,369] Tr

Best hyperparameters for logistic_regression: {'learning_rate': 1.639303640706943e-05, 'weight_decay': 8.202639343676631e-05}
Best F1 score for logistic_regression: 0.5893959190937148





In [12]:
lr_results['bert_with_augmentation'] = optimize_model('logistic_regression', X_bert_with_augmentation, y_bert_with_augmentation)
lr_results['tfidf_without_augmentation'] = optimize_model('logistic_regression', X_tfidf_no_augmentation, y_tfidf_no_augmentation)
lr_results['tfidf_with_augmentation'] = optimize_model('logistic_regression', X_tfidf_with_augmentation, y_tfidf_with_augmentation)

[I 2025-01-21 21:58:20,985] A new study created in memory with name: no-name-7d3004a2-0ed9-447e-95d1-30247402c1e2

  0%|          | 0/50 [00:00<?, ?it/s][A[I 2025-01-21 21:58:24,483] Trial 0 finished with value: 0.3528695426375376 and parameters: {'learning_rate': 0.00015693529527601566, 'weight_decay': 5.552633583208091e-05}. Best is trial 0 with value: 0.3528695426375376.

  2%|▏         | 1/50 [00:03<02:51,  3.50s/it][A[I 2025-01-21 21:58:26,720] Trial 1 finished with value: 0.3463980195963023 and parameters: {'learning_rate': 0.0007350279741325184, 'weight_decay': 0.000456344870848001}. Best is trial 0 with value: 0.3528695426375376.

  4%|▍         | 2/50 [00:05<02:12,  2.76s/it][A[I 2025-01-21 21:58:28,877] Trial 2 finished with value: 0.35646065159436346 and parameters: {'learning_rate': 0.0009300364309336016, 'weight_decay': 0.0004383435616569376}. Best is trial 2 with value: 0.35646065159436346.

  6%|▌         | 3/50 [00:07<01:56,  2.48s/it][A[I 2025-01-21 21:58:31,248] T

Best hyperparameters for logistic_regression: {'learning_rate': 0.0002738246363978017, 'weight_decay': 1.3118316853159452e-05}
Best F1 score for logistic_regression: 0.35763694907038746



  0%|          | 0/50 [00:00<?, ?it/s][A[I 2025-01-21 22:01:01,519] Trial 0 finished with value: 0.5004160988334246 and parameters: {'learning_rate': 1.4760397445003843e-05, 'weight_decay': 2.072476988515809e-05}. Best is trial 0 with value: 0.5004160988334246.

  2%|▏         | 1/50 [00:00<00:42,  1.16it/s][A[I 2025-01-21 22:01:02,389] Trial 1 finished with value: 0.6796696966716776 and parameters: {'learning_rate': 0.0006196821549443317, 'weight_decay': 1.2644084276175046e-05}. Best is trial 1 with value: 0.6796696966716776.

  4%|▍         | 2/50 [00:01<00:41,  1.15it/s][A[I 2025-01-21 22:01:03,276] Trial 2 finished with value: 0.5689660985992667 and parameters: {'learning_rate': 0.00013239681433701453, 'weight_decay': 0.0003621976558031507}. Best is trial 1 with value: 0.6796696966716776.

  6%|▌         | 3/50 [00:02<00:41,  1.14it/s][A[I 2025-01-21 22:01:04,158] Trial 3 finished with value: 0.41955547438076907 and parameters: {'learning_rate': 7.693301785032692e-05, 'weight_

Best hyperparameters for logistic_regression: {'learning_rate': 0.00035128291989658535, 'weight_decay': 0.0004388332527791302}
Best F1 score for logistic_regression: 0.6805873620223762



  0%|          | 0/50 [00:00<?, ?it/s][A[I 2025-01-21 22:01:47,808] Trial 0 finished with value: 0.43273330332721666 and parameters: {'learning_rate': 2.765886029653541e-05, 'weight_decay': 0.0003162455167593957}. Best is trial 0 with value: 0.43273330332721666.

  2%|▏         | 1/50 [00:02<01:48,  2.22s/it][A[I 2025-01-21 22:01:50,043] Trial 1 finished with value: 0.4297309836235848 and parameters: {'learning_rate': 0.0006489292919329492, 'weight_decay': 0.00012014646053284168}. Best is trial 0 with value: 0.43273330332721666.

  4%|▍         | 2/50 [00:04<01:46,  2.23s/it][A[I 2025-01-21 22:01:52,285] Trial 2 finished with value: 0.37206266662050347 and parameters: {'learning_rate': 1.9496940669960404e-05, 'weight_decay': 0.00023508800862769862}. Best is trial 0 with value: 0.43273330332721666.

  6%|▌         | 3/50 [00:06<01:44,  2.23s/it][A[I 2025-01-21 22:01:54,525] Trial 3 finished with value: 0.4316733026347679 and parameters: {'learning_rate': 0.0004668006214031272, 'wei

Best hyperparameters for logistic_regression: {'learning_rate': 8.907800526083353e-05, 'weight_decay': 0.0005285481392211958}
Best F1 score for logistic_regression: 0.4349104663642973





In [13]:
print("Logistic Regression results:\n\n")
print(f"Using BERT embeddings without augmentation scores: {lr_results['bert_without_augmentation'][2]}")
print(f"Using BERT embeddings without augmentation best score: {lr_results['bert_without_augmentation'][1]}")
print(f"Using BERT embeddings without augmentation best parameters: {lr_results['bert_without_augmentation'][0]}\n\n")
print(f"Using BERT embeddings with augmentation scores: {lr_results['bert_with_augmentation'][2]}")
print(f"Using BERT embeddings with augmentation best score: {lr_results['bert_with_augmentation'][1]}")
print(f"Using BERT embeddings with augmentation best parameters: {lr_results['bert_with_augmentation'][0]}\n\n")
print(f"Using TF-IDF embeddings without augmentation scores: {lr_results['tfidf_without_augmentation'][2]}")
print(f"Using TF-IDF embeddings without augmentation best score: {lr_results['tfidf_without_augmentation'][1]}")
print(f"Using TF-IDF embeddings without augmentation best parameters: {lr_results['tfidf_without_augmentation'][0]}\n\n")
print(f"Using TF-IDF embeddings with augmentation scores: {lr_results['tfidf_with_augmentation'][2]}")
print(f"Using TF-IDF embeddings with augmentation best score: {lr_results['tfidf_with_augmentation'][1]}")
print(f"Using TF-IDF embeddings with augmentation best parameters: {lr_results['tfidf_with_augmentation'][0]}\n\n")

Logistic Regression results:


Using BERT embeddings without augmentation scores: [[0.5412844036697247, 0.5244648318042814, 0.5198776758409785, 0.5229357798165137, 0.5275229357798165, 0.5152905198776758, 0.5137614678899083, 0.5084226646248086, 0.5451761102603369, 0.557427258805513], [0.5214067278287462, 0.5351681957186545, 0.5305810397553516, 0.5458715596330275, 0.5764525993883792, 0.5305810397553516, 0.5137614678899083, 0.5176110260336907, 0.5084226646248086, 0.5604900459418071], [0.30428134556574926, 0.45718654434250766, 0.4877675840978593, 0.5275229357798165, 0.5504587155963303, 0.5535168195718655, 0.536697247706422, 0.5191424196018377, 0.5313935681470138, 0.5788667687595712], [0.45718654434250766, 0.5275229357798165, 0.5336391437308868, 0.5458715596330275, 0.5642201834862385, 0.5565749235474006, 0.5214067278287462, 0.5329249617151608, 0.5436447166921899, 0.5482388973966309], [0.5259938837920489, 0.5688073394495413, 0.5489296636085627, 0.5290519877675841, 0.5489296636085627, 0.54740

## SVM

In [14]:
svm_results = {}
svm_results['bert_without_augmentation'] = optimize_model('svm', X_bert_no_augmentation, y_bert_no_augmentation)

[I 2025-01-21 22:03:42,125] A new study created in memory with name: no-name-e7d8686d-d737-4da4-8661-077eac4273bb

  0%|          | 0/50 [00:00<?, ?it/s][A[I 2025-01-21 22:05:03,006] Trial 0 finished with value: 0.6801286464260459 and parameters: {'C': 0.0001056793472547752, 'kernel': 'sigmoid', 'degree': 3, 'gamma': 'scale'}. Best is trial 0 with value: 0.6801286464260459.

  2%|▏         | 1/50 [01:20<1:06:03, 80.88s/it][A[I 2025-01-21 22:06:19,737] Trial 1 finished with value: 0.6801286464260459 and parameters: {'C': 0.00022333376636132435, 'kernel': 'poly', 'degree': 2, 'gamma': 'scale'}. Best is trial 0 with value: 0.6801286464260459.

  4%|▍         | 2/50 [02:37<1:02:45, 78.44s/it][A[I 2025-01-21 22:07:05,551] Trial 2 finished with value: 0.8438137787955847 and parameters: {'C': 0.0003560135369092698, 'kernel': 'linear', 'degree': 4, 'gamma': 'scale'}. Best is trial 2 with value: 0.8438137787955847.

  6%|▌         | 3/50 [03:23<49:46, 63.54s/it]  [A[I 2025-01-21 22:07:48,95

Best hyperparameters for svm: {'C': 0.0003560135369092698, 'kernel': 'linear', 'degree': 4, 'gamma': 'scale'}
Best F1 score for svm: 0.8438137787955847





In [15]:
svm_results['bert_with_augmentation'] = optimize_model('svm', X_bert_with_augmentation, y_bert_with_augmentation)
svm_results['tfidf_without_augmentation'] = optimize_model('svm', X_tfidf_no_augmentation, y_tfidf_no_augmentation)
svm_results['tfidf_with_augmentation'] = optimize_model('svm', X_tfidf_with_augmentation, y_tfidf_with_augmentation)

[I 2025-01-21 22:25:19,509] A new study created in memory with name: no-name-94bea806-3901-4049-b252-f124a4afb4e1

  0%|          | 0/50 [00:00<?, ?it/s][A[I 2025-01-21 22:31:13,275] Trial 0 finished with value: 0.8001407056053734 and parameters: {'C': 0.0005557124134415947, 'kernel': 'linear', 'degree': 2, 'gamma': 'scale'}. Best is trial 0 with value: 0.8001407056053734.

  2%|▏         | 1/50 [05:53<4:48:54, 353.76s/it][A[I 2025-01-21 22:45:32,500] Trial 1 finished with value: 0.5062383408925666 and parameters: {'C': 0.000589949334947208, 'kernel': 'poly', 'degree': 5, 'gamma': 'scale'}. Best is trial 0 with value: 0.8001407056053734.

  4%|▍         | 2/50 [20:12<8:05:11, 606.50s/it][A
[I 2025-01-21 22:45:32,505] A new study created in memory with name: no-name-259a2ba7-9296-4c4b-b7e7-508a73dabc9a


Best hyperparameters for svm: {'C': 0.0005557124134415947, 'kernel': 'linear', 'degree': 2, 'gamma': 'scale'}
Best F1 score for svm: 0.8001407056053734



  0%|          | 0/50 [00:00<?, ?it/s][A[I 2025-01-21 22:46:33,015] Trial 0 finished with value: 0.6801286464260459 and parameters: {'C': 79.7971925806194, 'kernel': 'poly', 'degree': 3, 'gamma': 'auto'}. Best is trial 0 with value: 0.6801286464260459.

  2%|▏         | 1/50 [01:00<49:24, 60.51s/it][A[I 2025-01-21 22:47:40,813] Trial 1 finished with value: 0.6801286464260459 and parameters: {'C': 0.0003724160232412946, 'kernel': 'rbf', 'degree': 5, 'gamma': 'auto'}. Best is trial 0 with value: 0.6801286464260459.

  4%|▍         | 2/50 [02:08<51:50, 64.80s/it][A[I 2025-01-21 22:49:03,000] Trial 2 finished with value: 0.6801286464260459 and parameters: {'C': 0.20500688205834142, 'kernel': 'rbf', 'degree': 4, 'gamma': 'auto'}. Best is trial 0 with value: 0.6801286464260459.

  6%|▌         | 3/50 [03:30<56:58, 72.74s/it][A[I 2025-01-21 22:50:16,155] Trial 3 finished with value: 0.730456233521128 and parameters: {'C': 0.06090560165176679, 'kernel': 'linear', 'degree': 4, 'gamma': 'sc

Best hyperparameters for svm: {'C': 0.49503644907234334, 'kernel': 'sigmoid', 'degree': 4, 'gamma': 'scale'}
Best F1 score for svm: 0.7821658213561498



  0%|          | 0/50 [00:00<?, ?it/s][A[I 2025-01-21 23:14:57,168] Trial 0 finished with value: 0.7840745074957588 and parameters: {'C': 0.9560707529752526, 'kernel': 'linear', 'degree': 4, 'gamma': 'auto'}. Best is trial 0 with value: 0.7840745074957588.

  2%|▏         | 1/50 [08:21<6:49:38, 501.60s/it][A[I 2025-01-21 23:23:08,982] Trial 1 finished with value: 0.7827793857978741 and parameters: {'C': 0.866907409787206, 'kernel': 'linear', 'degree': 4, 'gamma': 'auto'}. Best is trial 0 with value: 0.7840745074957588.

  4%|▍         | 2/50 [16:33<6:36:40, 495.84s/it][A[I 2025-01-21 23:38:57,640] Trial 2 finished with value: 0.43502821729044766 and parameters: {'C': 0.7878594571168033, 'kernel': 'rbf', 'degree': 3, 'gamma': 'auto'}. Best is trial 0 with value: 0.7840745074957588.

  6%|▌         | 3/50 [32:22<8:27:05, 647.36s/it][A

Best hyperparameters for svm: {'C': 0.9560707529752526, 'kernel': 'linear', 'degree': 4, 'gamma': 'auto'}
Best F1 score for svm: 0.7840745074957588





In [16]:
print("SVM results:\n\n")
print(f"Using BERT embeddings without augmentation scores: {svm_results['bert_without_augmentation'][2]}")
print(f"Using BERT embeddings without augmentation best score: {svm_results['bert_without_augmentation'][1]}")
print(f"Using BERT embeddings without augmentation best parameters: {svm_results['bert_without_augmentation'][0]}\n\n")
print(f"Using BERT embeddings with augmentation scores: {svm_results['bert_with_augmentation'][2]}")
print(f"Using BERT embeddings with augmentation best score: {svm_results['bert_with_augmentation'][1]}")
print(f"Using BERT embeddings with augmentation best parameters: {svm_results['bert_with_augmentation'][0]}\n\n")
print(f"Using TF-IDF embeddings without augmentation scores: {svm_results['tfidf_without_augmentation'][2]}")
print(f"Using TF-IDF embeddings without augmentation best score: {svm_results['tfidf_without_augmentation'][1]}")
print(f"Using TF-IDF embeddings without augmentation best parameters: {svm_results['tfidf_without_augmentation'][0]}\n\n")
print(f"Using TF-IDF embeddings with augmentation scores: {svm_results['tfidf_with_augmentation'][2]}")
print(f"Using TF-IDF embeddings with augmentation best score: {svm_results['tfidf_with_augmentation'][1]}")
print(f"Using TF-IDF embeddings with augmentation best parameters: {svm_results['tfidf_with_augmentation'][0]}\n\n")


SVM results:


Using BERT embeddings without augmentation scores: [[0.6788990825688074, 0.6788990825688074, 0.6804281345565749, 0.6804281345565749, 0.6804281345565749, 0.6804281345565749, 0.6804281345565749, 0.6814701378254211, 0.6799387442572741, 0.6799387442572741], [0.6788990825688074, 0.6788990825688074, 0.6804281345565749, 0.6804281345565749, 0.6804281345565749, 0.6804281345565749, 0.6804281345565749, 0.6814701378254211, 0.6799387442572741, 0.6799387442572741], [0.845565749235474, 0.8532110091743119, 0.8394495412844036, 0.8532110091743119, 0.8425076452599388, 0.8379204892966361, 0.8241590214067278, 0.8453292496171516, 0.8529862174578867, 0.8437978560490046], [0.8470948012232415, 0.8532110091743119, 0.8394495412844036, 0.8501529051987767, 0.8394495412844036, 0.8379204892966361, 0.8241590214067278, 0.8453292496171516, 0.8545176110260337, 0.8453292496171516], [0.6788990825688074, 0.6788990825688074, 0.6804281345565749, 0.6804281345565749, 0.6804281345565749, 0.6804281345565749, 0.680

## XGBoost

In [17]:
xgb_results = {}
xgb_results['bert_without_augmentation'] = optimize_model('xgboost', X_bert_no_augmentation, y_bert_no_augmentation)

[I 2025-01-21 23:38:57,661] A new study created in memory with name: no-name-95977fa0-2db4-4638-8c80-1e059a72647f

  0%|          | 0/50 [00:00<?, ?it/s][A[I 2025-01-21 23:49:38,235] Trial 0 finished with value: 0.8214800192946223 and parameters: {'booster': 'dart', 'max_depth': 7, 'min_child_weight': 7, 'eta': 0.0003039309394725608, 'gamma': 0.00011561856099347192, 'grow_policy': 'depthwise', 'sample_type': 'weighted', 'normalize_type': 'tree', 'rate_drop': 0.23597894465475702, 'skip_drop': 2.6927273011447618e-05}. Best is trial 0 with value: 0.8214800192946223.

  2%|▏         | 1/50 [10:40<8:43:08, 640.57s/it][A[I 2025-01-22 00:02:35,735] Trial 1 finished with value: 0.8413651881928151 and parameters: {'booster': 'dart', 'max_depth': 7, 'min_child_weight': 7, 'eta': 0.021095344741750326, 'gamma': 3.9120857204649044e-08, 'grow_policy': 'lossguide', 'sample_type': 'weighted', 'normalize_type': 'forest', 'rate_drop': 0.040842423024035804, 'skip_drop': 0.0015853552518449807}. Best is 

Best hyperparameters for xgboost: {'booster': 'dart', 'max_depth': 7, 'min_child_weight': 7, 'eta': 0.021095344741750326, 'gamma': 3.9120857204649044e-08, 'grow_policy': 'lossguide', 'sample_type': 'weighted', 'normalize_type': 'forest', 'rate_drop': 0.040842423024035804, 'skip_drop': 0.0015853552518449807}
Best F1 score for xgboost: 0.8413651881928151





In [18]:
xgb_results['bert_with_augmentation'] = optimize_model('xgboost', X_bert_with_augmentation, y_bert_with_augmentation)
xgb_results['tfidf_without_augmentation'] = optimize_model('xgboost', X_tfidf_no_augmentation, y_tfidf_no_augmentation)
xgb_results['tfidf_with_augmentation'] = optimize_model('xgboost', X_tfidf_with_augmentation, y_tfidf_with_augmentation)

[I 2025-01-22 00:02:35,748] A new study created in memory with name: no-name-52b9a333-5004-47ab-aa9b-bf23208103ac

  0%|          | 0/50 [00:00<?, ?it/s][A[I 2025-01-22 00:05:19,206] Trial 0 finished with value: 0.7781306650971159 and parameters: {'booster': 'gbtree', 'max_depth': 3, 'min_child_weight': 2, 'eta': 1.530201987912871e-07, 'gamma': 0.6105421880756111, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 0.7781306650971159.

  2%|▏         | 1/50 [02:43<2:13:29, 163.46s/it][A[I 2025-01-22 00:06:35,431] Trial 1 finished with value: 0.8234460409237269 and parameters: {'booster': 'gblinear'}. Best is trial 1 with value: 0.8234460409237269.

  4%|▍         | 2/50 [03:59<1:29:42, 112.14s/it][A[I 2025-01-22 00:31:18,263] Trial 2 finished with value: 0.43502821729044766 and parameters: {'booster': 'dart', 'max_depth': 7, 'min_child_weight': 10, 'eta': 1.777911040105423e-08, 'gamma': 5.325087413727096e-08, 'grow_policy': 'depthwise', 'sample_type': 'uniform', 'normalize_type

Best hyperparameters for xgboost: {'booster': 'gblinear'}
Best F1 score for xgboost: 0.8234460409237269



  0%|          | 0/50 [00:00<?, ?it/s][A[I 2025-01-22 00:32:24,144] Trial 0 finished with value: 0.7674845806932015 and parameters: {'booster': 'gbtree', 'max_depth': 9, 'min_child_weight': 9, 'eta': 0.0002802047486933304, 'gamma': 6.513704429429651e-05, 'grow_policy': 'lossguide'}. Best is trial 0 with value: 0.7674845806932015.

  2%|▏         | 1/50 [01:05<53:47, 65.87s/it][A[I 2025-01-22 00:33:19,099] Trial 1 finished with value: 0.7601388088848926 and parameters: {'booster': 'gbtree', 'max_depth': 7, 'min_child_weight': 2, 'eta': 0.0009668725597862803, 'gamma': 0.016763959241005454, 'grow_policy': 'lossguide'}. Best is trial 0 with value: 0.7674845806932015.

  4%|▍         | 2/50 [02:00<47:33, 59.45s/it][A[I 2025-01-22 00:40:43,693] Trial 2 finished with value: 0.766410731931195 and parameters: {'booster': 'dart', 'max_depth': 9, 'min_child_weight': 7, 'eta': 4.939113527583739e-05, 'gamma': 0.00583518111631256, 'grow_policy': 'lossguide', 'sample_type': 'uniform', 'normalize_

Best hyperparameters for xgboost: {'booster': 'gbtree', 'max_depth': 9, 'min_child_weight': 9, 'eta': 0.0002802047486933304, 'gamma': 6.513704429429651e-05, 'grow_policy': 'lossguide'}
Best F1 score for xgboost: 0.7674845806932015



  0%|          | 0/50 [00:00<?, ?it/s][A[I 2025-01-22 00:53:43,223] Trial 0 finished with value: 0.6993282207526919 and parameters: {'booster': 'gbtree', 'max_depth': 9, 'min_child_weight': 5, 'eta': 0.0005093095769966941, 'gamma': 0.2755313681357525, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 0.6993282207526919.

  2%|▏         | 1/50 [02:13<1:49:06, 133.60s/it][A[I 2025-01-22 00:54:57,485] Trial 1 finished with value: 0.7797195582176367 and parameters: {'booster': 'gblinear'}. Best is trial 1 with value: 0.7797195582176367.

  4%|▍         | 2/50 [03:27<1:18:57, 98.70s/it] [A[I 2025-01-22 01:08:17,167] Trial 2 finished with value: 0.6038717930962849 and parameters: {'booster': 'dart', 'max_depth': 3, 'min_child_weight': 6, 'eta': 0.002200044207834882, 'gamma': 3.130789620824195e-05, 'grow_policy': 'depthwise', 'sample_type': 'uniform', 'normalize_type': 'tree', 'rate_drop': 0.18103649974028194, 'skip_drop': 8.493998683190917e-07}. Best is trial 1 with value: 0.77971

Best hyperparameters for xgboost: {'booster': 'gblinear'}
Best F1 score for xgboost: 0.7797784163694907





In [19]:
print("XGBoost results:\n\n")
print(f"Using BERT embeddings without augmentation scores: {xgb_results['bert_without_augmentation'][2]}")
print(f"Using BERT embeddings without augmentation best score: {xgb_results['bert_without_augmentation'][1]}")
print(f"Using BERT embeddings without augmentation best parameters: {xgb_results['bert_without_augmentation'][0]}\n\n")
print(f"Using BERT embeddings with augmentation scores: {xgb_results['bert_with_augmentation'][2]}")
print(f"Using BERT embeddings with augmentation best score: {xgb_results['bert_with_augmentation'][1]}")
print(f"Using BERT embeddings with augmentation best parameters: {xgb_results['bert_with_augmentation'][0]}\n\n")
print(f"Using TF-IDF embeddings without augmentation scores: {xgb_results['tfidf_without_augmentation'][2]}")
print(f"Using TF-IDF embeddings without augmentation best score: {xgb_results['tfidf_without_augmentation'][1]}")
print(f"Using TF-IDF embeddings without augmentation best parameters: {xgb_results['tfidf_without_augmentation'][0]}\n\n")
print(f"Using TF-IDF embeddings with augmentation scores: {xgb_results['tfidf_with_augmentation'][2]}")
print(f"Using TF-IDF embeddings with augmentation best score: {xgb_results['tfidf_with_augmentation'][1]}")
print(f"Using TF-IDF embeddings with augmentation best parameters: {xgb_results['tfidf_with_augmentation'][0]}\n\n")

XGBoost results:


Using BERT embeddings without augmentation scores: [[0.8195718654434251, 0.8211009174311926, 0.8058103975535168, 0.8333333333333334, 0.8042813455657493, 0.8149847094801224, 0.8363914373088684, 0.8147013782542113, 0.8238897396630934, 0.8407350689127105], [0.8440366972477065, 0.8409785932721713, 0.8348623853211009, 0.8608562691131498, 0.8348623853211009, 0.8394495412844036, 0.8302752293577982, 0.8376722817764165, 0.8499234303215927, 0.8407350689127105]]
Using BERT embeddings without augmentation best score: 0.8413651881928151
Using BERT embeddings without augmentation best parameters: {'booster': 'dart', 'max_depth': 7, 'min_child_weight': 7, 'eta': 0.021095344741750326, 'gamma': 3.9120857204649044e-08, 'grow_policy': 'lossguide', 'sample_type': 'weighted', 'normalize_type': 'forest', 'rate_drop': 0.040842423024035804, 'skip_drop': 0.0015853552518449807}


Using BERT embeddings with augmentation scores: [[0.7776470588235294, 0.7823529411764706, 0.7692760447321955, 0.77

## DNN

In [20]:
dnn_results = {}
dnn_results['bert_without_augmentation'] = optimize_model('dnn', X_bert_no_augmentation, y_bert_no_augmentation)
dnn_results['bert_with_augmentation'] = optimize_model('dnn', X_bert_with_augmentation, y_bert_with_augmentation)
dnn_results['tfidf_without_augmentation'] = optimize_model('dnn', X_tfidf_no_augmentation, y_tfidf_no_augmentation)
dnn_results['tfidf_with_augmentation'] = optimize_model('dnn', X_tfidf_with_augmentation, y_tfidf_with_augmentation)

[I 2025-01-22 01:11:58,946] A new study created in memory with name: no-name-4b0ac8b7-ed74-4a89-a46b-e182a7c894ae

  0%|          | 0/50 [00:00<?, ?it/s][A[W 2025-01-22 01:11:58,952] Trial 0 failed with parameters: {'batch_norm': True} because of the following error: KeyError('layers').
Traceback (most recent call last):
  File "C:\Users\amita\PycharmProjects\Israel-Palestine-Political-Affiliation-Text-Classification\venv\Lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\amita\AppData\Local\Temp\ipykernel_35216\3444861739.py", line 127, in <lambda>
    study.optimize(lambda trial: objective(trial, model_name, X, y, folds_scores), n_trials=n_trials, timeout=timout, callbacks=[progress_bar])
                                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\amita\AppData\Local\Temp\ipykernel_35216\3444861739.py", line 107, in objective
    model = Clas

KeyError: 'layers'

In [None]:
print("DNN results:\n\n")
print(f"Using BERT embeddings without augmentation scores: {dnn_results['bert_without_augmentation'][2]}")
print(f"Using BERT embeddings without augmentation best score: {dnn_results['bert_without_augmentation'][1]}")
print(f"Using BERT embeddings without augmentation best parameters: {dnn_results['bert_without_augmentation'][0]}\n\n")
print(f"Using BERT embeddings with augmentation scores: {dnn_results['bert_with_augmentation'][2]}")
print(f"Using BERT embeddings with augmentation best score: {dnn_results['bert_with_augmentation'][1]}")
print(f"Using BERT embeddings with augmentation best parameters: {dnn_results['bert_with_augmentation'][0]}\n\n")
print(f"Using TF-IDF embeddings without augmentation scores: {dnn_results['tfidf_without_augmentation'][2]}")
print(f"Using TF-IDF embeddings without augmentation best score: {dnn_results['tfidf_without_augmentation'][1]}")
print(f"Using TF-IDF embeddings without augmentation best parameters: {dnn_results['tfidf_without_augmentation'][0]}\n\n")
print(f"Using TF-IDF embeddings with augmentation scores: {dnn_results['tfidf_with_augmentation'][2]}")
print(f"Using TF-IDF embeddings with augmentation best score: {dnn_results['tfidf_with_augmentation'][1]}")
print(f"Using TF-IDF embeddings with augmentation best parameters: {dnn_results['tfidf_with_augmentation'][0]}\n\n")