# Imports and Setup

In [1]:
import os
print(os.getcwd())
os.chdir(r'C:\Users\shaha\Projects\Python Projects\Israel-Palestine-Political-Affiliation-Text-Classification')
print(os.getcwd())

c:\Users\shaha\Projects\Python Projects\Israel-Palestine-Political-Affiliation-Text-Classification\Analysis
C:\Users\shaha\Projects\Python Projects\Israel-Palestine-Political-Affiliation-Text-Classification


In [2]:
import optuna

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score, f1_score
import pandas as pd


from classifiers import *
from dataset import EmbeddingDataset
from embedder import Embedder
from Config.dataset_config import *

from torch.utils.data import Dataset, DataLoader

import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')

from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\shaha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\shaha\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shaha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Define optimizers

In [3]:
# Helper dataset
class HelperDataset(Dataset):
    def __init__(self, X, y):
        super(HelperDataset).__init__()
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, item):
        return self.X[item], self.y[item]

In [4]:
# Custom tqdm callback
class TqdmCallback:
    def __init__(self, n_trials):
        self.pbar = tqdm(total=n_trials)

    def __call__(self, study, trial):
        self.pbar.update(1)

    def close(self):
        self.pbar.close()

In [5]:
model_hyperparameters = {
    'logistic_regression': {
        'num_epochs': (5, 20, 'int'),
        'learning_rate': (1e-5, 1e-3, 'loguniform'),
        'weight_decay': (1e-5, 1e-3, 'loguniform')
    },
    'svm': {
        'C': (1e-4, 1e2, 'loguniform'),
        'kernel': (['linear', 'rbf', 'sigmoid'], 'categorical'),
        'degree': (2, 4, 'int'),
        'gamma': (['scale', 'auto'], 'categorical')
    },
    'xgboost': {
        'n_estimators': (50, 200, 'int'),
        'learning_rate': (1e-3, 0.2, 'loguniform'),
        'booster': (['gbtree', 'gblinear', 'dart'], 'categorical'),
        'max_depth': (3, 10, 'int'),
        'min_child_weight': (2, 10, 'int'),
        'colsample_bytree': (0.5, 1.0, 'uniform'),
        'subsample': (0.5, 1.0, 'uniform'),
        'reg_alpha': (1e-8, 10.0, 'loguniform'),
        'reg_lambda': (1e-8, 10.0, 'loguniform'),
        'gamma': (1e-8, 1.0, 'loguniform')
    },
    'dnn': {
        "num_epochs": (5, 20, 'int'),  # Adjust after trial and error
        "learning_rate": (1e-5, 1e-3, 'loguniform'),
        'weight_decay': (1e-5, 1e-3, 'loguniform'),
        "batch_norm": ([True, False], 'categorical'),
        "drop_out": (0.0, 0.5, 'uniform'),
        "layers": ([[768, 64, 3],
                    [768, 128, 3],
                    [768, 64, 64, 3],
                    [768, 128, 64, 3],
                    [768, 512, 32, 3],
                    [768, 512, 128, 3],
                    [768, 512, 128, 64, 3]], 'categorical')  # Layer dimensions, including an input and output layer.
    }
}

def suggest_hyperparameters(trial, hyperparams):
    params = {}
    for key, value in hyperparams.items():
        if len(value) == 2 and value[1] == 'categorical':
            params[key] = trial.suggest_categorical(key, value[0])
        elif len(value) == 3:
            if value[2] == 'loguniform':
                params[key] = trial.suggest_float(key, value[0], value[1], log=True)
            elif value[2] == 'uniform':
                params[key] = trial.suggest_float(key, value[0], value[1])
            elif value[2] == 'int':
                params[key] = trial.suggest_int(key, value[0], value[1])
            elif value[2] == 'categorical':
                params[key] = trial.suggest_categorical(key, value[0])
        else:
            raise ValueError(f"Hyperparameter tuple for {key} is not in the expected format: {value}")
    return params

def cross_validation(estimator, X, y, n_splits=5):
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = []
    for i, (train_index, val_index) in enumerate(cv.split(X, y)):
        # Split to train and validation sets
        x_train, x_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        help_train_dataset = HelperDataset(x_train, y_train)
        help_val_dataset = HelperDataset(x_val, y_val)

        train_dataloader = DataLoader(help_train_dataset, batch_size=BATCH_SIZE, shuffle=True)
        val_dataloader = DataLoader(help_val_dataset, batch_size=BATCH_SIZE, shuffle=True)

        # Fit to the Classifier train and predict data type
        train = (train_dataloader, (x_train, y_train))
        val = (val_dataloader, (x_val, y_val))

        estimator.fit(train)
        pred = estimator.predict(val)
        score = f1_score(y_val, pred, average='micro')
        scores.append(score)
    return scores

# Define objective function for optuna. The function include all models, and should be called with the model name. The function optimize the Classifier class hyperparameters.
def objective(trial, model_name, X, y, folds_scores):
        # Use suggest_hyperparameters to handle standard parameters
    params = suggest_hyperparameters(trial, model_hyperparameters[model_name])

    # Add unique overrides for specific models
    if model_name == 'logistic_regression':
        params['batch_norm'] = False
        params['drop_out'] = 0.0
        params['layers'] = [768, 3]

    # Add unique parameters for XGBoost (booster-specific) based on booster
    if model_name == 'xgboost':
        if params["booster"] in ["gbtree", "dart"]:
            params["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
            params["colsample_bytree"] = trial.suggest_float("colsample_bytree", 0.5, 1.0)
            params["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
            params["max_depth"] = trial.suggest_int("max_depth", 3, 10)
            params["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
            params["subsample"] = trial.suggest_float("subsample", 0.5, 1.0)

        if params["booster"] == "dart":
            params["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
            params["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
            params["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
            params["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)


    model = Classifier(params, model_type=model_name, log=False)

    # Perform cross validation
    scores = cross_validation(model, X, y)

    folds_scores.append(scores)     # Save scores for statistic tests
    return np.mean(scores)

def optimize_model(model_name, X, y, n_trials=10, timout=36000):
    """
    The actual optimization.
    """
    folds_scores = []   # create a list to store the scores from each trial folds
    study = optuna.create_study(direction='maximize')
    progress_bar = TqdmCallback(n_trials)
    study.optimize(lambda trial: objective(trial, model_name, X, y, folds_scores), n_trials=n_trials, timeout=timout, callbacks=[progress_bar])
    # Close progress bar
    progress_bar.close()

    best_params = study.best_params
    best_value = study.best_value
    attempt_stats = [(np.mean(scores), np.std(scores, ddof=1), scores) for scores in folds_scores]
    best_attempt = sorted(attempt_stats, key=lambda x: (-x[0], x[1]))[0]

    n = len(best_attempt[2])
    z = 1.96  # For 95% confidence interval

    # Calculate margin of error
    margin_of_error = z * (best_attempt[1] / np.sqrt(n))
    print(f"Best hyperparameters for {model_name}: {best_params}")
    print(f"Best F1 score for {model_name}: {best_value}")
    print(f"Attempts Stats: Avg: {best_attempt[0]}, Margin: +-{margin_of_error}, STD: {best_attempt[1]}, Scores: {best_attempt[2]}")
    

    return best_params, best_value, margin_of_error, best_attempt[1], best_attempt[2]

# Optimize models

## Create Datasets

In [None]:
'''
Create 6 different datasets:
augmented dataset - with distilbert embedding or tfidf, 
undersampled balanced dataset - with distilbert embedding or tfidf,
regular dataset - with distilbert embedding or tfidf
'''
bert_embedding_no_augmentation_data = EmbeddingDataset(
    data_path=DATA_PATH,
    subset=SUBSET,
    id_column_idx=ID_COLUMN_IDX,
    comment_column_idx=COMMENT_COLUMN_IDX,
    label_column_idx=LABEL_COLUMN_IDX,
    subset_column_idx=SUBSET_COLUMN_IDX,
    augmented_classes=AUGMENTED_CLASSES,
    augmentation_ratio=0,
    augmentation_methods=AUGMENTATION_METHODS,
    adversation_ratio = ADVERSATION_RATIO,
    undersampling_targets={},
    embedder=Embedder(),
    embedding_method='distilbert'
)

bert_embedding_undersampled_data = EmbeddingDataset(
    data_path=DATA_PATH,
    subset=SUBSET,
    id_column_idx=ID_COLUMN_IDX,
    comment_column_idx=COMMENT_COLUMN_IDX,
    label_column_idx=LABEL_COLUMN_IDX,
    subset_column_idx=SUBSET_COLUMN_IDX,
    augmented_classes=AUGMENTED_CLASSES,
    augmentation_ratio=0,
    augmentation_methods=AUGMENTATION_METHODS,
    adversation_ratio = ADVERSATION_RATIO,
    undersampling_targets=UNDERSAMPLING_TARGETS,
    embedder=Embedder(),
    embedding_method='distilbert'
)

bert_embedding_with_augmentation_data = EmbeddingDataset(
    data_path=DATA_PATH,
    subset=SUBSET,
    id_column_idx=ID_COLUMN_IDX,
    comment_column_idx=COMMENT_COLUMN_IDX,
    label_column_idx=LABEL_COLUMN_IDX,
    subset_column_idx=SUBSET_COLUMN_IDX,
    augmented_classes=AUGMENTED_CLASSES,
    augmentation_ratio=3,
    augmentation_methods=AUGMENTATION_METHODS,
    adversation_ratio = ADVERSATION_RATIO,
    undersampling_targets={},
    embedder=Embedder(),
    embedding_method='distilbert'
)

tfidf_embedding_no_augmentation_data = EmbeddingDataset(
    data_path=DATA_PATH,
    subset=SUBSET,
    id_column_idx=ID_COLUMN_IDX,
    comment_column_idx=COMMENT_COLUMN_IDX,
    label_column_idx=LABEL_COLUMN_IDX,
    subset_column_idx=SUBSET_COLUMN_IDX,
    augmented_classes=AUGMENTED_CLASSES,
    augmentation_ratio=0,
    augmentation_methods=AUGMENTATION_METHODS,
    adversation_ratio = ADVERSATION_RATIO,
    undersampling_targets={},
    embedder=Embedder(),
    embedding_method='tf-idf'
)

tfidf_embedding_undersampled_data = EmbeddingDataset(
    data_path=DATA_PATH,
    subset=SUBSET,
    id_column_idx=ID_COLUMN_IDX,
    comment_column_idx=COMMENT_COLUMN_IDX,
    label_column_idx=LABEL_COLUMN_IDX,
    subset_column_idx=SUBSET_COLUMN_IDX,
    augmented_classes=AUGMENTED_CLASSES,
    augmentation_ratio=0,
    augmentation_methods=AUGMENTATION_METHODS,
    adversation_ratio = ADVERSATION_RATIO,
    undersampling_targets=UNDERSAMPLING_TARGETS,
    embedder=Embedder(),
    embedding_method='tf-idf'
)

tfidf_embedding_with_augmentation_data = EmbeddingDataset(
    data_path=DATA_PATH,
    subset=SUBSET,
    id_column_idx=ID_COLUMN_IDX,
    comment_column_idx=COMMENT_COLUMN_IDX,
    label_column_idx=LABEL_COLUMN_IDX,
    subset_column_idx=SUBSET_COLUMN_IDX,
    augmented_classes=AUGMENTED_CLASSES,
    augmentation_ratio=3,
    augmentation_methods=AUGMENTATION_METHODS,
    adversation_ratio = ADVERSATION_RATIO,
    undersampling_targets={},
    embedder=Embedder(),
    embedding_method='tf-idf'
)

# Get X,y
X_bert_no_augmentation, y_bert_no_augmentation = bert_embedding_no_augmentation_data.embeddings, bert_embedding_no_augmentation_data.labels
X_bert_undersampled, y_bert_undersampled = bert_embedding_undersampled_data.embeddings, bert_embedding_undersampled_data.labels
X_bert_with_augmentation, y_bert_with_augmentation = bert_embedding_with_augmentation_data.embeddings, bert_embedding_with_augmentation_data.labels
X_tfidf_no_augmentation, y_tfidf_no_augmentation = tfidf_embedding_no_augmentation_data.embeddings, tfidf_embedding_no_augmentation_data.labels
X_tfidf_undersampled, y_tfidf_undersampled = tfidf_embedding_undersampled_data.embeddings, tfidf_embedding_undersampled_data.labels
X_tfidf_with_augmentation, y_tfidf_with_augmentation = tfidf_embedding_with_augmentation_data.embeddings, tfidf_embedding_with_augmentation_data.labels

[Dataset Status]: Loading the dataset...


Preprocessing comments: 100%|██████████| 30137/30137 [00:01<00:00, 29560.78it/s]


dataset size:  29705
[Dataset Status]: No Augmentation was chosen (augmentation/ adversation ratio == 0 or no augmented_classes). Moving on...
[EmbeddingDataset]: Loading precomputed embeddings from C:\Users\shaha\Projects\Python Projects\Israel-Palestine-Political-Affiliation-Text-Classification\Data\subset TRAIN_augmentation=0_embeddings_distilbert.pkl...
[Dataset Status]: Loading the dataset...
[Dataset Status]: Undersampeling the dataset...


Preprocessing comments: 100%|██████████| 14800/14800 [00:00<00:00, 21999.97it/s]


dataset size:  14681
[Dataset Status]: No Augmentation was chosen (augmentation/ adversation ratio == 0 or no augmented_classes). Moving on...
[EmbeddingDataset]: Loading precomputed embeddings from C:\Users\shaha\Projects\Python Projects\Israel-Palestine-Political-Affiliation-Text-Classification\Data\subset TRAIN_undersampled_embeddings_distilbert.pkl...
[Dataset Status]: Loading the dataset...


Preprocessing comments: 100%|██████████| 30137/30137 [00:01<00:00, 28115.81it/s]


dataset size:  29705


Augmenting data:   0%|          | 0/29705 [00:00<?, ?row/s]

## Logistic Regression

In [30]:
lr_results = {}
lr_results['bert_without_augmentation'] = optimize_model('logistic_regression', X_bert_no_augmentation, y_bert_no_augmentation)

[I 2025-01-24 14:30:18,611] A new study created in memory with name: no-name-5a989953-dd4b-4726-8b5a-d79b67de0725






[A[A[A[A[A[A[I 2025-01-24 14:30:49,500] Trial 0 finished with value: 0.33437908292914587 and parameters: {'num_epochs': 18, 'learning_rate': 0.0001253244019953191, 'weight_decay': 0.00019306404323714608}. Best is trial 0 with value: 0.33437908292914587.






[A[A[A[A[A[A[W 2025-01-24 14:31:08,289] Trial 1 failed with parameters: {'num_epochs': 12, 'learning_rate': 0.0002740604354304378, 'weight_decay': 0.0006010989363749924} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\shaha\Projects\Python Projects\Israel-Palestine-Political-Affiliation-Text-Classification\venv\lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\shaha\AppData\Local\Temp\ipykernel_11516\1215731394.py", line 121, in <lambda>
    study.optimize(lambda trial: objec

KeyboardInterrupt: 

In [21]:
lr_results['bert_with_undersampling'] = optimize_model('logistic_regression', X_bert_undersampled, y_bert_undersampled)

[I 2025-01-24 13:52:04,417] A new study created in memory with name: no-name-eaba14b0-629f-4861-ab7f-f1dcb321dce1





[A[A[A[A[A[I 2025-01-24 13:52:15,434] Trial 0 finished with value: 0.3323978621546794 and parameters: {'num_epochs': 9, 'learning_rate': 0.00032110600290645734, 'weight_decay': 0.00019581136615241164}. Best is trial 0 with value: 0.3323978621546794.





[A[A[A[A[A[I 2025-01-24 13:52:34,354] Trial 1 finished with value: 0.33367457009390594 and parameters: {'num_epochs': 16, 'learning_rate': 0.0009298005070937324, 'weight_decay': 4.3015582155523136e-05}. Best is trial 1 with value: 0.33367457009390594.





[A[A[A[A[A[I 2025-01-24 13:52:54,885] Trial 2 finished with value: 0.33523652991258024 and parameters: {'num_epochs': 19, 'learning_rate': 0.00039844935048623454, 'weight_decay': 3.2260911396490585e-05}. Best is trial 2 with value: 0.33523652991258024.





[A[A[A[A[A[I 2025-01-24 13:53:07,001] Trial 3 finished with value: 0.33126628010689796 and 

Best hyperparameters for logistic_regression: {'num_epochs': 19, 'learning_rate': 3.896232888555642e-05, 'weight_decay': 1.037622336324651e-05}
Best F1 score for logistic_regression: 0.33700093949826976
Attempts Stats: Avg: 0.33700093949826976, Margin: +-0.007789072358673653, STD: 0.00888617105900965, Scores: [0.3292349832958081, 0.3501651667563103, 0.3282524424356439, 0.33845135123776643, 0.33890075376582024]





In [22]:
lr_results['bert_with_augmentation'] = optimize_model('logistic_regression', X_bert_with_augmentation, y_bert_with_augmentation)

[I 2025-01-24 13:55:21,033] A new study created in memory with name: no-name-65bba1c6-3157-4978-88ed-25a4e0b5e8ee





[A[A[A[A[A[I 2025-01-24 13:56:34,157] Trial 0 finished with value: 0.33099646602184657 and parameters: {'num_epochs': 14, 'learning_rate': 3.5303152403258176e-05, 'weight_decay': 0.0007984387593372948}. Best is trial 0 with value: 0.33099646602184657.





[A[A[A[A[A[I 2025-01-24 13:57:39,430] Trial 1 finished with value: 0.33709822922674665 and parameters: {'num_epochs': 13, 'learning_rate': 0.00032405256036654615, 'weight_decay': 0.00015860752822922895}. Best is trial 1 with value: 0.33709822922674665.





[A[A[A[A[A[I 2025-01-24 13:58:25,712] Trial 2 finished with value: 0.3326499984888184 and parameters: {'num_epochs': 9, 'learning_rate': 4.5787693431730284e-05, 'weight_decay': 0.0007112206304118288}. Best is trial 1 with value: 0.33709822922674665.





[A[A[A[A[A[I 2025-01-24 13:58:57,217] Trial 3 finished with value: 0.3331687919273835 and p

Best hyperparameters for logistic_regression: {'num_epochs': 13, 'learning_rate': 0.00032405256036654615, 'weight_decay': 0.00015860752822922895}
Best F1 score for logistic_regression: 0.33709822922674665
Attempts Stats: Avg: 0.33709822922674665, Margin: +-0.004013152806588234, STD: 0.004578409428380284, Scores: [0.3415232088554781, 0.3308273588875908, 0.3343300086455434, 0.3412266875406123, 0.3375838822045088]





In [23]:
lr_results['tfidf_without_augmentation'] = optimize_model('logistic_regression', X_tfidf_no_augmentation, y_tfidf_no_augmentation)

[I 2025-01-24 14:06:46,093] A new study created in memory with name: no-name-f33442a1-3c8c-467e-8245-85f86b0e0819





[A[A[A[A[A[I 2025-01-24 14:07:01,565] Trial 0 finished with value: 0.28766560061658886 and parameters: {'num_epochs': 6, 'learning_rate': 1.4443316826286925e-05, 'weight_decay': 0.0005229163977712571}. Best is trial 0 with value: 0.28766560061658886.





[A[A[A[A[A[I 2025-01-24 14:07:48,623] Trial 1 finished with value: 0.2722726626377209 and parameters: {'num_epochs': 18, 'learning_rate': 1.6197321680481613e-05, 'weight_decay': 3.8828499781522735e-05}. Best is trial 0 with value: 0.28766560061658886.





[A[A[A[A[A[I 2025-01-24 14:08:01,649] Trial 2 finished with value: 0.27248628784075735 and parameters: {'num_epochs': 5, 'learning_rate': 1.706890317933938e-05, 'weight_decay': 0.0005507275335652247}. Best is trial 0 with value: 0.28766560061658886.





[A[A[A[A[A[I 2025-01-24 14:08:42,024] Trial 3 finished with value: 0.27317240298740947 and pa

Best hyperparameters for logistic_regression: {'num_epochs': 6, 'learning_rate': 1.4443316826286925e-05, 'weight_decay': 0.0005229163977712571}
Best F1 score for logistic_regression: 0.28766560061658886
Attempts Stats: Avg: 0.28766560061658886, Margin: +-0.02270259293349784, STD: 0.025900276053473233, Scores: [0.33206216242683645, 0.28938605984770654, 0.2727665139016193, 0.27195658020845564, 0.2721566866983264]





In [24]:
lr_results['tfidf_with_undersampling'] = optimize_model('logistic_regression', X_tfidf_undersampled, y_tfidf_undersampled)

[I 2025-01-24 14:11:45,072] A new study created in memory with name: no-name-e66da7a4-a431-47a4-8e7b-d0aab8d3f164





[A[A[A[A[A[I 2025-01-24 14:11:51,448] Trial 0 finished with value: 0.3019900301620567 and parameters: {'num_epochs': 5, 'learning_rate': 0.0005289905367360097, 'weight_decay': 4.3887736593297426e-05}. Best is trial 0 with value: 0.3019900301620567.





[A[A[A[A[A[I 2025-01-24 14:12:03,197] Trial 1 finished with value: 0.31787164889941016 and parameters: {'num_epochs': 9, 'learning_rate': 0.0007220239846153542, 'weight_decay': 5.561756641519532e-05}. Best is trial 1 with value: 0.31787164889941016.





[A[A[A[A[A[I 2025-01-24 14:12:19,144] Trial 2 finished with value: 0.30317338051739995 and parameters: {'num_epochs': 12, 'learning_rate': 0.00040668719844268034, 'weight_decay': 0.0008325706652152355}. Best is trial 1 with value: 0.31787164889941016.





[A[A[A[A[A[I 2025-01-24 14:12:32,292] Trial 3 finished with value: 0.2924969799151785 and param

Best hyperparameters for logistic_regression: {'num_epochs': 9, 'learning_rate': 0.0007220239846153542, 'weight_decay': 5.561756641519532e-05}
Best F1 score for logistic_regression: 0.31787164889941016
Attempts Stats: Avg: 0.31787164889941016, Margin: +-0.017527123170772405, STD: 0.01999583615298916, Scores: [0.2833295459991065, 0.3201032087554124, 0.3299962984800231, 0.33298728648110343, 0.32294190478140555]





In [25]:
lr_results['tfidf_with_augmentation'] = optimize_model('logistic_regression', X_tfidf_with_augmentation, y_tfidf_with_augmentation)

[I 2025-01-24 14:13:50,931] A new study created in memory with name: no-name-14a71f0d-bbd1-4edc-be19-83b3bf7d0897





[A[A[A[A[A[I 2025-01-24 14:15:07,077] Trial 0 finished with value: 0.32281445995555674 and parameters: {'num_epochs': 17, 'learning_rate': 0.0003085659201290591, 'weight_decay': 0.0006344870613880225}. Best is trial 0 with value: 0.32281445995555674.





[A[A[A[A[A[I 2025-01-24 14:15:49,108] Trial 1 finished with value: 0.231657257038211 and parameters: {'num_epochs': 9, 'learning_rate': 1.2990685304362665e-05, 'weight_decay': 0.0009960478464434517}. Best is trial 0 with value: 0.32281445995555674.





[A[A[A[A[A[I 2025-01-24 14:17:20,575] Trial 2 finished with value: 0.31817720226835877 and parameters: {'num_epochs': 19, 'learning_rate': 8.834364155774411e-05, 'weight_decay': 5.832453571365449e-05}. Best is trial 0 with value: 0.32281445995555674.





[A[A[A[A[A[I 2025-01-24 14:17:46,059] Trial 3 finished with value: 0.32645279120821546 and para

Best hyperparameters for logistic_regression: {'num_epochs': 5, 'learning_rate': 0.0008379698067225026, 'weight_decay': 0.0001762009409032115}
Best F1 score for logistic_regression: 0.32645279120821546
Attempts Stats: Avg: 0.32645279120821546, Margin: +-0.004713258585313571, STD: 0.005377125812395733, Scores: [0.32218714991307257, 0.31980961375264094, 0.3316365359075722, 0.3270717841526088, 0.33155887231518294]





In [None]:
print("Logistic Regression results:\n\n")
df = pd.DataFrame([
    {
        'Experiment': key,
        'Best Parameters': value[0],
        'Best Avg': value[1],
        'Margin': value[2],
        'STD': value[3],
        'Scores': value[4]
    }
    for key, value in lr_results.items()
])
display(df)


Logistic Regression results:




Unnamed: 0,Experiment,Best Parameters,Best Avg,Margin,STD,Scores
0,bert_without_augmentation,"{'num_epochs': 19, 'learning_rate': 0.00031630...",0.339557,0.004255,0.004854,"[0.33497629314953326, 0.33842494294603287, 0.3..."
1,bert_with_undersampling,"{'num_epochs': 19, 'learning_rate': 3.89623288...",0.337001,0.007789,0.008886,"[0.3292349832958081, 0.3501651667563103, 0.328..."
2,bert_with_augmentation,"{'num_epochs': 13, 'learning_rate': 0.00032405...",0.337098,0.004013,0.004578,"[0.3415232088554781, 0.3308273588875908, 0.334..."
3,tfidf_without_augmentation,"{'num_epochs': 6, 'learning_rate': 1.444331682...",0.287666,0.022703,0.0259,"[0.33206216242683645, 0.28938605984770654, 0.2..."
4,tfidf_with_undersampling,"{'num_epochs': 9, 'learning_rate': 0.000722023...",0.317872,0.017527,0.019996,"[0.2833295459991065, 0.3201032087554124, 0.329..."
5,tfidf_with_augmentation,"{'num_epochs': 5, 'learning_rate': 0.000837969...",0.326453,0.004713,0.005377,"[0.32218714991307257, 0.31980961375264094, 0.3..."


## SVM

In [14]:
svm_results = {}
svm_results['bert_without_augmentation'] = optimize_model('svm', X_bert_no_augmentation, y_bert_no_augmentation)

[I 2025-01-22 17:34:11,376] A new study created in memory with name: no-name-d8f1fc7f-7d31-418f-880a-294454c62a9c
  0%|          | 0/100 [00:00<?, ?it/s][I 2025-01-22 17:35:33,977] Trial 0 finished with value: 0.805571790512853 and parameters: {'C': 15.81137337626578, 'kernel': 'sigmoid', 'degree': 2, 'gamma': 'auto'}. Best is trial 0 with value: 0.805571790512853.
  1%|          | 1/100 [01:22<2:16:17, 82.60s/it][I 2025-01-22 17:37:11,083] Trial 1 finished with value: 0.8364712851998071 and parameters: {'C': 6.318287384728119, 'kernel': 'rbf', 'degree': 3, 'gamma': 'auto'}. Best is trial 1 with value: 0.8364712851998071.
  2%|▏         | 2/100 [02:59<2:28:50, 91.13s/it][I 2025-01-22 17:38:58,817] Trial 2 finished with value: 0.801437262036894 and parameters: {'C': 0.20890059254271676, 'kernel': 'linear', 'degree': 2, 'gamma': 'auto'}. Best is trial 1 with value: 0.8364712851998071.
  3%|▎         | 3/100 [04:47<2:39:35, 98.71s/it][I 2025-01-22 17:40:08,544] Trial 3 finished with value

In [None]:
svm_results['bert_with_undersampling'] = optimize_model('svm', X_bert_undersampled, y_bert_undersampled)

In [7]:
svm_results['bert_with_augmentation'] = optimize_model('svm', X_bert_with_augmentation, y_bert_with_augmentation)

[I 2025-01-22 20:11:10,140] A new study created in memory with name: no-name-713591f1-f6c1-42c3-a133-8fe7fba753a0
  0%|          | 0/100 [00:00<?, ?it/s][I 2025-01-22 20:24:02,959] Trial 0 finished with value: 0.7960212928019943 and parameters: {'C': 0.0003197872629113399, 'kernel': 'linear', 'degree': 5, 'gamma': 'auto'}. Best is trial 0 with value: 0.7960212928019943.
  1%|          | 1/100 [12:52<21:15:09, 772.82s/it][I 2025-01-22 20:36:33,379] Trial 1 finished with value: 0.926730360419624 and parameters: {'C': 69.0221340142107, 'kernel': 'rbf', 'degree': 2, 'gamma': 'auto'}. Best is trial 1 with value: 0.926730360419624.
  2%|▏         | 2/100 [25:23<20:40:44, 759.64s/it][I 2025-01-22 20:47:45,839] Trial 2 finished with value: 0.7039768722085655 and parameters: {'C': 42.13385194789783, 'kernel': 'sigmoid', 'degree': 2, 'gamma': 'scale'}. Best is trial 1 with value: 0.926730360419624.
  3%|▎         | 3/100 [36:35<19:23:43, 719.83s/it][I 2025-01-22 21:02:22,391] Trial 3 finished wi

KeyboardInterrupt: 

In [None]:
svm_results['tfidf_without_augmentation'] = optimize_model('svm', X_tfidf_no_augmentation, y_tfidf_no_augmentation)

In [None]:
svm_results['tfidf_with_undersampling'] = optimize_model('svm', X_tfidf_undersampled, y_tfidf_undersampled)

In [15]:
svm_results['tfidf_with_augmentation'] = optimize_model('svm', X_tfidf_with_augmentation, y_tfidf_with_augmentation)

[I 2025-01-21 22:25:19,509] A new study created in memory with name: no-name-94bea806-3901-4049-b252-f124a4afb4e1

  0%|          | 0/50 [00:00<?, ?it/s][A[I 2025-01-21 22:31:13,275] Trial 0 finished with value: 0.8001407056053734 and parameters: {'C': 0.0005557124134415947, 'kernel': 'linear', 'degree': 2, 'gamma': 'scale'}. Best is trial 0 with value: 0.8001407056053734.

  2%|▏         | 1/50 [05:53<4:48:54, 353.76s/it][A[I 2025-01-21 22:45:32,500] Trial 1 finished with value: 0.5062383408925666 and parameters: {'C': 0.000589949334947208, 'kernel': 'poly', 'degree': 5, 'gamma': 'scale'}. Best is trial 0 with value: 0.8001407056053734.

  4%|▍         | 2/50 [20:12<8:05:11, 606.50s/it][A
[I 2025-01-21 22:45:32,505] A new study created in memory with name: no-name-259a2ba7-9296-4c4b-b7e7-508a73dabc9a


Best hyperparameters for svm: {'C': 0.0005557124134415947, 'kernel': 'linear', 'degree': 2, 'gamma': 'scale'}
Best F1 score for svm: 0.8001407056053734



  0%|          | 0/50 [00:00<?, ?it/s][A[I 2025-01-21 22:46:33,015] Trial 0 finished with value: 0.6801286464260459 and parameters: {'C': 79.7971925806194, 'kernel': 'poly', 'degree': 3, 'gamma': 'auto'}. Best is trial 0 with value: 0.6801286464260459.

  2%|▏         | 1/50 [01:00<49:24, 60.51s/it][A[I 2025-01-21 22:47:40,813] Trial 1 finished with value: 0.6801286464260459 and parameters: {'C': 0.0003724160232412946, 'kernel': 'rbf', 'degree': 5, 'gamma': 'auto'}. Best is trial 0 with value: 0.6801286464260459.

  4%|▍         | 2/50 [02:08<51:50, 64.80s/it][A[I 2025-01-21 22:49:03,000] Trial 2 finished with value: 0.6801286464260459 and parameters: {'C': 0.20500688205834142, 'kernel': 'rbf', 'degree': 4, 'gamma': 'auto'}. Best is trial 0 with value: 0.6801286464260459.

  6%|▌         | 3/50 [03:30<56:58, 72.74s/it][A[I 2025-01-21 22:50:16,155] Trial 3 finished with value: 0.730456233521128 and parameters: {'C': 0.06090560165176679, 'kernel': 'linear', 'degree': 4, 'gamma': 'sc

Best hyperparameters for svm: {'C': 0.49503644907234334, 'kernel': 'sigmoid', 'degree': 4, 'gamma': 'scale'}
Best F1 score for svm: 0.7821658213561498



  0%|          | 0/50 [00:00<?, ?it/s][A[I 2025-01-21 23:14:57,168] Trial 0 finished with value: 0.7840745074957588 and parameters: {'C': 0.9560707529752526, 'kernel': 'linear', 'degree': 4, 'gamma': 'auto'}. Best is trial 0 with value: 0.7840745074957588.

  2%|▏         | 1/50 [08:21<6:49:38, 501.60s/it][A[I 2025-01-21 23:23:08,982] Trial 1 finished with value: 0.7827793857978741 and parameters: {'C': 0.866907409787206, 'kernel': 'linear', 'degree': 4, 'gamma': 'auto'}. Best is trial 0 with value: 0.7840745074957588.

  4%|▍         | 2/50 [16:33<6:36:40, 495.84s/it][A[I 2025-01-21 23:38:57,640] Trial 2 finished with value: 0.43502821729044766 and parameters: {'C': 0.7878594571168033, 'kernel': 'rbf', 'degree': 3, 'gamma': 'auto'}. Best is trial 0 with value: 0.7840745074957588.

  6%|▌         | 3/50 [32:22<8:27:05, 647.36s/it][A

Best hyperparameters for svm: {'C': 0.9560707529752526, 'kernel': 'linear', 'degree': 4, 'gamma': 'auto'}
Best F1 score for svm: 0.7840745074957588





In [16]:
print("SVM results:\n\n")
df = pd.DataFrame([
    {
        'Experiment': key,
        'Best Parameters': value[0],
        'Best Avg': value[1],
        'STD': value[2],
        'Scores': value[3]
    }
    for key, value in svm_results.items()
])
display(df)

SVM results:


Using BERT embeddings without augmentation scores: [[0.6788990825688074, 0.6788990825688074, 0.6804281345565749, 0.6804281345565749, 0.6804281345565749, 0.6804281345565749, 0.6804281345565749, 0.6814701378254211, 0.6799387442572741, 0.6799387442572741], [0.6788990825688074, 0.6788990825688074, 0.6804281345565749, 0.6804281345565749, 0.6804281345565749, 0.6804281345565749, 0.6804281345565749, 0.6814701378254211, 0.6799387442572741, 0.6799387442572741], [0.845565749235474, 0.8532110091743119, 0.8394495412844036, 0.8532110091743119, 0.8425076452599388, 0.8379204892966361, 0.8241590214067278, 0.8453292496171516, 0.8529862174578867, 0.8437978560490046], [0.8470948012232415, 0.8532110091743119, 0.8394495412844036, 0.8501529051987767, 0.8394495412844036, 0.8379204892966361, 0.8241590214067278, 0.8453292496171516, 0.8545176110260337, 0.8453292496171516], [0.6788990825688074, 0.6788990825688074, 0.6804281345565749, 0.6804281345565749, 0.6804281345565749, 0.6804281345565749, 0.680

## XGBoost

In [7]:
xgb_results = {}
xgb_results['bert_without_augmentation'] = optimize_model('xgboost', X_bert_no_augmentation, y_bert_no_augmentation)

[I 2025-01-24 14:31:49,384] A new study created in memory with name: no-name-05eba04d-bfdf-4720-b5d5-8bc0b2a8ccdd







Parameters: { "colsample_bytree", "gamma", "max_depth", "min_child_weight", "subsample" } are not used.

Parameters: { "colsample_bytree", "gamma", "max_depth", "min_child_weight", "subsample" } are not used.

Parameters: { "colsample_bytree", "gamma", "max_depth", "min_child_weight", "subsample" } are not used.

Parameters: { "colsample_bytree", "gamma", "max_depth", "min_child_weight", "subsample" } are not used.

Parameters: { "colsample_bytree", "gamma", "max_depth", "min_child_weight", "subsample" } are not used.

[I 2025-01-24 14:32:53,336] Trial 0 finished with value: 0.7465663697593963 and parameters: {'n_estimators': 120, 'learning_rate': 0.17939026509382186, 'booster': 'gblinear', 'max_depth': 4, 'min_child_weight': 10, 'colsample_bytree': 0.9373755220160798, 'subsample': 0.6889316913998524, 'reg_alpha': 0.158167892934842, 'reg_lambda': 0.2983076654164862, 

In [None]:
xgb_results['bert_with_undersampling'] = optimize_model('xgboost', X_bert_undersampled, y_bert_undersampled)

In [None]:
xgb_results['bert_with_augmentation'] = optimize_model('xgboost', X_bert_with_augmentation, y_bert_with_augmentation)

In [None]:
xgb_results['tfidf_without_augmentation'] = optimize_model('xgboost', X_tfidf_no_augmentation, y_tfidf_no_augmentation)

In [None]:
xgb_results['tfidf_with_undersampling'] = optimize_model('xgboost', X_tfidf_undersampled, y_tfidf_undersampled)

In [18]:
xgb_results['tfidf_with_augmentation'] = optimize_model('xgboost', X_tfidf_with_augmentation, y_tfidf_with_augmentation)

[I 2025-01-22 00:02:35,748] A new study created in memory with name: no-name-52b9a333-5004-47ab-aa9b-bf23208103ac

  0%|          | 0/50 [00:00<?, ?it/s][A[I 2025-01-22 00:05:19,206] Trial 0 finished with value: 0.7781306650971159 and parameters: {'booster': 'gbtree', 'max_depth': 3, 'min_child_weight': 2, 'eta': 1.530201987912871e-07, 'gamma': 0.6105421880756111, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 0.7781306650971159.

  2%|▏         | 1/50 [02:43<2:13:29, 163.46s/it][A[I 2025-01-22 00:06:35,431] Trial 1 finished with value: 0.8234460409237269 and parameters: {'booster': 'gblinear'}. Best is trial 1 with value: 0.8234460409237269.

  4%|▍         | 2/50 [03:59<1:29:42, 112.14s/it][A[I 2025-01-22 00:31:18,263] Trial 2 finished with value: 0.43502821729044766 and parameters: {'booster': 'dart', 'max_depth': 7, 'min_child_weight': 10, 'eta': 1.777911040105423e-08, 'gamma': 5.325087413727096e-08, 'grow_policy': 'depthwise', 'sample_type': 'uniform', 'normalize_type

Best hyperparameters for xgboost: {'booster': 'gblinear'}
Best F1 score for xgboost: 0.8234460409237269



  0%|          | 0/50 [00:00<?, ?it/s][A[I 2025-01-22 00:32:24,144] Trial 0 finished with value: 0.7674845806932015 and parameters: {'booster': 'gbtree', 'max_depth': 9, 'min_child_weight': 9, 'eta': 0.0002802047486933304, 'gamma': 6.513704429429651e-05, 'grow_policy': 'lossguide'}. Best is trial 0 with value: 0.7674845806932015.

  2%|▏         | 1/50 [01:05<53:47, 65.87s/it][A[I 2025-01-22 00:33:19,099] Trial 1 finished with value: 0.7601388088848926 and parameters: {'booster': 'gbtree', 'max_depth': 7, 'min_child_weight': 2, 'eta': 0.0009668725597862803, 'gamma': 0.016763959241005454, 'grow_policy': 'lossguide'}. Best is trial 0 with value: 0.7674845806932015.

  4%|▍         | 2/50 [02:00<47:33, 59.45s/it][A[I 2025-01-22 00:40:43,693] Trial 2 finished with value: 0.766410731931195 and parameters: {'booster': 'dart', 'max_depth': 9, 'min_child_weight': 7, 'eta': 4.939113527583739e-05, 'gamma': 0.00583518111631256, 'grow_policy': 'lossguide', 'sample_type': 'uniform', 'normalize_

Best hyperparameters for xgboost: {'booster': 'gbtree', 'max_depth': 9, 'min_child_weight': 9, 'eta': 0.0002802047486933304, 'gamma': 6.513704429429651e-05, 'grow_policy': 'lossguide'}
Best F1 score for xgboost: 0.7674845806932015



  0%|          | 0/50 [00:00<?, ?it/s][A[I 2025-01-22 00:53:43,223] Trial 0 finished with value: 0.6993282207526919 and parameters: {'booster': 'gbtree', 'max_depth': 9, 'min_child_weight': 5, 'eta': 0.0005093095769966941, 'gamma': 0.2755313681357525, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 0.6993282207526919.

  2%|▏         | 1/50 [02:13<1:49:06, 133.60s/it][A[I 2025-01-22 00:54:57,485] Trial 1 finished with value: 0.7797195582176367 and parameters: {'booster': 'gblinear'}. Best is trial 1 with value: 0.7797195582176367.

  4%|▍         | 2/50 [03:27<1:18:57, 98.70s/it] [A[I 2025-01-22 01:08:17,167] Trial 2 finished with value: 0.6038717930962849 and parameters: {'booster': 'dart', 'max_depth': 3, 'min_child_weight': 6, 'eta': 0.002200044207834882, 'gamma': 3.130789620824195e-05, 'grow_policy': 'depthwise', 'sample_type': 'uniform', 'normalize_type': 'tree', 'rate_drop': 0.18103649974028194, 'skip_drop': 8.493998683190917e-07}. Best is trial 1 with value: 0.77971

Best hyperparameters for xgboost: {'booster': 'gblinear'}
Best F1 score for xgboost: 0.7797784163694907





In [19]:
print("XGBoost results:\n\n")
df = pd.DataFrame([
    {
        'Experiment': key,
        'Best Parameters': value[0],
        'Best Avg': value[1],
        'STD': value[2],
        'Scores': value[3]
    }
    for key, value in xgb_results.items()
])
display(df)

XGBoost results:


Using BERT embeddings without augmentation scores: [[0.8195718654434251, 0.8211009174311926, 0.8058103975535168, 0.8333333333333334, 0.8042813455657493, 0.8149847094801224, 0.8363914373088684, 0.8147013782542113, 0.8238897396630934, 0.8407350689127105], [0.8440366972477065, 0.8409785932721713, 0.8348623853211009, 0.8608562691131498, 0.8348623853211009, 0.8394495412844036, 0.8302752293577982, 0.8376722817764165, 0.8499234303215927, 0.8407350689127105]]
Using BERT embeddings without augmentation best score: 0.8413651881928151
Using BERT embeddings without augmentation best parameters: {'booster': 'dart', 'max_depth': 7, 'min_child_weight': 7, 'eta': 0.021095344741750326, 'gamma': 3.9120857204649044e-08, 'grow_policy': 'lossguide', 'sample_type': 'weighted', 'normalize_type': 'forest', 'rate_drop': 0.040842423024035804, 'skip_drop': 0.0015853552518449807}


Using BERT embeddings with augmentation scores: [[0.7776470588235294, 0.7823529411764706, 0.7692760447321955, 0.77

## DNN

In [19]:
dnn_results = {}
dnn_results['bert_without_augmentation'] = optimize_model('dnn', X_bert_no_augmentation, y_bert_no_augmentation)

[I 2025-01-24 13:38:22,075] A new study created in memory with name: no-name-42c5ee1e-9e8e-4cc4-a047-a98d7be67b83




[A[A[A[A[I 2025-01-24 13:38:45,103] Trial 0 finished with value: 0.3319768884173942 and parameters: {'num_epochs': 8, 'learning_rate': 1.0038165104772525e-05, 'weight_decay': 1.6986560279504574e-05, 'batch_norm': False, 'drop_out': 0.3748758721881615, 'layers': [768, 64, 64, 3]}. Best is trial 0 with value: 0.3319768884173942.




[I 2025-01-24 13:39:23,903] Trial 1 finished with value: 0.3324707183704495 and parameters: {'num_epochs': 10, 'learning_rate': 0.0009853333199590212, 'weight_decay': 0.00017060459135858619, 'batch_norm': True, 'drop_out': 0.00882633257625165, 'layers': [768, 128, 3]}. Best is trial 1 with value: 0.3324707183704495.




[I 2025-01-24 13:40:26,929] Trial 2 finished with value: 0.3374707073914421 and parameters: {'num_epochs': 7, 'learning_rate': 0.0001618661209833094, 'weight_decay': 2.7749784607589302e-05, 'batch_norm': False, 'drop_out': 

KeyboardInterrupt: 

In [None]:
dnn_results['bert_with_undersampling'] = optimize_model('dnn', X_bert_undersampled, y_bert_undersampled)

In [None]:
dnn_results['bert_with_augmentation'] = optimize_model('dnn', X_bert_with_augmentation, y_bert_with_augmentation)

In [None]:
dnn_results['tfidf_without_augmentation'] = optimize_model('dnn', X_tfidf_no_augmentation, y_tfidf_no_augmentation)

In [None]:
dnn_results['tfidf_with_undersampling'] = optimize_model('dnn', X_tfidf_undersampled, y_tfidf_undersampled)

In [20]:
dnn_results['tfidf_with_augmentation'] = optimize_model('dnn', X_tfidf_with_augmentation, y_tfidf_with_augmentation)

[I 2025-01-22 01:11:58,946] A new study created in memory with name: no-name-4b0ac8b7-ed74-4a89-a46b-e182a7c894ae

  0%|          | 0/50 [00:00<?, ?it/s][A[W 2025-01-22 01:11:58,952] Trial 0 failed with parameters: {'batch_norm': True} because of the following error: KeyError('layers').
Traceback (most recent call last):
  File "C:\Users\amita\PycharmProjects\Israel-Palestine-Political-Affiliation-Text-Classification\venv\Lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\amita\AppData\Local\Temp\ipykernel_35216\3444861739.py", line 127, in <lambda>
    study.optimize(lambda trial: objective(trial, model_name, X, y, folds_scores), n_trials=n_trials, timeout=timout, callbacks=[progress_bar])
                                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\amita\AppData\Local\Temp\ipykernel_35216\3444861739.py", line 107, in objective
    model = Clas

KeyError: 'layers'

In [None]:
print("DNN results:\n\n")
df = pd.DataFrame([
    {
        'Experiment': key,
        'Best Parameters': value[0],
        'Best Avg': value[1],
        'STD': value[2],
        'Scores': value[3]
    }
    for key, value in dnn_results.items()
])
display(df)