# Imports

In [None]:
import os
print(os.getcwd())
os.chdir(r'C:\Users\shaha\Projects\Python Projects\Israel-Palestine-Political-Affiliation-Text-Classification')
print(os.getcwd())

In [None]:

from sklearn.metrics import balanced_accuracy_score, f1_score
import pandas as pd

from classifiers import *
from dataset import EmbeddingDataset
from embedder import Embedder
from Config.dataset_config import *

from torch.utils.data import Dataset, DataLoader

import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')

from tqdm import tqdm

In [3]:
from sklearn.metrics import f1_score, classification_report

def calculate_f1_score(predictions, test_data_package, valid_labels=[0,1,2]):
    '''
    Uses the y_test from test_data_package to evaluate the model while ignoring bad labels.
    
    Args:
        test_data_package (tuple): A tuple containing (DataLoader, (X_test, y_test)).
        valid_labels (list): A list of valid labels to consider for the report.
        
    Return:
        tuple: (F1 score (float), classification_report)
    '''
    test_dataloader, (X_test, y_test) = test_data_package
    true_labels = y_test
        
    # Calculate F1 score
    f1 = f1_score(true_labels, predictions, average='weighted')  # Use 'weighted' for multi-class F1
    
    # Return F1 score and classification report
    return f1, classification_report(true_labels, predictions, zero_division=0)

# Datasets

In [None]:
bert_embedding_undersampled_data = EmbeddingDataset(
    data_path=DATA_PATH,
    subset=SUBSET,
    id_column_idx=ID_COLUMN_IDX,
    comment_column_idx=COMMENT_COLUMN_IDX,
    label_column_idx=LABEL_COLUMN_IDX,
    subset_column_idx=SUBSET_COLUMN_IDX,
    augmented_classes=AUGMENTED_CLASSES,
    augmentation_ratio=0,
    augmentation_methods=AUGMENTATION_METHODS,
    adversation_ratio = ADVERSATION_RATIO,
    undersampling_targets=UNDERSAMPLING_TARGETS,
    embedder=Embedder(),
    embedding_method='distilbert'
)

bert_embedding_with_augmentation_data = EmbeddingDataset(
    data_path=DATA_PATH,
    subset=SUBSET,
    id_column_idx=ID_COLUMN_IDX,
    comment_column_idx=COMMENT_COLUMN_IDX,
    label_column_idx=LABEL_COLUMN_IDX,
    subset_column_idx=SUBSET_COLUMN_IDX,
    augmented_classes=AUGMENTED_CLASSES,
    augmentation_ratio=3,
    augmentation_methods=AUGMENTATION_METHODS,
    adversation_ratio = ADVERSATION_RATIO,
    undersampling_targets={},
    embedder=Embedder(),
    embedding_method='distilbert'
)

tfidf_embedding_undersampled_data = EmbeddingDataset(
    data_path=DATA_PATH,
    subset=SUBSET,
    id_column_idx=ID_COLUMN_IDX,
    comment_column_idx=COMMENT_COLUMN_IDX,
    label_column_idx=LABEL_COLUMN_IDX,
    subset_column_idx=SUBSET_COLUMN_IDX,
    augmented_classes=AUGMENTED_CLASSES,
    augmentation_ratio=0,
    augmentation_methods=AUGMENTATION_METHODS,
    adversation_ratio = ADVERSATION_RATIO,
    undersampling_targets=UNDERSAMPLING_TARGETS,
    embedder=Embedder(),
    embedding_method='tf-idf'
)

tfidf_embedding_with_augmentation_data = EmbeddingDataset(
    data_path=DATA_PATH,
    subset=SUBSET,
    id_column_idx=ID_COLUMN_IDX,
    comment_column_idx=COMMENT_COLUMN_IDX,
    label_column_idx=LABEL_COLUMN_IDX,
    subset_column_idx=SUBSET_COLUMN_IDX,
    augmented_classes=AUGMENTED_CLASSES,
    augmentation_ratio=3,
    augmentation_methods=AUGMENTATION_METHODS,
    adversation_ratio = ADVERSATION_RATIO,
    undersampling_targets={},
    embedder=Embedder(),
    embedding_method='tf-idf'
)

# Evaluations

## Bert Embedding without Augmentation

In [None]:
train_data_package = get_dataloader(bert_embedding_undersampled_data,  
                            batch_size=BATCH_SIZE,
                            shuffle=False, 
                            num_workers=2)
test_dataset = EmbeddingDataset(
                data_path=DATA_PATH,
                subset='TEST',
                id_column_idx=ID_COLUMN_IDX,
                comment_column_idx=COMMENT_COLUMN_IDX,
                label_column_idx=LABEL_COLUMN_IDX,
                subset_column_idx=SUBSET_COLUMN_IDX,
                augmented_classes=[],
                augmentation_ratio=0,
                augmentation_methods=[],
                adversation_ratio = 0,
                undersampling_targets={},
                embedder=Embedder(), 
                embedding_method='distilbert')

test_data_package = get_dataloader(test_dataset,  
                            batch_size=BATCH_SIZE,
                            shuffle=False, 
                            num_workers=2)

In [9]:
models = ['logistic_regression', 'xgboost']


In [None]:
models_evals_bert = {}
models_evals_bert_preds = {}
for MODEL_TYPE in models:
    print(f'[Testing Status]: Fitting a {MODEL_TYPE} classifier...')
    model_config = MODEL_CONFIG.get(MODEL_TYPE)
    
    # Initialize and train the model
    classifier = Classifier(model_config, 
                            model_type=MODEL_TYPE,
                            log=False)
    classifier.fit(train_data_package)
    
    # Test the model
    print(f'[Testing Status]: Testing on test subset...')
    predictions = classifier.predict(test_data_package)
    models_evals_bert_preds[MODEL_TYPE] = predictions
    # Show accuracy score per class + macro (classification report)
    # Calculate accuracy and show classification report
    accuracy, report = calculate_accuracy(predictions, test_data_package)
    f1, report = calculate_f1_score(predictions, test_data_package)
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"F1 score: {f1 * 100:.2f}%")
    print("Classification Report:")
    models_evals_bert[MODEL_TYPE] = {'accuracy': accuracy, 'f1_score': f1}
    print(report)

## Bert Embedding with augmentation

In [None]:
train_data_package = get_dataloader(bert_embedding_with_augmentation_data,  
                            batch_size=BATCH_SIZE,
                            shuffle=False, 
                            num_workers=2)

In [None]:
models_evals_bert_aug = {}
models_evals_bert_aug_preds = {}
for MODEL_TYPE in models:
    print(f'[Testing Status]: Fitting a {MODEL_TYPE} classifier...')
    model_config = MODEL_CONFIG.get(MODEL_TYPE)
    
    # Initialize and train the model
    classifier = Classifier(model_config, 
                            model_type=MODEL_TYPE,
                            log=False)
    classifier.fit(train_data_package)
    
    # Test the model
    print(f'[Testing Status]: Testing on test subset...')
    predictions = classifier.predict(test_data_package)
    models_evals_bert_aug_preds[MODEL_TYPE] = predictions
    # Show accuracy score per class + macro (classification report)
    # Calculate accuracy and show classification report
    accuracy, report = calculate_accuracy(predictions, test_data_package)
    f1, report = calculate_f1_score(predictions, test_data_package)
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"F1 score: {f1 * 100:.2f}%")
    print("Classification Report:")
    models_evals_bert_aug[MODEL_TYPE] = {'accuracy': accuracy, 'f1_score': f1}
    print(report)

## TF-IDF Vector no Augmentation

In [None]:
train_data_package = get_dataloader(tfidf_embedding_undersampled_data,  
                            batch_size=BATCH_SIZE,
                            shuffle=False, 
                            num_workers=2)
test_dataset = EmbeddingDataset(
                data_path=DATA_PATH,
                subset='TEST',
                id_column_idx=ID_COLUMN_IDX,
                comment_column_idx=COMMENT_COLUMN_IDX,
                label_column_idx=LABEL_COLUMN_IDX,
                subset_column_idx=SUBSET_COLUMN_IDX,
                augmented_classes=[],
                augmentation_ratio=0,
                augmentation_methods=[],
                adversation_ratio = 0,
                undersampling_targets={},
                embedder=Embedder(), 
                embedding_method='tf-idf')

test_data_package = get_dataloader(test_dataset,  
                            batch_size=BATCH_SIZE,
                            shuffle=False, 
                            num_workers=2)

In [None]:
models_evals_tfidf = {}
models_evals_tfidf_preds = {}
train_data_package = get_dataloader(tfidf_embedding_undersampled_data,  
                            batch_size=BATCH_SIZE,
                            shuffle=False, 
                            num_workers=2)
for MODEL_TYPE in models:
    print(f'[Testing Status]: Fitting a {MODEL_TYPE} classifier...')
    model_config = MODEL_CONFIG.get(MODEL_TYPE)
    
    # Initialize and train the model
    classifier = Classifier(model_config, 
                            model_type=MODEL_TYPE,
                            log=False)
    classifier.fit(train_data_package)
    
    # Test the model
    print(f'[Testing Status]: Testing on test subset...')
    models_evals_tfidf_preds[MODEL_TYPE] = predictions
    predictions = classifier.predict(test_data_package)
    
    # Show accuracy score per class + macro (classification report)
    # Calculate accuracy and show classification report
    accuracy, report = assess_model(predictions, test_data_package)
    f1, report = calculate_f1_score(predictions, test_data_package)
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"F1 score: {f1 * 100:.2f}%")
    print("Classification Report:")
    models_evals_tfidf[MODEL_TYPE] = {'accuracy': accuracy, 'f1_score': f1}
    print(report)

## TF-IDF with Augmentation

In [None]:
train_data_package = get_dataloader(tfidf_embedding_with_augmentation_data,  
                            batch_size=BATCH_SIZE,
                            shuffle=False, 
                            num_workers=2)

test_dataset = EmbeddingDataset(
                data_path=DATA_PATH,
                subset='TEST',
                id_column_idx=ID_COLUMN_IDX,
                comment_column_idx=COMMENT_COLUMN_IDX,
                label_column_idx=LABEL_COLUMN_IDX,
                subset_column_idx=SUBSET_COLUMN_IDX,
                augmented_classes=[],
                augmentation_ratio=0,
                augmentation_methods=[],
                adversation_ratio = 0,
                undersampling_targets={},
                embedder=Embedder(), 
                embedding_method='tf-idf')

test_data_package = get_dataloader(test_dataset,  
                            batch_size=BATCH_SIZE,
                            shuffle=False, 
                            num_workers=2)

In [None]:
models_evals_tfidf_aug = {}
train_data_package = get_dataloader(tfidf_embedding_undersampled_data,  
                            batch_size=BATCH_SIZE,
                            shuffle=False, 
                            num_workers=2)
for MODEL_TYPE in models:
    print(f'[Testing Status]: Fitting a {MODEL_TYPE} classifier...')
    model_config = MODEL_CONFIG.get(MODEL_TYPE)
    
    # Initialize and train the model
    classifier = Classifier(model_config, 
                            model_type=MODEL_TYPE,
                            log=False)
    classifier.fit(train_data_package)
    
    # Test the model
    print(f'[Testing Status]: Testing on test subset...')
    predictions = classifier.predict(test_data_package)
    
    # Show accuracy score per class + macro (classification report)
    # Calculate accuracy and show classification report
    accuracy, report = calculate_accuracy(predictions, test_data_package)
    f1, report = calculate_f1_score(predictions, test_data_package)
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"F1 score: {f1 * 100:.2f}%")
    print("Classification Report:")
    models_evals_tfidf_aug[MODEL_TYPE] = {'accuracy': accuracy, 'f1_score': f1}
    print(report)

## Plots

In [None]:
import matplotlib.pyplot as plt

def plot_models(models, metric):
    model_names = list(models.keys())
    accuracy_values = [models[model][metric] for model in model_names]
    
    # Define the x-axis positions for the bars
    x = np.arange(len(model_names))
    
    # Define the width of the bars
    bar_width = 0.35
    
    # Plotting
    fig, ax = plt.subplots(figsize=(10, 6))
    
    # Plot bars for accuracy
    bars1 = ax.bar(x - bar_width/2, accuracy_values, bar_width, label=metric, color='b')
    
    # Plot bars for F1 score
    
    # Adding labels, title, and ticks
    ax.set_xlabel('Models')
    ax.set_ylabel(f'{metric} Scores')
    ax.set_title(f'Models Comparison: {metric}')
    ax.set_xticks(x)
    ax.set_xticklabels(model_names)
    ax.legend()
    
    # Display the plot
    plt.tight_layout()
    plt.show()

In [None]:
plot_models(models_evals_bert, 'accuracy')

In [None]:
plot_models(models_evals_bert, 'f1_score')

In [None]:
plot_models(models_evals_tfidf, 'accuracy')

In [None]:
plot_models(models_evals_tfidf, 'f1_score')


## Ablation Tests

In [31]:
def prep_ablation_test(test_data_package, predictions):
    test_dataloader, (X_test, true_labels) = test_data_package
    valid_labels = [0,1,2]
    valid_mask = [label in valid_labels for label in true_labels]
    true_labels = [label for label, mask in zip(true_labels, valid_mask) if mask]
    predictions = [pred for pred, mask in zip(predictions, valid_mask) if mask]
    return true_labels, predictions, valid_mask


In [24]:
def f1_score_instance(prediction, true_label):
    """
    Calculates the F1 score for a single instance.
    """
    if prediction == true_label == 1:
        # True Positive (TP)
        tp, fp, fn = 1, 0, 0
    elif prediction == 1 and true_label == 0:
        # False Positive (FP)
        tp, fp, fn = 0, 1, 0
    elif prediction == 0 and true_label == 1:
        # False Negative (FN)
        tp, fp, fn = 0, 0, 1
    else:
        # True Negative (TN)
        tp, fp, fn = 0, 0, 0
    
    # Calculate precision and recall
    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    
    # Calculate F1 score
    if precision + recall > 0:
        return 2 * (precision * recall) / (precision + recall)
    else:
        return 0

In [35]:
import numpy as np
import scipy.stats as stats

def perform_paired_ttest(predictions_1, predictions_2, ground_truth, alpha=0.05):
    """
    Performs a paired t-test to compare the performance of two models based on their predictions.
    
    Parameters:
    - predictions_1: List of predictions from model 1.
    - predictions_2: List of predictions from model 2.
    - ground_truth: List of true labels for the dataset.
    - alpha: Significance level (default is 0.05).
    
    Returns:
    - t_stat: The t-statistic value.
    - p_value: The p-value from the t-test.
    - result: 'Significant' if p-value < alpha, otherwise 'Not Significant'.
    """
    # Compute per-sample correctness for each model
    model_1_correctness = [1 if pred == true_label else 0 for pred, true_label in zip(predictions_1, ground_truth)]
    model_2_correctness = [1 if pred == true_label else 0 for pred, true_label in zip(predictions_2, ground_truth)]

    # Perform paired t-test
    t_stat, p_value = stats.ttest_rel(model_1_correctness, model_2_correctness)
    
    # Determine significance
    result = 'Significant' if p_value < alpha else 'Not Significant'

    print(f"T-Statistic: {t_stat}")
    print(f"P-Value: {p_value}")
    print(f"Result: {result}")

    return t_stat, p_value, result

In [None]:
true_labels, _, _ = prep_ablation_test(test_data_package, [])

print('Significance Test for Model Selection:')
perform_paired_ttest(models_evals_bert_preds['logistic_regression'], models_evals_bert_preds['xgboost'], true_labels)

print('Significance Test for Vector Selection:')
perform_paired_ttest(models_evals_tfidf_preds['xgboost'], models_evals_bert_preds['xgboost'], true_labels)