In [1]:
import pandas as pd
import numpy as np
from glob import glob
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [2]:
def bootstrap_performance(y_true, y_pred, sample_size=500, n_samples=100):
    assert len(y_true) == len(y_pred), "Both lists must be of same length"
    n = len(y_true)
    accuracies = []
    f1s = []

    for seed_val in range(n_samples):
        # Sample indices with replacement
        np.random.seed(seed_val)
        indices = np.random.choice(len(y_true), size=sample_size, replace=False)
        sample_true = [y_true[i] for i in indices]
        sample_pred = [y_pred[i] for i in indices]
        acc = accuracy_score(sample_true, sample_pred)
        accuracies.append(acc)
        f1 = f1_score(sample_true, sample_pred, average='weighted')
        f1s.append(f1)

    return np.mean(accuracies), np.std(accuracies), np.mean(f1s), np.std(f1s)

def analyze_results(tru, pred):
    accuracy = accuracy_score(tru, pred)
    f1 = f1_score(tru, pred, average='weighted')
    report = classification_report(tru, pred, digits=6)
    
    print(f"Overall Accuracy: {accuracy:.6f}")
    print(f"Overall w-F1 Score: {f1:.6f}")
    print("Overall Classification Report:")
    print(report)

    # Mean and standard deviation of the predictions performance scores
    mean_acc, std_acc, mean_f1, std_f1 = bootstrap_performance(tru, pred)
    print(f"Bootstrapped Accuracy: {mean_acc:.6f} ± {std_acc:.6f}")
    print(f"Bootstrapped w-F1 Score: {mean_f1:.6f} ± {std_f1:.6f}")


In [3]:
files = glob("Outputs/*.csv")
files.sort()

for f in files:
    print(f"Analyzing file: {f}")
    df = pd.read_csv(f)
    tru = df['true_label'].tolist()
    pred = df['predicted_label'].tolist()

    # Analyze the results
    analyze_results(tru, pred)
    print("\n" + "="*50 + "\n")

Analyzing file: Outputs/albert-base-v2_isExt_0.csv
Overall Accuracy: 0.585667
Overall w-F1 Score: 0.550835
Overall Classification Report:
              precision    recall  f1-score   support

           0   0.610047  0.814205  0.697493      1760
           1   0.497696  0.261290  0.342676      1240

    accuracy                       0.585667      3000
   macro avg   0.553871  0.537747  0.520085      3000
weighted avg   0.563608  0.585667  0.550835      3000

Bootstrapped Accuracy: 0.584840 ± 0.018692
Bootstrapped w-F1 Score: 0.549638 ± 0.021041


Analyzing file: Outputs/albert-base-v2_isExt_1.csv
Overall Accuracy: 0.583667
Overall w-F1 Score: 0.585280
Overall Classification Report:
              precision    recall  f1-score   support

           0   0.651812  0.623295  0.637235      1760
           1   0.496583  0.527419  0.511537      1240

    accuracy                       0.583667      3000
   macro avg   0.574198  0.575357  0.574386      3000
weighted avg   0.587651  0.583667  

Bootstrapped Accuracy: 0.583880 ± 0.019041
Bootstrapped w-F1 Score: 0.585389 ± 0.019137


Analyzing file: Outputs/beit-base-patch16-224-pt22k-ft22k_isExt_0.csv
Overall Accuracy: 0.578333
Overall w-F1 Score: 0.533886
Overall Classification Report:
              precision    recall  f1-score   support

           0   0.601810  0.831250  0.698163      1760
           1   0.478032  0.219355  0.300719      1240

    accuracy                       0.578333      3000
   macro avg   0.539921  0.525302  0.499441      3000
weighted avg   0.550648  0.578333  0.533886      3000

Bootstrapped Accuracy: 0.578680 ± 0.020800
Bootstrapped w-F1 Score: 0.533802 ± 0.023876


Analyzing file: Outputs/beit-base-patch16-224-pt22k-ft22k_isExt_1.csv
Overall Accuracy: 0.586333
Overall w-F1 Score: 0.555622
Overall Classification Report:
              precision    recall  f1-score   support

           0   0.612484  0.802841  0.694861      1760
           1   0.499278  0.279032  0.357993      1240

    accuracy   