In [None]:
## -*- coding: utf-8 -*-
# library imports
import pandas as pd
from io import StringIO
import numpy as np
from scipy import stats

# --- Data Extraction from Table 13 ---
# This data is taken directly from Table 13 in the praxis document.
# N/A values from the table are excluded from the lists.

In [None]:
# visulization of the data table
# This data is taken directly from Table 13 in the praxis document.
# N/A values from the table are excluded from the lists.    

csv_data = """
Study,Year,Feature Selection,ML Method,Accuracy,F1,Precision,Recall,FPR
Catillo et al.,2020,Manual,Deep Autoencoder,0.992,NA,0.950,0.989,0.00500000
Saeful Fitni and Ramli,2020,Manual,Ensemble (Voting),0.988,0.979,0.988,0.971,0.0259627
Karatas et al.,2020,Manual,Adaboost,0.9969,0.997,0.997,0.997,NA
Kim et al.,2020,Manual,CNN,0.9999,NA,0.818,0.823,NA
Peng et al.,2019,Manual,LSTM & Attention Mechanism,0.962,0.930,0.960,0.960,0.00670
Darko,2023,Automatic,BERT,0.9982,0.9922,0.9863,0.9983,0.00100000
Zhao et al.,2020,Automatic,Deep Autoencoder,0.979,0.980,0.980,0.980,0.02100000
Proposed 1,2025,Automatic,AutoGluon subset 1,0.999993,0.9999928,0.9999929,0.9999928,0.00000033
Proposed 2,2025,Automatic,AutoGluon subset 2,0.9999966,0.9999967,0.9999967,0.9999967,0.00000002
Proposed 3,2025,Automatic,Azure AutoML subset 1,0.999998,0.999998,0.999998,0.999998,0.00000028
Proposed 4,2025,Automatic,Azure AutoML subset 2,0.999997,0.999997,0.999997,0.999997,0.00000028
Proposed 5,2025,Automatic,AWS AutoPilot subset 1,0.9999998,0.9999998,0.9999998,0.9999998,0.00000046
Proposed 6,2025,Automatic,AWS AutoPilot subset 2,0.9999954,0.9999957,0.9999963,0.9999954,0.00000018
"""

df = pd.read_csv(StringIO(csv_data))
df  

Unnamed: 0,Study,Year,Feature Selection,ML Method,Accuracy,F1,Precision,Recall,FPR
0,Catillo et al.,2020,Manual,Deep Autoencoder,0.992,,0.95,0.989,0.005
1,Saeful Fitni and Ramli,2020,Manual,Ensemble (Voting),0.988,0.979,0.988,0.971,0.0259627
2,Karatas et al.,2020,Manual,Adaboost,0.9969,0.997,0.997,0.997,
3,Kim et al.,2020,Manual,CNN,0.9999,,0.818,0.823,
4,Peng et al.,2019,Manual,LSTM & Attention Mechanism,0.962,0.93,0.96,0.96,0.0067
5,Darko,2023,Automatic,BERT,0.9982,0.9922,0.9863,0.9983,0.001
6,Zhao et al.,2020,Automatic,Deep Autoencoder,0.979,0.98,0.98,0.98,0.021
7,Proposed 1,2025,Automatic,AutoGluon subset 1,0.999993,0.999993,0.999993,0.999993,3.3e-07
8,Proposed 2,2025,Automatic,AutoGluon subset 2,0.999997,0.999997,0.999997,0.999997,2e-08
9,Proposed 3,2025,Automatic,Azure AutoML subset 1,0.999998,0.999998,0.999998,0.999998,2.8e-07


# Group 1: The 6 Proposed AutoML Models

In [None]:
#mapping the columns to lists
proposed_accuracy = [0.9999930, 0.9999966, 0.9999980, 0.9999970, 0.9999998, 0.9999954]
proposed_f1 = [0.9999928, 0.9999967, 0.9999980, 0.9999970, 0.9999998, 0.9999957]
proposed_precision = [0.9999929, 0.9999967, 0.9999980, 0.9999970, 0.9999998, 0.9999963]
proposed_recall = [0.9999928, 0.9999967, 0.9999980, 0.9999970, 0.9999998, 0.9999954]
proposed_fpr = [0.00000033, 0.00000002, 0.00000028, 0.00000028, 0.00000046, 0.00000018]

# Group 2: The 7 Benchmark Studies from Literature

In [None]:
# Group 2: Benchmark Models from Praxis Document
benchmark_accuracy = [0.9920, 0.9880, 0.9969, 0.9999, 0.9620, 0.9982, 0.9790]
benchmark_f1 = [0.9790, 0.9970, 0.9300, 0.9922, 0.9800]  # n=5
benchmark_precision = [0.9500, 0.9880, 0.9970, 0.8180, 0.9600, 0.9863, 0.9800]
benchmark_recall = [0.9890, 0.9710, 0.9970, 0.8230, 0.9600, 0.9983, 0.9800]
benchmark_fpr = [0.0050, 0.0010, 0.0210, 0.00670, 0.0259627]  # n=5

# --- Performing the T-Tests ---
```
>  Based on normalization test/outcome which the p-value is less than 0.05 I used Welch's t-test (equal_var=False) as it's more reliable for groups with unequal sizes and variances.
> I used a one-tailed test (alternative='greater' or 'less') to directly
>  I tested the hypotheses H1 and H2 only.
> H3 is not scoped as I did not track bechnmark time of training and process. 
```


# H1 Test: Is the proposed AutoML group's performance SIGNIFICANTLY GREATER than Benchmark group?

In [5]:
# Perform t-tests for each metric
# H1: Proposed group's performance is significantly greater than the benchmark group.
# H2: Proposed group's performance is significantly less than the benchmark group.
# H0: There is no significant difference between the two groups.
# We use Welch's t-test (equal_var=False) as it's more reliable for groups
# with unequal sizes and variances.
# We use a one-tailed test (alternative='greater' or 'less') to directly
# test the hypotheses H1 and H2.
# Group 1: The 6 Proposed AutoML Models
# Group 2: The Benchmark AutoML Models
t_stat_acc, p_val_acc = stats.ttest_ind(proposed_accuracy, benchmark_accuracy, equal_var=False, alternative='greater')
t_stat_f1, p_val_f1 = stats.ttest_ind(proposed_f1, benchmark_f1, equal_var=False, alternative='greater')
t_stat_prec, p_val_prec = stats.ttest_ind(proposed_precision, benchmark_precision, equal_var=False, alternative='greater')
t_stat_recall, p_val_recall = stats.ttest_ind(proposed_recall, benchmark_recall, equal_var=False, alternative='greater')

# H2 Test: Is the proposed group's FPR SIGNIFICANTLY LESS?

In [6]:
# H2 Test: Is the proposed group's performance SIGNIFICANTLY LESS?
t_stat_fpr, p_val_fpr = stats.ttest_ind(proposed_fpr, benchmark_fpr, equal_var=False, alternative='less')

# --- Displaying the Results ---

In [7]:
# Print the results
print("--- Statistical Validation of Hypotheses ---")
print("\n--- H1: Performance Improvement (Higher is Better) ---")
print(f"Accuracy:       t-statistic = {t_stat_acc:.3f}, p-value = {p_val_acc:.4f}")
print(f"F1-Score:       t-statistic = {t_stat_f1:.3f}, p-value = {p_val_f1:.4f}")
print(f"Precision:      t-statistic = {t_stat_prec:.3f}, p-value = {p_val_prec:.4f}")
print(f"Recall:         t-statistic = {t_stat_recall:.3f}, p-value = {p_val_recall:.4f}")

--- Statistical Validation of Hypotheses ---

--- H1: Performance Improvement (Higher is Better) ---
Accuracy:       t-statistic = 2.346, p-value = 0.0287
F1-Score:       t-statistic = 2.043, p-value = 0.0553
Precision:      t-statistic = 1.946, p-value = 0.0498
Recall:         t-statistic = 1.721, p-value = 0.0680


In [8]:
# H2: False Positive Rate Reduction (Lower is Better)
print("\n--- H2: False Positive Rate Reduction (Lower is Better) ---")
print(f"FPR:            t-statistic = {t_stat_fpr:.3f}, p-value = {p_val_fpr:.4f}")


--- H2: False Positive Rate Reduction (Lower is Better) ---
FPR:            t-statistic = -2.451, p-value = 0.0352
