In [1]:
import numpy as np
from scipy.stats import wilcoxon, ttest_rel

In [5]:
import numpy as np
from scipy.stats import wilcoxon

class ModelComparisonPipeline:
    def __init__(self, alpha=0.1):
        self.alpha = alpha

    def format_model_scores(self, classification_reports):
        f1s, precisions, recalls = [], [], [],
        for report  in classification_reports:
            f1s.extend([report['Label 0']['f1-score'], report['Label 1']['f1-score']])
            precisions.extend([report['Label 0']['precision'], report['Label 1']['precision']])
            recalls.extend([report['Label 0']['recall'], report['Label 1']['recall']])


        return {
            'f1_score': f1s,
            'precision': precisions,
            'recall': recalls,
        }

    def average_scores_across_models(self, model_scores_list):
        avg_scores = {}
        for key in model_scores_list[0]:
            all_values = np.array([model[key] for model in model_scores_list])
            avg_scores[key] = list(np.mean(all_values, axis=0))
        return avg_scores

    def compare_models_on_all_metrics(self, scores_model_a, scores_model_b, model_a_name, model_b_name):
        print(f"\nComparing '{model_a_name}' vs '{model_b_name}' using Wilcoxon Signed-Rank Test across metrics:\n")

        for metric in scores_model_a:
            try:
                print(f"{'='*150}")
                a_scores = np.array(scores_model_a[metric], dtype=float)
                b_scores = np.array(scores_model_b[metric], dtype=float)
                if len(a_scores) != len(b_scores):
                    print(f"Skipping metric '{metric}': Unequal lengths.")
                    continue
                stat, p_value = wilcoxon(a_scores, b_scores, alternative='greater', method='auto')
                print(f"Metric: {metric}")
                print(f" - {model_a_name} scores: {a_scores}")
                print(f" - {model_b_name} scores: {b_scores}")
                print(f" - Test Statistic: {stat:.4f}")
                print(f" - p-value: {p_value:.4f}")
                if p_value < self.alpha:
                    print(f"Result: '{model_a_name}' performs significantly better than '{model_b_name}' (p < {self.alpha}) → Reject H₀")
                else:
                    print(f"Result: No significant difference between '{model_a_name}' and '{model_b_name}' (p ≥ {self.alpha}) → Fail to reject H₀")
            except Exception as e:
                print(f"Error with metric '{metric}': {e}\n")

    def compare_classical_vs_quantum(self, classical_models_results, quantum_models_results, model_a_name, model_b_name):
        classical_formatted, quantum_formatted = [], []

        for reports in classical_models_results:
            classical_formatted.append(self.format_model_scores(reports))
        for reports in quantum_models_results:
            quantum_formatted.append(self.format_model_scores(reports))

        avg_classical = self.average_scores_across_models(classical_formatted)
        avg_quantum = self.average_scores_across_models(quantum_formatted)


        # Compare metrics statistically
        self.compare_models_on_all_metrics(avg_quantum, avg_classical, "Quantum Models (avg)", "Classical Models (avg)")

In [7]:
IEEE_CIS_AE = {
  '0': {'precision': 0.80, 'recall': 0.90, 'f1-score': 0.85, 'support': 569877},
  '1': {'precision': 0.58, 'recall': 0.38, 'f1-score': 0.46, 'support': 206630},
  'accuracy': 0.76,
  'macro avg': {'precision': 0.69, 'recall': 0.64, 'f1-score': 0.65, 'support': 776507},
  'weighted avg': {'precision': 0.74, 'recall': 0.76, 'f1-score': 0.74, 'support': 776507}
}

IEEE_CIS_IF = {
  '0': {'precision': 0.80, 'recall': 0.90, 'f1-score': 0.85, 'support': 569877},
  '1': {'precision': 0.59, 'recall': 0.40, 'f1-score': 0.48, 'support': 206630},
  'accuracy': 0.77,
  'macro avg': {'precision': 0.70, 'recall': 0.65, 'f1-score': 0.66, 'support': 776507},
  'weighted avg': {'precision': 0.75, 'recall': 0.77, 'f1-score': 0.75, 'support': 776507}
}

IEEE_CIS_OSVM = {
  '0': {'precision': 0.98, 'recall': 0.90, 'f1-score': 0.93, 'support': 569877},
  '1': {'precision': 0.13, 'recall': 0.41, 'f1-score': 0.20, 'support': 206630},
  'accuracy': 0.77,
  'macro avg': {'precision': 0.70, 'recall': 0.65, 'f1-score': 0.66, 'support': 776507},
  'weighted avg': {'precision': 0.75, 'recall': 0.77, 'f1-score': 0.75, 'support': 776507}
}

ISCOM_AE = {
  '0': {'precision': 0.65, 'recall': 0.64, 'f1-score': 0.64, 'support': 1463},
  '1': {'precision': 0.50, 'recall': 0.51, 'f1-score': 0.50, 'support': 1040},
  'accuracy': 0.58,
  'macro avg': {'precision': 0.57, 'recall': 0.57, 'f1-score': 0.57, 'support': 2503},
  'weighted avg': {'precision': 0.58, 'recall': 0.58, 'f1-score': 0.58, 'support': 2503}
}

ISCOM_IF = {
  '0': {'precision': 0.62, 'recall': 0.80, 'f1-score': 0.70, 'support': 1463},
  '1': {'precision': 0.54, 'recall': 0.32, 'f1-score': 0.40, 'support': 1040},
  'accuracy': 0.60,
  'macro avg': {'precision': 0.58, 'recall': 0.56, 'f1-score': 0.55, 'support': 2503},
  'weighted avg': {'precision': 0.59, 'recall': 0.60, 'f1-score': 0.58, 'support': 2503}
}

ISCOM_OCSVM = {
  '0': {'precision': 0.63, 'recall': 0.81, 'f1-score': 0.71, 'support': 1463},
  '1': {'precision': 0.56, 'recall': 0.34, 'f1-score': 0.42, 'support': 1040},
  'accuracy': 0.61,
  'macro avg': {'precision': 0.59, 'recall': 0.57, 'f1-score': 0.57, 'support': 2503},
  'weighted avg': {'precision': 0.60, 'recall': 0.61, 'f1-score': 0.59, 'support': 2503}
}

NSL_KDD_IF = {
  '0': {'precision': 0.76, 'recall': 0.91, 'f1-score': 0.82, 'support': 9711},
  '1': {'precision': 0.92, 'recall': 0.78, 'f1-score': 0.84, 'support': 12833},
  'accuracy': 0.83,
  'macro avg': {'precision': 0.84, 'recall': 0.84, 'f1-score': 0.83, 'support': 22544},
  'weighted avg': {'precision': 0.85, 'recall': 0.83, 'f1-score': 0.84, 'support': 22544}
}

NSL_KDD_AE = {
  '0': {'precision': 0.65, 'recall': 0.88, 'f1-score': 0.75, 'support': 9711},
  '1': {'precision': 0.88, 'recall': 0.64, 'f1-score': 0.74, 'support': 12833},
  'accuracy': 0.75,
  'macro avg': {'precision': 0.77, 'recall': 0.76, 'f1-score': 0.75, 'support': 22544},
  'weighted avg': {'precision': 0.78, 'recall': 0.75, 'f1-score': 0.75, 'support': 22544}
}

NSL_KDD_OCSVM = {
  '0': {'precision': 0.78, 'recall': 0.88, 'f1-score': 0.83, 'support': 9711},
  '1': {'precision': 0.90, 'recall': 0.81, 'f1-score': 0.85, 'support': 12833},
  'accuracy': 0.84,
  'macro avg': {'precision': 0.84, 'recall': 0.85, 'f1-score': 0.84, 'support': 22544},
  'weighted avg': {'precision': 0.85, 'recall': 0.84, 'f1-score': 0.84, 'support': 22544}
}


classical_models_results = [
    # IEEE_CIS classical models
    [
        {'Label 0': IEEE_CIS_AE['0'], 'Label 1': IEEE_CIS_AE['1']},
        {'Label 0': IEEE_CIS_IF['0'], 'Label 1': IEEE_CIS_IF['1']},
        {'Label 0': IEEE_CIS_OSVM['0'], 'Label 1': IEEE_CIS_OSVM['1']},
    ],
    # ISCOM classical models
    [
        {'Label 0': ISCOM_AE['0'], 'Label 1': ISCOM_AE['1']},
        {'Label 0': ISCOM_IF['0'], 'Label 1': ISCOM_IF['1']},
        {'Label 0': ISCOM_OCSVM['0'], 'Label 1': ISCOM_OCSVM['1']},
    ],
    # NSL-KDD classical models
    [
        {'Label 0': NSL_KDD_AE['0'], 'Label 1': NSL_KDD_AE['1']},
        {'Label 0': NSL_KDD_IF['0'], 'Label 1': NSL_KDD_IF['1']},
        {'Label 0': NSL_KDD_OCSVM['0'], 'Label 1': NSL_KDD_OCSVM['1']},
    ]
]

ocsvm_scores = {
    'f1_score': [0.71,0.42, 0.93,0.20, 0.75,0.74],
    'precision': [0.63,0.56, 0.98,0.13, 0.65,0.88],
    'recall' : [0.81,0.34, 0.90,0.41, 0.88,0.64]
}

isolation_forest_scores = {

    'f1_score':     [0.70,0.40, 0.85,0.48, 0.83,0.85],
    'precision':   [0.62,0.54, 0.80,0.59, 0.78,0.90],
    'recall':      [0.80,0.32, 0.90,0.40, 0.88,0.81]
}

autoencoder_scores = {
    'f1_score':     [0.64,0.50, 0.85,0.46, 0.75,0.74],
    'precision':   [0.65,0.50, 0.80,0.58, 0.,65,0.88],
    'recall':      [0.64,0.51, 0.90,0.38, 0.88,0.64]
}


In [8]:
IEEE_CIS_QSVM = {
  'Label 0': {'precision': 0.97, 'recall': 0.96, 'f1-score': 0.965, 'support': 900.0},
  'Label 1': {'precision': 0.85, 'recall': 0.88, 'f1-score': 0.865, 'support': 100.0},
  'accuracy': 0.95,
  'macro avg': {'precision': 0.91, 'recall': 0.92, 'f1-score': 0.915, 'support': 1000.0},
  'weighted avg': {'precision': 0.95, 'recall': 0.95, 'f1-score': 0.95, 'support': 1000.0}
}

IEEE_CIS_QNN = {
  'Label 0': {'precision': 0.98, 'recall': 0.96, 'f1-score': 0.97, 'support': 900.0},
  'Label 1': {'precision': 0.86, 'recall': 0.92, 'f1-score': 0.89, 'support': 100.0},
  'accuracy': 0.9649,
  'macro avg': {'precision': 0.92, 'recall': 0.94, 'f1-score': 0.93, 'support': 1000.0},
  'weighted avg': {'precision': 0.96, 'recall': 0.96, 'f1-score': 0.96, 'support': 1000.0}
}

IEEE_CIS_QAE = {
  'Label 0': {'precision': 0.87, 'recall': 0.83, 'f1-score': 0.85, 'support': 900.0},
  'Label 1': {'precision': 0.45, 'recall': 0.58, 'f1-score': 0.51, 'support': 100.0},
  'accuracy': 0.835,
  'macro avg': {'precision': 0.66, 'recall': 0.71, 'f1-score': 0.68, 'support': 1000.0},
  'weighted avg': {'precision': 0.81, 'recall': 0.835, 'f1-score': 0.82, 'support': 1000.0}
}

SECOM_QSVM = {
  'Label 0': {'precision': 0.95, 'recall': 0.94, 'f1-score': 0.945, 'support': 1557.0},
  'Label 1': {'precision': 0.75, 'recall': 0.8, 'f1-score': 0.77, 'support': 98.0},
  'accuracy': 0.84,
  'macro avg': {'precision': 0.85, 'recall': 0.87, 'f1-score': 0.86, 'support': 1655.0},
  'weighted avg': {'precision': 0.93, 'recall': 0.94, 'f1-score': 0.93, 'support': 1655.0}
}

SECOM_QNN = {
  'Label 0': {'precision': 0.96, 'recall': 0.94, 'f1-score': 0.95, 'support': 1557.0},
  'Label 1': {'precision': 0.82, 'recall': 0.87, 'f1-score': 0.84, 'support': 98.0},
  'accuracy': 0.9351,
  'macro avg': {'precision': 0.89, 'recall': 0.905, 'f1-score': 0.895, 'support': 1655.0},
  'weighted avg': {'precision': 0.93, 'recall': 0.935, 'f1-score': 0.934, 'support': 1655.0}
}

SECOM_QAE = {
  'Label 0': {'precision': 0.85, 'recall': 0.81, 'f1-score': 0.83, 'support': 1557.0},
  'Label 1': {'precision': 0.55, 'recall': 0.7, 'f1-score': 0.62, 'support': 98.0},
  'accuracy': 0.82,
  'macro avg': {'precision': 0.7, 'recall': 0.755, 'f1-score': 0.725, 'support': 1655.0},
  'weighted avg': {'precision': 0.81, 'recall': 0.82, 'f1-score': 0.815, 'support': 1655.0}
}

NSL_KDD_QSVM = {
  'Label 0': {'precision': 0.87, 'recall': 0.85, 'f1-score': 0.86, 'support': 743.0},
  'Label 1': {'precision': 0.84, 'recall': 0.86, 'f1-score': 0.85, 'support': 1250.0},
  'accuracy': 0.85,
  'macro avg': {'precision': 0.855, 'recall': 0.855, 'f1-score': 0.855, 'support': 1993.0},
  'weighted avg': {'precision': 0.85, 'recall': 0.85, 'f1-score': 0.85, 'support': 1993.0}
}

NSL_KDD_QNN = {
  'Label 0': {'precision': 0.78, 'recall': 0.73, 'f1-score': 0.755, 'support': 743.0},
  'Label 1': {'precision': 0.7, 'recall': 0.75, 'f1-score': 0.725, 'support': 1250.0},
  'accuracy': 0.7427,
  'macro avg': {'precision': 0.74, 'recall': 0.74, 'f1-score': 0.74, 'support': 1993.0},
  'weighted avg': {'precision': 0.745, 'recall': 0.743, 'f1-score': 0.744, 'support': 1993.0}
}

NSL_KDD_QAE = {
  'Label 0': {'precision': 0.82, 'recall': 0.78, 'f1-score': 0.8, 'support': 743.0},
  'Label 1': {'precision': 0.76, 'recall': 0.8, 'f1-score': 0.78, 'support': 1250.0},
  'accuracy': 0.8,
  'macro avg': {'precision': 0.79, 'recall': 0.79, 'f1-score': 0.79, 'support': 1993.0},
  'weighted avg': {'precision': 0.8, 'recall': 0.8, 'f1-score': 0.8, 'support': 1993.0}
}


quantum_models_results = [
    # IEEE_CIS classical models
    [
        {'Label 0': IEEE_CIS_QAE['Label 0'], 'Label 1': IEEE_CIS_QAE['Label 1']},
        {'Label 0': IEEE_CIS_QNN['Label 0'], 'Label 1': IEEE_CIS_QNN['Label 1']},
        {'Label 0': IEEE_CIS_QSVM['Label 0'], 'Label 1': IEEE_CIS_QSVM['Label 1']},
    ],
    # ISCOM classical models
    [
        {'Label 0': SECOM_QAE['Label 0'], 'Label 1': SECOM_QAE['Label 1']},
        {'Label 0': SECOM_QNN['Label 0'], 'Label 1': SECOM_QNN['Label 1']},
        {'Label 0': SECOM_QSVM['Label 0'], 'Label 1': SECOM_QSVM['Label 1']},
    ],
    # NSL-KDD classical models
    [
        {'Label 0': NSL_KDD_QAE['Label 0'], 'Label 1': NSL_KDD_QAE['Label 1']},
        {'Label 0': NSL_KDD_QNN['Label 0'], 'Label 1': NSL_KDD_QNN['Label 1']},
        {'Label 0': NSL_KDD_QSVM['Label 0'], 'Label 1': NSL_KDD_QSVM['Label 1']},
    ]
]


QNN_scores = {
    'f1_score': [0.95,0.84, 0.97,0.89, 0.755,0.725],
    'precision': [0.96,0.82, 0.98,0.86, 0.78,0.7],
    'recall': [0.94,0.87, 0.96,0.92, 0.73,0.75]
}

QAE_scores = {
    'f1_score': [0.755,0.725, 0.83,0.51, 0.85,0.51],
    'precision': [0.78,0.7, 0.87,0.45, 0.87,0.45],
    'recall': [0.73,0.75, 0.83,0.58, 0.83,0.58]
}

QSVM_scores = {
    'f1_score': [0.945,0.77, 0.965,0.865, 0.86,0.85],
    'precision': [0.95,0.75, 0.97,0.85, 0.87,0.84],
    'recall': [0.94,0.8, 0.96,0.88, 0.85,0.86]
}

In [9]:
pipeline = ModelComparisonPipeline(alpha=0.1)

In [10]:
pipeline.compare_models_on_all_metrics(isolation_forest_scores,ocsvm_scores,"Isolation Forest", "OCSVM")


Comparing 'Isolation Forest' vs 'OCSVM' using Wilcoxon Signed-Rank Test across metrics:

Metric: f1_score
 - Isolation Forest scores: [0.7  0.4  0.85 0.48 0.83 0.85]
 - OCSVM scores: [0.71 0.42 0.93 0.2  0.75 0.74]
 - Test Statistic: 14.0000
 - p-value: 0.2812
Result: No significant difference between 'Isolation Forest' and 'OCSVM' (p ≥ 0.1) → Fail to reject H₀
Metric: precision
 - Isolation Forest scores: [0.62 0.54 0.8  0.59 0.78 0.9 ]
 - OCSVM scores: [0.63 0.56 0.98 0.13 0.65 0.88]
 - Test Statistic: 12.5000
 - p-value: 0.4219
Result: No significant difference between 'Isolation Forest' and 'OCSVM' (p ≥ 0.1) → Fail to reject H₀
Metric: recall
 - Isolation Forest scores: [0.8  0.32 0.9  0.4  0.88 0.81]
 - OCSVM scores: [0.81 0.34 0.9  0.41 0.88 0.64]
 - Test Statistic: 4.0000
 - p-value: 0.6425
Result: No significant difference between 'Isolation Forest' and 'OCSVM' (p ≥ 0.1) → Fail to reject H₀


  temp = _wilcoxon_iv(x, y, zero_method, correction, alternative, method, axis)


In [11]:
pipeline.compare_models_on_all_metrics(QNN_scores,ocsvm_scores,"QNN", "OCSVM")



Comparing 'QNN' vs 'OCSVM' using Wilcoxon Signed-Rank Test across metrics:

Metric: f1_score
 - QNN scores: [0.95  0.84  0.97  0.89  0.755 0.725]
 - OCSVM scores: [0.71 0.42 0.93 0.2  0.75 0.74]
 - Test Statistic: 19.0000
 - p-value: 0.0469
Result: 'QNN' performs significantly better than 'OCSVM' (p < 0.1) → Reject H₀
Metric: precision
 - QNN scores: [0.96 0.82 0.98 0.86 0.78 0.7 ]
 - OCSVM scores: [0.63 0.56 0.98 0.13 0.65 0.88]
 - Test Statistic: 13.0000
 - p-value: 0.0690
Result: 'QNN' performs significantly better than 'OCSVM' (p < 0.1) → Reject H₀
Metric: recall
 - QNN scores: [0.94 0.87 0.96 0.92 0.73 0.75]
 - OCSVM scores: [0.81 0.34 0.9  0.41 0.88 0.64]
 - Test Statistic: 17.0000
 - p-value: 0.1094
Result: No significant difference between 'QNN' and 'OCSVM' (p ≥ 0.1) → Fail to reject H₀


In [12]:
pipeline.compare_models_on_all_metrics(QNN_scores,QAE_scores,"QNN", "QAE")



Comparing 'QNN' vs 'QAE' using Wilcoxon Signed-Rank Test across metrics:

Metric: f1_score
 - QNN scores: [0.95  0.84  0.97  0.89  0.755 0.725]
 - QAE scores: [0.755 0.725 0.83  0.51  0.85  0.51 ]
 - Test Statistic: 20.0000
 - p-value: 0.0312
Result: 'QNN' performs significantly better than 'QAE' (p < 0.1) → Reject H₀
Metric: precision
 - QNN scores: [0.96 0.82 0.98 0.86 0.78 0.7 ]
 - QAE scores: [0.78 0.7  0.87 0.45 0.87 0.45]
 - Test Statistic: 20.0000
 - p-value: 0.0312
Result: 'QNN' performs significantly better than 'QAE' (p < 0.1) → Reject H₀
Metric: recall
 - QNN scores: [0.94 0.87 0.96 0.92 0.73 0.75]
 - QAE scores: [0.73 0.75 0.83 0.58 0.83 0.58]
 - Test Statistic: 20.0000
 - p-value: 0.0312
Result: 'QNN' performs significantly better than 'QAE' (p < 0.1) → Reject H₀


In [13]:
pipeline.compare_models_on_all_metrics(QNN_scores,isolation_forest_scores,"QNN", "Isolation Forest")


Comparing 'QNN' vs 'Isolation Forest' using Wilcoxon Signed-Rank Test across metrics:

Metric: f1_score
 - QNN scores: [0.95  0.84  0.97  0.89  0.755 0.725]
 - Isolation Forest scores: [0.7  0.4  0.85 0.48 0.83 0.85]
 - Test Statistic: 17.0000
 - p-value: 0.1094
Result: No significant difference between 'QNN' and 'Isolation Forest' (p ≥ 0.1) → Fail to reject H₀
Metric: precision
 - QNN scores: [0.96 0.82 0.98 0.86 0.78 0.7 ]
 - Isolation Forest scores: [0.62 0.54 0.8  0.59 0.78 0.9 ]
 - Test Statistic: 13.0000
 - p-value: 0.0690
Result: 'QNN' performs significantly better than 'Isolation Forest' (p < 0.1) → Reject H₀
Metric: recall
 - QNN scores: [0.94 0.87 0.96 0.92 0.73 0.75]
 - Isolation Forest scores: [0.8  0.32 0.9  0.4  0.88 0.81]
 - Test Statistic: 15.0000
 - p-value: 0.2188
Result: No significant difference between 'QNN' and 'Isolation Forest' (p ≥ 0.1) → Fail to reject H₀


In [14]:
pipeline.compare_models_on_all_metrics(QNN_scores,QSVM_scores,"QNN", "QSVM")


Comparing 'QNN' vs 'QSVM' using Wilcoxon Signed-Rank Test across metrics:

Metric: f1_score
 - QNN scores: [0.95  0.84  0.97  0.89  0.755 0.725]
 - QSVM scores: [0.945 0.77  0.965 0.865 0.86  0.85 ]
 - Test Statistic: 10.0000
 - p-value: 0.5781
Result: No significant difference between 'QNN' and 'QSVM' (p ≥ 0.1) → Fail to reject H₀
Metric: precision
 - QNN scores: [0.96 0.82 0.98 0.86 0.78 0.7 ]
 - QSVM scores: [0.95 0.75 0.97 0.85 0.87 0.84]
 - Test Statistic: 10.0000
 - p-value: 0.5781
Result: No significant difference between 'QNN' and 'QSVM' (p ≥ 0.1) → Fail to reject H₀
Metric: recall
 - QNN scores: [0.94 0.87 0.96 0.92 0.73 0.75]
 - QSVM scores: [0.94 0.8  0.96 0.88 0.85 0.86]
 - Test Statistic: 3.0000
 - p-value: 0.7674
Result: No significant difference between 'QNN' and 'QSVM' (p ≥ 0.1) → Fail to reject H₀


In [15]:
pipeline.compare_models_on_all_metrics(QNN_scores,autoencoder_scores,"QNN", "AE")


Comparing 'QNN' vs 'AE' using Wilcoxon Signed-Rank Test across metrics:

Metric: f1_score
 - QNN scores: [0.95  0.84  0.97  0.89  0.755 0.725]
 - AE scores: [0.64 0.5  0.85 0.46 0.75 0.74]
 - Test Statistic: 19.0000
 - p-value: 0.0469
Result: 'QNN' performs significantly better than 'AE' (p < 0.1) → Reject H₀
Skipping metric 'precision': Unequal lengths.
Metric: recall
 - QNN scores: [0.94 0.87 0.96 0.92 0.73 0.75]
 - AE scores: [0.64 0.51 0.9  0.38 0.88 0.64]
 - Test Statistic: 18.0000
 - p-value: 0.0781
Result: 'QNN' performs significantly better than 'AE' (p < 0.1) → Reject H₀


In [16]:
pipeline.compare_models_on_all_metrics(QSVM_scores,isolation_forest_scores,"QSVM", "Isolation Forest")


Comparing 'QSVM' vs 'Isolation Forest' using Wilcoxon Signed-Rank Test across metrics:

Metric: f1_score
 - QSVM scores: [0.945 0.77  0.965 0.865 0.86  0.85 ]
 - Isolation Forest scores: [0.7  0.4  0.85 0.48 0.83 0.85]
 - Test Statistic: 15.0000
 - p-value: 0.0216
Result: 'QSVM' performs significantly better than 'Isolation Forest' (p < 0.1) → Reject H₀
Metric: precision
 - QSVM scores: [0.95 0.75 0.97 0.85 0.87 0.84]
 - Isolation Forest scores: [0.62 0.54 0.8  0.59 0.78 0.9 ]
 - Test Statistic: 20.0000
 - p-value: 0.0312
Result: 'QSVM' performs significantly better than 'Isolation Forest' (p < 0.1) → Reject H₀
Metric: recall
 - QSVM scores: [0.94 0.8  0.96 0.88 0.85 0.86]
 - Isolation Forest scores: [0.8  0.32 0.9  0.4  0.88 0.81]
 - Test Statistic: 20.0000
 - p-value: 0.0312
Result: 'QSVM' performs significantly better than 'Isolation Forest' (p < 0.1) → Reject H₀


In [17]:
pipeline.compare_models_on_all_metrics(QSVM_scores,autoencoder_scores,"QSVM", "AE")


Comparing 'QSVM' vs 'AE' using Wilcoxon Signed-Rank Test across metrics:

Metric: f1_score
 - QSVM scores: [0.945 0.77  0.965 0.865 0.86  0.85 ]
 - AE scores: [0.64 0.5  0.85 0.46 0.75 0.74]
 - Test Statistic: 21.0000
 - p-value: 0.0156
Result: 'QSVM' performs significantly better than 'AE' (p < 0.1) → Reject H₀
Skipping metric 'precision': Unequal lengths.
Metric: recall
 - QSVM scores: [0.94 0.8  0.96 0.88 0.85 0.86]
 - AE scores: [0.64 0.51 0.9  0.38 0.88 0.64]
 - Test Statistic: 20.0000
 - p-value: 0.0312
Result: 'QSVM' performs significantly better than 'AE' (p < 0.1) → Reject H₀


In [18]:
pipeline.compare_models_on_all_metrics(QSVM_scores,QAE_scores,"QSVM", "QAE")


Comparing 'QSVM' vs 'QAE' using Wilcoxon Signed-Rank Test across metrics:

Metric: f1_score
 - QSVM scores: [0.945 0.77  0.965 0.865 0.86  0.85 ]
 - QAE scores: [0.755 0.725 0.83  0.51  0.85  0.51 ]
 - Test Statistic: 21.0000
 - p-value: 0.0156
Result: 'QSVM' performs significantly better than 'QAE' (p < 0.1) → Reject H₀
Metric: precision
 - QSVM scores: [0.95 0.75 0.97 0.85 0.87 0.84]
 - QAE scores: [0.78 0.7  0.87 0.45 0.87 0.45]
 - Test Statistic: 15.0000
 - p-value: 0.0216
Result: 'QSVM' performs significantly better than 'QAE' (p < 0.1) → Reject H₀
Metric: recall
 - QSVM scores: [0.94 0.8  0.96 0.88 0.85 0.86]
 - QAE scores: [0.73 0.75 0.83 0.58 0.83 0.58]
 - Test Statistic: 21.0000
 - p-value: 0.0156
Result: 'QSVM' performs significantly better than 'QAE' (p < 0.1) → Reject H₀


In [19]:
pipeline.compare_models_on_all_metrics(QSVM_scores,ocsvm_scores,"QSVM", "OCSVM")


Comparing 'QSVM' vs 'OCSVM' using Wilcoxon Signed-Rank Test across metrics:

Metric: f1_score
 - QSVM scores: [0.945 0.77  0.965 0.865 0.86  0.85 ]
 - OCSVM scores: [0.71 0.42 0.93 0.2  0.75 0.74]
 - Test Statistic: 21.0000
 - p-value: 0.0156
Result: 'QSVM' performs significantly better than 'OCSVM' (p < 0.1) → Reject H₀
Metric: precision
 - QSVM scores: [0.95 0.75 0.97 0.85 0.87 0.84]
 - OCSVM scores: [0.63 0.56 0.98 0.13 0.65 0.88]
 - Test Statistic: 18.0000
 - p-value: 0.0781
Result: 'QSVM' performs significantly better than 'OCSVM' (p < 0.1) → Reject H₀
Metric: recall
 - QSVM scores: [0.94 0.8  0.96 0.88 0.85 0.86]
 - OCSVM scores: [0.81 0.34 0.9  0.41 0.88 0.64]
 - Test Statistic: 20.0000
 - p-value: 0.0312
Result: 'QSVM' performs significantly better than 'OCSVM' (p < 0.1) → Reject H₀


In [20]:
pipeline.compare_models_on_all_metrics(QSVM_scores,QNN_scores,"QSVM", "QNN")



Comparing 'QSVM' vs 'QNN' using Wilcoxon Signed-Rank Test across metrics:

Metric: f1_score
 - QSVM scores: [0.945 0.77  0.965 0.865 0.86  0.85 ]
 - QNN scores: [0.95  0.84  0.97  0.89  0.755 0.725]
 - Test Statistic: 11.0000
 - p-value: 0.5000
Result: No significant difference between 'QSVM' and 'QNN' (p ≥ 0.1) → Fail to reject H₀
Metric: precision
 - QSVM scores: [0.95 0.75 0.97 0.85 0.87 0.84]
 - QNN scores: [0.96 0.82 0.98 0.86 0.78 0.7 ]
 - Test Statistic: 11.0000
 - p-value: 0.5000
Result: No significant difference between 'QSVM' and 'QNN' (p ≥ 0.1) → Fail to reject H₀
Metric: recall
 - QSVM scores: [0.94 0.8  0.96 0.88 0.85 0.86]
 - QNN scores: [0.94 0.87 0.96 0.92 0.73 0.75]
 - Test Statistic: 7.0000
 - p-value: 0.2326
Result: No significant difference between 'QSVM' and 'QNN' (p ≥ 0.1) → Fail to reject H₀


In [21]:
pipeline.compare_models_on_all_metrics(QAE_scores,autoencoder_scores,"QAE", "AE")


Comparing 'QAE' vs 'AE' using Wilcoxon Signed-Rank Test across metrics:

Metric: f1_score
 - QAE scores: [0.755 0.725 0.83  0.51  0.85  0.51 ]
 - AE scores: [0.64 0.5  0.85 0.46 0.75 0.74]
 - Test Statistic: 14.0000
 - p-value: 0.2812
Result: No significant difference between 'QAE' and 'AE' (p ≥ 0.1) → Fail to reject H₀
Skipping metric 'precision': Unequal lengths.
Metric: recall
 - QAE scores: [0.73 0.75 0.83 0.58 0.83 0.58]
 - AE scores: [0.64 0.51 0.9  0.38 0.88 0.64]
 - Test Statistic: 15.0000
 - p-value: 0.2188
Result: No significant difference between 'QAE' and 'AE' (p ≥ 0.1) → Fail to reject H₀


In [22]:
pipeline.compare_models_on_all_metrics(QAE_scores,isolation_forest_scores,"QAE", "Isolation Forest")


Comparing 'QAE' vs 'Isolation Forest' using Wilcoxon Signed-Rank Test across metrics:

Metric: f1_score
 - QAE scores: [0.755 0.725 0.83  0.51  0.85  0.51 ]
 - Isolation Forest scores: [0.7  0.4  0.85 0.48 0.83 0.85]
 - Test Statistic: 13.5000
 - p-value: 0.3438
Result: No significant difference between 'QAE' and 'Isolation Forest' (p ≥ 0.1) → Fail to reject H₀
Metric: precision
 - QAE scores: [0.78 0.7  0.87 0.45 0.87 0.45]
 - Isolation Forest scores: [0.62 0.54 0.8  0.59 0.78 0.9 ]
 - Test Statistic: 12.0000
 - p-value: 0.4219
Result: No significant difference between 'QAE' and 'Isolation Forest' (p ≥ 0.1) → Fail to reject H₀
Metric: recall
 - QAE scores: [0.73 0.75 0.83 0.58 0.83 0.58]
 - Isolation Forest scores: [0.8  0.32 0.9  0.4  0.88 0.81]
 - Test Statistic: 10.0000
 - p-value: 0.5781
Result: No significant difference between 'QAE' and 'Isolation Forest' (p ≥ 0.1) → Fail to reject H₀


In [23]:
pipeline.compare_classical_vs_quantum(classical_models_results, quantum_models_results, "Classical", "Quantum")


Comparing 'Quantum Models (avg)' vs 'Classical Models (avg)' using Wilcoxon Signed-Rank Test across metrics:

Metric: f1_score
 - Quantum Models (avg) scores: [0.82666667 0.63666667 0.89166667 0.81833333 0.92333333 0.82833333]
 - Classical Models (avg) scores: [0.74666667 0.56666667 0.79       0.57333333 0.82333333 0.49      ]
 - Test Statistic: 21.0000
 - p-value: 0.0156
Result: 'Quantum Models (avg)' performs significantly better than 'Classical Models (avg)' (p < 0.1) → Reject H₀
Metric: precision
 - Quantum Models (avg) scores: [0.84666667 0.58666667 0.90666667 0.79333333 0.93       0.81333333]
 - Classical Models (avg) scores: [0.7        0.65333333 0.72666667 0.68333333 0.79666667 0.53      ]
 - Test Statistic: 20.0000
 - p-value: 0.0312
Result: 'Quantum Models (avg)' performs significantly better than 'Classical Models (avg)' (p < 0.1) → Reject H₀
Metric: recall
 - Quantum Models (avg) scores: [0.80666667 0.69333333 0.87666667 0.84666667 0.91666667 0.84666667]
 - Classical Mode