## Wilcoxon-signed rank test 

### Token-level micro and macro F1

In [3]:
import numpy as np
from scipy.stats import wilcoxon


bert_micro = [0.9121690255298978, 0.9097311573102187, 0.9100697501185074, 0.912913929708133, 0.9067515405972777]
bert_macro = [0.6870282026473729, 0.6585237595275365, 0.6834733456245836, 0.6679360297319112, 0.6423132830038847]

biobert_micro = [0.9170763260025874, 0.9140362225097025, 0.9163001293661062, 0.9191461836998707, 0.9159767141009055]
biobert_macro = [0.738943917021055, 0.7169422044202867, 0.7219250222154572, 0.7605814749054154, 0.7102863783965648]


pubmedbert_micro = [0.931540536252578, 0.9322544819927019, 0.9337617007774076, 0.9317785181659527, 0.93233380929716]
pubmedbert_macro = [0.7340460466537359, 0.74156121407352, 0.7464635063023489, 0.7571975840982823, 0.7458718161729185]


def perform_wilcoxon_test(scores1, scores2, label1, label2, score_type):
    stat, p_value = wilcoxon(scores1, scores2)
    print(f"Wilcoxon test for {score_type} scores between {label1} and {label2}:")
    print(f"Statistic: {stat}, p-value: {p_value}\n")

perform_wilcoxon_test(bert_micro, biobert_micro, "BERT", "BioBERT", "Micro")
perform_wilcoxon_test(biobert_micro, pubmedbert_micro, "BioBERT", "PubMedBERT", "Micro")

perform_wilcoxon_test(bert_macro, biobert_macro, "BERT", "BioBERT", "Macro")
perform_wilcoxon_test(biobert_macro, pubmedbert_macro, "BioBERT", "PubMedBERT", "Macro")

perform_wilcoxon_test(bert_macro, pubmedbert_macro, "BERT", "PubMedBERT", "Macro")
perform_wilcoxon_test(bert_micro, pubmedbert_micro, "BERT", "PubMedBERT", "Micro")

Wilcoxon test for Micro scores between BERT and BioBERT:
Statistic: 0.0000, p-value: 0.0625

Wilcoxon test for Micro scores between BioBERT and PubMedBERT:
Statistic: 0.0000, p-value: 0.0625

Wilcoxon test for Macro scores between BERT and BioBERT:
Statistic: 0.0000, p-value: 0.0625

Wilcoxon test for Macro scores between BioBERT and PubMedBERT:
Statistic: 3.0000, p-value: 0.3125

Wilcoxon test for Macro scores between BERT and PubMedBERT:
Statistic: 0.0000, p-value: 0.0625

Wilcoxon test for Micro scores between BERT and PubMedBERT:
Statistic: 0.0000, p-value: 0.0625



#### $\rightarrow$ For token-level micro and macro F1, performance differences are not statistically significant, however, there is a clear trend as indicated by average values and p-values of 0.0625, close to the treshold of 0.05.

### Entity-level metrics

In [5]:
bert_macro_precision = [0.524, 0.5178, 0.5582, 0.5421, 0.5041]
bert_macro_recall    = [0.6206, 0.6172, 0.638,  0.6328, 0.6052]
bert_macro_f1        = [0.5649, 0.5599, 0.5894, 0.5754, 0.5476]
bert_micro_precision = [0.6848, 0.6876, 0.6919, 0.6896, 0.6693]
bert_micro_recall    = [0.7645, 0.7726, 0.7601, 0.7619, 0.761]
bert_micro_f1        = [0.7225, 0.7277, 0.7244, 0.7239, 0.7122]


biobert_macro_precision = [0.6039, 0.5908, 0.5881, 0.6474, 0.5778]
biobert_macro_recall    = [0.744,  0.7408, 0.7089, 0.7476, 0.6698]
biobert_macro_f1        = [0.6576, 0.6398, 0.6314, 0.6832, 0.614]
biobert_micro_precision = [0.7119, 0.7175, 0.7119, 0.7273, 0.7256]
biobert_micro_recall    = [0.7986, 0.8048, 0.7986, 0.8021, 0.7789]
biobert_micro_f1        = [0.7527, 0.7586, 0.7527, 0.7629, 0.7513]


pubmedbert_macro_precision = [0.6523, 0.6546, 0.679, 0.6731, 0.6877]
pubmedbert_macro_recall    = [0.7522, 0.7439, 0.7603, 0.737,  0.7618]
pubmedbert_macro_f1        = [0.6854, 0.6868, 0.7017, 0.6935, 0.7075]
pubmedbert_micro_precision = [0.7539, 0.7457, 0.7531, 0.7542, 0.7619]
pubmedbert_micro_recall    = [0.812,  0.8192, 0.8192, 0.8129, 0.8138]
pubmedbert_micro_f1        = [0.7819, 0.7807, 0.7847, 0.7824, 0.787]



metrics = [
    ("Macro Precision", bert_macro_precision, biobert_macro_precision, pubmedbert_macro_precision),
    ("Macro Recall",    bert_macro_recall,    biobert_macro_recall,    pubmedbert_macro_recall),
    ("Macro F1",        bert_macro_f1,        biobert_macro_f1,        pubmedbert_macro_f1),
    ("Micro Precision", bert_micro_precision, biobert_micro_precision, pubmedbert_micro_precision),
    ("Micro Recall",    bert_micro_recall,    biobert_micro_recall,    pubmedbert_micro_recall),
    ("Micro F1",        bert_micro_f1,        biobert_micro_f1,        pubmedbert_micro_f1)
]

for metric_name, bert_scores, biobert_scores, pubmedbert_scores in metrics:
    perform_wilcoxon_test(bert_scores, biobert_scores, "BERT", "BioBERT", metric_name)
    perform_wilcoxon_test(biobert_scores, pubmedbert_scores, "BioBERT", "PubMedBERT", metric_name)
    perform_wilcoxon_test(bert_scores, pubmedbert_scores, "BERT", "PubMedBERT", metric_name)

Wilcoxon test for Macro Precision between BERT and BioBERT:
Statistic: 0.0000, p-value: 0.0625

Wilcoxon test for Macro Precision between BioBERT and PubMedBERT:
Statistic: 0.0000, p-value: 0.0625

Wilcoxon test for Macro Precision between BERT and PubMedBERT:
Statistic: 0.0000, p-value: 0.0625

Wilcoxon test for Macro Recall between BERT and BioBERT:
Statistic: 0.0000, p-value: 0.0625

Wilcoxon test for Macro Recall between BioBERT and PubMedBERT:
Statistic: 3.0000, p-value: 0.3125

Wilcoxon test for Macro Recall between BERT and PubMedBERT:
Statistic: 0.0000, p-value: 0.0625

Wilcoxon test for Macro F1 between BERT and BioBERT:
Statistic: 0.0000, p-value: 0.0625

Wilcoxon test for Macro F1 between BioBERT and PubMedBERT:
Statistic: 0.0000, p-value: 0.0625

Wilcoxon test for Macro F1 between BERT and PubMedBERT:
Statistic: 0.0000, p-value: 0.0625

Wilcoxon test for Micro Precision between BERT and BioBERT:
Statistic: 0.0000, p-value: 0.0625

Wilcoxon test for Micro Precision between B