In [1]:
from scipy import stats
import numpy as np

# Macro F1-Scores der 3 Runs mit unterschiedlichen Seeds
scores = {
    "A": [0.2444, 0.2065, 0.2386],  # Baseline
    "B": [0.3124, 0.2569, 0.3067],  # RandAug
    "C": [0.4861, 0.506, 0.5152],  # 11%
    "D": [0.6152, 0.5954, 0.6356]   # 100%
}

comparisons = [
    ("A", "B", "Baseline vs. RandAug"),
    ("A", "C", "Baseline vs. 11% Data"),
    ("B", "C", "RandAug vs. 11% Data"),
    ("C", "D", "11% Data vs. 100% Data")
]

print(f"{'Vergleich':<25} | {'Delta':<8} | {'p-Value':<10} | {'Signifikant?'}")
print("-" * 60)

for m1, m2, label in comparisons:
    # Daten holen
    data1 = scores[m1]
    data2 = scores[m2]
    
    # Welch's t-test
    t_stat, p_val = stats.ttest_ind(data1, data2, equal_var=False)
    
    # Mittelwert-Differenz
    delta = np.mean(data2) - np.mean(data1)
    
    # Bewertung
    sig = "JA (***)" if p_val < 0.001 else "JA (**)" if p_val < 0.01 else "JA (*)" if p_val < 0.05 else "NEIN"
    
    print(f"{label:<25} | {delta:+.4f}   | {p_val:.5f}    | {sig}")

Vergleich                 | Delta    | p-Value    | Signifikant?
------------------------------------------------------------
Baseline vs. RandAug      | +0.0622   | 0.05055    | NEIN
Baseline vs. 11% Data     | +0.2726   | 0.00009    | JA (***)
RandAug vs. 11% Data      | +0.2104   | 0.00201    | JA (**)
11% Data vs. 100% Data    | +0.1130   | 0.00201    | JA (**)
