In [1]:
import pickle
import pandas as pd
from scipy.stats import wilcoxon

FILES = ["ABLE_results.pickle", "ML_results.pickle", "BILSTM_results.pickle", 
        "GRU_results.pickle", "CNN_results.pickle", "LSTM_results.pickle"]

all_results = []
for filename in FILES:
    with open(filename, "rb") as datafile:
        results = pickle.load(datafile)
    all_results.extend(results)

In [2]:
df = pd.DataFrame(all_results)
model_names = list(set(df["model"]))
df['f1'] = None
df["precision"] = None
df["recall"] = None
df["bal_acc"] = None

In [3]:
for i in range(0, len(all_results)):
    df.loc[i,'f1'] = df['report'][i]['macro avg']['f1-score']
    df.loc[i, 'precision'] = df['report'][i]['macro avg']['precision']
    df.loc[i, 'recall'] = df['report'][i]['macro avg']['recall']
    NUMBERS = ['0', '1', '2', '3', '4', '5', '6']
    bal_acc_score = 0
    for num in NUMBERS:
        bal_acc_score += df['report'][i][num]['recall']
    bal_acc_score /= 7.0
    df.loc[i, 'bal_acc'] = bal_acc_score

In [4]:
df.drop(columns=["report", "confusion_matrix"], inplace=True)
df.to_csv("all_models_results.csv")

In [18]:
f1_test = pd.DataFrame(columns=model_names, index=model_names)
precision_test = pd.DataFrame(columns=model_names, index=model_names)
bal_acc_test = pd.DataFrame(columns=model_names, index=model_names)
recall_test = pd.DataFrame(columns=model_names, index=model_names)

In [19]:
f1_test["RidgeClassifier"]["BILSTM"] = 1

In [20]:
for i in range(0, len(model_names)):
    for j in range(i+1, len(model_names)):
        model_a = model_names[i]
        model_b = model_names[j]
        
        # f score test
        a_f1_observations = df[df["model"] == model_a].sort_values(by=["sampling", "fold"])["f1"]
        b_f1_observations = df[df["model"] == model_b].sort_values(by=["sampling", "fold"])["f1"]
        w, p = wilcoxon(a_f1_observations, b_f1_observations)
        f1_test[model_a][model_b] = p
        f1_test[model_b][model_a] = p
        
        # precision test
        a_prec_observations = df[df["model"] == model_a].sort_values(by=["sampling", "fold"])["precision"]
        b_prec_observations = df[df["model"] == model_b].sort_values(by=["sampling", "fold"])["precision"]
        w, p = wilcoxon(a_prec_observations, b_prec_observations)
        f1_test[model_a][model_b] = p
        f1_test[model_b][model_a] = p
        
        # recall test
        a_recall_observations = df[df["model"] == model_a].sort_values(by=["sampling", "fold"])["recall"]
        b_recall_observations = df[df["model"] == model_b].sort_values(by=["sampling", "fold"])["recall"]
        w, p = wilcoxon(a_recall_observations, b_recall_observations)
        recall_test[model_a][model_b] = p
        recall_test[model_b][model_a] = p
        
        # balanced accuracy test
        a_bal_acc_observations = df[df["model"] == model_a].sort_values(by=["sampling", "fold"])["recall"]
        b_bal_acc_observations = df[df["model"] == model_b].sort_values(by=["sampling", "fold"])["recall"]
        w, p = wilcoxon(a_bal_acc_observations, b_bal_acc_observations)
        bal_acc_test[model_a][model_b] = p
        bal_acc_test[model_b][model_a] = p

In [21]:
f1_test.to_csv("f1_test.csv")
precision_test.to_csv("prec_test.csv")
bal_acc_test.to_csv("bal_acc_test.csv")
recall_test.to_csv("recall_test.csv")