## Combine CNNs and Naive Bayes predictions

In [1]:
import pandas as pd
import numpy as np
from sklearn.isotonic import IsotonicRegression
from sklearn.neighbors import NearestNeighbors
import networkx as nx
from sklearn.metrics import log_loss


In [2]:
# Calibrated preds of cnn 
cnn_preds_test = pd.read_csv("cnn_predictions_test.csv")
cnn_preds_test.head()

Unnamed: 0,id,cnn_class_0,cnn_class_1,cnn_class_2,cnn_class_3,cnn_class_4,cnn_class_5,cnn_class_6,cnn_class_7,cnn_class_8,cnn_class_9,true_label
0,0,0.7646409,2.929921e-08,0.0003541268,0.0002757727,0.0004640916,1.097169e-12,0.2340744,4.767119e-12,0.0001905905,1.464011e-14,0
1,1,1.475791e-07,0.9999992,1.13826e-10,1.559626e-07,5.492502e-07,7.475159e-13,1.660269e-08,1.402884e-13,3.315164e-10,1.39708e-11,1
2,2,0.01620998,4.26587e-05,0.2199001,0.002774664,0.03708358,8.34472e-06,0.723725,4.747752e-07,0.0002462436,8.926933e-06,2
3,3,0.003039407,0.001616838,0.8089375,0.008918043,0.07218361,6.735617e-09,0.1037089,8.984537e-07,0.001594747,3.810794e-08,2
4,4,0.0002765935,3.107002e-05,0.1534149,0.6109465,0.2351162,1.557162e-07,0.0001786349,8.04075e-08,3.520224e-05,6.593066e-07,3


In [3]:
# The predictions for the Naive Bayes classifier are currently not calibrated - I use it as is just to test my part. 
# Once we have the calibrated preds we can change them.

nb_preds_test = pd.read_csv("nb_predictions_test.csv")
nb_preds_test.head()

Unnamed: 0,id,nb_class_0,nb_class_1,nb_class_2,nb_class_3,nb_class_4,nb_class_5,nb_class_6,nb_class_7,nb_class_8,nb_class_9,true_label
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
3,3,0.0,0.0,7.248385e-174,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2
4,4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3


In [4]:
# Merge the two datasets 
cnn_nb_combined_test = cnn_preds_test.merge(nb_preds_test, on="id")

# Get the probs for the CNN and NB
cnn_probs_test = cnn_nb_combined_test[[f"cnn_class_{i}" for i in range(10)]].values
nb_probs_test = cnn_nb_combined_test[[f"nb_class_{i}" for i in range(10)]].values


1. Accurate Probability Calibration for Multiple Classifiers by Leon Wenliang Zhong and James T. Kwok

In their paper they use soft voting (averaging the probabilities) to get an ensemble starting point for each class. Then they fit an isotonic regression to make the combined probability well-calibrated and finally optimise using alternating direction method of multipliers (ADMM). 

I will try to replicate their method and see if it improves the accuracy. (exact steps on the paper page 1942)

In [5]:
# Sanity checks 

# Check if probs sum to 1 for the CNN and NB
cnn_sum = cnn_nb_combined_test[[f"cnn_class_{i}" for i in range(10)]].sum(axis=1)
print(cnn_sum)

nb_sum = cnn_nb_combined_test[[f"nb_class_{i}" for i in range(10)]].sum(axis=1)
print(nb_sum)

0       1.0
1       1.0
2       1.0
3       1.0
4       1.0
       ... 
9995    1.0
9996    1.0
9997    1.0
9998    1.0
9999    1.0
Length: 10000, dtype: float64
0       1.0
1       1.0
2       1.0
3       1.0
4       1.0
       ... 
9995    1.0
9996    1.0
9997    1.0
9998    1.0
9999    1.0
Length: 10000, dtype: float64


In [6]:
# Calculate the average prob between CNN and NB 
avg_probs_test = 0.5 * cnn_probs_test + 0.5 * nb_probs_test

# Get the highest (final prediction)
avg_preds_test = np.argmax(avg_probs_test, axis=1)
y_true_test = cnn_nb_combined_test["true_label_y"].values

# Get the accuracy 
accuracy = (avg_preds_test == y_true_test).mean() * 100
print(f"Accuracy Test (average): {accuracy}%")


Accuracy Test (average): 66.8%


In [7]:
# Validation preds of CNN and NB
cnn_preds_val = pd.read_csv("cnn_predictions_val.csv")
cnn_preds_val.head()

nb_preds_val = pd.read_csv("nb_predictions_val.csv")
nb_preds_val.head()

Unnamed: 0,id,nb_class_0,nb_class_1,nb_class_2,nb_class_3,nb_class_4,nb_class_5,nb_class_6,nb_class_7,nb_class_8,nb_class_9,true_label
0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,8
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,5
4,4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,4


In [8]:
# Merge the two datasets 
cnn_nb_combined_val = cnn_preds_val.merge(nb_preds_val, on="id")

# Get the probs for the CNN and NB
cnn_probs_val = cnn_nb_combined_val[[f"cnn_class_{i}" for i in range(10)]].values
nb_probs_val = cnn_nb_combined_val[[f"nb_class_{i}" for i in range(10)]].values

y_true_val = cnn_nb_combined_val["true_label_y"].values


In [9]:
# Repeat for validation
avg_probs_val = 0.5 * cnn_probs_val + 0.5 * nb_probs_val
avg_preds_val = np.argmax(avg_probs_val, axis=1)

accuracy_val = (avg_preds_val == y_true_val).mean() * 100
print(f"Accuracy Val (average): {accuracy_val}%")

Accuracy Val (average): 66.0111111111111%


In [10]:
from sklearn.metrics import log_loss

# Calculate NLL and ECE for validation and testing set 
nll_val = log_loss(y_true_val, avg_probs_val, labels=np.arange(10))
print(f"NLL Val (average): {nll_val}")

nll_test = log_loss(y_true_test, avg_probs_test, labels=np.arange(10))
print(f"NLL Test (average): {nll_test}")

def ece_score(probs, labels, n_bins=15):
    confidences = probs.max(axis=1)
    predictions = probs.argmax(axis=1)
    accuracies = (predictions == labels)

    bins = np.linspace(0, 1, n_bins + 1)
    ece = 0.0
    for i in range(n_bins):
        mask = (confidences > bins[i]) & (confidences <= bins[i+1])
        if mask.any():
            bin_acc = accuracies[mask].mean()
            bin_conf = confidences[mask].mean()
            ece += np.abs(bin_acc - bin_conf) * mask.mean()
    return ece

ece_val = ece_score(avg_probs_val, y_true_val)
print(f"ECE Val (average): {ece_val}")

ece_test = ece_score(avg_probs_test, y_true_test)
print(f"ECE Test (average): {ece_test}")


NLL Val (average): 0.47425961043747994
NLL Test (average): 0.46148764800955333
ECE Val (average): 0.16610358190839664
ECE Test (average): 0.16395596667470533




In [12]:
# All results for simple average (50/50)

print(f"Accuracy Test (average): {accuracy}%")
print(f"Accuracy Val (average): {accuracy_val}%")
print()

print(f"NLL Val (average): {nll_val}")
print(f"NLL Test (average): {nll_test}")
print()

print(f"ECE Val (average): {ece_val}")
print(f"ECE Test (average): {ece_test}")
print()



Accuracy Test (average): 66.8%
Accuracy Val (average): 66.0111111111111%

NLL Val (average): 0.47425961043747994
NLL Test (average): 0.46148764800955333

ECE Val (average): 0.16610358190839664
ECE Test (average): 0.16395596667470533



In [13]:
# Weighted Average using AUC (macro one vs rest) - Eq 7 from the paper
from sklearn.metrics import roc_auc_score

auc_cnn = roc_auc_score(y_true_val, cnn_probs_val, multi_class="ovr", average='macro')
auc_nb = roc_auc_score(y_true_val, nb_probs_val, multi_class="ovr", average='macro')

print(f"AUC CNN Val: {auc_cnn}  and  AUC NB Val: {auc_nb}")

AUC CNN Val: 0.9919948079561042  and  AUC NB Val: 0.828166755829904


In [15]:
# Calculate eta (Eq 7)
# μ is the average of (1 − AUCc) over the C classifiers
m = (1-auc_cnn)+(1-auc_nb)/2
eta_cnn_unnorm = np.exp(-(1-auc_cnn)/2*m)
eta_nb_unnorm = np.exp(-(1-auc_nb)/2*m)

# Z normalizes {ηc}Cc=1 to sum to 1
Z = eta_cnn_unnorm + eta_nb_unnorm

eta_cnn = eta_cnn_unnorm/Z
eta_nb = eta_nb_unnorm/Z

print(f"Eta (weights) for CNN: {eta_cnn_unnorm} and NB: {eta_nb_unnorm}")


Eta (weights) for CNN: 0.9996241395736278 and NB: 0.9919630255627921


In [16]:
# Use the weights to calculate a new wgt avg and pick the highest 
wgt_avg_probs_val = eta_cnn * cnn_probs_val + eta_nb * nb_probs_val
wgt_avg_preds_val = np.argmax(wgt_avg_probs_val, axis=1)

wgt_avg_probs_test = eta_cnn * cnn_probs_test + eta_nb * nb_probs_test
wgt_avg_preds_test = np.argmax(wgt_avg_probs_test, axis=1)


In [17]:
# Calculate the metrics for wgt_val
acc_wgt_avg_val = (wgt_avg_preds_val == y_true_val).mean() * 100
nll_wgt_avg_val = log_loss(y_true_val, wgt_avg_probs_val, labels=np.arange(10))
ece_wgt_avg_val = ece_score(wgt_avg_probs_val, y_true_val)

# Calculate the metrics for wgt_test
acc_wgt_avg_test = (wgt_avg_preds_test == y_true_test).mean() * 100
nll_wgt_avg_test = log_loss(y_true_test, wgt_avg_probs_test, labels=np.arange(10))
ece_wgt_avg_test = ece_score(wgt_avg_probs_test, y_true_test)




In [18]:
# All results for simple average (50/50)

print(f"Accuracy Test (Weighted Average): {acc_wgt_avg_test}%")
print(f"Accuracy Val (Weighted Average): {acc_wgt_avg_val}%")
print()

print(f"NLL Val (Weighted Average): {nll_wgt_avg_val}")
print(f"NLL Test (Weighted Average): {nll_wgt_avg_test}")
print()

print(f"ECE Val (Weighted Average): {ece_wgt_avg_val}")
print(f"ECE Test (Weighted Average): {ece_wgt_avg_test}")
print()

Accuracy Test (Weighted Average): 74.42%
Accuracy Val (Weighted Average): 74.05555555555556%

NLL Val (Weighted Average): 0.47311526051460695
NLL Test (Weighted Average): 0.4603667767382166

ECE Val (Weighted Average): 0.08524692050557625
ECE Test (Weighted Average): 0.08735680565192584



In [None]:
# Multi-Isotonic Calibration Model (MIC)

#  MIC constraints (soft voting) Eq 2 (paper)

# DAG Eq 3 (paper) ?? tree ordering?






2. Applying probability calibration to ensemble methods to predict 2-year mortality in patients with DLBCL
Shuanglong Fan, Zhiqiang Zhao, Hongmei Yu, Lei Wang, Chuchu Zheng, Xueqian Huang,
Zhenhuan Yang, Meng Xing, Qing Lu and Yanhong Luo

The third part is the combination of the base models. We used three methods (simple averaging, weighted averaging, and stacking) to combine the above 5 base models. Stacking or stacked generalization, which takes the outputs of the base models as its inputs, uses another machine learning algorithm (also called a meta-learner)

 https://www.analyticsvidhya.com/blog/2021/08/ensemble-stacking-for-machine-learning-and-deep-learning/ 

In [19]:
from sklearn.linear_model import LogisticRegression

def stacking_multinomial_from_frames(cnn_val_df, nb_val_df, cnn_test_df, nb_test_df):

    # Merge the two prediction files
    val_merged = cnn_val_df.merge(nb_val_df, on="id", suffixes=("_cnn", "_nb"))
    test_merged = cnn_test_df.merge(nb_test_df, on="id", suffixes=("_cnn", "_nb"))

    n_classes = 10  

    # Get the probability columns per class
    cnn_cols = [f"cnn_class_{i}" for i in range(n_classes)]
    nb_cols  = [f"nb_class_{i}"  for i in range(n_classes)]

    # Get the probability values per class
    cnn_val = val_merged[cnn_cols].values
    nb_val  = val_merged[nb_cols].values
    y_val   = val_merged["true_label_cnn"].values  

    cnn_test = test_merged[cnn_cols].values
    nb_test  = test_merged[nb_cols].values
    y_test   = test_merged["true_label_cnn"].values

    # Create the hstack table that will be needed for the regression 
    x_val  = np.hstack([cnn_val, nb_val])
    x_test = np.hstack([cnn_test, nb_test])

    # Train multinomial logistic regression
    stacker = LogisticRegression(
        multi_class="multinomial",
        solver="lbfgs",
        max_iter=1000
    )
    stacker.fit(x_val, y_val)

    # Get the predicted & calibrated probs 
    stacked_val  = stacker.predict_proba(x_val)
    stacked_test = stacker.predict_proba(x_test)

    return stacked_val, stacked_test, y_val, y_test, stacker


In [20]:
# Perform stacking 
stack_val, stack_test, y_val, y_test, stack_model = stacking_multinomial_from_frames(
    cnn_preds_val, nb_preds_val,
    cnn_preds_test, nb_preds_test)

# Calculate metrics for validation 
acc_stack_val = (np.argmax(stack_val, axis=1) == y_val).mean() * 100
nll_stack_val = log_loss(y_val, stack_val, labels=np.arange(stack_val.shape[1]))
ece_stack_val = ece_score(stack_val, y_val)

# Calculate metrics for testing  
acc_stack_test = (np.argmax(stack_test, axis=1) == y_test).mean() * 100
nll_stack_test = log_loss(y_test, stack_test, labels=np.arange(stack_test.shape[1]))
ece_stack_test = ece_score(stack_test, y_test)




In [21]:
# All results for calibration using stacking (meta-learner)

print(f"Accuracy Test (Stacking): {acc_stack_test}%")
print(f"Accuracy Val (Stacking): {acc_stack_val}%")
print()

print(f"NLL Val (Stacking): {nll_stack_val}")
print(f"NLL Test (Stacking): {nll_stack_test}")
print()

print(f"ECE Val (Stacking): {ece_stack_val}")
print(f"ECE Test (Stacking): {ece_stack_test}")
print()

Accuracy Test (Stacking): 89.84%
Accuracy Val (Stacking): 89.92222222222223%

NLL Val (Stacking): 0.30949097078855
NLL Test (Stacking): 0.3100217280631483

ECE Val (Stacking): 0.019311135132109285
ECE Test (Stacking): 0.0193937478421264



## Model Comparison – Fashion MNIST (CNN + Naive Bayes)

- CNN Accuracy on Test set: 89.88%
- NB Accuracy on Test set: 66.80%

### Accuracy Results

| Method               | Accuracy (Validation) | Accuracy (Test) |
|---------------------|---------------------|----------------|
| Average             | **66.01%**          | **66.80%**     |
| Weighted Average    | **74.06%**          | **74.42%**     |
| Stacking            | **89.92%**          | **89.84%**     |



### NLL and ECE Results

| Method               | NLL (Validation) | NLL (Test) | ECE (Validation) | ECE (Test) |
|---------------------|----------------|-----------|----------------|-----------|
| Average             | 0.474         | 0.461    | 0.166         | 0.164    |
| Weighted Average    | 0.473         | 0.460    | 0.085         | 0.087    |
| Stacking            | 0.309         | 0.310    | 0.019         | 0.019    |


Stacking achieves the best results with the highest accuracy and lowest NLL/ECE (accurate and well-calibrated).


3. check bayesian calibration using gaussian process
https://discourse.pymc.io/t/bayesian-model-calibration-with-gaussian-process/948