## Combine CNNs and Naive Bayes predictions

In [11]:
import pandas as pd
import numpy as np
from sklearn.isotonic import IsotonicRegression
from sklearn.neighbors import NearestNeighbors
import networkx as nx
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression

In [14]:
# dropout or full image 
USE_DROPOUT = True 

if not USE_DROPOUT:
    # CNN (calibrated) on combination splits
    CNN_TEST_PATH = "/Users/ioannaioannidou/Desktop/Uppsala University/Year 2/Semester 1/Project in DS/cnn_comb_test_calibrated_predictions.csv"
    CNN_VAL_PATH  = "/Users/ioannaioannidou/Desktop/Uppsala University/Year 2/Semester 1/Project in DS/cnn_comb_train_calibrated_predictions.csv"
    # BLR (no calibration) on combination splits
    NB_TEST_PATH  = "/Users/ioannaioannidou/Desktop/Uppsala University/Year 2/Semester 1/Project in DS/blr_full_comb_test_predictions.csv"
    NB_VAL_PATH   = "/Users/ioannaioannidou/Desktop/Uppsala University/Year 2/Semester 1/Project in DS/blr_full_comb_train_predictions.csv"
else:
    CNN_TEST_PATH = "/Users/ioannaioannidou/Desktop/Uppsala University/Year 2/Semester 1/Project in DS/cnn_dropout_comb_test_calibrated_predictions.csv"
    CNN_VAL_PATH  = "/Users/ioannaioannidou/Desktop/Uppsala University/Year 2/Semester 1/Project in DS/cnn_dropout_comb_train_calibrated_predictions.csv"
    NB_TEST_PATH  = "/Users/ioannaioannidou/Desktop/Uppsala University/Year 2/Semester 1/Project in DS/blr_dropout_comb_test_predictions.csv"
    NB_VAL_PATH   = "/Users/ioannaioannidou/Desktop/Uppsala University/Year 2/Semester 1/Project in DS/blr_dropout_comb_train_predictions.csv"


In [15]:
# Standardise dfs so merging will be easier 
def standardise_df(df, want_prefix, current_true_label_col=None):
    df = df.copy()
    # id column 
    if "id" not in df.columns:
        df.insert(0, "id", np.arange(len(df)))

    # true_label column
    if "true_label" not in df.columns:
        cand = current_true_label_col if current_true_label_col else ("label" if "label" in df.columns else None)
        if cand is not None:
            df = df.rename(columns={cand: "true_label"})
        else:
            pass

    # find prob columns 
    prob_cols = [c for c in df.columns if "_class_" in c]
    def _cls_idx(c):
        try:
            return int(c.split("_")[-1])
        except:
            return 0
    prob_cols = sorted(prob_cols, key=_cls_idx)

    # rename to the expected prefix 
    rename_map = {c: f"{want_prefix}_class_{_cls_idx(c)}" for c in prob_cols}
    df = df.rename(columns=rename_map)

    return df

1. Accurate Probability Calibration for Multiple Classifiers by Leon Wenliang Zhong and James T. Kwok

In their paper they use soft voting (averaging the probabilities) to get an ensemble starting point for each class. Then they fit an isotonic regression to make the combined probability well-calibrated and finally optimise using alternating direction method of multipliers (ADMM). 

I will try to replicate their method and see if it improves the accuracy. (exact steps on the paper page 1942)

In [16]:
# TEST DAATASET 
# Calibrated preds of cnn 
cnn_preds_test = pd.read_csv(CNN_TEST_PATH)
# standardize CNN columns to cnn_class_k + ensure id/true_label exist
cnn_preds_test = standardise_df(cnn_preds_test, want_prefix="cnn")

cnn_preds_test.head()

# The predictions for the Naive Bayes classifier are currently not calibrated - I use it as is just to test my part. 
# Once we have the calibrated preds we can change them.
nb_preds_test = pd.read_csv(NB_TEST_PATH)

# BLR files usually have 'label' and no 'id'
nb_preds_test = standardise_df(nb_preds_test, want_prefix="nb")
nb_preds_test.head()

# Merge the two datasets
cnn_nb_combined_test = cnn_preds_test.merge(nb_preds_test, on="id")

# Get the probs for the CNN and NB
cnn_probs_test = cnn_nb_combined_test[[f"cnn_class_{i}" for i in range(10)]].values
nb_probs_test  = cnn_nb_combined_test[[f"nb_class_{i}"  for i in range(10)]].values

# Sanity checks 
# Check if probs sum to 1 for the CNN and NB
cnn_sum = cnn_nb_combined_test[[f"cnn_class_{i}" for i in range(10)]].sum(axis=1)
print(cnn_sum)

nb_sum = cnn_nb_combined_test[[f"nb_class_{i}" for i in range(10)]].sum(axis=1)
print(nb_sum)

# Calculate the average prob between CNN and NB 
avg_probs_test = 0.5 * cnn_probs_test + 0.5 * nb_probs_test

# Get the highest (final prediction)
avg_preds_test = np.argmax(avg_probs_test, axis=1)
# both frames have 'true_label' 
y_true_test = cnn_nb_combined_test["true_label_y"].values

# Get the accuracy 
accuracy = (avg_preds_test == y_true_test).mean() * 100
print(f"Accuracy Test (average): {accuracy}%")


0       1.0
1       1.0
2       1.0
3       1.0
4       1.0
       ... 
4795    1.0
4796    1.0
4797    1.0
4798    1.0
4799    1.0
Length: 4800, dtype: float64
0       1.0
1       1.0
2       1.0
3       1.0
4       1.0
       ... 
4795    1.0
4796    1.0
4797    1.0
4798    1.0
4799    1.0
Length: 4800, dtype: float64
Accuracy Test (average): 81.97916666666667%


In [17]:
# VALIDATION DATASET
# Validation preds of CNN and NB
cnn_preds_val = pd.read_csv(CNN_VAL_PATH)
cnn_preds_val = standardise_df(cnn_preds_val, want_prefix="cnn")
cnn_preds_val.head()

nb_preds_val = pd.read_csv(NB_VAL_PATH)
nb_preds_val = standardise_df(nb_preds_val, want_prefix="nb")
nb_preds_val.head()

# Merge the two datasets 
cnn_nb_combined_val = cnn_preds_val.merge(nb_preds_val, on="id")

# Get the probs for the CNN and NB
cnn_probs_val = cnn_nb_combined_val[[f"cnn_class_{i}" for i in range(10)]].values
nb_probs_val  = cnn_nb_combined_val[[f"nb_class_{i}"  for i in range(10)]].values

y_true_val = cnn_nb_combined_val["true_label_y"].values

# Repeat for validation
avg_probs_val = 0.5 * cnn_probs_val + 0.5 * nb_probs_val
avg_preds_val = np.argmax(avg_probs_val, axis=1)

accuracy_val = (avg_preds_val == y_true_val).mean() * 100
print(f"Accuracy Val (average): {accuracy_val}%")


Accuracy Val (average): 82.359375%


In [18]:
from sklearn.metrics import log_loss

# Calculate NLL and ECE for validation and testing set 
nll_val = log_loss(y_true_val, avg_probs_val, labels=np.arange(10))
print(f"NLL Val (average): {nll_val}")

nll_test = log_loss(y_true_test, avg_probs_test, labels=np.arange(10))
print(f"NLL Test (average): {nll_test}")


def ece_score(probs, labels, n_bins=15):
    confidences = probs.max(axis=1)
    predictions = probs.argmax(axis=1)
    accuracies = (predictions == labels)

    bins = np.linspace(0, 1, n_bins + 1)
    ece = 0.0
    for i in range(n_bins):
        mask = (confidences > bins[i]) & (confidences <= bins[i+1])
        if mask.any():
            bin_acc = accuracies[mask].mean()
            bin_conf = confidences[mask].mean()
            ece += np.abs(bin_acc - bin_conf) * mask.mean()
    return ece

ece_val = ece_score(avg_probs_val, y_true_val)
print(f"ECE Val (average): {ece_val}")

ece_test = ece_score(avg_probs_test, y_true_test)
print(f"ECE Test (average): {ece_test}")


NLL Val (average): 0.5498834077136138
NLL Test (average): 0.5515493621529449
ECE Val (average): 0.11573917825881024
ECE Test (average): 0.11386967372517837




In [19]:
# All results for simple average (50/50)
print(f"Accuracy Test (average): {accuracy}%")
print(f"Accuracy Val (average): {accuracy_val}%")
print()
print(f"NLL Val (average): {nll_val}")
print(f"NLL Test (average): {nll_test}")
print()
print(f"ECE Val (average): {ece_val}")
print(f"ECE Test (average): {ece_test}")
print()

Accuracy Test (average): 81.97916666666667%
Accuracy Val (average): 82.359375%

NLL Val (average): 0.5498834077136138
NLL Test (average): 0.5515493621529449

ECE Val (average): 0.11573917825881024
ECE Test (average): 0.11386967372517837



In [20]:
# Weighted Average using AUC (macro one-vs-rest) - Eq 7 from the paper 
from sklearn.metrics import roc_auc_score

auc_cnn = roc_auc_score(y_true_val, cnn_probs_val, multi_class="ovr", average='macro')
auc_nb  = roc_auc_score(y_true_val, nb_probs_val,  multi_class="ovr", average='macro')

print(f"AUC CNN Val: {auc_cnn}  and  AUC NB Val: {auc_nb}")

# Calculate eta (Eq 7)
# μ is the average of (1 − AUCc) over the C classifiers
# (fixed the parenthesis bug so it's truly an average)
m = ((1 - auc_cnn) + (1 - auc_nb)) / 2.0

# η_c ∝ exp( - (1 - AUC_c) / (2 μ) )  → then normalize to sum to 1
eta_cnn_unnorm = np.exp(-(1 - auc_cnn) / (2.0 * m))
eta_nb_unnorm  = np.exp(-(1 - auc_nb)  / (2.0 * m))

# Z normalizes {ηc}Cc=1 to sum to 1
Z = eta_cnn_unnorm + eta_nb_unnorm

eta_cnn = eta_cnn_unnorm / Z
eta_nb  = eta_nb_unnorm  / Z

print(f"Eta (weights) for CNN: {eta_cnn} and NB: {eta_nb}")

# Use the weights to calculate a new wgt avg and pick the highest 
wgt_avg_probs_val  = eta_cnn * cnn_probs_val + eta_nb * nb_probs_val
wgt_avg_preds_val  = np.argmax(wgt_avg_probs_val, axis=1)

wgt_avg_probs_test = eta_cnn * cnn_probs_test + eta_nb * nb_probs_test
wgt_avg_preds_test = np.argmax(wgt_avg_probs_test, axis=1)

# Calculate the metrics for wgt_val
acc_wgt_avg_val  = (wgt_avg_preds_val  == y_true_val).mean() * 100
nll_wgt_avg_val  = log_loss(y_true_val,  wgt_avg_probs_val,  labels=np.arange(10))
ece_wgt_avg_val  = ece_score(wgt_avg_probs_val,  y_true_val)

# Calculate the metrics for wgt_test
acc_wgt_avg_test = (wgt_avg_preds_test == y_true_test).mean() * 100
nll_wgt_avg_test = log_loss(y_true_test, wgt_avg_probs_test, labels=np.arange(10))
ece_wgt_avg_test = ece_score(wgt_avg_probs_test, y_true_test)



AUC CNN Val: 0.9810783683244964  and  AUC NB Val: 0.9486706081732063
Eta (weights) for CNN: 0.6133257779143674 and NB: 0.38667422208563257




In [21]:
# All results for weighted average
print(f"Accuracy Test (Weighted Average): {acc_wgt_avg_test}%")
print(f"Accuracy Val (Weighted Average): {acc_wgt_avg_val}%")
print()
print(f"NLL Val (Weighted Average): {nll_wgt_avg_val}")
print(f"NLL Test (Weighted Average): {nll_wgt_avg_test}")
print()
print(f"ECE Val (Weighted Average): {ece_wgt_avg_val}")
print(f"ECE Test (Weighted Average): {ece_wgt_avg_test}")
print()

Accuracy Test (Weighted Average): 81.95833333333333%
Accuracy Val (Weighted Average): 82.328125%

NLL Val (Weighted Average): 0.5215816772425292
NLL Test (Weighted Average): 0.5227144528971687

ECE Val (Weighted Average): 0.08237789081689495
ECE Test (Weighted Average): 0.08091613027857282



In [None]:
# Multi-Isotonic Calibration Model (MIC)

#  MIC constraints (soft voting) Eq 2 (paper)

# DAG Eq 3 (paper) ?? tree ordering?

2. Applying probability calibration to ensemble methods to predict 2-year mortality in patients with DLBCL
Shuanglong Fan, Zhiqiang Zhao, Hongmei Yu, Lei Wang, Chuchu Zheng, Xueqian Huang,
Zhenhuan Yang, Meng Xing, Qing Lu and Yanhong Luo

The third part is the combination of the base models. We used three methods (simple averaging, weighted averaging, and stacking) to combine the above 5 base models. Stacking or stacked generalization, which takes the outputs of the base models as its inputs, uses another machine learning algorithm (also called a meta-learner)

In [22]:
# Stacking meta-learner
def stacking_multinomial_from_frames(cnn_val_df, nb_val_df, cnn_test_df, nb_test_df):

    # Merge the two prediction files
    val_merged = cnn_val_df.merge(nb_val_df, on="id", suffixes=("_cnn", "_nb"))
    test_merged = cnn_test_df.merge(nb_test_df, on="id", suffixes=("_cnn", "_nb"))

    n_classes = 10  

    # Get the probability columns per class
    cnn_cols = [f"cnn_class_{i}" for i in range(n_classes)]
    nb_cols  = [f"nb_class_{i}"  for i in range(n_classes)]

    # Get the probability values per class
    cnn_val = val_merged[cnn_cols].values
    nb_val  = val_merged[nb_cols].values
    y_val   = val_merged["true_label_cnn"].values  

    cnn_test = test_merged[cnn_cols].values
    nb_test  = test_merged[nb_cols].values
    y_test   = test_merged["true_label_cnn"].values

    # Create the hstack table that will be needed for the regression 
    x_val  = np.hstack([cnn_val, nb_val])
    x_test = np.hstack([cnn_test, nb_test])

    # Train multinomial logistic regression
    stacker = LogisticRegression(
        multi_class="multinomial",
        solver="lbfgs",
        max_iter=1000
    )
    stacker.fit(x_val, y_val)

    # Get the predicted & calibrated probs 
    stacked_val  = stacker.predict_proba(x_val)
    stacked_test = stacker.predict_proba(x_test)

    return stacked_val, stacked_test, y_val, y_test, stacker

# Perform stacking 
stack_val, stack_test, y_val, y_test, stack_model = stacking_multinomial_from_frames(
    cnn_preds_val, nb_preds_val,
    cnn_preds_test, nb_preds_test)



In [23]:
# Calculate metrics for validation 
acc_stack_val = (np.argmax(stack_val, axis=1) == y_val).mean() * 100
nll_stack_val = log_loss(y_val, stack_val, labels=np.arange(stack_val.shape[1]))
ece_stack_val = ece_score(stack_val, y_val)

# Calculate metrics for testing  
acc_stack_test = (np.argmax(stack_test, axis=1) == y_test).mean() * 100
nll_stack_test = log_loss(y_test, stack_test, labels=np.arange(stack_test.shape[1]))
ece_stack_test = ece_score(stack_test, y_test)

# All results for calibration using stacking (meta-learner)
print(f"Accuracy Test (Stacking): {acc_stack_test}%")
print(f"Accuracy Val (Stacking): {acc_stack_val}%")
print()
print(f"NLL Val (Stacking): {nll_stack_val}")
print(f"NLL Test (Stacking): {nll_stack_test}")
print()
print(f"ECE Val (Stacking): {ece_stack_val}")
print(f"ECE Test (Stacking): {ece_stack_test}")
print()

Accuracy Test (Stacking): 84.85416666666666%
Accuracy Val (Stacking): 84.8125%

NLL Val (Stacking): 0.4141678229236834
NLL Test (Stacking): 0.41917520434260797

ECE Val (Stacking): 0.008308374090578658
ECE Test (Stacking): 0.012554928047833465



# SUMMARY FULL IMAGE

### Full Images – Combination Results

| Method                            | Accuracy (Val) | Accuracy (Test) | NLL (Val) | NLL (Test) | ECE (Val) | ECE (Test) |
|----------------------------------|----------------|-----------------|------------|-------------|------------|-------------|
| **Simple Average (50/50)**       | 88.42%         | 88.58%          | 0.4876     | 0.4833      | 0.1781     | 0.1789      |
| **Weighted Average (AUC-based)** | 88.46%         | 88.50%          | 0.4169     | 0.4131      | 0.1209     | 0.1206      |
| **Stacking (Meta-Model)**        | 88.94%         | 88.19%          | 0.3213     | 0.3333      | 0.0138     | 0.0197      |

Stacking achieves lowest ECE and NLL, while accuracy differences are minimal across methods.


# SUMMARY DROPOUT IMAGES

### Dropout Images – Combination Results

| Method                            | Accuracy (Val) | Accuracy (Test) | NLL (Val) | NLL (Test) | ECE (Val) | ECE (Test) |
|----------------------------------|----------------|-----------------|------------|-------------|------------|-------------|
| **Simple Average (50/50)**       | 82.36%         | 81.98%          | 0.5499     | 0.5515      | 0.1157     | 0.1139      |
| **Weighted Average (AUC-based)** | 82.33%         | 81.96%          | 0.5216     | 0.5227      | 0.0824     | 0.0809      |
| **Stacking (Meta-Model)**        | 84.81%         | 84.85%          | 0.4142     | 0.4192      | 0.0083     | 0.0126      |

 Stacking again achieves the best performance, significantly improving both accuracy and ECE compared to averaging methods.


# Train several meta learners 
Important to mention that i combined the predictions of CNN and BLR for both full and drop out images and all these 4 columns for 10 classes will be the 40 features that will be used to train a new meta learner. 

- A static one (Logistic Regression) that will learn the weights for each model and whether is a full or drop out images and apply it to make new predicions

- A dynamic one (Random Forest and Gradient Boosting) that will learn to adjust the weights based on the patterns observed in the input features

In [24]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import log_loss

In [25]:
# Load the csv files containing the predictions
def load_preds(path, prefix):
    df = pd.read_csv(path).copy()
    if "true_label" not in df.columns and "label" in df.columns:
        df = df.rename(columns={"label": "true_label"})
    prob_cols = [c for c in df.columns if "_class_" in c]
    for c in prob_cols:
        idx = c.split("_")[-1]
        df = df.rename(columns={c: f"{prefix}_class_{idx}"})
    if "id" not in df.columns:
        df.insert(0, "id", np.arange(len(df)))
    return df

cnn_full_train = load_preds("cnn_comb_train_calibrated_predictions.csv", "cnn_full")
cnn_full_test  = load_preds("cnn_comb_test_calibrated_predictions.csv", "cnn_full")

cnn_drop_train = load_preds("cnn_dropout_comb_train_calibrated_predictions.csv", "cnn_drop")
cnn_drop_test  = load_preds("cnn_dropout_comb_test_calibrated_predictions.csv", "cnn_drop")

blr_full_train = load_preds("blr_full_comb_train_predictions.csv", "blr_full")
blr_full_test  = load_preds("blr_full_comb_test_predictions.csv", "blr_full")

blr_drop_train = load_preds("blr_dropout_comb_train_predictions.csv", "blr_drop")
blr_drop_test  = load_preds("blr_dropout_comb_test_predictions.csv", "blr_drop")

In [27]:
# Clean up duplicate true_label columns before merging otherwise python complaints  
def drop_extra_labels(df):
    cols = [c for c in df.columns if "true_label" in c]
    if len(cols) > 1:
        df = df.drop(columns=cols[1:])
    return df

cnn_full_train = drop_extra_labels(cnn_full_train)
cnn_drop_train = drop_extra_labels(cnn_drop_train)
blr_full_train = drop_extra_labels(blr_full_train)
blr_drop_train = drop_extra_labels(blr_drop_train)

cnn_full_test = drop_extra_labels(cnn_full_test)
cnn_drop_test = drop_extra_labels(cnn_drop_test)
blr_full_test = drop_extra_labels(blr_full_test)
blr_drop_test = drop_extra_labels(blr_drop_test)

# Merge datasets on id
train_df = (
    cnn_full_train
    .merge(cnn_drop_train, on="id", suffixes=("", "_dup1"))
    .merge(blr_full_train, on="id", suffixes=("", "_dup2"))
    .merge(blr_drop_train, on="id", suffixes=("", "_dup3")))

test_df = (
    cnn_full_test
    .merge(cnn_drop_test, on="id", suffixes=("", "_dup1"))
    .merge(blr_full_test, on="id", suffixes=("", "_dup2"))
    .merge(blr_drop_test, on="id", suffixes=("", "_dup3")))

print("Train:", train_df.shape, "Test:", test_df.shape)


Train: (19200, 45) Test: (4800, 45)


In [29]:
# Split dependent and independent vars 
feature_cols = [c for c in train_df.columns if "_class_" in c]
X_train = train_df[feature_cols].values
y_train = train_df["true_label"].values 
X_test  = test_df[feature_cols].values
y_test  = test_df["true_label"].values

print("Features:", len(feature_cols))

# Meta-learners used 
models = {
    "LogReg": LogisticRegression(max_iter=1000, multi_class="multinomial"),
    "RF": RandomForestClassifier(n_estimators=200, max_depth=None, random_state=0),
    "GBM": GradientBoostingClassifier(n_estimators=300, learning_rate=0.05, max_depth=3, random_state=0)}

# Fit, test each of them and evaluate them 
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    probs_val = model.predict_proba(X_train)
    probs_test = model.predict_proba(X_test)

    acc_val = (np.argmax(probs_val, 1) == y_train).mean() * 100
    acc_test = (np.argmax(probs_test, 1) == y_test).mean() * 100
    nll_val = log_loss(y_train, probs_val, labels=np.arange(probs_val.shape[1]))
    nll_test = log_loss(y_test, probs_test, labels=np.arange(probs_test.shape[1]))
    ece_val = ece_score(probs_val, y_train)
    ece_test = ece_score(probs_test, y_test)

    results.append({
        "Model": name,
        "Accuracy (Val)": acc_val,
        "Accuracy (Test)": acc_test,
        "NLL (Val)": nll_val,
        "NLL (Test)": nll_test,
        "ECE (Val)": ece_val,
        "ECE (Test)": ece_test})


Features: 40




In [30]:
# Summary 
results_df = pd.DataFrame(results)
display(results_df.style.format({
    "Accuracy (Val)": "{:.2f}%",
    "Accuracy (Test)": "{:.2f}%",
    "NLL (Val)": "{:.4f}",
    "NLL (Test)": "{:.4f}",
    "ECE (Val)": "{:.4f}",
    "ECE (Test)": "{:.4f}"}))


Unnamed: 0,Model,Accuracy (Val),Accuracy (Test),NLL (Val),NLL (Test),ECE (Val),ECE (Test)
0,LogReg,89.57%,89.02%,0.299,0.3116,0.0128,0.0142
1,RF,100.00%,89.27%,0.0686,0.3434,0.0607,0.0169
2,GBM,95.17%,88.90%,0.1493,0.3128,0.0351,0.0243


### Meta-Learner (Full & Dropout Images Predictions)

| Model | Accuracy (Val) | Accuracy (Test) | NLL (Val) | NLL (Test) | ECE (Val) | ECE (Test) |
|--------|----------------|-----------------|------------|-------------|------------|-------------|
| **Logistic Regression** | 89.57% | 89.02% | 0.2990 | 0.3116 | 0.0128 | 0.0142 |
| **Random Forest**       | 100.00% | 89.27% | 0.0686 | 0.3434 | 0.0607 | 0.0169 |
| **Gradient Boosting**   | 95.17% | 88.90% | 0.1493 | 0.3128 | 0.0351 | 0.0243 |


- Logistic Regression provides the best generalization (lowest NLL & ECE on both val/test)
- Random Forest clearly overfits (100% Val Acc but drop on Test + higher NLL)
- Gradient Boosting performs well, slightly under LogReg, showing slight overfitting too 

Logistic Regression is the best for now (static) for the other nonlinear cases i should optimise them to improve generalisation
