# Calibration examples

This notebook shows how to calculcate different calibaration errors and apply differen calibration techniques

In the alpaca library following calibration errors are implemented:
- Expected Calibration Error (ECE)
- Static Calibration Error (SCE)
- Adaptive Calibration Error (ACE)
- Thresholded Adaptive Calibration Error (TACE)

As methods we use:
- Temperature Scaling
- Vector Scaling
- Matrix Scaling

In [1]:
import math

import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics import accuracy_score
from scipy.special import softmax

from torch.nn import functional as f
import torch
from torch import nn, optim
from torch.utils.data import TensorDataset, DataLoader

from alpaca.utils.datasets.builder import build_dataset
import alpaca.calibrator as calibrator

In [2]:
# Let's build helper function to calculate all the calibration errors
def compute_errors(n_bins, probs, labels, len_dataset, threshold):
    ece = calibrator.compute_ece(n_bins, probs, labels, len_dataset)
    sce = calibrator.compute_sce(n_bins, probs, labels)
    ace = calibrator.compute_ace(n_bins, probs, labels)
    tace = calibrator.compute_tace(threshold, probs, labels, n_bins)
    errors = {
        'ece' : ece,
        'sce' : sce,
        'ace' : ace,
        'tace' : tace
    }
    for name, calibration_error in errors.items():
        print(name, ' = ', calibration_error)

### Model training
We showcase the calibration approaches with simple neural network and MNIST datset. To start with, we'll take the data, build neural net and train it.

In [3]:
mnist = build_dataset('mnist', val_size=10_000)
X_train, y_train = mnist.dataset('train')
X_val, y_val = mnist.dataset('val')
X_cal = X_train[48000:][:]
X_train = X_train[0:48000][:]
y_cal = y_train[48000:][:]
y_train = y_train[0:48000][:]

x_shape = (-1, 1, 28, 28)

train_ds = TensorDataset(torch.FloatTensor(X_train.reshape(x_shape)), torch.LongTensor(y_train))
val_ds = TensorDataset(torch.FloatTensor(X_val.reshape(x_shape)), torch.LongTensor(y_val))
train_loader = DataLoader(train_ds, batch_size=512)
val_loader = DataLoader(val_ds, batch_size=512)
cal_ds = TensorDataset(torch.FloatTensor(X_cal.reshape(x_shape)), torch.LongTensor(y_cal))
cal_loader = DataLoader(cal_ds, batch_size=512)
X_val.shape

(10000, 784)

In [4]:
class Net(nn.Module):   
    def __init__(self):
        super(Net, self).__init__()

        self.cnn_layers = nn.Sequential(
            nn.Conv2d(1, 4, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(4),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(4, 4, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(4),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )

        self.linear_layers = nn.Sequential(
            nn.Linear(4 * 7 * 7, 10)
        )
  
    def forward(self, x):
        x = self.cnn_layers(x)
        x = x.view(x.size(0), -1)
        x = self.linear_layers(x)
        return x

In [5]:
model = Net()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

In [6]:
for epoch in range(7):
    for x_batch, y_batch in train_loader: # Train for one epoch
        print('.', end='')
        prediction = model(x_batch)
        optimizer.zero_grad()
        loss = criterion(prediction, y_batch)
        loss.backward()
        optimizer.step()
    print('\nTrain loss on last batch', loss.item())

# Check accuracy
x_batch, y_batch = next(iter(val_loader))


class_preds = f.softmax(model(x_batch), dim=-1).detach().numpy()
predictions = np.argmax(class_preds, axis=-1)
print('Accuracy', accuracy_score(predictions, y_batch))

..............................................................................................
Train loss on last batch 0.5327802896499634
..............................................................................................
Train loss on last batch 0.27037015557289124
..............................................................................................
Train loss on last batch 0.1935376077890396
..............................................................................................
Train loss on last batch 0.15540318191051483
..............................................................................................
Train loss on last batch 0.13320226967334747
..............................................................................................
Train loss on last batch 0.11954189091920853
..............................................................................................
Train loss on last batch 0.11020981520414352
Accuracy 0.97265625


## Calibration

Calibration applies to the logit outputs of the network. Usuall pipeline is as follows:
- Train the model with logits outputs
- Calculate the logits for some calibration dataset (it basically a validation dataset in some sense)
- Wrap the model with calibration model and train it on the calibration logits 

In [7]:
logits_list = []
labels_list = []
for x_batch, y_batch in cal_loader:
    logits_list.append(model(x_batch))
    labels_list.append(y_batch)
logits = torch.cat(logits_list)
labels = torch.cat(labels_list)
logits.detach_()


tensor([[ -2.3510,  -2.6608,  -4.4777,  ...,   9.7287,  -2.8388,   5.9241],
        [  7.5125,  -5.3876,   0.5365,  ...,  -7.2053,  -0.0680,  -1.2508],
        [ -1.4826,   6.4404,  -0.1426,  ...,  -1.9341,  -0.6942,  -1.3237],
        ...,
        [ -2.7429,  -6.0311,  -5.5568,  ...,  -8.2758,   5.0076,   0.1018],
        [ -3.4007,  -6.8022,  -0.8077,  ...,   1.9691,  -1.2460,   8.5202],
        [  8.9903, -10.3975,  -1.4172,  ...,  -7.7311,   0.1962,   0.8563]])

### Temperature Scaling

In [8]:
calibr = calibrator.ModelWithTempScaling(model)

In [9]:
calibr.scaling(logits, labels)

ModelWithTempScaling(
  (model): Net(
    (cnn_layers): Sequential(
      (0): Conv2d(1, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (4): Conv2d(4, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (5): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (6): ReLU(inplace=True)
      (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (linear_layers): Sequential(
      (0): Linear(in_features=196, out_features=10, bias=True)
    )
  )
)

In [10]:
val_logits_list = []
val_labels_list = []
for x_batch, y_batch in val_loader:
    val_logits_list.append(model(x_batch))
    val_labels_list.append(y_batch)
val_logits = torch.cat(val_logits_list)
val_labels = torch.cat(val_labels_list)
val_logits.detach_()
probs = f.softmax(val_logits, dim=-1)

In [11]:
compute_errors(n_bins=15, probs=probs.numpy(), labels=val_labels.numpy(),
               len_dataset=np.shape(probs)[0], threshold=0.9)

ece  =  tensor([0.0169])
sce  =  tensor([0.0022])
ace  =  tensor(0.0205)
tace  =  tensor(0.0111)


In [12]:
print(calibr.temperature)

Parameter containing:
tensor([0.6644], requires_grad=True)


In [13]:
temp_scaling_probs_list = []
for x_batch, y_batch in val_loader:
    temp_scaling_probs_list.append(calibr.forward(x_batch))
temp_scaling_probs = torch.cat(temp_scaling_probs_list)
compute_errors(n_bins=15, probs=temp_scaling_probs.detach().numpy(), labels=val_labels.numpy(),
               len_dataset=np.shape(probs)[0], threshold=0.9)

ece  =  tensor([0.0066])
sce  =  tensor([0.0012])
ace  =  tensor(0.0144)
tace  =  tensor(0.0059)


### Vector Scaling

In [14]:
calibr = calibrator.ModelWithVectScaling(model, n_classes=10).float()

In [15]:
calibr.scaling(logits, labels, lr=0.001, max_iter=300)

ModelWithVectScaling(
  (model): Net(
    (cnn_layers): Sequential(
      (0): Conv2d(1, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (4): Conv2d(4, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (5): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (6): ReLU(inplace=True)
      (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (linear_layers): Sequential(
      (0): Linear(in_features=196, out_features=10, bias=True)
    )
  )
)

In [16]:
vect_scaling_probs_list = []
for x_batch, y_batch in val_loader:
    vect_scaling_probs_list.append(calibr.forward(x_batch))
vect_scaling_probs = torch.cat(vect_scaling_probs_list)

In [17]:
compute_errors(n_bins=15, probs=vect_scaling_probs.detach().numpy(), labels=val_labels.numpy(),
               len_dataset=np.shape(probs)[0], threshold=0.9)

ece  =  tensor([0.0049])
sce  =  tensor([0.0013])
ace  =  tensor(0.0141)
tace  =  tensor(0.0058)


In [18]:
calibr.W_and_b

Parameter containing:
tensor([ 1.1088,  1.1993,  1.0772,  1.1848,  1.2557,  1.1746,  1.2312,  1.3807,
         1.1934,  1.3136, -0.0117,  0.0208, -0.0524, -0.0219, -0.0316, -0.0127,
         0.0147,  0.0682, -0.0066,  0.0332], requires_grad=True)

### Matrix Scaling

In [19]:
calibr = calibrator.ModelWithMatrScaling(model, n_classes=10).float()
calibr.scaling(logits, labels, lr=0.0001, max_iter=1000)

ModelWithMatrScaling(
  (model): Net(
    (cnn_layers): Sequential(
      (0): Conv2d(1, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (4): Conv2d(4, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (5): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (6): ReLU(inplace=True)
      (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (linear_layers): Sequential(
      (0): Linear(in_features=196, out_features=10, bias=True)
    )
  )
)

In [20]:
matr_scaling_probs_list = []
for x_batch, y_batch in val_loader:
    matr_scaling_probs_list.append(calibr.forward(x_batch))
matr_scaling_probs = torch.cat(matr_scaling_probs_list)

In [21]:
compute_errors(n_bins=15, probs=matr_scaling_probs.detach().numpy(), labels=val_labels.numpy(),
               len_dataset=np.shape(probs)[0], threshold=0.9)

ece  =  tensor([0.0048])
sce  =  tensor([0.0013])
ace  =  tensor(0.0138)
tace  =  tensor(0.0056)


In [22]:
hist_binning_probs = calibrator.multiclass_histogram_binning(15, logits.numpy(), labels.numpy(), val_logits)

In [23]:
compute_errors(n_bins=15, probs=hist_binning_probs, labels=val_labels.numpy(),
               len_dataset=np.shape(probs)[0], threshold=0.9)

ece  =  tensor([0.0074])
sce  =  tensor([0.0015])
ace  =  tensor(0.0150, dtype=torch.float64)
tace  =  tensor(0.0109, dtype=torch.float64)
