In [15]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import numpy as np
import torch
import sys, os
import pickle
from torch import nn
import utils.trainutils as tutils
import utils.datautils as dutils
import utils.uqutils as uqutils
import models
from tqdm import tqdm_notebook
import torch.nn.functional as F
from scipy.special import comb
from torch.nn import CrossEntropyLoss as CE

# SETUP GPU
torch.backends.cudnn.benchmark = True
device = torch.device("cuda:0")
base = "/home/data/"

def res34(num_class):
    model = torchvision.models.resnet34()
    model.fc = nn.Linear(512, num_class)
    return model

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Dataloader and inference

In the following cell, we define our dataloaders, `start` denotes **ntrain** (number of training dataset), `end` denotes **nvalid** (number of validation dataset). To allow for various possible choices of validation dataset, keep `end` much higher than **nvalid**. Make sure that the `train_shuffle` and `valid_shuffle` is `False` in order for the inference to work with Deep Ensembles.

Then define the `model_file_pattern` to be the name pattern of the models in the trained Deep Ensemble and initiate the `model`.

Finally, use the `infer_ensemble` method from `trainutils` to infer on the *train, test, validation* datasets. We use the train and validation datasets without augmentations.

In [4]:
loader_dict, num_class = dutils.return_loaders(base=base, dataset='CIFAR10', start=1000, end=1500, 
                                               train_shuffle=False, valid_shuffle=False)
np.random.seed(1)
model_file_pattern = 'CIFAR10_ntrain-1000_MixUpAlpha-0.5_id-*.model'
model = models.FastResNet().to(device)
test_probs, targets, model_files = tutils.infer_ensemble(model_file_pattern=model_file_pattern, model=model, 
                                                         dataloader=loader_dict['test'], evalmode=False)
train_probs, train_targets, model_files = tutils.infer_ensemble(model_file_pattern=model_file_pattern, model=model, 
                                                         dataloader=loader_dict['no-augment_train'], evalmode=False)
valid_probs, valid_targets, model_files = tutils.infer_ensemble(model_file_pattern=model_file_pattern, model=model, 
                                                         dataloader=loader_dict['no-augment_valid'], evalmode=False)

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


  0%|          | 0/5 [00:00<?, ?it/s]

Number of files found: 5


100%|██████████| 5/5 [00:04<00:00,  1.20it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

Number of files found: 5


100%|██████████| 5/5 [00:01<00:00,  2.74it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

Number of files found: 5


100%|██████████| 5/5 [00:01<00:00,  2.65it/s]


Next, we show uncertainty quantification of individual models and the **pooled** Deep Ensemble model. In our chosen setup, the pooled model has higher *Expected Calibration Error* (ECE) than the individual models. On a closer inspection, one can find that the pooled model is more under-confident than the individual models.

In [14]:
for i in range(len(model_files)):
    ret = uqutils.get_all_scores(test_probs[i], targets)
    print(f'Model: {i+1} => Accuracy: {100*ret[0]:.1f}%, ECE: {100*ret[1]:.2f}%, NLL: {ret[2]:.4f}, Brier: {ret[3]:.4f}')

print('\n')
ret = uqutils.get_all_scores(np.mean(test_probs, axis=0), targets)
print(f'Pooled model => Accuracy: {100*ret[0]:.1f}%, ECE: {100*ret[1]:.2f}%, NLL: {ret[2]:.4f}, Brier: {ret[3]:.4f}')

Model: 1 => Accuracy: 65.2%, ECE: 3.83%, NLL: 1.0502, Brier: 0.4702
Model: 2 => Accuracy: 65.6%, ECE: 4.58%, NLL: 1.0528, Brier: 0.4697
Model: 3 => Accuracy: 65.5%, ECE: 4.94%, NLL: 1.0579, Brier: 0.4705
Model: 4 => Accuracy: 65.6%, ECE: 4.70%, NLL: 1.0533, Brier: 0.4723
Model: 5 => Accuracy: 65.4%, ECE: 4.46%, NLL: 1.0519, Brier: 0.4724


Pooled model => Accuracy: 68.8%, ECE: 10.44%, NLL: 0.9779, Brier: 0.4394


In [None]:
def find_temperature(logits, targets):
    logits = torch.tensor(logits)
    targets = torch.tensor(targets)
    temps = np.exp(np.linspace(-3, 3, 50))
    losses = [CE(logits, targets) for ]