In [1]:
import os
import time
import numpy as np
import tensorflow as tf
# import tensorflow_probability as tfp
import matplotlib.pyplot as plt

from src.datasets import load_dataset, preprocess_dataset, prefetch_dataset
from src.utils import *

2024-05-22 15:11:14.888661: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9360] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-22 15:11:14.888721: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-22 15:11:14.888747: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1537] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-22 15:11:14.896175: I tensorflow/core/platform/cpu_feature_guard.cc:183] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as 

In [25]:
cfg = { 'dataset' : 'stl10',
        'model' : 'pretrained_inception',
        'batch_size' : 512,
        'optimizer' : 'Adam',
        'learning_rate' : 0.001,
        'max_epoch' : 300,
        'patience' : 10,}    

model_name = cfg['model']
dataset_name = cfg['dataset']

### Softmax

In [26]:
ds_train, ds_val, ds_test, ds_info = load_dataset(cfg, shuffle=False)
n_classes = ds_info.features['label'].num_classes
ds_test = preprocess_dataset(ds_test, cfg, n_classes, resize=True, normalize=True, onehot=True)

ece_list = []
nll_list = []
bs_list = []
class_error_list = []

for run in range(5):
    tf.keras.utils.set_random_seed(run+10) # set random seed for Python, NumPy, and TensorFlow
    print(f'Run: {run+1}')
    model = tf.keras.models.load_model(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/trained_model.keras')
    preds = model.predict(ds_test.batch(cfg['batch_size']), verbose=0)
    true_y = np.argmax([y for x,y in ds_test], axis=1)
    pred_y = np.argmax(preds, axis=1)
    true_label = np.equal(true_y, pred_y).astype(int) # assign 0 if true_y != pred_y, assign 1 if true_y == pred_y
    softmax_val = np.max(preds, axis=1)
    
    ece = compute_ece(softmax_val, true_y, pred_y, n_bins=10)
    ece_list.append(ece)
    nll = compute_nll(preds, true_y, n_classes)
    nll_list.append(nll)
    bs = compute_brier_score(preds, true_y, n_classes)
    bs_list.append(bs)
    class_error = compute_classification_error(preds, true_y)
    class_error_list.append(class_error)
    
    print(f'ECE: {ece:.3f}')
    print(f'NLL: {nll:.3f}')
    print(f'Brier Score: {bs:.3f}')
    print(f'Classification Error: {class_error*100:.3f}')
    
print('---------------------')
print(f'Average ECE: {np.mean(ece_list):.2f}, std: {np.std(ece_list):.2f}')
print(f'Average NLL: {np.mean(nll_list):.3f}, std: {np.std(nll_list):.3f}')
print(f'Average Brier Score: {np.mean(bs_list):.3f}, std: {np.std(bs_list):.3f}')
print(f'Average Classification Error: {np.mean(class_error_list)*100:.2f}, std: {np.std(class_error_list)*100:.2f}')

Run: 1
ECE: 1.064
NLL: 0.155
Brier Score: 0.070
Classification Error: 4.563
Run: 2
ECE: 1.100
NLL: 0.162
Brier Score: 0.071
Classification Error: 4.600
Run: 3
ECE: 1.220
NLL: 0.160
Brier Score: 0.071
Classification Error: 4.763
Run: 4
ECE: 1.558
NLL: 0.162
Brier Score: 0.072
Classification Error: 4.825
Run: 5
ECE: 2.146
NLL: 0.179
Brier Score: 0.074
Classification Error: 4.812
---------------------
Average ECE: 1.42, std: 0.40
Average NLL: 0.164, std: 0.008
Average Brier Score: 0.072, std: 0.001
Average Classification Error: 4.71, std: 0.11


In [None]:
ds_train, ds_val, ds_test, ds_info = load_dataset(cfg, shuffle=False)
n_classes = ds_info.features['label'].num_classes
ds_test = preprocess_dataset(ds_test, cfg, n_classes, resize=False, normalize=True, onehot=True)

ece_list = []

for run in range(5):
    tf.keras.utils.set_random_seed(run+10) # set random seed for Python, NumPy, and TensorFlow
    print(f'Run: {run+1}')
    model = tf.keras.models.load_model(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/trained_model.keras')
    logits_layer = model.layers[-1]
    logits_layer.activation = None 
    logits_model = tf.keras.models.Model(inputs=model.input, outputs=logits_layer.output)
    preds = logits_model.predict(ds_test.batch(cfg['batch_size']), verbose=0)
    true_y = np.argmax([y for x,y in ds_test], axis=1)
    num_bins = 10
    labels_true = tf.convert_to_tensor(true_y, dtype=tf.int32, name='labels_true')
    logits = tf.convert_to_tensor(preds, dtype=tf.float32, name='logits')

    ece = tfp.stats.expected_calibration_error(num_bins=10, 
                                               logits=logits, 
                                               labels_true=labels_true)
    ece_list.append(ece.numpy()*100)
    print(f'ECE: {ece.numpy()*100:.3f}')
    
print('---------------------')
print(f'Average ECE: {np.mean(ece_list):.2f}, std: {np.std(ece_list):.2f}')

### PMI

In [27]:
critic = 'separable'
estimator = 'density_ratio_fitting'

ds_train, ds_val, ds_test, ds_info = load_dataset(cfg, shuffle=False)
n_classes = ds_info.features['label'].num_classes
ds_test = preprocess_dataset(ds_test, cfg, n_classes, resize=True, normalize=True, onehot=True)

ece_list = []
nll_list = []
bs_list = []
class_error_list = []

for run in range(5):
    tf.keras.utils.set_random_seed(run+10) # set random seed for Python, NumPy, and TensorFlow
    print(f'Run: {run+1}')
    model = tf.keras.models.load_model(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/trained_model.keras')
    preds = model.predict(ds_test.batch(cfg['batch_size']), verbose=0)
    true_y = np.argmax([y for x,y in ds_test], axis=1)
    pred_y = np.argmax(preds, axis=1)
    true_label = np.equal(true_y, pred_y).astype(int) # assign 0 if true_y != pred_y, assign 1 if true_y == pred_y
    
    exp_name = f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/pmi/{critic}_{estimator}'
    pmi_class = np.load(f'{exp_name}/pmi_class_test.npy')
    pmi_class = np.array([softmax(x) for x in pmi_class])
    pmi = np.array([pmi_value[pred_value] for pmi_value, pred_value in zip(pmi_class, pred_y)])
    
    ece = compute_ece(pmi, true_y, pred_y, n_bins=10)
    ece_list.append(ece)
    nll = compute_nll(pmi_class, true_y, n_classes)
    nll_list.append(nll)
    bs = compute_brier_score(pmi_class, true_y, n_classes)
    bs_list.append(bs)
    class_error = compute_classification_error(pmi_class, true_y)
    class_error_list.append(class_error)
    
    print(f'ECE: {ece:.3f}')
    print(f'NLL: {nll:.3f}')
    print(f'Brier Score: {bs:.3f}')
    print(f'Classification Error: {class_error*100:.3f}')
    
print('---------------------')
print(f'Average ECE: {np.mean(ece_list):.2f}, std: {np.std(ece_list):.2f}')
print(f'Average NLL: {np.mean(nll_list):.3f}, std: {np.std(nll_list):.3f}')
print(f'Average Brier Score: {np.mean(bs_list):.3f}, std: {np.std(bs_list):.3f}')
print(f'Average Classification Error: {np.mean(class_error_list)*100:.2f}, std: {np.std(class_error_list)*100:.2f}')

Run: 1
ECE: 2.039
NLL: 0.225
Brier Score: 0.071
Classification Error: 4.675
Run: 2
ECE: 2.443
NLL: 0.244
Brier Score: 0.074
Classification Error: 4.812
Run: 3
ECE: 2.526
NLL: 0.229
Brier Score: 0.072
Classification Error: 4.837
Run: 4
ECE: 1.368
NLL: 0.232
Brier Score: 0.073
Classification Error: 4.950
Run: 5
ECE: 1.043
NLL: 0.222
Brier Score: 0.071
Classification Error: 4.725
---------------------
Average ECE: 1.88, std: 0.59
Average NLL: 0.230, std: 0.008
Average Brier Score: 0.072, std: 0.001
Average Classification Error: 4.80, std: 0.10


In [None]:
critic = 'separable'
estimator = 'density_ratio_fitting'

ds_train, ds_val, ds_test, ds_info = load_dataset(cfg, shuffle=False)
n_classes = ds_info.features['label'].num_classes
ds_test = preprocess_dataset(ds_test, cfg, n_classes, resize=False, normalize=True, onehot=True)

ece_list = []

for run in range(5):
    tf.keras.utils.set_random_seed(run+10) # set random seed for Python, NumPy, and TensorFlow
    print(f'Run: {run+1}')
    model = tf.keras.models.load_model(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/trained_model.keras')
    exp_name = f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/pmi/{critic}_{estimator}'
    pmi_class = np.load(f'{exp_name}/pmi_class_test.npy')
    true_y = np.argmax([y for x,y in ds_test], axis=1)
    labels_true = tf.convert_to_tensor(true_y, dtype=tf.int32, name='labels_true')
    logits = tf.convert_to_tensor(pmi_class, dtype=tf.float32, name='logits')

    ece = tfp.stats.expected_calibration_error(num_bins=10, 
                                               logits=logits, 
                                               labels_true=labels_true)
    ece_list.append(ece.numpy()*100)
    print(f'ECE: {ece.numpy()*100:.3f}')
    
print('---------------------')
print(f'Average ECE: {np.mean(ece_list):.2f}, std: {np.std(ece_list):.2f}')

Run: 1
ECE: 1.543
Run: 2


### PVI

In [28]:
estimator = 'training_from_scratch'

ds_train, ds_val, ds_test, ds_info = load_dataset(cfg, shuffle=False)
n_classes = ds_info.features['label'].num_classes
ds_test = preprocess_dataset(ds_test, cfg, n_classes, resize=True, normalize=True, onehot=True)

ece_list = []
nll_list = []
bs_list = []
class_error_list = []

for run in range(5):
    tf.keras.utils.set_random_seed(run+10) # set random seed for Python, NumPy, and TensorFlow
    print(f'Run: {run+1}')
    model = tf.keras.models.load_model(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/trained_model.keras')
    preds = model.predict(ds_test.batch(cfg['batch_size']), verbose=0)
    true_y = np.argmax([y for x,y in ds_test], axis=1)
    pred_y = np.argmax(preds, axis=1)
    true_label = np.equal(true_y, pred_y).astype(int) # assign 0 if true_y != pred_y, assign 1 if true_y == pred_y
    
    exp_name = f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/pvi/{estimator}'
    pvi_class = np.load(f'{exp_name}/pvi_calibrated_class_test.npy')
    pvi_class = np.array([softmax(x) for x in pvi_class])
    pvi = np.array([pvi_value[pred_value] for pvi_value, pred_value in zip(pvi_class, pred_y)])
    
    ece = compute_ece(pvi, true_y, pred_y, n_bins=10)
    ece_list.append(ece)
    nll = compute_nll(pvi_class, true_y, n_classes)
    nll_list.append(nll)
    bs = compute_brier_score(pvi_class, true_y, n_classes)
    bs_list.append(bs)
    class_error = compute_classification_error(pvi_class, true_y)
    class_error_list.append(class_error)
    
    print(f'ECE: {ece:.3f}')
    print(f'NLL: {nll:.3f}')
    print(f'Brier Score: {bs:.3f}')
    print(f'Classification Error: {class_error*100:.3f}')
    
print('---------------------')
print(f'Average ECE: {np.mean(ece_list):.2f}, std: {np.std(ece_list):.2f}')
print(f'Average NLL: {np.mean(nll_list):.3f}, std: {np.std(nll_list):.3f}')
print(f'Average Brier Score: {np.mean(bs_list):.3f}, std: {np.std(bs_list):.3f}')
print(f'Average Classification Error: {np.mean(class_error_list)*100:.2f}, std: {np.std(class_error_list)*100:.2f}')

Run: 1
ECE: 2.081
NLL: 0.177
Brier Score: 0.074
Classification Error: 4.688
Run: 2
ECE: 2.380
NLL: 0.179
Brier Score: 0.075
Classification Error: 4.837
Run: 3
ECE: 2.112
NLL: 0.178
Brier Score: 0.074
Classification Error: 4.800
Run: 4
ECE: 2.208
NLL: 0.175
Brier Score: 0.073
Classification Error: 4.700
Run: 5
ECE: 2.091
NLL: 0.184
Brier Score: 0.074
Classification Error: 4.763
---------------------
Average ECE: 2.17, std: 0.11
Average NLL: 0.179, std: 0.003
Average Brier Score: 0.074, std: 0.001
Average Classification Error: 4.76, std: 0.06


In [79]:
estimator = 'training_from_scratch'

ds_train, ds_val, ds_test, ds_info = load_dataset(cfg, shuffle=False)
n_classes = ds_info.features['label'].num_classes
ds_test = preprocess_dataset(ds_test, cfg, n_classes, resize=False, normalize=True, onehot=True)

ece_list = []

for run in range(5):
    tf.keras.utils.set_random_seed(run+10) # set random seed for Python, NumPy, and TensorFlow
    print(f'Run: {run+1}')
    model = tf.keras.models.load_model(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/trained_model.keras')
    exp_name = f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/pvi/{estimator}'
    pvi_class = np.load(f'{exp_name}/pvi_calibrated_class_test.npy')
    true_y = np.argmax([y for x,y in ds_test], axis=1)
    labels_true = tf.convert_to_tensor(true_y, dtype=tf.int32, name='labels_true')
    logits = tf.convert_to_tensor(pvi_class, dtype=tf.float32, name='logits')

    ece = tfp.stats.expected_calibration_error(num_bins=10, 
                                               logits=logits, 
                                               labels_true=labels_true)
    ece_list.append(ece.numpy()*100)
    print(f'ECE: {ece.numpy()*100:.3f}')
    
print('---------------------')
print(f'Average ECE: {np.mean(ece_list):.2f}, std: {np.std(ece_list):.2f}')

Run: 1
ECE: 3.110
Run: 2
ECE: 2.775
Run: 3
ECE: 2.387
Run: 4
ECE: 3.267
Run: 5
ECE: 3.007
---------------------
Average ECE: 2.91, std: 0.31


### PSI

In [29]:
estimator = 'gaussian'

ds_train, ds_val, ds_test, ds_info = load_dataset(cfg, shuffle=False)
n_classes = ds_info.features['label'].num_classes
ds_test = preprocess_dataset(ds_test, cfg, n_classes, resize=True, normalize=True, onehot=True)

ece_list = []
nll_list = []
bs_list = []
class_error_list = []

for run in range(5):
    tf.keras.utils.set_random_seed(run+10) # set random seed for Python, NumPy, and TensorFlow
    print(f'Run: {run+1}')
    model = tf.keras.models.load_model(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/trained_model.keras')
    preds = model.predict(ds_test.batch(cfg['batch_size']), verbose=0)
    true_y = np.argmax([y for x,y in ds_test], axis=1)
    pred_y = np.argmax(preds, axis=1)
    true_label = np.equal(true_y, pred_y).astype(int) # assign 0 if true_y != pred_y, assign 1 if true_y == pred_y
    
    exp_name = f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/psi/{estimator}'
    psi_class = np.load(f'{exp_name}/psi_class_test.npy')
    psi_class = np.array([softmax(x) for x in psi_class])
    psi = np.array([psi_value[pred_value] for psi_value, pred_value in zip(psi_class, pred_y)])
#     psi = np.max(psi_class, axis=1)
    
    ece = compute_ece(psi, true_y, pred_y, n_bins=10)
    ece_list.append(ece)
    nll = compute_nll(psi_class, true_y, n_classes)
    nll_list.append(nll)
    bs = compute_brier_score(psi_class, true_y, n_classes)
    bs_list.append(bs)
    class_error = compute_classification_error(psi_class, true_y)
    class_error_list.append(class_error)
    
    print(f'ECE: {ece:.3f}')
    print(f'NLL: {nll:.3f}')
    print(f'Brier Score: {bs:.3f}')
    print(f'Classification Error: {class_error*100:.3f}')
    
print('---------------------')
print(f'Average ECE: {np.mean(ece_list):.2f}, std: {np.std(ece_list):.2f}')
print(f'Average NLL: {np.mean(nll_list):.3f}, std: {np.std(nll_list):.3f}')
print(f'Average Brier Score: {np.mean(bs_list):.3f}, std: {np.std(bs_list):.3f}')
print(f'Average Classification Error: {np.mean(class_error_list)*100:.2f}, std: {np.std(class_error_list)*100:.2f}')

Run: 1
ECE: 16.223
NLL: 0.313
Brier Score: 0.116
Classification Error: 5.250
Run: 2
ECE: 14.421
NLL: 0.288
Brier Score: 0.108
Classification Error: 5.112
Run: 3
ECE: 13.635
NLL: 0.283
Brier Score: 0.106
Classification Error: 5.100
Run: 4
ECE: 11.563
NLL: 0.260
Brier Score: 0.098
Classification Error: 5.112
Run: 5
ECE: 12.316
NLL: 0.268
Brier Score: 0.099
Classification Error: 4.975
---------------------
Average ECE: 13.63, std: 1.63
Average NLL: 0.282, std: 0.019
Average Brier Score: 0.105, std: 0.006
Average Classification Error: 5.11, std: 0.09


In [81]:
estimator = 'gaussian'

ds_train, ds_val, ds_test, ds_info = load_dataset(cfg, shuffle=False)
n_classes = ds_info.features['label'].num_classes
ds_test = preprocess_dataset(ds_test, cfg, n_classes, resize=False, normalize=True, onehot=True)

ece_list = []

for run in range(5):
    tf.keras.utils.set_random_seed(run+10) # set random seed for Python, NumPy, and TensorFlow
    print(f'Run: {run+1}')
    model = tf.keras.models.load_model(f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/trained_model.keras')
    exp_name = f'../results/PI_Explainability/{model_name}_{dataset_name}/run_{run+1}/calibration/psi/{estimator}'
    psi_class = np.load(f'{exp_name}/psi_class_test.npy')
    true_y = np.argmax([y for x,y in ds_test], axis=1)
    labels_true = tf.convert_to_tensor(true_y, dtype=tf.int32, name='labels_true')
    logits = tf.convert_to_tensor(psi_class, dtype=tf.float32, name='logits')

    ece = tfp.stats.expected_calibration_error(num_bins=10, 
                                               logits=logits, 
                                               labels_true=labels_true)
    ece_list.append(ece.numpy()*100)
    print(f'ECE: {ece.numpy()*100:.3f}')
    
print('---------------------')
print(f'Average ECE: {np.mean(ece_list):.2f}, std: {np.std(ece_list):.2f}')

Run: 1
ECE: 28.690
Run: 2
ECE: 30.469
Run: 3
ECE: 28.575
Run: 4
ECE: 27.214
Run: 5
ECE: 29.106
---------------------
Average ECE: 28.81, std: 1.04
