In [None]:
import sys
sys.path.append('..')
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import accuracy_score
from scipy.special import softmax as softmax
import matplotlib.pyplot as plt

from dataloader.builder import build_dataset
from experiment_setup import build_estimator
from uncertainty_estimator.masks import build_masks, DEFAULT_MASKS
from analysis.metrics import uq_ndcg, ndcg, dcg

from model.cnn import SimpleConv
from model.trainer import Trainer, EnsembleTrainer

In [None]:
config = {
    'use_cuda': True,
    'batch_size': 256,
    'epochs': 5,
    'lr': 1e-2,
    'momentum': 0.5,
    'seed': 1,
    'log_interval': 10,
    'nn_runs': 100,
    'dropout_uq': 0.5,
    'train_samples': 500,
    
    'n_models': 10
}



#### Load data and preprocess

In [None]:
mnist = build_dataset('mnist', val_size=10_000)
x_train, y_train = mnist.dataset('train')
x_val, y_val = mnist.dataset('val')

ood = build_dataset('fashion_mnist', val_size=0)
x_ood, _ = ood.dataset('train') 

In [None]:
x_train = x_train.reshape(-1, 1, 28, 28)
x_val = x_val.reshape(-1, 1, 28, 28)
x_ood = x_ood.reshape(-1, 1, 28, 28)
y_train = y_train.astype('long').reshape(-1)
y_val = y_val.astype('long').reshape(-1)
x_train /= 255.0
x_val /= 255.0
x_ood /= 255.0


#### Train model

In [None]:
unique, counts = np.unique(y_train[:config['train_samples']], return_counts=True)
dict(zip(unique, counts))

In [None]:
model = SimpleConv()
trainer = Trainer(model)
trainer.fit(x_train[:config['train_samples']], y_train[:config['train_samples']], epochs=config['epochs'])
accuracy_score(y_val, trainer.predict(x_val))


### BALD

##### UQ by different masks

In [None]:
masks = build_masks(DEFAULT_MASKS)


In [None]:
estimation_samples = config['train_samples']
uqs, datasets, mask_type = [], [], []

for mask_name, mask in masks.items():
    print(mask_name)
    estimator = build_estimator(
        'bald_masked', trainer, nn_runs=config['nn_runs'], dropout_mask=mask,
        dropout_rate=config['dropout_uq'], num_classes=10)

    for data_name, x_current in (('train', x_train), ('val', x_val), ('ood', x_ood)):
        uq = estimator.estimate(x_current[:estimation_samples])
        uqs = np.concatenate((uqs, uq))
        datasets = np.concatenate((datasets, [data_name]*estimation_samples))
        mask_type = np.concatenate((mask_type, [mask_name]*estimation_samples))
        
# for data_name, x_current in (('train', x_train), ('val', x_val), ('ood', x_ood)):
# estimator = build_estimator('bald_ensemble', ensemble, num_classes=10)
# current_ll = ll(trainer, x_current, y_current)
# uq = estimator.estimate(x_current)
# ndcgs.append(uq_ndcg(-current_ll, uq))
# estimator_type.append('ensemble')
# train_size.append(train_samples)


In [None]:
plt.figure(figsize=(16, 9))

df = pd.DataFrame({'uq': uqs, 'dataset': datasets, 'mask_type': mask_type})
sns.boxplot(data=df, x='mask_type',  y='uq', hue='dataset')



In [None]:
pool_size = 300
x_pool, y_pool = x_val[:pool_size], y_val[:pool_size]

def ll(trainer, x, y):
    trainer.eval()
    logits = trainer(x).detach().cpu().numpy()
    probs = softmax(logits, axis=-1)[np.arange(len(x)), y]
    return np.log(probs)

pool_ll = ll(trainer, x_pool, y_pool)
    

In [None]:
plt.figure(figsize=(16, 18))
for name, mask in masks.items():
    estimator = build_estimator(
        'bald_masked', trainer, nn_runs=config['nn_runs'], dropout_mask=mask,
        dropout_rate=config['dropout_uq'], num_classes=10)
    estimations = estimator.estimate(x_pool)
    plt.scatter(estimations, pool_ll, label=name, alpha=0.5)
    plt.xlabel('Uncertainty estimation')
    plt.ylabel('Log likelihood')
plt.legend(loc='lower right')
    

In [None]:
masks = build_masks(nn_runs=config['nn_runs'])
plt.figure(figsize=(10, 25))
num = (len(masks) + 1) // 2
for i, (name, mask) in enumerate(masks.items()):
    plt.subplot(num, 2, i+1)
    estimator = build_estimator(
        'bald_masked', trainer, nn_runs=config['nn_runs'], dropout_mask=mask,
        dropout_rate=config['dropout_uq'], num_classes=10)
    estimations = estimator.estimate(x_pool)
    plt.scatter(estimations, pool_ll, alpha=0.5)
    plt.xlabel('Uncertainty estimation')
    plt.ylabel('Log likelihood')
    plt.title(name)
# plt.legend(loc='lower right')
    
 
 

In [None]:
def retrain(train_samples):
    idxs = np.random.choice(len(x_train), train_samples, replace=False)
    
    model = SimpleConv()
    trainer = Trainer(model)
    trainer.fit(x_train[idxs], y_train[idxs], epochs=config['epochs'])
    
    ensemble = EnsembleTrainer(SimpleConv, {}, config['n_models'])
    ensemble.fit(x_train[idxs], y_train[idxs], epochs=config['epochs'])
    
    return trainer, ensemble

In [None]:
masks = build_masks(nn_runs=100)
estimation_samples = 3000 
ndcgs, estimator_type, train_size = [], [], []

model_runs = 1
repeat_runs = 1


# for train_samples in [500, 2000, 5000, 20000, 60000]:
for train_samples in [500]:
    for i in range(model_runs):
        trainer, ensemble = retrain(train_samples)
        for j in range(repeat_runs):
            idxs = np.random.choice(len(x_val), estimation_samples, replace=False)
            x_current = x_val[idxs]
            y_current = y_val[idxs]

            # masks
            current_ll = ll(trainer, x_current, y_current)
            for mask_name, mask in masks.items():
                estimator = build_estimator(
                    'bald_masked', trainer, nn_runs=config['nn_runs'], dropout_mask=mask,
                    dropout_rate=config['dropout_uq'], num_classes=10)
                uq = estimator.estimate(x_current)
                ndcgs.append(uq_ndcg(-current_ll, uq))
                estimator_type.append(mask_name)
                train_size.append(train_samples)
                estimator.reset()
                
            # ensemble
            estimator = build_estimator('bald_ensemble', ensemble, num_classes=10)
            current_ll = ll(trainer, x_current, y_current)
            uq = estimator.estimate(x_current)
            ndcgs.append(uq_ndcg(-current_ll, uq))
            estimator_type.append('ensemble')
            train_size.append(train_samples)
            
    
    

In [None]:
plt.figure(figsize=(12, 8))
plt.title(f"NDCG on different train samples")

df = pd.DataFrame({
    'ndcg': ndcgs,
    'estimator_type': estimator_type,
    'train_size': train_size
})
sns.boxplot(data=df, x='estimator_type',  y='ndcg', hue='train_size')
