In [0]:
# !python -m pip install git+https://github.com/stat-ml/alpaca

In [0]:
import os
from pathlib import Path
import random

import matplotlib.pyplot as plt
import pickle
import torch
import numpy as np
from sklearn.model_selection import train_test_split

In [0]:
from alpaca.model.ensemble import MLPEnsemble
from alpaca.model.mlp import MLP
from alpaca.dataloader.builder import build_dataset
from alpaca.uncertainty_estimator import build_estimator
from alpaca.analysis.metrics import get_uq_metrics

In [0]:
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import KFold

# scaling back
def scale(train, val):
    scaler = StandardScaler()
    scaler.fit(train)
    train = scaler.transform(train)
    val = scaler.transform(val)
    return train, val, scaler


def split_ood(x_all, y_all, percentile=10):
    threshold = np.percentile(y_all, percentile)
    ood_idx = np.argwhere(y_all > threshold)[:, 0]
    x_ood, y_ood = x_all[ood_idx], y_all[ood_idx]
    train_idx = np.argwhere(y_all <= threshold)[:, 0]
    x_train, y_train = x_all[train_idx], y_all[train_idx]

    return x_train, y_train, x_ood, y_ood


def multiple_kfold(k, data_size, max_iterations):
    kfold = KFold(k)
    for i in range(max_iterations):
        if i % k == 0:
            data_idx = np.random.permutation(data_size)
            idx_generator = kfold.split(data_idx)
        train_idx, val_idx = next(idx_generator)
        yield data_idx[train_idx], data_idx[val_idx]

In [0]:
# reproducibility
SEED = 10
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

torch.cuda.set_device(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


In [0]:
from alpaca.uncertainty_estimator.masks import build_masks, DEFAULT_MASKS
from alpaca.model.ensemble import MLPEnsemble
from alpaca.uncertainty_estimator import build_estimator
from alpaca.analysis.metrics import get_uq_metrics

In [0]:
# estimator that invokes low-level functions
def estimate(name, x_val, y_val, model):
  results = []
  x_val_tensor = torch.DoubleTensor(x_val).cuda()
  predictions = model(x_val_tensor).cpu().detach().numpy()

  errors = np.abs(predictions - y_val)

  unscale = lambda y : y_scaler.inverse_transform(y)
  scaled_errors = unscale(predictions) - unscale(y_val)
  rmse = np.sqrt(np.mean(np.square(scaled_errors)))

  estimator = build_estimator(
      'mcdue_masked', model, nn_runs=100, 
      dropout_mask=name, dropout_rate=config['dropout_uq'])

  estimations = estimator.estimate(x_val_tensor)
  acc, ndcg, ll = get_uq_metrics(estimations, errors,
                                config['acc_percentile'],
                                bins = [80, 95, 99]
                                )
  return [acc, ndcg, ll, rmse, name, str(estimations)]

In [0]:
import pandas as pd

In [0]:
# TRAINING

In [0]:
res = []
for dataset_name in ['boston_housing', 'concrete',
                     'energy_efficiency', 
                     'kin8nm',
                     'naval_propulsion', 'ccpp',
                     'red_wine', 'yacht_hydrodynamics'
                     ]:
  print(f'=={dataset_name}==')
  dataset = build_dataset(dataset_name, val_split=0.0)
  x_set, y_set = dataset.dataset('train')
  config['layers'][0] = x_set.shape[-1]

  for dim in range(x_set.shape[-1]):
    for run in range(5):
      # splitting the data
      med = np.median(x_set[:,dim])
      x_train = x_set[x_set[:, dim] < med]
      y_train = y_set[x_set[:, dim] < med]
      x_val = x_set[x_set[:, dim] >= med]
      y_val = y_set[x_set[:, dim] >= med]
      # random picking on train\test
      if np.random.random() > .5:
        x_train, x_val = x_val, x_train
        y_train, y_val = y_val, y_train

      x_train, x_val, x_scaler = scale(x_train, x_val)
      y_train, y_val, y_scaler = scale(y_train, y_val)

      model = MLP(config['layers'])
      train_opts = {}
      # fitting
      model.fit((x_train, y_train),
                (x_val, y_val),
                **train_opts)
      # UE
      for name in DEFAULT_MASKS + ['cov_dpp', 'cov_k_dpp']:
        tmp = estimate(name, x_train, y_train, model)
        res.append(tmp + [dataset_name, dim, run, 'train'])
        tmp = estimate(name, x_val, y_val, model)
        res.append(tmp + [dataset_name, dim, run, 'val'])
      dfq = pd.DataFrame(res,
                  columns = ['acc', 'ndcg', 'll',
                              'rmse', 'mask_name', 'ues',
                              'dataset_name', 'dim', 'run', 'split'
                              ])
      dfq.to_csv('fname.csv', index = None)

In [0]:
# MAKING TABLES

In [0]:
import matplotlib.pyplot as plt
%matplotlib inline

In [0]:
data = []
erc = 0
# postprocessing UE data
for q, dfx in dfq.groupby(['mask_name', 
                          'dataset_name', 'dim', 'run']):
    vals_train = []
    vals_val = []
    try:
        # get train percentile values
        ue_str = dfx[dfx.split == 'train'].ues.values[0]
        for x in ue_str[1:-1].split():
            vals_train.append(float(x))
        
        ue_str = dfx[dfx.split == 'val'].ues.values[0]
        for x in ue_str[1:-1].split():
            vals_val.append(float(x))
        vals_val = np.array(vals_val)
        
        for perc in [80, 90, 95]:
            med = np.percentile(vals_train, perc)
            data.append([
                dfx.dataset_name.values[0],
                dfx.dim.values[0],
                dfx.run.values[0],
                dfx.mask_name.values[0],
                perc,
                100*sum(vals_val > med)/len(vals_val) # scaling itself
            ])
    except Exception as e:
        pass

In [0]:
df_perc = pd.DataFrame(data,
                  columns = ['dataset_name', 'dim', 'run',
                            'mask_name', 'perc', 'ratio'])

In [0]:
# reformatting for latex tables
data2 = []
for vals, df_temp in df_perc.groupby(['dataset_name',
                                  'mask_name', 'perc'
                                 ]):
    dataset, mask, perc = vals
    data2.append([dataset, mask, perc, f'{df_temp.ratio.mean():.1f}±{df_temp.ratio.std():.1f}'])
dfres = pd.DataFrame(data2, 
                    columns = ['dataset', 'mask', 'percentile', 'ood_ratio'])

In [0]:
# output
for dset in dfres.dataset.unique():
    print(dset)
    display(
    dfres[(dfres.dataset == dset) & (dfres['mask'].isin(['mc_dropout',
                                                      'ht_leverages',
                                                      'ht_dpp', 
                                                      'cov_k_dpp'
                                                     ]))]\
        .pivot(index='percentile', columns='mask', values='ood_ratio')\
        [['mc_dropout',
                                                      'ht_leverages',
                                                      'ht_dpp', 
                                                      'cov_k_dpp'
                                                     ]]
    )