# Bayesian Neural Network for obtaining confidence by sampling

Seemed like a nice problem for me to learn to use Bayesian Neural Networks with Pyro. I'll try to step through what's going on. With thanks to https://github.com/paraschopra/bayesian-neural-network-mnist/blob/master/bnn.ipynb**
https://forum.pyro.ai/t/dealing-with-noise-in-bayesian-neural-network-regression/863
https://github.com/Rachnog/Deep-Trading/blob/master/bayesian/Pyro%20-%20bayesian%20regression.ipynb

https://www.kaggle.com/raghaw/install-segmentation-model-offline-in-infer-kernel

In [None]:
!mkdir -p /tmp/pip/cache/
!cp ../input/pyroppl/pyro_ppl-1.3.1-py3-none-any.whl /tmp/pip/cache/
!cp ../input/pyroapi/pyro_api-0.1.2-py3-none-any.whl /tmp/pip/cache/

In [None]:
!pip install --no-index --find-links /tmp/pip/cache/ pyro-ppl==1.3.1

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
#Load the dependancies
import pyro
from pyro.infer import EmpiricalMarginal, SVI, Trace_ELBO, TracePredictive
from   torch.distributions import constraints
from pyro.distributions import Normal, Categorical
from pyro.infer import SVI, Trace_ELBO
from pyro.optim import Adam
from torch.autograd import Variable
from tqdm.notebook import tqdm
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import pyro.distributions as dist
import torch.nn as nn
from pyro.nn import PyroModule
from torch import optim
import random
from torchvision import models
import torch.multiprocessing as mp
from pyro.nn import PyroSample
from torch.nn import functional as F
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import pydicom.pixel_data_handlers.gdcm_handler as gdcm_handler 
import cv2
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import pydicom
import os
from torch.utils.data import DataLoader, Dataset
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
epochs=100
batch_size=8
num_workers=3
prec_alpha    = 3
prec_beta     = 1

In [None]:
class Net(torch.nn.Module):
    def __init__(self, n_feature, n_hidden):
        super(Net, self).__init__()
        self.hidden1 = torch.nn.Linear(n_feature, n_hidden) 
        self.hidden2 = torch.nn.Linear(n_hidden, n_hidden)# hidden layer
        self.predict = torch.nn.Linear(n_hidden, 1)   # output layer

    def forward(self, x):
        x = self.hidden1(x)
        x = self.hidden2(x)
        x = self.predict(x)
        return x
    
def model(X,y):
    num_x       = X.shape[0]
    mu = Variable(torch.zeros(n_hidden, n_features))
    sigma = Variable(torch.ones(n_hidden, n_features))
    bias_mu = Variable(torch.zeros(n_hidden))
    bias_sigma = Variable(torch.ones(n_hidden))
    w_prior, b_prior = Normal(mu, sigma), Normal(bias_mu, bias_sigma)
    
    mu2 = Variable(torch.zeros(n_hidden, n_hidden))
    sigma2 = Variable(torch.ones(n_hidden, n_hidden))
    bias_mu2 = Variable(torch.zeros(n_hidden))
    bias_sigma2 = Variable(torch.ones(n_hidden))
    w_prior2, b_prior2 = Normal(mu2, sigma2), Normal(bias_mu2, bias_sigma2) 
    
    mu3 = Variable(torch.zeros(1, n_hidden))
    sigma3 = Variable(torch.ones(1, n_hidden))
    bias_mu3 = Variable(torch.zeros(1))
    bias_sigma3 = Variable(torch.ones(1))
    w_prior3, b_prior3 = Normal(mu3, sigma3), Normal(bias_mu3, bias_sigma3)   
    
    priors = {'hidden1.weight': w_prior, 
              'hidden1.bias': b_prior,
              'hidden2.weight': w_prior2, 
              'hidden3.bias': b_prior2,
              'predict.weight': w_prior3,
              'predict.bias': b_prior3}
    
    # lift module parameters to random variables sampled from the priors
    lifted_module = pyro.random_module("module", regression_model, priors)
    # sample a regressor (which also samples w and b)
    lifted_reg_model = lifted_module()
    precision   = pyro.sample("precision", pyro.distributions.Gamma(prec_alpha, prec_beta))
    noise_scale = 1 / precision.sqrt()

    with pyro.plate("map", len(X), subsample_size = min(num_x, batch_size)) as ind:
            prediction_mean = lifted_reg_model(X[ind]).squeeze(-1)
            pyro.sample("obs", 
                    pyro.distributions.Normal(prediction_mean, Variable(torch.ones(X.shape[0])*noise_scale)), 
                    obs = y[ind])
        

In [None]:
def guide(X,y):
    alpha     = pyro.param("alpha", torch.tensor(prec_alpha), constraint = constraints.positive)
    beta      = pyro.param("beta",  torch.tensor(prec_beta),  constraint = constraints.positive)
    precision = pyro.sample("precision", pyro.distributions.Gamma(alpha, beta))

    w_mu = Variable(torch.randn(n_hidden, n_features), requires_grad=True)
    w_log_sig = Variable((torch.ones(n_hidden, n_features) + 0.05 * torch.randn(n_hidden, n_features)), requires_grad=True)
    b_mu = Variable(torch.randn(n_hidden), requires_grad=True)
    b_log_sig = Variable((torch.ones(n_hidden) + 0.05 * torch.randn(n_hidden)), requires_grad=True)
    
    # register learnable params in the param store
    mw_param = pyro.param("guide_mean_weight", w_mu)
    sw_param = softplus(pyro.param("guide_log_sigma_weight", w_log_sig))
    mb_param = pyro.param("guide_mean_bias", b_mu)
    sb_param = softplus(pyro.param("guide_log_sigma_bias", b_log_sig))
    
    # gaussian guide distributions for w and b
    w_dist = Normal(mw_param, sw_param)
    b_dist = Normal(mb_param, sb_param)
    
#     w_mu2 = Variable(torch.randn(1, second_layer).type_as(data.data), requires_grad=True)
#     w_log_sig2 = Variable(torch.randn(1, second_layer).type_as(data.data), requires_grad=True)
#     b_mu2 = Variable(torch.randn(1).type_as(data.data), requires_grad=True)
#     b_log_sig2 = Variable(torch.randn(1).type_as(data.data), requires_grad=True)
    
    w_mu2 = Variable(torch.randn(n_hidden, n_hidden), requires_grad=True)
    w_log_sig2 = Variable((torch.ones(n_hidden, n_hidden) + 0.05 * torch.randn(n_hidden, n_hidden)), requires_grad=True)
    b_mu2 = Variable(torch.randn(n_hidden), requires_grad=True)
    b_log_sig2 = Variable((torch.ones(n_hidden) + 0.05 * torch.randn(n_hidden)), requires_grad=True)
    
    # register learnable params in the param store
    mw_param2 = pyro.param("guide_mean_weight2", w_mu2)
    sw_param2 = softplus(pyro.param("guide_log_sigma_weight2", w_log_sig2))
    mb_param2 = pyro.param("guide_mean_bias2", b_mu2)
    sb_param2 = softplus(pyro.param("guide_log_sigma_bias2", b_log_sig2))
    
    # gaussian guide distributions for w and b
    w_dist2 = Normal(mw_param2, sw_param2)
    b_dist2 = Normal(mb_param2, sb_param2)
    
    w_mu3 = Variable(torch.randn(1, n_hidden), requires_grad=True)
    w_log_sig3 = Variable((torch.ones(1, n_hidden) + 0.05 * torch.randn(1, n_hidden)), requires_grad=True)
    b_mu3 = Variable(torch.randn(1), requires_grad=True)
    b_log_sig3 = Variable((torch.ones(1) + 0.05 * torch.randn(1)), requires_grad=True)
    
    # register learnable params in the param store
    mw_param3 = pyro.param("guide_mean_weight3", w_mu3)
    sw_param3 = softplus(pyro.param("guide_log_sigma_weight3", w_log_sig3))
    mb_param3 = pyro.param("guide_mean_bias3", b_mu3)
    sb_param3 = softplus(pyro.param("guide_log_sigma_bias3", b_log_sig3))
    
    # gaussian guide distributions for w and b
    w_dist3 = Normal(mw_param3, sw_param3)
    b_dist3 = Normal(mb_param3, sb_param3)
      
    dists = {'hidden1.weight': w_dist, 
              'hidden1.bias': b_dist,
             'hidden2.weight': w_dist2, 
              'hidden2.bias': b_dist2,
              'predict.weight': w_dist3,
              'predict.bias': b_dist3}
    
    # overloading the parameters in the module with random samples from the guide distributions
    lifted_module = pyro.random_module("module", regression_model, dists)
    # sample a regressor
    return lifted_module()

## Metrics
We will use these to measure the model performance in terms we understand but we will optimize for the Evidence Lower Bound (ELBO)

In [None]:
def metric_loss(pred_fvc,true_fvc,pred_sigma):
    true_fvc=torch.reshape(true_fvc,pred_fvc.shape)
    sigma_clipped=torch.clamp(pred_sigma,min=70)
    delta=torch.clamp(torch.abs(pred_fvc-true_fvc),max=1000)
    metric=torch.div(-torch.sqrt(torch.tensor([2.0]).to(device))*delta,sigma_clipped)-torch.log(torch.sqrt(torch.tensor([2.0]).to(device))*sigma_clipped)
    return -metric

def fvc_loss(pred_fvc,true_fvc):
    true_fvc=torch.reshape(true_fvc,pred_fvc.shape)
    fvc_err=torch.abs(pred_fvc-true_fvc)
    return fvc_err

In [None]:
def plot_training_loss(train, val,title='loss'):
    plt.figure()
    plt.plot(train, label='Train')
    plt.plot(val, label='Val')
    if title=='loss':
        plt.title('Model Training Loss')
    else:
        plt.title('Model Metric Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig('training_loss')

## Prepare Data
All in the same way as my tabular pytorch model

### Load Dataframes

In [None]:
train=pd.read_csv('../input/osic-pulmonary-fibrosis-progression/train.csv')
test=pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')
submission=pd.read_csv('../input/osic-pulmonary-fibrosis-progression/sample_submission.csv')

### Prepare Training Data (Tabular)

In [None]:
train['base_Weeks']=train.groupby(['Patient'])['Weeks'].transform('min')
base=train[train.Weeks==train.base_Weeks]
base = base.rename(columns={'FVC': 'base_FVC','Percent': 'base_Percent'})
base.drop_duplicates(subset=['Patient', 'Weeks'], keep='first',inplace=True)
train=train.merge(base[['Patient','base_FVC','base_Percent']],on='Patient',how='left')
train['Week_passed'] = train['Weeks'] - train['base_Weeks']

### Prepare Test Data (tabular)

In [None]:
test = test.rename(columns={'Weeks': 'base_Weeks', 'FVC': 'base_FVC','Percent': 'base_Percent'})

# Adding Sample Submission
submission = pd.read_csv("../input/osic-pulmonary-fibrosis-progression/sample_submission.csv")

# In submisison file, format: ID_'week', using lambda to split the ID
submission['Patient'] = submission['Patient_Week'].apply(lambda x:x.split('_')[0])

# In submisison file, format: ID_'week', using lambda to split the Week
submission['Weeks'] = submission['Patient_Week'].apply(lambda x:x.split('_')[1]).astype(int)

test = submission.drop(columns = ["FVC", "Confidence"]).merge(test, on = 'Patient')

test['Week_passed'] = test['Weeks'] - test['base_Weeks']

test=test[train.columns.drop(['FVC','Percent'])]

### OH Encode Sex and Smoking
With thanks to https://www.kaggle.com/ulrich07/osic-keras-starter-with-custom-metrics

In [None]:
COLS = ['Sex','SmokingStatus']
for col in COLS:
    for mod in train[col].unique():
        train[mod] = (train[col] == mod).astype(int)
        
        test[mod] = (test[col] == mod).astype(int)
    train.drop(col,axis=1,inplace=True)
    test.drop(col,axis=1,inplace=True)

### Rescale based on train data

In [None]:
from sklearn import preprocessing
robust_scaler = preprocessing.RobustScaler()
train[train.columns.difference(['Patient','FVC','Percent','Weeks','base_Weeks'])]=robust_scaler.fit_transform(train[train.columns.difference(['Patient','FVC','Percent','Weeks','base_Weeks'])])

### I've used this to avoid the files dcmread can't load (future version with CNN)

In [None]:
import pickle
with open('../input/file-dictionary/good_files.pickle', 'rb') as handle:
    good_file_dict = pickle.load(handle)
with open('../input/file-dictionary/bad_files.pickle', 'rb') as handle:
    bad_file_dict = pickle.load(handle)

In [None]:
class OSIC(Dataset):
    def __init__(self,patient_ids,df,file_dict,train=True, transform=None,nims=10):
        self.df=df[df.Patient.isin(patient_ids)]
        self.train=train
        if self.train:
            self.fvc=self.df['FVC'].values
        else:
            self.df[self.df.columns.difference(['Patient','FVC','Percent','Weeks','base_Weeks'])]=robust_scaler.transform(self.df[self.df.columns.difference(['Patient','FVC','Percent','Weeks','base_Weeks'])])
    
        self.data=self.df[self.df.columns.difference(['FVC','Patient','Percent'])].values
        self.patients=self.df['Patient'].values
        self.file_dict=file_dict
        self.nims=nims
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if self.train:
            data = {'fvc': self.fvc[idx],
                   'data': self.data[idx]}
        else:
            data = {'data': self.data[idx]}
        return data

### Split training data into train and val by patient (80:20)
'ID00011637202177653955184' has no images we can load. Therefore I'm going to drop.

We shuffle the train data

### Set up datasets and dataloaders

In [None]:
ids=train.Patient.unique()
index = np.argwhere(ids=='ID00011637202177653955184')
ids = list(np.delete(ids, index))
random.shuffle(ids)
ids=np.array(ids)

train_ids,val_ids=np.split(ids, [int(round(0.9 * len(ids), 0))])

train_dataset = OSIC(train_ids,train,good_file_dict)  
train_dataloader = DataLoader(train_dataset, batch_size=batch_size,shuffle=True,num_workers=num_workers)

val_dataset = OSIC(val_ids,train,good_file_dict)  
val_dataloader = DataLoader(val_dataset, batch_size=batch_size,shuffle=True,num_workers=num_workers)

## Training

Get the correct shaped model

In [None]:
n_features = train_dataset.data.shape[1]
n_hidden = 100

softplus = nn.Softplus()
regression_model = Net(n_features, n_hidden)

print('Number of parameters:')
print(sum(p.numel() for p in regression_model.parameters() if p.requires_grad))

Set up a prediction function that takes the average of a number of sampled models and returns the mean and standard deviation of the outputs

In [None]:
def predict(x,num_samples):
    sampled_models = [guide(None, None) for _ in range(num_samples)]
    yhats = [sample_model(x) for sample_model in sampled_models]
    mean = torch.mean(torch.stack(yhats), 0)
    std = torch.std(torch.stack(yhats), 0)
    return mean,std

In [None]:
from pyro.infer import SVI, Trace_ELBO


adam = pyro.optim.Adam({"lr": 0.03})
svi = SVI(model, guide, adam, loss=Trace_ELBO())
pyro.clear_param_store()
epoch_val_metric=[]
epoch_train_fvc=[]
epoch_train_loss=[]
#Start by training for fvc
for epoch in range(epochs):
    train_loss=0
    train_fvc=0
    val_metric=0
    val_fvc=0
    for batch_idx, data in enumerate(train_dataloader):
        svi_loss = svi.step(data['data'].float(), data['fvc'].float())
        mean,std = predict(data['data'].float(),10)
        train_fvc += fvc_loss(mean,data['fvc']).mean().item()
        train_loss=svi_loss
    print('====> Epoch: {} Average train loss ELBO: {:.4f}'.format(
                        epoch, train_loss / len(train_dataloader)))
    print('====> Epoch: {} Average train fvc absolute loss: {:.4f}'.format(
                        epoch, train_fvc / len(train_dataloader)))
    epoch_train_loss.append(train_loss/ len(train_dataloader))
    epoch_train_fvc.append(train_fvc)
    
    for batch_idx, data in enumerate(val_dataloader):
        mean,std = predict(data['data'].float(),10)
        val_metric += metric_loss(mean,data['fvc'],std).mean().item()
        val_fvc += fvc_loss(mean,data['fvc']).mean().item()
    print('====> Epoch: {} Average val metric: {:.4f}'.format(
                        epoch, val_metric / len(val_dataloader)))
    print('====> Epoch: {} Average val fvc absolute loss: {:.4f}'.format(
                        epoch, val_fvc / len(val_dataloader)))
    epoch_val_metric.append(val_metric/ len(val_dataloader))
    

In [None]:
for name, value in pyro.get_param_store().items():
    print(name, pyro.param(name))

# Plot training curves

In [None]:
plt.plot(epoch_train_loss)
plt.title('ELBO loss on train data')
plt.ylabel('Trace ELBO')
plt.xlabel('Epochs')
plt.yscale('log')

In [None]:
plt.plot(epoch_val_metric)
plt.title('Metric on Validation Data with ' + str(10) + " samples")
plt.ylabel('Evaluation Metric')
plt.xlabel('Epochs')

# Test Data

In [None]:
submission=pd.read_csv('../input/osic-pulmonary-fibrosis-progression/sample_submission.csv')

In [None]:
test_ids=test.Patient.unique()
test_dataset = OSIC(test_ids,test,good_file_dict,train=False)  
test_dataloader = DataLoader(test_dataset, batch_size=batch_size,shuffle=False,num_workers=num_workers)
fvc_pred = []
sigma_pred = []
with torch.no_grad():
    for batch_idx, data in enumerate(test_dataloader):
        mean,std = predict(data['data'].float(),50)
        fvc_pred.append(mean)
        sigma_pred.append(std)
fvc_pred=torch.cat(fvc_pred, dim=0)
sigma_pred=torch.cat(sigma_pred, dim=0)
test['FVC']=fvc_pred.cpu().numpy()
test['Confidence']=sigma_pred.cpu().numpy()


In [None]:
test['Patient_Week']=test["Patient"] + '_' + test['Weeks'].apply(str)

In [None]:
submission=submission[['Patient_Week']].merge(test[['Patient_Week','FVC','Confidence']],on='Patient_Week')

In [None]:
submission.to_csv('submission.csv', index=False, float_format='%.1f')

# Post-Match Analysis

## Test Predictions

In [None]:
plt.scatter(submission['FVC'],submission['Confidence'])
plt.title('Test')
plt.xlabel('FVC')
plt.ylabel('Confidence')

## Train

In [None]:
fvc_pred = []
sigma_pred = []
with torch.no_grad():
    for batch_idx, data in enumerate(train_dataloader):
        mean,std = predict(data['data'].float(),30)
        fvc_pred.append(mean)
        sigma_pred.append(std)
fvc_pred_train=torch.cat(fvc_pred, dim=0)
sigma_pred_train=torch.cat(sigma_pred, dim=0)
plt.scatter(fvc_pred_train.cpu().numpy(),sigma_pred_train.cpu().numpy())
plt.title('Train Confidence vs. FVC')
plt.xlabel('FVC')
plt.ylabel('Confidence')


In [None]:
plt.scatter(train_dataset.fvc,fvc_pred_train.cpu().numpy())
plt.title('Train: predicted FVC vs true FVC')
plt.xlabel('True FVC')
plt.ylabel('Predicted FVC')

## Val

In [None]:
fvc_pred = []
sigma_pred = []
with torch.no_grad():
    for batch_idx, data in enumerate(val_dataloader):
        mean,std = predict(data['data'].float(),30)
        fvc_pred.append(mean)
        sigma_pred.append(std)
fvc_pred_val=torch.cat(fvc_pred, dim=0)
sigma_pred_val=torch.cat(sigma_pred, dim=0)
plt.scatter(fvc_pred_val.cpu().numpy(),sigma_pred_val.cpu().numpy())
plt.title('Val')
plt.xlabel('FVC')
plt.ylabel('Confidence')

In [None]:
plt.scatter(val_dataset.fvc,fvc_pred_val.cpu().numpy())
plt.title('Val: predicted FVC vs true FVC')
plt.xlabel('True FVC')
plt.ylabel('Predicted FVC')

## All

In [None]:
plt.hist(submission['FVC'], alpha=0.5,label='test')
plt.hist(fvc_pred_train.cpu().numpy(), alpha=0.5,label='train')
plt.hist(fvc_pred_val.cpu().numpy(), alpha=0.5,label='val')
plt.legend()
plt.title('Histogram of FVC predictions')

In [None]:
plt.hist(submission['Confidence'], alpha=0.5,label='test')
plt.hist(sigma_pred_train.cpu().numpy(), alpha=0.5,label='train')
plt.hist(sigma_pred_val.cpu().numpy(), alpha=0.5,label='val')
plt.legend()
plt.title('Histogram of Confidence predictions')