Did you know that sklearn's BayesianRidge predict method has ```return_std=True``` where the standard deviation of the prediction is returned?


# Libraries

In [None]:
import cv2
import pydicom
import numpy as np
import pandas as pd
import pydicom
import os
import math
import random
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from PIL import Image
from sklearn.metrics import mean_absolute_error
from sklearn import decomposition
from typing import List, NoReturn, Union, Tuple, Optional, Text, Generic, Callable, Dict
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, QuantileTransformer
from sklearn.model_selection import KFold, StratifiedKFold, TimeSeriesSplit
import itertools
from sklearn.linear_model import BayesianRidge

# visualize
import matplotlib.pyplot as plt
import matplotlib.style as style
import seaborn as sns
from matplotlib_venn import venn2
from matplotlib import pyplot
from matplotlib.ticker import ScalarFormatter
sns.set_context("talk")
style.use('fivethirtyeight')
pd.options.display.max_columns = None

import warnings
warnings.filterwarnings('ignore')

# Config

In [None]:
# CONFIG
INPUT_DIR = "../input/osic-pulmonary-fibrosis-progression"
SEED = 42
NFOLD = 10
SCALER = 'MinMax'

# Load data

In [None]:
def read_tabular():
    train = pd.read_csv(INPUT_DIR + '/train.csv')
    test = pd.read_csv(INPUT_DIR + '/test.csv')
    sub = pd.read_csv(INPUT_DIR + '/sample_submission.csv')
    return train, test, sub
train, test, sub = read_tabular()

In [None]:
print(train.shape)
train.head()

In [None]:
print(test.shape)
test.head()

In [None]:
sub['Patient'] = sub['Patient_Week'].apply(lambda x:x.split('_')[0])
sub['Weeks'] = sub['Patient_Week'].apply(lambda x: int(x.split('_')[-1]))
print(sub.shape)
sub.head()

# Feature engineering

In [None]:
sub = pd.merge(sub[['Patient','Weeks','Confidence','Patient_Week']], test.drop(columns=['Weeks']), on='Patient')
train['where'] = 'train'
test['where'] = 'test'
sub['where'] = 'sub'
data = pd.concat([train, test, sub], ignore_index=True)
print(data.shape)

In [None]:
# construct train input
def fe(data):
    data['min_week'] = data['Weeks']
    data.loc[data['where'] == 'test','min_week'] = np.nan
    data['min_week'] = data.groupby('Patient')['min_week'].transform('min')
    
    base = data.loc[data.Weeks == data.min_week]
    base = base[['Patient','FVC', 'Percent']].copy()
    base.columns = ['Patient','base_FVC', 'base_Percent']
    base['nb'] = 1
    base['nb'] = base.groupby('Patient')['nb'].transform('cumsum')
    base = base[base.nb==1]
    base.drop('nb', axis=1, inplace=True)

    data = data.merge(base, on='Patient', how='left')
    data['base_week'] = data['Weeks'] - data['min_week']
    del base
    
    train = data.loc[data['where'] == 'train', :].reset_index(drop=True)
    test = data.loc[data['where'] == 'test', :].reset_index(drop=True)
    sub = data.loc[data['where'] == 'sub', :].reset_index(drop=True)

    return train, test, sub
train, test, sub = fe(data)

In [None]:
print(train.shape)
train.head()

In [None]:
test = sub
print(test.shape)
test.head()

# EDA

In [None]:
venn2([set(train['Patient'].values.tolist()), set(test['Patient'].values.tolist())])

In [None]:
# plot weeks vs Percent (and FVC)
def plot_weeks_vs(patient : str):
    fig, ax = plt.subplots(1, 1, figsize=(7, 4))
    colors = sns.color_palette('deep', 3)
    weeks = train.loc[train['Patient'] == patient, 'Weeks'].values
    
    ax.plot(weeks, train.loc[train['Patient'] == patient, 'Percent'].values, '-o', color=colors[0], alpha=0.4)
    ax.set_ylabel('Percent', color=colors[0])
    ax.set_xlabel('Weeks')
    ax.tick_params(axis='y', labelcolor=colors[0])
    ax.set_title(patient)
    
    ax2 = ax.twinx()
    ax2.plot(weeks+1, train.loc[train['Patient'] == patient, 'FVC'].values, '-^', color=colors[1], alpha=0.4)
    ax2.plot(weeks[-3:]+1, train.loc[train['Patient'] == patient, 'FVC'].values[-3:], '-s', color=colors[2])
    ax2.set_ylabel('FVC', color=colors[1])
    ax2.tick_params(axis='y', labelcolor=colors[1])
    fig.tight_layout()  # otherwise the right y-label is slightly clipped
    plt.show()
    
plot_weeks_vs(train['Patient'].unique()[0])

In [None]:
plot_weeks_vs(train['Patient'].unique()[1])

In [None]:
plot_weeks_vs(test['Patient'].unique()[0])

In [None]:
plot_weeks_vs(test['Patient'].unique()[2])

Given there is a super-high positive correlation between FVC and Percent as a function of Weeks, I would simply use features related to weeks, FVC, and Percent.

# Fitting

Here is the groupkfold with shuffle=True, which cannot be done in the native sklearn groupkfold method.

In [None]:
import random
from collections import Counter, defaultdict
from sklearn import model_selection

# ---- GroupKFold ----
class GroupKFold(object):
    """
    GroupKFold with random shuffle with a sklearn-like structure
    """

    def __init__(self, n_splits=4, shuffle=True, random_state=42):
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state

    def get_n_splits(self, X=None, y=None, group=None):
        return self.n_splits

    def split(self, X, y, group):
        kf = model_selection.KFold(n_splits=self.n_splits, shuffle=self.shuffle, random_state=self.random_state)
        unique_ids = X[group].unique()
        for fold, (tr_group_idx, va_group_idx) in enumerate(kf.split(unique_ids)):
            # split group
            tr_group, va_group = unique_ids[tr_group_idx], unique_ids[va_group_idx]
            train_idx = np.where(X[group].isin(tr_group))[0]
            val_idx = np.where(X[group].isin(va_group))[0]
            yield train_idx, val_idx

In [None]:
# to normal
ID = 'Patient_Week'
target = 'FVC'
group = 'Patient'
features = ['Weeks', 'Percent', 'min_week', 'base_FVC', 'base_Percent', 'base_week']

# starndarditest
if SCALER == "MinMax":
    scaler = MinMaxScaler()
elif SCALER == "Standard":
    scaler = StandardScaler()
train[features] = scaler.fit_transform(train[features])
test[features] = scaler.transform(test[features])

In [None]:
print(len(features))
features

# Fitting

In [None]:
def lb_metric(data, oof):
    data['FVC_pred'] = oof[:, 0]
    data['Confidence'] = oof[:, -1]
    data['sigma_clipped'] = data['Confidence'].apply(lambda x: max(x, 70))
    data['diff'] = abs(data['FVC'] - data['FVC_pred'])
    data['delta'] = data['diff'].apply(lambda x: min(x, 1000))
    data['score'] = -math.sqrt(2)*data['delta']/data['sigma_clipped'] - np.log(math.sqrt(2)*data['sigma_clipped'])
    score = data['score'].mean()
    return score

In [None]:
%%time
ypred = np.zeros((test.shape[0], 2))
oof = np.zeros((train.shape[0], 2))
kf = GroupKFold(n_splits=NFOLD, shuffle=True, random_state=SEED)
kf = kf.split(train, train[target], group)

for cnt, (tr_idx, val_idx) in enumerate(kf):
    print(f"FOLD {cnt}")
    
    # fit
    model = BayesianRidge()
    model.fit(train[features].values[tr_idx, :], train[target].values[tr_idx])
    
    # evaluate
    yme, ystd = model.predict(train[features].values[val_idx, :], return_std=True) # return_std=True!
    oof[val_idx, 0] = yme 
    oof[val_idx, 1] = 2 * ystd
    yme, ystd = model.predict(test[features].values, return_std=True)
    ypred[:, 0] += yme / NFOLD 
    ypred[:, 1] += 2 * ystd / NFOLD
    print(r'Fold {}: score = {}'.format(cnt, lb_metric(train.iloc[val_idx], oof[val_idx, :])))

# Results

In [None]:
score = lb_metric(train, oof)
print(f'Overall CV = {score}')

In [None]:
plt.hist(oof[:, -1])
plt.title("uncertainty in prediction")
plt.show()

# Prediction

In [None]:
submission = pd.read_csv(INPUT_DIR + '/sample_submission.csv')
submission.head()

In [None]:
test['FVC_pred'] = ypred[:, 0]
test['Confidence'] = ypred[:, 1]

In [None]:
test[[ID, 'FVC_pred', 'Confidence']].head()

In [None]:
test[['FVC_pred', 'Confidence']].describe().T

In [None]:
sub = submission.drop(columns=['FVC', 'Confidence']).merge(test[['Patient_Week', 'FVC_pred', 'Confidence']], 
                                                           on='Patient_Week')
sub.columns = submission.columns
sub.to_csv('submission.csv', index=False)
sub.head()