This notebook is based on @ulrich07 https://www.kaggle.com/ulrich07/osic-multiple-quantile-regression-starter and code with pytorch.If you are not familiar with tensorflow, refer to this notebook.
btw,Pytorch is a bit slow than tensorflow.

In [None]:
import numpy as np
import pandas as pd
import pydicom
import os
import random
import matplotlib.pyplot as plt
from tqdm import tqdm

from PIL import Image
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings("ignore")

In [None]:
import torch.nn as nn
import torch
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader

In [None]:
def seed_everything(seed=2020):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)#set all gpus seed
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False#if input data type and channels' changes arent' large use it improve train efficient
        torch.backends.cudnn.enabled = True
    
seed_everything(42)

In [None]:
ROOT = "../input/osic-pulmonary-fibrosis-progression"
device = torch.device('cuda')

In [None]:
tr = pd.read_csv(f"{ROOT}/train.csv")
tr.drop_duplicates(keep=False, inplace=True, subset=['Patient','Weeks'])
chunk = pd.read_csv(f"{ROOT}/test.csv")

print("add infos")
sub = pd.read_csv(f"{ROOT}/sample_submission.csv")
sub['Patient'] = sub['Patient_Week'].apply(lambda x:x.split('_')[0])
sub['Weeks'] = sub['Patient_Week'].apply(lambda x: int(x.split('_')[-1]))
sub =  sub[['Patient','Weeks','Confidence','Patient_Week']]
sub = sub.merge(chunk.drop('Weeks', axis=1), on="Patient")

In [None]:
sub.head()


In [None]:
tr['WHERE'] = 'train'
chunk['WHERE'] = 'val'
sub['WHERE'] = 'test'
data = tr.append([chunk, sub])

In [None]:
sub.iloc[:10]

In [None]:
print(tr.shape, chunk.shape, sub.shape, data.shape)
print(tr.Patient.nunique(), chunk.Patient.nunique(), sub.Patient.nunique(), 
      data.Patient.nunique())
#

In [None]:
data['min_week'] = data['Weeks']
data.loc[data.WHERE=='test','min_week'] = np.nan
data['min_week'] = data.groupby('Patient')['min_week'].transform('min')

In [None]:
data.head()

In [None]:
base = data.loc[data.Weeks == data.min_week]
base = base[['Patient','FVC']].copy()
base.columns = ['Patient','min_FVC']
base['nb'] = 1
base['nb'] = base.groupby('Patient')['nb'].transform('cumsum')
base = base[base.nb==1]
base.drop('nb', axis=1, inplace=True)

In [None]:
data = data.merge(base, on='Patient', how='left')
data['base_week'] = data['Weeks'] - data['min_week']
del base

In [None]:
data.head()

In [None]:
COLS = ['Sex','SmokingStatus'] #,'Age'
FE = []
for col in COLS:
    for mod in data[col].unique():
        FE.append(mod)
        data[mod] = (data[col] == mod).astype(int)
#=================

In [None]:
data.head()

In [None]:
#
data['age'] = (data['Age'] - data['Age'].min() ) / ( data['Age'].max() - data['Age'].min() )
data['BASE'] = (data['min_FVC'] - data['min_FVC'].min() ) / ( data['min_FVC'].max() - data['min_FVC'].min() )
data['week'] = (data['base_week'] - data['base_week'].min() ) / ( data['base_week'].max() - data['base_week'].min() )
data['percent'] = (data['Percent'] - data['Percent'].min() ) / ( data['Percent'].max() - data['Percent'].min() )
FE += ['age','percent','week','BASE']

In [None]:
data.head()

In [None]:
tr = data.loc[data.WHERE=='train']
chunk = data.loc[data.WHERE=='val']
sub = data.loc[data.WHERE=='test']
del data

In [None]:
tr.shape, chunk.shape, sub.shape

### BASELINE NN 

In [None]:
import torch.nn.functional as F

class MishFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x):
        ctx.save_for_backward(x)
        return x * torch.tanh(F.softplus(x))   # x * tanh(ln(1 + exp(x)))

    @staticmethod
    def backward(ctx, grad_output):
        x = ctx.saved_variables[0]
        sigmoid = torch.sigmoid(x)
        tanh_sp = torch.tanh(F.softplus(x)) 
        return grad_output * (tanh_sp + x * sigmoid * (1 - tanh_sp * tanh_sp))

class Mish(nn.Module):
    def forward(self, x):
        return MishFunction.apply(x)

def to_Mish(model):
    for child_name, child in model.named_children():
        if isinstance(child, nn.ReLU):
            setattr(model, child_name, Mish())
        else:
            to_Mish(child)

In [None]:

#C1, C2 = tf.constant(70, dtype='float32'), tf.constant(1000, dtype="float32")
C1, C2 = torch.tensor(70,dtype=torch.float),torch.tensor(1000,dtype=torch.float)
C1, C2 = C1.to(device),C2.to(device)
#=============================#
def score(y_true, y_pred):
    y_true = y_true.to(torch.float)
    y_pred = y_pred.to(torch.float)
    
    sigma = y_pred[:,2] - y_pred[:,0]
    fvc_pred = y_pred[:,1]
    #sigma_clip = sigma + C1
    sigma_clip = torch.max(sigma, C1)
    delta = torch.abs(y_true[:,0] - fvc_pred)
    delta = torch.min(delta, C2)
    sq2 = torch.sqrt(torch.tensor(2, dtype=torch.float))
    metric = (delta / sigma_clip)*sq2 + torch.log(sigma_clip* sq2)
    return torch.mean(metric)

#============================#
def qloss(y_true, y_pred):
    # Pinball loss for multiple quantiles
    device = y_true.device
    qs = [0.2, 0.50, 0.8]
    q = torch.tensor(np.array([qs]), dtype=torch.float32)
    q = q.to(device)
    e = y_true - y_pred
    v = torch.max(q*e, (q-1)*e)
    return torch.mean(v)
#=============================#
def mloss(_lambda):
    def loss(y_true, y_pred):
        return _lambda * qloss(y_true, y_pred) + (1 - _lambda)*score(y_true, y_pred)
    return loss
#=================
class make_model(nn.Module):
    def __init__(self, in_ch, out_ch=3):
        super(make_model, self).__init__()
        self.fc1 = nn.Sequential(
            nn.Linear(in_ch, 100),
            Mish()
        )
        self.fc2 = nn.Sequential(
            nn.Linear(100, 100),
            Mish()
        )
        self.fc3_p1 = nn.Linear(100, out_ch)
        self.fc3_p2 = nn.Sequential(
            nn.Linear(100, out_ch),
            Mish()
        )
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        x1 = self.fc3_p1(x)
        x2 = self.fc3_p2(x)
        x = x1 + torch.cumsum(x2,dim=1)
        return x
    
    #x = L.Dense(100, activation="relu", name="d1")(z)
    #x = L.Dense(100, activation="relu", name="d2")(x)
    #x = L.Dense(100, activation="relu", name="d3")(x)
    #p1 = L.Dense(3, activation="linear", name="p1")(x)
    #p2 = L.Dense(3, activation="relu", name="p2")(x)
    #preds = L.Lambda(lambda x: x[0] + tf.cumsum(x[1], axis=1), 
    #                 name="preds")([p1, p2])
    
    #model.compile(loss=qloss, optimizer="adam", metrics=[score])


In [None]:
#test loss function
Mloss = mloss(0.8)
y_true = (torch.tensor([2100,2300])).to(device).reshape(2,1)
pred = (torch.tensor([[1800,2100,2400],
                     [2100,2300,2600]])).to(device).reshape(2,3)
Mloss(y_true,pred)

In [None]:
NFOLD = 5
kf = KFold(n_splits=NFOLD)
pd_patient = pd.DataFrame({"Patient":tr["Patient"].unique()})

for idx, (tr_idx, val_idx) in enumerate(kf.split(pd_patient)):
    pd_patient.loc[val_idx,"fold"] = idx
pd_patient.head()

In [None]:
tr['fold'] = -1
for i in range(len(pd_patient)):
    tr.fold[tr.Patient==pd_patient.loc[i,"Patient"]] = pd_patient.loc[i,"fold"]
tr.head()

In [None]:
ze = (sub[FE].values).astype(np.float32)
pe = np.zeros((ze.shape[0], 3))
batch = 128

In [None]:
class Data_Generate(Dataset):
    def __init__(self,data,label=None):
        self.data = data
        self.label = label
        
    def __getitem__(self,index):
        z_ = self.data[index]
        if self.label is not None:
            y_ = self.label[index]
            y_ = y_[None,]
            return z_,y_
        else:
            return z_
         
    def __len__(self):
        return len(self.data)

In [None]:
test_db = Data_Generate(ze)
test_loader = DataLoader(test_db, batch_size=batch, shuffle=False, num_workers=4)

In [None]:
class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt'):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            path (str): Path for the checkpoint to be saved to.
                            Default: 'checkpoint.pt'
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path

    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            #print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [None]:
%%time
cnt = 0
EPOCHS = 1000
criterion = mloss(0.8)
list_train_loss,list_val_loss,list_train_score,list_val_score = [],[],[],[]

#for tr_idx, val_idx in kf.split(z):
for fold in range(NFOLD):
    
    val_out = []
    print(f"FOLD {fold+1}")
    #==================load data kfold==========================#
    tr_z = tr[FE][tr.fold!=fold].values.astype(np.float32)
    tr_y = tr.FVC[tr.fold!=fold].values.astype(np.float32)
    val_z = tr[FE][tr.fold==fold].values.astype(np.float32)
    val_y = tr.FVC[tr.fold==fold].values.astype(np.float32)
    train_db = Data_Generate(tr_z,tr_y)
    train_loader = DataLoader(train_db, batch_size=batch, shuffle=True, num_workers=4)
    val_db = Data_Generate(val_z,val_y)
    val_loader = DataLoader(val_db, batch_size=batch, shuffle=False, num_workers=4)
    #==================prepare model==========================#
    tr_num_batch = len(train_loader)
    val_num_batch = len(val_loader)
    net = make_model(in_ch=len(FE)).to(device)
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=0.001, weight_decay=5e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=20, min_lr=1e-8, verbose=False)
    early_stopping = EarlyStopping(patience=80,path=f'Osic-NN-fold_{fold}.pth')

    for epoch in tqdm(range(EPOCHS)):
        train_loss,train_score,val_loss,val_score = 0,0,0,0
        #==================train ==========================#
        net.train()
        for idx, sample in enumerate(train_loader):
            data, label = sample
            data, label = data.to(device), label.to(device)
            out = net(data)
            loss = criterion(label, out)
            score_ = score(label ,out)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()/tr_num_batch
            train_score += score_.item()/tr_num_batch
        list_train_loss.append(train_loss)
        list_train_score.append(train_score)
        #==================val ==========================#
        net.eval()   
        for idx, sample in enumerate(val_loader):
            data, label = sample
            data, label = data.to(device), label.to(device)
            with torch.no_grad():
                out = net(data)
            val_out.append(out.cpu().numpy())
            loss = criterion(label, out)
            score_ = score(label, out)
            val_loss += loss.item()/val_num_batch
            val_score += score_.item()/val_num_batch
        list_val_loss.append(val_loss)
        list_val_score.append(val_score)
        early_stopping(val_loss, net)
        if early_stopping.early_stop:
            print("Early stopping")
            break
        scheduler.step(val_loss)
             
    print(f"train loss: {train_loss}  train score: {train_score}\n \
          val loss: {val_loss} val score: {val_score}\n \
          final lr: {optimizer.param_groups[0]['lr']}"
         )
  

### PREDICTION

In [None]:
print("predict test...")
for k in range(NFOLD):
    pred = []
    net = make_model(in_ch=len(FE)).to(device)
    net.load_state_dict(torch.load(f"Osic-NN-fold_{k}.pth"))
    net.eval()
    for idx, sample in enumerate(test_loader):
        data = sample
        data = data.to(device)
        with torch.no_grad():
            out = net(data)
        out = (out.cpu().numpy()).astype(np.int)
        pred.append(out)
    pred = np.concatenate(pred)
    pe += pred / NFOLD

In [None]:
sub['FVC1'] = pe[:, 1]
sub['Confidence1'] = abs(pe[:, 2] - pe[:, 0])

In [None]:
subm = sub[['Patient_Week','FVC1','Confidence1']].copy()

In [None]:
subm.loc[~subm.FVC1.isnull()].head(10)

In [None]:
subm.describe().T

In [None]:
subm.rename(columns={'FVC1':'FVC','Confidence1':'Confidence'},inplace=True) 

In [None]:
subm[["Patient_Week","FVC","Confidence"]].to_csv("submission.csv", index=False)