# IMPORT

In [None]:
import copy
import random
import os

from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from pathlib import Path

import numpy as np
import pandas as pd

from tqdm import tqdm
from tqdm.notebook import trange
from time import time

import torch
from torch.utils.data import DataLoader, Subset
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.autograd import Variable

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GroupKFold

LR_SCHEDULE = True

# GLOBAL VARIABLES

In [None]:
class CFG:
    root_dir = Path('/kaggle/input/osic-pulmonary-fibrosis-progression')
    model_dir = Path('/kaggle/working')
    num_kfolds = 5
    cpu_workers = 4
    batch_size = 1
    learning_rate = 1e-2
    num_epochs = 100
    quantiles = [0.2, 0.5, 0.8]
    # LSTM parameters
    seq_length = 146
    n_features = 8
    n_layers = 2

# UTILS

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.enabled = True
    
    
# Helper generator that yields kfold PyTorch datasets
def group_kfold(dataset, groups, n_splits):
    gkf = GroupKFold(n_splits=n_splits)
    for train_idx, val_idx in gkf.split(dataset, dataset, groups):
        train = Subset(dataset, train_idx)
        val = Subset(dataset, val_idx)
        yield train, val
        
            
# Helper function with competition metric
def metric(p0, p1, p2, targets):
    sigma = p2 - p0
    sigma[sigma < 70] = 70
    delta = np.absolute(p1 - targets)
    delta[delta > 1000] = 1000
    return np.mean(-np.sqrt(2) * delta / sigma - np.log(np.sqrt(2) * sigma), 1)

# Loss
def pinball_loss(preds, targets, q):
    assert not targets.requires_grad
    assert preds.size(0) == targets.size(0)
    e = targets - preds
    loss = torch.max((q - 1) * e, q * e)
    loss = torch.mean(torch.sum(loss, dim=1))
    return loss

# Configure tabular data

In [None]:
scaler_fvc = MinMaxScaler()
scaler_percent = MinMaxScaler()
scaler_age = MinMaxScaler()

# Read train csv
train_df = pd.read_csv(Path(CFG.root_dir)/"train.csv")
train_df.drop_duplicates(keep=False, inplace=True, subset=['Patient','Weeks'])

# Normalize features
train_df[['FVC']] = scaler_fvc.fit_transform(train_df[['FVC']])
train_df[['Percent']] = scaler_percent.fit_transform(train_df[['Percent']])
train_df[['Age']] = scaler_age.fit_transform(train_df[['Age']])

# Create sequences 
df = pd.merge(train_df.groupby('Patient')['Weeks'].apply(list).to_frame().reset_index(), \
              train_df.groupby('Patient')['FVC'].apply(list).to_frame().reset_index(), on="Patient")
df = pd.merge(df,  train_df.groupby('Patient')['Percent'].apply(list).to_frame().reset_index(), on="Patient")
df = pd.merge(df, train_df.groupby('Patient')['Age'].first(), on="Patient")
df = pd.merge(df, train_df.groupby('Patient')['Sex'].first(), on="Patient")
df = pd.merge(df, train_df.groupby('Patient')['SmokingStatus'].first(), on="Patient")

# Convert sex and smoking status to one-hot encoding
COLS = ['Sex', 'SmokingStatus']
for col in COLS:
    for mod in df[col].unique():
        df[mod] = (df[col] == mod).astype(int)

# Remove useless columns
df.drop(columns=['Sex', 'SmokingStatus'], inplace=True)
        
df.head()

In [None]:
t = df['FVC'].iloc[90]
plt.plot(t)

# DEFINE DATASET

In [None]:
class ClinicalDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.N = CFG.seq_length
        self.feat = CFG.n_features

    def __len__(self):
        return len(self.df.Patient.values)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        # Extract row from dataframe
        row = self.df.iloc[idx].values
        
        # Get features 
        patiend_id = row[0]
        weeks = np.array(row[1]) + 12
        fvc = np.array(row[2])
        percent = np.array(row[3])
        age =  np.array(row[4])
        male = np.array(row[5])
        female = np.array(row[6])
        ex_smok = np.array(row[7])
        n_smok = np.array(row[8])
        smok = np.array(row[9])
        
        # Create input FVC sequence
        seq = np.zeros((self.N, self.feat))
        seq[0, 0] = fvc[0]
        seq[0, 1] = percent[0]
        seq[:, 2].fill(age)
        seq[:, 3].fill(male)
        seq[:, 4].fill(female)
        seq[:, 5].fill(ex_smok)
        seq[:, 6].fill(n_smok)
        seq[:, 7].fill(smok)
        
        # Create expected FVCs (starting just after the initial FVC)
        gt = np.zeros(self.N)
        out_ids = weeks[1:] - weeks[0] - 1
        gt[out_ids] = fvc[1:]

        return torch.tensor(seq), torch.tensor(gt), out_ids.tolist()

**Check dataset**

In [None]:
df.iloc[0].values

In [None]:
dataset = ClinicalDataset(df)

print('Input sequence \n', dataset[0][0])
print('\nOutput sequence \n', dataset[0][1])
print('\nOutput ids \n', dataset[0][2])

# DEFINE NETWORK AND LOSS

In [None]:
class LSTM(nn.Module):

    def __init__(self, device, input_size, hidden_size, num_layers):
        super(LSTM, self).__init__()
        
        self.device = device
        self.num_layers = num_layers
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
                            num_layers=num_layers, batch_first=True, dropout=0.5)

    def forward(self, x):
        h_0 = Variable(torch.zeros(
            self.num_layers, x.size(0), self.hidden_size).to(self.device))
        
        c_0 = Variable(torch.zeros(
            self.num_layers, x.size(0), self.hidden_size).to(self.device))
        
        # Propagate input through LSTM
        ula, (hn, _) = self.lstm(x, (h_0, c_0))
        
        y = hn.view(-1, self.hidden_size)
        h_out = hn.view(self.num_layers, x.size(0), self.hidden_size)[-1]
        
        h_out = h_out.view(-1, self.hidden_size)
       
        return h_out

# TRAIN

In [None]:
# Get current device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

seed_everything()

models = []
scores = []

# KFold 
gkf = GroupKFold(n_splits=CFG.num_kfolds)

for i, (train_idx, val_idx) in enumerate(gkf.split(df.values, df.values, df['Patient'].values)): 
    df.loc[val_idx, 'fold'] = i

df['fold'] = df['fold'].astype(int)

t0 = time()

# Loop through folds
for fold in range(CFG.num_kfolds):
    
    # Get fold ids
    trn_idx = df[df['fold'] != fold].index
    val_idx = df[df['fold'] == fold].index

    df_train = df.iloc[trn_idx].reset_index(drop=True)
    df_valid = df.iloc[val_idx].reset_index(drop=True)
    
    # Create dataset
    train_dataset = ClinicalDataset(df_train)
    valid_dataset = ClinicalDataset(df_valid)
    
    # Create dataloaders
    dataset_sizes = {'train': len(train_dataset), 
                     'val': len(valid_dataset)}
    
    dataloaders = {
        'train': DataLoader(train_dataset, batch_size=CFG.batch_size,
                            shuffle=True, num_workers=CFG.cpu_workers),
        'val': DataLoader(valid_dataset, batch_size=CFG.batch_size,
                          shuffle=False, num_workers=CFG.cpu_workers)
    }
    # Create the models
    modelq0 = LSTM(device=device, input_size=CFG.n_features, hidden_size=CFG.seq_length, num_layers=CFG.n_layers)
    modelq1 = LSTM(device=device, input_size=CFG.n_features, hidden_size=CFG.seq_length, num_layers=CFG.n_layers)
    modelq2 = LSTM(device=device, input_size=CFG.n_features, hidden_size=CFG.seq_length, num_layers=CFG.n_layers)
    
    modelq0.to(device)
    modelq1.to(device)
    modelq2.to(device)
    
    # Optimizer
    optimizerq0 = torch.optim.Adam(modelq0.parameters(), lr=CFG.learning_rate)
    optimizerq1 = torch.optim.Adam(modelq1.parameters(), lr=CFG.learning_rate)
    optimizerq2 = torch.optim.Adam(modelq2.parameters(), lr=CFG.learning_rate)
    
    # Schedulers
    if LR_SCHEDULE == True:
        schedulerq0 = torch.optim.lr_scheduler.CosineAnnealingLR(optimizerq0, CFG.num_epochs)
        schedulerq1 = torch.optim.lr_scheduler.CosineAnnealingLR(optimizerq1, CFG.num_epochs)
        schedulerq2 = torch.optim.lr_scheduler.CosineAnnealingLR(optimizerq2, CFG.num_epochs)
    
    epoch_loss_train_0 = np.zeros(CFG.num_epochs)
    epoch_loss_train_1 = np.zeros(CFG.num_epochs)
    epoch_loss_train_2 = np.zeros(CFG.num_epochs)
    
    epoch_loss_val_0 = np.zeros(CFG.num_epochs)
    epoch_loss_val_1 = np.zeros(CFG.num_epochs)
    epoch_loss_val_2 = np.zeros(CFG.num_epochs)
    m = np.zeros(CFG.num_epochs)
    
    # Loop through epochs
    bar = trange(CFG.num_epochs, desc=f'Training fold {fold + 1}')
    for epoch in bar:
        # Train
        modelq0.train()
        modelq1.train()
        modelq2.train()
        
        for in_seq, out_seq, mask in dataloaders['train']:
            
            # Get input and target sequences
            inputs = in_seq.float().to(device) # [bs, N, n_feat]
            targets = out_seq.to(device) # [bs, N]

            # First quantile model
            optimizerq0.zero_grad()
            preds = modelq0(inputs) # [bs, N]
            loss0 = pinball_loss(preds[:, mask], targets[:, mask], CFG.quantiles[0])
            loss0.backward()
            optimizerq0.step()
            epoch_loss_train_0[epoch] += loss0.item()
            
            # Second quantile model
            optimizerq1.zero_grad()
            preds = modelq1(inputs) # [bs, N]
            loss1 = pinball_loss(preds[:, mask], targets[:, mask], CFG.quantiles[1])
            loss1.backward()
            optimizerq1.step()
            epoch_loss_train_1[epoch] += loss1.item()
            
            # Third quantile model
            optimizerq2.zero_grad()
            preds = modelq2(inputs) # [bs, N]
            loss2 = pinball_loss(preds[:, mask], targets[:, mask], CFG.quantiles[2])
            loss2.backward()
            optimizerq2.step()
            epoch_loss_train_2[epoch] += loss2.item()

        # Epoch losses
        epoch_loss_train_0[epoch] = epoch_loss_train_0[epoch] / dataset_sizes['train']
        epoch_loss_train_1[epoch] = epoch_loss_train_1[epoch] / dataset_sizes['train']
        epoch_loss_train_2[epoch] = epoch_loss_train_2[epoch] / dataset_sizes['train']
        
        # Validate
        modelq0.eval()
        modelq1.eval()
        modelq2.eval()
        
        for in_seq, out_seq, mask in dataloaders['val']:
            
            # Get input and target sequences
            inputs = in_seq.float().to(device) # [bs, N, n_feat]
            targets = out_seq.to(device) # [bs, N]
            
            # Inference
            preds0 = modelq0(inputs) # [bs, N]
            preds1 = modelq1(inputs) # [bs, N]
            preds2 = modelq2(inputs) # [bs, N]
            
            # Losses
            epoch_loss_val_0[epoch] += pinball_loss(preds0[:, mask], targets[:, mask], CFG.quantiles[0]).item()
            epoch_loss_val_1[epoch] += pinball_loss(preds1[:, mask], targets[:, mask], CFG.quantiles[1]).item()
            epoch_loss_val_2[epoch] += pinball_loss(preds2[:, mask], targets[:, mask], CFG.quantiles[2]).item()
            
            # Metric
            p0 = scaler_fvc.inverse_transform(preds0[:, mask].cpu().detach().numpy())
            p1 = scaler_fvc.inverse_transform(preds1[:, mask].cpu().detach().numpy())
            p2 = scaler_fvc.inverse_transform(preds2[:, mask].cpu().detach().numpy())
            gt = scaler_fvc.inverse_transform(targets[:, mask].cpu().detach().numpy())
            m[epoch] += (metric(p0[:, :-3], p1[:, :-3], p2[:, :-3], gt[:, :-3]).sum()) # evaluate on last 3 FVC 
            
        if LR_SCHEDULE == True:
            schedulerq0.step()
            schedulerq1.step()
            schedulerq2.step()
            
        # Epoch losses
        epoch_loss_val_0[epoch] = epoch_loss_val_0[epoch] / dataset_sizes['val']
        epoch_loss_val_1[epoch] = epoch_loss_val_1[epoch] / dataset_sizes['val']
        epoch_loss_val_2[epoch] = epoch_loss_val_2[epoch] / dataset_sizes['val']
        m[epoch] = m[epoch] / dataset_sizes['val']
            
        # Update progress bar
        bar.set_postfix(q0_loss_train=f'{epoch_loss_train_0[epoch]:0.4f}', q1_loss_train=f'{epoch_loss_train_1[epoch]:0.4f}', q2_loss_train=f'{epoch_loss_train_2[epoch]:0.4f}', \
                       q0_loss_val=f'{epoch_loss_val_0[epoch]:0.4f}', q1_loss_val=f'{epoch_loss_val_1[epoch]:0.4f}', q2_loss_val=f'{epoch_loss_val_2[epoch]:0.4f}', \
                       metric=f'{m[epoch]:0.4}')    
    # Plot losses 
    fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(20,5))
    
    ax1.plot(epoch_loss_train_0, color='blue', label='train loss')
    ax1.plot(epoch_loss_val_0, color='red', label='val loss')
    handles, labels = ax1.get_legend_handles_labels()
    ax1.legend(handles, labels, loc='upper left')
    
    ax2.plot(epoch_loss_train_1, color='blue', label='train loss')
    ax2.plot(epoch_loss_val_1, color='red', label='val loss')
    handles, labels = ax2.get_legend_handles_labels()
    ax2.legend(handles, labels, loc='upper left')
    
    ax3.plot(epoch_loss_train_2, color='blue', label='train loss')
    ax3.plot(epoch_loss_val_2, color='red', label='train val')
    handles, labels = ax3.get_legend_handles_labels()
    ax3.legend(handles, labels, loc='upper left')
    
    ax4.plot(m, color='green', label='metric val')
    handles, labels = ax4.get_legend_handles_labels()
    ax4.legend(handles, labels, loc='upper left')
    
    plt.show()