# Libraries

In [None]:
#%cd ../input/python3gdcm
!dpkg -i ../input/python3gdcm/build_1-1_amd64.deb
!apt-get install -f

In [None]:
!cp /usr/local/lib/gdcm.py /opt/conda/lib/python3.7/site-packages/.
!cp /usr/local/lib/gdcmswig.py /opt/conda/lib/python3.7/site-packages/.
!cp /usr/local/lib/_gdcmswig.so /opt/conda/lib/python3.7/site-packages/.
!cp /usr/local/lib/libgdcm* /opt/conda/lib/python3.7/site-packages/.
!ldconfig

In [None]:
import cv2
import numpy as np
import pandas as pd
import pickle
import glob
from pathlib import Path
import os
import math
import random
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from PIL import Image
from sklearn.metrics import mean_absolute_error
from sklearn import decomposition
from typing import List, NoReturn, Union, Tuple, Optional, Text, Generic, Callable, Dict
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, QuantileTransformer
from sklearn.model_selection import KFold, StratifiedKFold, TimeSeriesSplit
import itertools
from functools import partial
import umap
import gdcm
import scipy as sp

import pydicom
from pydicom.tag import Tag

import scipy.ndimage as ndimage
from scipy.ndimage import zoom
from scipy.stats import kurtosis
from scipy.stats import skew

from sklearn.cluster import KMeans
from skimage import measure, morphology, segmentation

from sklearn.svm import SVR
from sklearn.linear_model import Ridge, Lasso, BayesianRidge, ElasticNet
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor

# visualize
import matplotlib.pyplot as plt
import matplotlib.style as style
import seaborn as sns
from matplotlib_venn import venn2
from matplotlib import pyplot
from matplotlib.ticker import ScalarFormatter
sns.set_context("talk")
style.use('seaborn-colorblind')
pd.options.display.max_columns = None

import warnings
warnings.filterwarnings('ignore')

In [None]:
# keras
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras import callbacks
from tensorflow.keras import optimizers
from tensorflow.keras import models
from tensorflow.keras import losses
from tensorflow.keras import utils
from tensorflow.keras import layers
import tensorflow_addons as tfa
from tensorflow.keras.utils import Sequence
import tensorflow.keras.backend as K 

def seed_everything(seed : int) -> NoReturn :    
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)
    
seed_everything(64)

# Config

In [None]:
# CONFIG
INPUT_DIR = "../input/osic-pulmonary-fibrosis-progression"
SEED = 42
NFOLD = 5
SCALER = 'MinMax'
SAVE_BEST = True
SA = 3
FOLD_TYPE = 'StratifiedGroupKFold'
EARLY_STOP = 80
verbosity = 1000
VAL_STRATEGY = 'cv'
EPOCHS = 16
BATCH_SIZE = 32
LR = 0.001
w = [1, 1]
MAGIC = 1
DEBUG = False

# ---------------------------
# MLP ---------------------
# ---------------------------
nn_params = {
        'input_dropout': 0.0,
        'hidden_layers': 3,
        'hidden_units': 128,
        'embedding_out_dim': 4,
        'hidden_activation': 'Mish', 
        'hidden_dropout': 0.32,
        'gauss_noise': 0.0001,
        'norm_type': 'none', # layer
        'optimizer': {'type': 'radam', 'lr': 1e-1},
        'batch_size': 128,
        'epochs': 80
        }

# ---------------------------
# small MLP ---------------------
# ---------------------------
snn_params = {
        'input_dropout': 0.0,
        'hidden_layers': 2,
        'hidden_units': 64,
        'embedding_out_dim': 4,
        'hidden_activation': 'relu', 
        'hidden_dropout': 0.16,
        'gauss_noise': 0.0001,
        'norm_type': 'none', # layer
        'optimizer': {'type': 'adam', 'lr': 1e-1},
        'batch_size': 128,
        'epochs': 80
        }

# -----------------------
# XGB -------------------
# -----------------------
xgb_params = {
            'colsample_bytree': 0.32,                 
            'learning_rate': 0.04,
            'max_depth': 4,
            'subsample': 1,
            'min_child_weight': 4,
            'gamma': 0.24,
            'alpha': 0,
            'lambda': 1,
            'seed': SEED,
            'n_estimators': 240000
        }
xgb_params["objective"] = 'reg:squarederror'
xgb_params["eval_metric"] = "rmse"

# -----------------------
# LGB -------------------
# -----------------------
lgb_params = {
            'n_estimators': 240000,
            'boosting_type': 'gbdt',
            'max_depth': 3,
            'learning_rate': 0.08,
            'subsample': 0.72,
            'subsample_freq': 4,
            'feature_fraction': 0.24,
            'lambda_l1': 1,
            'lambda_l2': 1,
            'seed': SEED,
            'early_stopping_rounds': EARLY_STOP,
            }    
lgb_params["objective"] = 'huber'
lgb_params["metric"] = "huber"

# -----------------------
# CATB -------------------
# -----------------------
catb_params = { 'task_type': "CPU",
            'learning_rate': 0.04, 
            'iterations': 240000,
            'colsample_bylevel': 0.01,
            'random_seed': SEED,
            'use_best_model': True,
            'early_stopping_rounds': EARLY_STOP
            }
catb_params["loss_function"] = "MAE"
catb_params["eval_metric"] = "MAE"

# ---------------------------
# Linear ---------------------
# ---------------------------
lin_params = {
        'alpha': 8, 
        'fit_intercept': True,
        'max_iter': 8000, 
        'tol': 1e-06,
        'random_state': SEED,
}

# ---------------------------
# BayesianRidge ---------------------
# ---------------------------
br_params = {
        'n_iter': 8000, 
        'fit_intercept': True,
        'tol': 1e-06,
}

# ---------------------------
# SVR ---------------------
# ---------------------------
svm_params = {
        'C': 8,
        'cache_size': 2400.0,
        'max_iter': 8000,
        'verbose': True
} 


# Load data

In [None]:
def read_tabular():
    train = pd.read_csv(INPUT_DIR + '/train.csv')
    test = pd.read_csv(INPUT_DIR + '/test.csv')
    sub = pd.read_csv(INPUT_DIR + '/sample_submission.csv')
    return train, test, sub
train, test, sub = read_tabular()

In [None]:
train.drop_duplicates(keep=False, inplace=True, subset=['Patient','Weeks'])
print(train.shape)
train.head()

In [None]:
print(test.shape)
test.head()

In [None]:
sub['Patient'] = sub['Patient_Week'].apply(lambda x:x.split('_')[0])
sub['Weeks'] = sub['Patient_Week'].apply(lambda x: int(x.split('_')[-1]))
print(sub.shape)
sub.head()

# Feature engineering

In [None]:
sub = pd.merge(sub[['Patient','Weeks','Confidence','Patient_Week']], test.drop(columns=['Weeks']), on='Patient')
train['where'] = 'train'
test['where'] = 'test'
sub['where'] = 'sub'
data = pd.concat([train, test, sub], ignore_index=True)
print(data.shape)

In [None]:
# construct train input
def fe(data):
    data['min_week'] = data['Weeks']
    data.loc[data['where'] == 'test','min_week'] = np.nan
    data['min_week'] = data.groupby('Patient')['min_week'].transform('min')
    
    base = data.loc[data.Weeks == data.min_week]
    base = base[['Patient','FVC', 'Percent']].copy()
    base.columns = ['Patient','base_FVC', 'base_Percent']
    base['nb'] = 1
    base['nb'] = base.groupby('Patient')['nb'].transform('cumsum')
    base = base[base.nb==1]
    base.drop('nb', axis=1, inplace=True)

    data = data.merge(base, on='Patient', how='left')
    data['base_week'] = data['Weeks'] - data['min_week']
    del base
    
    train = data.loc[data['where'] == 'train', :].reset_index(drop=True)
    test = data.loc[data['where'] == 'test', :].reset_index(drop=True)
    sub = data.loc[data['where'] == 'sub', :].reset_index(drop=True)

    return train, test, sub
train, test, sub = fe(data)

In [None]:
print(train.shape)
train.head()

In [None]:
test = sub
print(test.shape)
test.head()

# CNN

In [None]:
def load_image(path):
    ds = pydicom.dcmread(path)
    img = cv2.resize(ds.pixel_array / 2**11, (512, 512))
#     img = ds.pixel_array                                  # Now, img is pixel_array. it is input of our demo code
                                                          # Convert pixel_array (img) to -> gray image (img_2d_scaled)
    img_2d = img.astype(float)                            # Step 1. Convert to float to avoid overflow or underflow losses.
    img = (np.maximum(img_2d,0) / img_2d.max()) * 255.0   # Step 2. Rescaling grey scale between 0-255
    
    return img

def find_contour(img_thr, find_max_con=False):
    contours,hierarchy,=cv2.findContours(img_thr,cv2.RETR_CCOMP,cv2.CHAIN_APPROX_SIMPLE)
    
    # Max area contour
    if find_max_con:
        contours,hierarchy,=cv2.findContours(img_thr,cv2.RETR_CCOMP,cv2.CHAIN_APPROX_SIMPLE)
        max_con= max(contours,key=cv2.contourArea)
        
        mask=np.zeros(img_thr.shape)
        mask= cv2.fillConvexPoly(mask, max_con, 1.0)
        
        return img_thr * mask
    
    # EXTERNAL CONTOUR
    ext_contours=np.zeros(img_thr.shape)
    for i in range(len(contours)):
        if hierarchy[0][i][3]==-1:  # checking for ext. contour (-1)  else 'specific no' for diff. internal contours 
            cv2.drawContours(ext_contours,contours,i,255,-1)
    
    # INTERNAL CONTOUR
    int_contours=np.zeros(img_thr.shape)
    for i in range(len(contours)):
        if hierarchy[0][i][3]!= -1:  # checking for ext. contour (-1)  else internal contour
            cv2.drawContours(int_contours,contours,i,255,-1)
    return ext_contours, int_contours

def seg_lung(org_img, in_contours_img):
    kernel= cv2.getStructuringElement(cv2.MORPH_RECT,(5,5))
    
    clr_noise_img=cv2.morphologyEx(in_contours_img, cv2.MORPH_OPEN, kernel)
    forg_img= cv2.dilate(clr_noise_img, kernel,iterations = 2)
    
    forg_img= cv2.bitwise_not(forg_img.astype('uint8'))
    return cv2.bitwise_or(org_img.astype('uint8'), forg_img)

In [None]:
train['a'] = 0
train['b'] = 0
for i, p in tqdm(enumerate(train.Patient.unique())):
    sub = train.loc[train.Patient == p, :] 
    fvc = sub.FVC.values
    weeks = sub.Weeks.values
    c = np.vstack([weeks, np.ones(len(weeks))]).T
    a, b = np.linalg.lstsq(c, fvc)[0]
    
    train.loc[train['Patient'] == p, 'a'] = a
    train.loc[train['Patient'] == p, 'b'] = b

In [None]:
def load_imgs(p, data='train'):
    x = [] 

#     if p in ['ID00011637202177653955184', 'ID00052637202186188008618']:
#         return x

    ldir = os.listdir(f'../input/osic-pulmonary-fibrosis-progression/{data}/{p}/')
    for i in ldir:
        if int(i[:-4]) / len(ldir) < 0.7 and int(i[:-4]) / len(ldir) > 0.3:
#             x.append(get_img(f'../input/osic-pulmonary-fibrosis-progression/{data}/{p}/{i}')) 
#             x.append(get_img(p, i[:-4])) 
                
            x.append(load_image(f'../input/osic-pulmonary-fibrosis-progression/{data}/{p}/{i}'))
        if len(x) < 1:
            continue
    x = np.expand_dims(x, axis=-1)

    return x

In [None]:
def get_model(shape=(512, 512, 1)):
    def res_block(x, n_features):
        _x = x
        x = layers.BatchNormalization()(x)
        x = layers.LeakyReLU(0.05)(x)
    
        x = layers.Conv2D(n_features, kernel_size=(3, 3), strides=(1, 1), padding='same')(x)
        x = layers.Add()([_x, x])
        return x
    
    inp = layers.Input(shape=shape)
    
    # 512
    x = layers.Conv2D(32, kernel_size=(3, 3), strides=(1, 1), padding='same', input_shape=shape)(inp)
    x = layers.BatchNormalization()(x)
    x = layers.LeakyReLU(0.05)(x)
    
    x = layers.Conv2D(32, kernel_size=(3, 3), strides=(1, 1), padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.LeakyReLU(0.05)(x)
    
    x = layers.AveragePooling2D(pool_size=(2, 2), strides=(2, 2))(x)
    
    # 256
    x = layers.Conv2D(8, kernel_size=(3, 3), strides=(1, 1), padding='same')(x)
    for _ in range(2):
        x = res_block(x, 8)
    x = layers.AveragePooling2D(pool_size=(2, 2), strides=(2, 2))(x)
    
    # 128
    x = layers.Conv2D(16, kernel_size=(3, 3), strides=(1, 1), padding='same')(x)
    for _ in range(2):
        x = res_block(x, 16)
    x = layers.AveragePooling2D(pool_size=(2, 2), strides=(2, 2))(x)
    
    # 64
    x = layers.Conv2D(32, kernel_size=(3, 3), strides=(1, 1), padding='same')(x)
    for _ in range(3):
        x = res_block(x, 32)
    x = layers.AveragePooling2D(pool_size=(2, 2), strides=(2, 2))(x)
    
    # 32
    x = layers.Conv2D(64, kernel_size=(3, 3), strides=(1, 1), padding='same')(x)
    for _ in range(3):
        x = res_block(x, 64)
    x = layers.AveragePooling2D(pool_size=(2, 2), strides=(2, 2))(x)    
    
    # 16
    x = layers.Conv2D(128, kernel_size=(3, 3), strides=(1, 1), padding='same')(x)
    for _ in range(3):
        x = res_block(x, 128)
        
    # pool
    x = layers.GlobalAveragePooling2D(name='hidden')(x)
    x = layers.Dropout(0.4)(x) 
    out = layers.Dense(1)(x)
    
    # compile
    model = models.Model(inputs=inp, outputs=out)
    opt = tfa.optimizers.RectifiedAdam(lr=LR)
    opt = tfa.optimizers.SWA(opt)
    model.compile(optimizer=opt, loss="mae")
    return model

In [None]:
model = get_model()
model.summary()

In [None]:
kf = KFold(n_splits=NFOLD, random_state=42, shuffle=True)
ndim = 128
tr_vec = {
    'average': pd.DataFrame(np.nan * np.ones((train['Patient'].nunique(), ndim))),
    'max': pd.DataFrame(np.nan * np.ones((train['Patient'].nunique(), ndim)))
         }
te_vec = {
    'average': pd.DataFrame(np.zeros((test['Patient'].nunique(), ndim))),
    'max': pd.DataFrame(np.zeros((test['Patient'].nunique(), ndim)))
    }
for t in ['average', 'max']:
    tr_vec[t]['Patient'] = train['Patient'].unique()
    te_vec[t]['Patient'] = test['Patient'].unique()
    
train['cnn'] = 0
test['cnn'] = 0

for fold, (tr_idx, val_idx) in enumerate(kf.split(train['Patient'].unique())):
    print('#####################')
    print('####### Fold %i ######'%fold)
    print('#####################')
    print('Training...')
    
    er = tf.keras.callbacks.EarlyStopping(
        monitor="val_loss",
        min_delta=1e-3,
        patience=10,
        verbose=1,
        mode="auto",
        baseline=None,
        restore_best_weights=True,
    )

    cpt = tf.keras.callbacks.ModelCheckpoint(
        filepath='fold-%i.h5'%fold,
        monitor='val_loss', 
        verbose=1, 
        save_best_only=SAVE_BEST,
        mode='auto'
    )

    rlp = tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss', 
        factor=0.5,
        patience=5, 
        verbose=1, 
        min_lr=1e-8
    )
    model = get_model()
    
    # laod pretrained weight
    model.load_weights(f'../input/osic-2dcnn/fold-{fold}.h5')
        
    # extract CNN features
    hidden_model = models.Model(inputs=model.input,
                         outputs=model.get_layer('hidden').output)
    
    # val
    for p in tqdm(train['Patient'].unique()[val_idx]):
        x = load_imgs(p, data='train')
        
        # CNN features
        try:
            cnn_vec = hidden_model.predict(x)
            me = np.mean(cnn_vec, axis=0)
            ma = np.max(cnn_vec, axis=0)
            for k in range(ndim):
                tr_vec['average'].loc[tr_vec['average']['Patient'] == p, k] = me[k]
                tr_vec['max'].loc[tr_vec['max']['Patient'] == p, k] = ma[k]
        except:
            print(p, ' cnn feats error')
            continue
            
        # true model
        try:
            a = model.predict(x)
            cond = train['Patient'] == p
            train.loc[cond, 'cnn'] = np.median(a) * (train.loc[cond, 'Weeks'] - train.loc[cond, 'min_week']) + train.loc[cond, 'base_FVC']
                    
        except:
            print(p, ' cnn pred error')
            continue            
            
    # test
    for p in tqdm(test['Patient'].unique()):
        x = load_imgs(p, data='test')
        
        # CNN features
        cnn_vec = hidden_model.predict(x)
        me = np.mean(cnn_vec, axis=0)
        ma = np.max(cnn_vec, axis=0)
        for k in range(ndim):
            te_vec['average'].loc[te_vec['average']['Patient'] == p, k] += me[k] / NFOLD
            te_vec['max'].loc[te_vec['max']['Patient'] == p, k] += ma[k] / NFOLD
        
        # true model
        a = model.predict(x)
        cond = test['Patient'] == p
        test.loc[cond, 'cnn'] += (np.median(a) * (test.loc[cond, 'Weeks'] - test.loc[cond, 'min_week']) + test.loc[cond, 'base_FVC']) / NFOLD
        

In [None]:
%%time

# CNN feats
ndim = 8
tr_cnn_df = pd.DataFrame()
te_cnn_df = pd.DataFrame()
tr_cnn_df['Patient'] = tr_vec['average']['Patient'].values
te_cnn_df['Patient'] = te_vec['average']['Patient'].values

# pca
for t in ['average', 'max']:
    # fillna
    te_vec[t] = te_vec[t].fillna(tr_vec[t].median())
    tr_vec[t] = tr_vec[t].fillna(tr_vec[t].median())
    
    # scaling
    feats = [f for f in range(128)]
    scaler = StandardScaler()
    tr_vec[t][feats] = scaler.fit_transform(tr_vec[t][feats].values)
    te_vec[t][feats] = scaler.transform(te_vec[t][feats].values)
    
    # dimensionality reduction
    trans = decomposition.PCA(n_components=80)
    train_dist = trans.fit_transform(tr_vec[t][feats].values)
    test_dist = trans.transform(te_vec[t][feats].values)
    
    trans = umap.UMAP(n_components=ndim, random_state=SEED)
    train_dist2 = trans.fit_transform(train_dist)
    test_dist2 = trans.transform(test_dist)
    
    # add to df
    for k in range(ndim):
        tr_cnn_df['pca_' + t + str(k)] = train_dist[:, k]
        te_cnn_df['pca_' + t + str(k)] = test_dist[:, k]
        tr_cnn_df['umap_' + t + str(k)] = train_dist2[:, k]
        te_cnn_df['umap_' + t + str(k)] = test_dist2[:, k]

In [None]:
# add cnn feats
train = pd.merge(train, tr_cnn_df, how='left', on='Patient')
test = pd.merge(test, te_cnn_df, how='left', on='Patient')

In [None]:
print(train.shape)
train.head()

In [None]:
print(test.shape)
test.head()

# Final FE

In [None]:
def add_lasts(train):
    train['validation'] = 0
    for p in train['Patient'].unique():
        for i, w in enumerate(train.loc[train['Patient'] == p, 'Weeks'].values[-3:]):
            train.loc[(train['Patient'] == p) & (train['Weeks'] == w), 'validation'] = i+1
    return train

train = add_lasts(train)

def calculate_height(row):
    if row['Sex'] == 'Male':
        return row['base_FVC'] / (27.63 - 0.112 * row['Age'])
    else:
        return row['base_FVC'] / (21.78 - 0.101 * row['Age'])

def fe2(data):    
    data['height'] = data.apply(calculate_height, axis=1)
    data['is_exsmoker'] = data['SmokingStatus'].apply(lambda x : 1 if 'Ex-smoker' in x else 0)
    data['is_neversmoker'] = data['SmokingStatus'].apply(lambda x : 1 if 'Never smoked' in x else 0)
    data['is_nowsmoker'] = data['SmokingStatus'].apply(lambda x : 1 if 'Currently smokes' in x else 0)
    data['is_smoker'] = data['is_exsmoker'] + data['is_nowsmoker']
    data['Sex'] = data['Sex'].map({'Male': 1, 'Female': 0})
    
    # binning
#     percent_bin = [71.824968, 76.672493, 79.258903]
#     fvc_bin = [2739, 2925, 3020]
#     def binner(x, bins):
#         if x < bins[0]:
#             return 1
#         elif (x >= bins[0]) & (x < bins[1]):
#             return 2
#         elif (x >= bins[1]) & (x < bins[2]):
#             return 3
#         elif x >= bins[2]:
#             return 4
    
#     data['bin_Age'] = data['Age'].apply(lambda x : 1 if x >= 70 else 0)
#     data['bin_FVC'] = data['base_FVC'].apply(lambda x : binner(x, fvc_bin))
#     data['bin_min_week'] = data['min_week'].apply(lambda x : 0 if x <=0 else 1)
#     data['bin_Percent'] = data['base_Percent'].apply(lambda x : binner(x, percent_bin))
#     data['bin_FVCxPercent'] =  data['bin_Percent'] * data['bin_FVC']
#     data['bin_FVCwPercent'] =  data['bin_FVC'] / data['bin_Percent']
    data['base_FVCxPercent'] =  data['base_Percent'] * data['base_FVC']
    data['base_FVCwPercent'] =  data['base_Percent'] / data['base_FVC']
#     data['pred_FVC'] = data['base_FVCwPercent'] * data['base_Percent']

#     if 'Volume' in data.columns.values.tolist():
#         for f in ['Volume', 'Mean', 'Skew', 'Kurtosis']:
#             data[f'base_FVCw{f}'] = data[f] / data['base_FVC'] 
#             data[f'base_FVCx{f}'] = data['base_FVC'] * data[f]
                
    # interaction
    for f in ['is_exsmoker', 'is_neversmoker', 'is_nowsmoker', 'is_smoker', 'base_FVC', 'base_Percent',]:
        data['Sex_' + f] = data['Sex'] * data[f]
    for f in ['base_FVC', 'Sex', 'is_exsmoker', 'is_neversmoker', 'is_nowsmoker', 'is_smoker','base_Percent',]:
        data['Age_' + f] = data['Age'] * data[f]
    for f in ['base_FVC', 'is_exsmoker', 'is_neversmoker', 'is_nowsmoker', 'is_smoker','base_Percent',]:
        data['Age_Sex_' + f] = data['Age'] * data['Sex'] * data[f]
    
#     # magic
#     magic = [0.81039892, 0.92171452, 1.00464756]
#     for f in ['base_FVC']:
#         for i, m in enumerate(magic):
#             data[f + f'_magic{i}'] = data[f] * m
#     for v in itertools.permutations([0, 1, 2], 2):
#         data['base_FVCxPercent_{}{}'.format(v[0], v[1])] = data['base_Percent_magic{}'.format(v[0])] * data['base_FVC_magic{}'.format(v[1])]
#         data['base_FVCwPercent_{}{}'.format(v[0], v[1])] = data['base_FVC_magic{}'.format(v[1])] / data['base_Percent_magic{}'.format(v[0])]
# #         data['pred_FVC_{}{}'.format(v[0], v[1])] = data['base_FVCwPercent_{}{}'.format(v[0], v[1])] * data['base_Percent']
    drops = ['SmokingStatus']
#     for f in ['base_Percent', 'base_FVC']:
#         for i, m in enumerate(magic):
#             drops.append(f + f'_magic{i}')
        
    # drops
    data.drop(columns=drops, inplace=True)
    
    return data

train = fe2(train)
test = fe2(test)

def fe3(train, test):
    # agg by patient
    train.groupby('Patient')
    
    # mean for category
    target_feats = ['base_FVC']
    cat_feats = ['Sex', 'is_smoker']
    for r in range(len(cat_feats)-1):
        comb = list(itertools.combinations(cat_feats, r+1))
        for c in comb:
            for t in target_feats:
                suf = '_'.join(list(c))               
                tmp = train.groupby(list(c))[t].agg(['mean', 'max', 'min', ]).reset_index().rename(columns={'mean': t + '_mean_' + suf,
                                                                                                          'max': t + '_max_' + suf,
                                                                                                          'min': t + '_min_' + suf})
                train = pd.merge(train, tmp, how='left', on=list(c)) 
                test = pd.merge(test, tmp, how='left', on=list(c))
                
                # diff
                for m in ['mean', 'max', 'min']:
                    train[t + '_' + m + '_' + suf] = train[t] - train[t + '_' + m + '_' + suf]
                    test[t + '_' + m + '_' + suf] = test[t] - test[t + '_' + m + '_' + suf]
    return train, test

train, test = fe3(train, test)

In [None]:
# min      -28.182575
# 25%       -7.588524
# 50%       -3.909110
# 75%       -0.950083
# max       14.682612
# group by a
train['group_a'] = 0
train.loc[(-7.58 <= train['a']) & (train['a'] < -3.9), 'group_a'] = 1
train.loc[(-3.9 <= train['a']) & (train['a'] < -0.95), 'group_a'] = 2
train.loc[-0.95 < train['a'], 'group_a'] = 3
train['group_a'].hist()

In [None]:
print(train.shape)
train.head()

In [None]:
print(test.shape)
test.head()

# Model

In [None]:
import random
from collections import Counter, defaultdict
from sklearn import model_selection

# ---- GroupKFold ----
class GroupKFold(object):
    """
    GroupKFold with random shuffle with a sklearn-like structure
    """

    def __init__(self, n_splits=4, shuffle=True, random_state=42):
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state

    def get_n_splits(self, X=None, y=None, group=None):
        return self.n_splits

    def split(self, X, y, group):
        kf = model_selection.KFold(n_splits=self.n_splits, shuffle=self.shuffle, random_state=self.random_state)
        unique_ids = X[group].unique()
        for fold, (tr_group_idx, va_group_idx) in enumerate(kf.split(unique_ids)):
            # split group
            tr_group, va_group = unique_ids[tr_group_idx], unique_ids[va_group_idx]
            train_idx = np.where(X[group].isin(tr_group))[0]
            val_idx = np.where(X[group].isin(va_group))[0]
            yield train_idx, val_idx
            
# ---- StratifiedGroupKFold ----
class StratifiedGroupKFold(object):
    """
    StratifiedGroupKFold with random shuffle with a sklearn-like structure
    """

    def __init__(self, n_splits=4, shuffle=True, random_state=42):
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state

    def get_n_splits(self, X=None, y=None, group=None):
        return self.n_splits

    def split(self, X, y, group):
        labels_num = np.max(y) + 1
        y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
        y_distr = Counter()
        groups = X[group].values
        for label, g in zip(y, groups):
            y_counts_per_group[g][label] += 1
            y_distr[label] += 1

        y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
        groups_per_fold = defaultdict(set)

        def eval_y_counts_per_fold(y_counts, fold):
            y_counts_per_fold[fold] += y_counts
            std_per_label = []
            for label in range(labels_num):
                label_std = np.std([y_counts_per_fold[i][label] / y_distr[label] for i in range(self.n_splits)])
                std_per_label.append(label_std)
            y_counts_per_fold[fold] -= y_counts
            return np.mean(std_per_label)
        
        groups_and_y_counts = list(y_counts_per_group.items())
        random.Random(self.random_state).shuffle(groups_and_y_counts)

        for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])):
            best_fold = None
            min_eval = None
            for i in range(self.n_splits):
                fold_eval = eval_y_counts_per_fold(y_counts, i)
                if min_eval is None or fold_eval < min_eval:
                    min_eval = fold_eval
                    best_fold = i
            y_counts_per_fold[best_fold] += y_counts
            groups_per_fold[best_fold].add(g)

        all_groups = set(groups)
        for i in range(self.n_splits):
            train_groups = all_groups - groups_per_fold[i]
            test_groups = groups_per_fold[i]

            train_idx = [i for i, g in enumerate(groups) if g in train_groups]
            test_idx = [i for i, g in enumerate(groups) if g in test_groups]

            yield train_idx, test_idx

In [None]:
# to normal
ID = 'Patient_Week'
target = 'FVC'
group = 'Patient'
dropcols = [ID, 'test_patient', 'where', 'Confidence', 'validation', 'a', 'b', 'group_a']
dropcols += [target, group] + ['Percent', 'cnn']
features = [f for f in train.columns.values.tolist() if f not in dropcols]

In [None]:
print(len(features))
features

# Linear decay

In [None]:
C1, C2 = tf.constant(70, dtype='float32'), tf.constant(1000, dtype="float32")

from tensorflow.keras.layers import Activation
from tensorflow.keras.utils import get_custom_objects

# mish
class Mish(Activation):
    '''
    Mish Activation Function.
    .. math::
        mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + e^{x}))
    Shape:
        - Input: Arbitrary. Use the keyword argument `input_shape`
        (tuple of integers, does not include the samples axis)
        when using this layer as the first layer in a model.
        - Output: Same shape as the input.
    Examples:
        >>> X = Activation('Mish', name="conv1_act")(X_input)
    '''

    def __init__(self, activation, **kwargs):
        super(Mish, self).__init__(activation, **kwargs)
        self.__name__ = 'Mish'

def mish(inputs):
    return inputs * tf.math.tanh(tf.math.softplus(inputs))

get_custom_objects().update({'Mish': Mish(mish)})

#=============================#
def score(y_true, y_pred):
    tf.dtypes.cast(y_true, tf.float32)
    tf.dtypes.cast(y_pred, tf.float32)
    sigma = y_pred[:, 2] - y_pred[:, 0]
    fvc_pred = y_pred[:, 1]
    
    #sigma_clip = sigma + C1
    sigma_clip = tf.maximum(sigma, C1)
    delta = tf.abs(y_true[:, 0] - fvc_pred)
    delta = tf.minimum(delta, C2)
    sq2 = tf.sqrt( tf.dtypes.cast(2, dtype=tf.float32) )
    metric = (delta / sigma_clip)*sq2 + tf.math.log(sigma_clip* sq2)
    
    return K.mean(metric)

#============================#
def qloss(y_true, y_pred):
    # Pinball loss for multiple quantiles
    tf.dtypes.cast(y_true, tf.float32)
    tf.dtypes.cast(y_pred, tf.float32)
    q = tf.constant(np.array([qs]), dtype=tf.float32)
    e = y_true - y_pred
    v = tf.maximum(q*e, (q-1)*e)
    return K.mean(v)

#=============================#
def mloss(_lambda):
    def loss(y_true, y_pred):
        y_true = tf.dtypes.cast(y_true, tf.float32)
        y_pred = tf.dtypes.cast(y_pred, tf.float32)
        return _lambda * qloss(y_true, y_pred) + (1 - _lambda) * score(y_true, y_pred)
    return loss


def nn_model(L, params, obj='regression', mloss_param=0.8):
    inputs = layers.Input((L, ), name='Patient')
    x = layers.Dense(params['hidden_units'], activation=params['hidden_activation'])(inputs)
    x = layers.Dropout(params['hidden_dropout'])(x)
    if params['norm_type'] == 'batch':
        x = layers.BatchNormalization()(x)
    elif params['norm_type'] == 'layer':
        x = layers.LayerNormalization()(x)
    else:
        pass
    
    # more layers
    for i in np.arange(params['hidden_layers'] - 1):
        x = layers.Dense(params['hidden_units'] // (2 * (i+1)), activation=params['hidden_activation'])(x)
        x = layers.Dropout(params['hidden_dropout'])(x)
        if params['norm_type'] == 'batch':
            x = layers.BatchNormalization()(x)
        elif params['norm_type'] == 'layer':
            x = layers.LayerNormalization()(x)
        else:
            pass
            
    if obj == 'quantile':
        p1 = layers.Dense(3, activation="linear", name="p1")(x)
        p2 = layers.Dense(3, activation="relu", name="p2")(x)
        preds = layers.Lambda(lambda x: x[0] + tf.cumsum(x[1], axis=1), 
                         name="preds")([p1, p2])
    elif obj == 'regression':
        preds = layers.Dense(1, activation="linear", name="p1")(x)
        
    model = models.Model(inputs, preds, name="NN")
    if params['optimizer']['type'] == 'adam':
        opt = optimizers.Adam(lr=params['optimizer']['lr'])
    elif params['optimizer']['type'] == 'sgd':
        opt = optimizers.SGD(lr=params['optimizer']['lr'], decay=1e-6, momentum=0.9)
    elif params['optimizer']['type'] == 'radam':
        opt = tfa.optimizers.RectifiedAdam(lr=params['optimizer']['lr'])
    opt = tfa.optimizers.SWA(opt)

    if obj == 'quantile':
        model.compile(loss=mloss(mloss_param), optimizer=opt, metrics=[score])
    elif obj == 'regression':
        model.compile(loss=tf.keras.losses.MeanAbsoluteError(), optimizer=opt)
    
    return model

# Regression

In [None]:
def nn_mae(train, test, features, target, seed=SEED):
    ypred = np.zeros(test.shape[0])
    oof = np.zeros(train.shape[0])
    fi = pd.DataFrame()
    fi['features'] = features
    fi['importance'] = 0
    if FOLD_TYPE == 'GroupKFold':
        kf = GroupKFold(n_splits=NFOLD, shuffle=True, random_state=seed)
        kf = kf.split(train, train[target], group)
    elif FOLD_TYPE == 'KFold':
        kf = KFold(n_splits=NFOLD, shuffle=True, random_state=seed)
        kf = kf.split(train)
    elif FOLD_TYPE == 'StratifiedKFold':
        kf = StratifiedKFold(n_splits=NFOLD, shuffle=True, random_state=seed)
        kf = kf.split(train, train['test_patient'].astype(int))
    elif FOLD_TYPE == 'StratifiedGroupKFold':
        kf = StratifiedGroupKFold(n_splits=NFOLD, shuffle=True, random_state=seed)
        kf = kf.split(train, train['group_a'].astype(int), group)

    # scaling
    if SCALER == "MinMax":
        scaler = MinMaxScaler()
    elif SCALER == "Standard":
        scaler = StandardScaler()
#     trs = scaler.fit_transform(train[features])
#     tes = scaler.transform(test[features])
    df = pd.concat([train[features], test[features]])
    df[features] = scaler.fit_transform(df[features])
    trs = df.iloc[:train.shape[0]].values
    tes = df.iloc[train.shape[0]:].values
    
    for cnt, (tr_idx, val_idx) in enumerate(kf):
        print(f"FOLD {cnt}")

        # --------------------------
        # MLP ----------------------
        # --------------------------
        model = nn_model(len(features), nn_params, 'regression')
        early_stop = callbacks.EarlyStopping(patience=8, restore_best_weights=True, monitor='val_loss')
        lr_schedule = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=8, verbose=2, mode='min')
        history = model.fit(trs[tr_idx, :], train[target].values[tr_idx], callbacks=[early_stop, lr_schedule],
                        epochs=nn_params['epochs'], batch_size=nn_params['batch_size'],
                        validation_data=(trs[val_idx, :], train[target].values[val_idx]), verbose=0)

        oof[val_idx] = model.predict(trs[val_idx, :]).ravel()
        ypred += model.predict(tes).ravel() / NFOLD
            
    return oof, ypred, history

In [None]:
def snn_mae(train, test, features, target, seed=SEED):
    ypred = np.zeros(test.shape[0])
    oof = np.zeros(train.shape[0])
    fi = pd.DataFrame()
    fi['features'] = features
    fi['importance'] = 0
    if FOLD_TYPE == 'GroupKFold':
        kf = GroupKFold(n_splits=NFOLD, shuffle=True, random_state=seed)
        kf = kf.split(train, train[target], group)
    elif FOLD_TYPE == 'KFold':
        kf = KFold(n_splits=NFOLD, shuffle=True, random_state=seed)
        kf = kf.split(train)
    elif FOLD_TYPE == 'StratifiedKFold':
        kf = StratifiedKFold(n_splits=NFOLD, shuffle=True, random_state=seed)
        kf = kf.split(train, train['test_patient'].astype(int))
    elif FOLD_TYPE == 'StratifiedGroupKFold':
        kf = StratifiedGroupKFold(n_splits=NFOLD, shuffle=True, random_state=seed)
        kf = kf.split(train, train['group_a'].astype(int), group)

    # scaling
    if SCALER == "MinMax":
        scaler = MinMaxScaler()
    elif SCALER == "Standard":
        scaler = StandardScaler()
#     trs = scaler.fit_transform(train[features])
#     tes = scaler.transform(test[features])
    df = pd.concat([train[features], test[features]])
    df[features] = scaler.fit_transform(df[features])
    trs = df.iloc[:train.shape[0]].values
    tes = df.iloc[train.shape[0]:].values
    
    for cnt, (tr_idx, val_idx) in enumerate(kf):
        print(f"FOLD {cnt}")

        # --------------------------
        # MLP ----------------------
        # --------------------------
        model = nn_model(len(features), snn_params, 'regression')
        early_stop = callbacks.EarlyStopping(patience=8, restore_best_weights=True, monitor='val_loss')
        lr_schedule = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=8, verbose=2, mode='min')
        history = model.fit(trs[tr_idx, :], train[target].values[tr_idx], callbacks=[early_stop, lr_schedule],
                        epochs=nn_params['epochs'], batch_size=nn_params['batch_size'],
                        validation_data=(trs[val_idx, :], train[target].values[val_idx]), verbose=0)

        oof[val_idx] = model.predict(trs[val_idx, :]).ravel()
        ypred += model.predict(tes).ravel() / NFOLD
            
    return oof, ypred, history

In [None]:
def xgb_mae(train, test, features, target, seed=SEED):
    ypred = np.zeros(test.shape[0])
    oof = np.zeros(train.shape[0])
    fi = pd.DataFrame()
    fi['features'] = features
    fi['importance'] = 0
    if FOLD_TYPE == 'GroupKFold':
        kf = GroupKFold(n_splits=NFOLD, shuffle=True, random_state=seed)
        kf = kf.split(train, train[target], group)
    elif FOLD_TYPE == 'KFold':
        kf = KFold(n_splits=NFOLD, shuffle=True, random_state=seed)
        kf = kf.split(train)
    elif FOLD_TYPE == 'StratifiedKFold':
        kf = StratifiedKFold(n_splits=NFOLD, shuffle=True, random_state=seed)
        kf = kf.split(train, train['test_patient'].astype(int))
    elif FOLD_TYPE == 'StratifiedGroupKFold':
        kf = StratifiedGroupKFold(n_splits=NFOLD, shuffle=True, random_state=seed)
        kf = kf.split(train, train['group_a'].astype(int), group)
    
    for cnt, (tr_idx, val_idx) in enumerate(kf):
        print(f"FOLD {cnt}")

        # --------------------------
        # XGB ----------------------
        # --------------------------
        xgb_params['seed'] = seed
        model = xgb.XGBRegressor(**xgb_params)
        model.fit(train[features].iloc[tr_idx], train[target].iloc[tr_idx], eval_set=[(train[features].iloc[val_idx], train[target].iloc[val_idx])],
                            early_stopping_rounds=EARLY_STOP, verbose=1000)
        
        oof[val_idx] = model.predict(train[features].iloc[val_idx])
        ypred += model.predict(test[features]) / NFOLD
            
    return oof, ypred

In [None]:
def lgb_mae(train, test, features, target, seed=SEED):
    ypred = np.zeros(test.shape[0])
    oof = np.zeros(train.shape[0])
    fi = pd.DataFrame()
    fi['features'] = features
    fi['importance'] = 0
    if FOLD_TYPE == 'GroupKFold':
        kf = GroupKFold(n_splits=NFOLD, shuffle=True, random_state=seed)
        kf = kf.split(train, train[target], group)
    elif FOLD_TYPE == 'KFold':
        kf = KFold(n_splits=NFOLD, shuffle=True, random_state=seed)
        kf = kf.split(train)
    elif FOLD_TYPE == 'StratifiedKFold':
        kf = StratifiedKFold(n_splits=NFOLD, shuffle=True, random_state=seed)
        kf = kf.split(train, train['test_patient'].astype(int))
    elif FOLD_TYPE == 'StratifiedGroupKFold':
        kf = StratifiedGroupKFold(n_splits=NFOLD, shuffle=True, random_state=seed)
        kf = kf.split(train, train['group_a'].astype(int), group)
    
    for cnt, (tr_idx, val_idx) in enumerate(kf):
        print(f"FOLD {cnt}")

        # --------------------------
        # CatB ----------------------
        # --------------------------
        lgb_params['seed'] = seed
        model = lgb.LGBMRegressor(**lgb_params)
        model.fit(train[features].iloc[tr_idx], train[target].iloc[tr_idx], eval_set=[(train[features].iloc[val_idx], train[target].iloc[val_idx])],
            verbose=1000)
        fi['importance'] += model.booster_.feature_importance(importance_type="gain") / NFOLD

        oof[val_idx] = model.predict(train[features].iloc[val_idx])
        ypred += model.predict(test[features]) / NFOLD
            
    return oof, ypred, fi

In [None]:
def catb_mae(train, test, features, target, seed=SEED):
    ypred = np.zeros(test.shape[0])
    oof = np.zeros(train.shape[0])
    fi = pd.DataFrame()
    fi['features'] = features
    fi['importance'] = 0
    if FOLD_TYPE == 'GroupKFold':
        kf = GroupKFold(n_splits=NFOLD, shuffle=True, random_state=seed)
        kf = kf.split(train, train[target], group)
    elif FOLD_TYPE == 'KFold':
        kf = KFold(n_splits=NFOLD, shuffle=True, random_state=seed)
        kf = kf.split(train)
    elif FOLD_TYPE == 'StratifiedKFold':
        kf = StratifiedKFold(n_splits=NFOLD, shuffle=True, random_state=seed)
        kf = kf.split(train, train['test_patient'].astype(int))
    elif FOLD_TYPE == 'StratifiedGroupKFold':
        kf = StratifiedGroupKFold(n_splits=NFOLD, shuffle=True, random_state=seed)
        kf = kf.split(train, train['group_a'].astype(int), group)
    
    for cnt, (tr_idx, val_idx) in enumerate(kf):
        print(f"FOLD {cnt}")

        # --------------------------
        # CatB ----------------------
        # --------------------------
        catb_params['random_seed'] = seed
        model = CatBoostRegressor(**catb_params)
        model.fit(train[features].iloc[tr_idx], train[target].iloc[tr_idx], 
                  eval_set=(train[features].iloc[val_idx], train[target].iloc[val_idx]),
                  verbose=1000, cat_features=[])
        fi['importance'] += model.get_feature_importance() / NFOLD
        oof[val_idx] = model.predict(train[features].iloc[val_idx])
        ypred += model.predict(test[features]) / NFOLD
            
    return oof, ypred, fi

In [None]:
def lin_mae(train, test, features, target, seed=SEED):
    ypred = np.zeros(test.shape[0])
    oof = np.zeros(train.shape[0])
    fi = pd.DataFrame()
    fi['features'] = features
    fi['importance'] = 0
    if FOLD_TYPE == 'GroupKFold':
        kf = GroupKFold(n_splits=NFOLD, shuffle=True, random_state=seed)
        kf = kf.split(train, train[target], group)
    elif FOLD_TYPE == 'KFold':
        kf = KFold(n_splits=NFOLD, shuffle=True, random_state=seed)
        kf = kf.split(train)
    elif FOLD_TYPE == 'StratifiedKFold':
        kf = StratifiedKFold(n_splits=NFOLD, shuffle=True, random_state=seed)
        kf = kf.split(train, train['test_patient'].astype(int))
    elif FOLD_TYPE == 'StratifiedGroupKFold':
        kf = StratifiedGroupKFold(n_splits=NFOLD, shuffle=True, random_state=seed)
        kf = kf.split(train, train['group_a'].astype(int), group)

    # scaling
    if SCALER == "MinMax":
        scaler = MinMaxScaler()
    elif SCALER == "Standard":
        scaler = StandardScaler()
#     trs = scaler.fit_transform(train[features])
#     tes = scaler.transform(test[features])
    df = pd.concat([train[features], test[features]])
    df[features] = scaler.fit_transform(df[features])
    trs = df.iloc[:train.shape[0]].values
    tes = df.iloc[train.shape[0]:].values
    
    for cnt, (tr_idx, val_idx) in enumerate(kf):
        print(f"FOLD {cnt}")

        # --------------------------
        # Linear ----------------------
        # --------------------------
        lin_params['random_state'] = seed
        model = Ridge(**lin_params)
        model.fit(trs[tr_idx, :], train[target].iloc[tr_idx])
        fi['importance'] += model.coef_.ravel()

        oof[val_idx] = model.predict(trs[val_idx, :])
        ypred += model.predict(tes) / NFOLD
            
    return oof, ypred, fi

In [None]:
def lasso_mae(train, test, features, target, seed=SEED):
    ypred = np.zeros(test.shape[0])
    oof = np.zeros(train.shape[0])
    fi = pd.DataFrame()
    fi['features'] = features
    fi['importance'] = 0
    if FOLD_TYPE == 'GroupKFold':
        kf = GroupKFold(n_splits=NFOLD, shuffle=True, random_state=seed)
        kf = kf.split(train, train[target], group)
    elif FOLD_TYPE == 'KFold':
        kf = KFold(n_splits=NFOLD, shuffle=True, random_state=seed)
        kf = kf.split(train)
    elif FOLD_TYPE == 'StratifiedKFold':
        kf = StratifiedKFold(n_splits=NFOLD, shuffle=True, random_state=seed)
        kf = kf.split(train, train['test_patient'].astype(int))
    elif FOLD_TYPE == 'StratifiedGroupKFold':
        kf = StratifiedGroupKFold(n_splits=NFOLD, shuffle=True, random_state=seed)
        kf = kf.split(train, train['group_a'].astype(int), group)

    # scaling
    if SCALER == "MinMax":
        scaler = MinMaxScaler()
    elif SCALER == "Standard":
        scaler = StandardScaler()
#     trs = scaler.fit_transform(train[features])
#     tes = scaler.transform(test[features])
    df = pd.concat([train[features], test[features]])
    df[features] = scaler.fit_transform(df[features])
    trs = df.iloc[:train.shape[0]].values
    tes = df.iloc[train.shape[0]:].values
    
    for cnt, (tr_idx, val_idx) in enumerate(kf):
        print(f"FOLD {cnt}")

        # --------------------------
        # Linear ----------------------
        # --------------------------
        lin_params['random_state'] = seed
        model = Lasso(**lin_params)
        model.fit(trs[tr_idx, :], train[target].iloc[tr_idx])

        oof[val_idx] = model.predict(trs[val_idx, :])
        ypred += model.predict(tes) / NFOLD
            
    return oof, ypred

In [None]:
def br_mae(train, test, features, target, seed=SEED):
    ypred = np.zeros(test.shape[0])
    oof = np.zeros(train.shape[0])
    fi = pd.DataFrame()
    fi['features'] = features
    fi['importance'] = 0
    if FOLD_TYPE == 'GroupKFold':
        kf = GroupKFold(n_splits=NFOLD, shuffle=True, random_state=seed)
        kf = kf.split(train, train[target], group)
    elif FOLD_TYPE == 'KFold':
        kf = KFold(n_splits=NFOLD, shuffle=True, random_state=seed)
        kf = kf.split(train)
    elif FOLD_TYPE == 'StratifiedKFold':
        kf = StratifiedKFold(n_splits=NFOLD, shuffle=True, random_state=seed)
        kf = kf.split(train, train['test_patient'].astype(int))
    elif FOLD_TYPE == 'StratifiedGroupKFold':
        kf = StratifiedGroupKFold(n_splits=NFOLD, shuffle=True, random_state=seed)
        kf = kf.split(train, train['group_a'].astype(int), group)

    # scaling
    if SCALER == "MinMax":
        scaler = MinMaxScaler()
    elif SCALER == "Standard":
        scaler = StandardScaler()
#     trs = scaler.fit_transform(train[features])
#     tes = scaler.transform(test[features])
    df = pd.concat([train[features], test[features]])
    df[features] = scaler.fit_transform(df[features])
    trs = df.iloc[:train.shape[0]].values
    tes = df.iloc[train.shape[0]:].values
    
    for cnt, (tr_idx, val_idx) in enumerate(kf):
        print(f"FOLD {cnt}")

        # --------------------------
        # Linear ----------------------
        # --------------------------
        model = BayesianRidge(**br_params)
        model.fit(trs[tr_idx, :], train[target].iloc[tr_idx])
        fi['importance'] += model.coef_.ravel()

        oof[val_idx] = model.predict(trs[val_idx, :])
        ypred += model.predict(tes) / NFOLD
            
    return oof, ypred, fi

In [None]:
def svr_mae(train, test, features, target, seed=SEED):
    ypred = np.zeros(test.shape[0])
    oof = np.zeros(train.shape[0])
    fi = pd.DataFrame()
    fi['features'] = features
    fi['importance'] = 0
    if FOLD_TYPE == 'GroupKFold':
        kf = GroupKFold(n_splits=NFOLD, shuffle=True, random_state=seed)
        kf = kf.split(train, train[target], group)
    elif FOLD_TYPE == 'KFold':
        kf = KFold(n_splits=NFOLD, shuffle=True, random_state=seed)
        kf = kf.split(train)
    elif FOLD_TYPE == 'StratifiedKFold':
        kf = StratifiedKFold(n_splits=NFOLD, shuffle=True, random_state=seed)
        kf = kf.split(train, train['test_patient'].astype(int))
    elif FOLD_TYPE == 'StratifiedGroupKFold':
        kf = StratifiedGroupKFold(n_splits=NFOLD, shuffle=True, random_state=seed)
        kf = kf.split(train, train['group_a'].astype(int), group)

    # scaling
    if SCALER == "MinMax":
        scaler = MinMaxScaler()
    elif SCALER == "Standard":
        scaler = StandardScaler()
#     trs = scaler.fit_transform(train[features])
#     tes = scaler.transform(test[features])
    df = pd.concat([train[features], test[features]])
    df[features] = scaler.fit_transform(df[features])
    trs = df.iloc[:train.shape[0]].values
    tes = df.iloc[train.shape[0]:].values
    
    for cnt, (tr_idx, val_idx) in enumerate(kf):
        print(f"FOLD {cnt}")

        # --------------------------
        # Linear ----------------------
        # --------------------------
        model = SVR(**svm_params)
        model.fit(trs[tr_idx, :], train[target].iloc[tr_idx])
#         fi['importance'] += model.coef_.ravel()

        oof[val_idx] = model.predict(trs[val_idx, :])
        ypred += model.predict(tes) / NFOLD
            
    return oof, ypred

## NN

In [None]:
# NN
target = 'FVC'
oof_df = pd.DataFrame()
ypred_df = pd.DataFrame()
if 'cnn' in test.columns.values.tolist():
    oof_df['cnn'] = train['cnn'].values
    ypred_df['cnn'] = test['cnn'].values

ypred = np.zeros(test.shape[0])
oof = np.zeros(train.shape[0])
for s in range(SA): # seed average
    oof_, ypred_, history = nn_mae(train, test, features, target, seed=SEED+s)
    oof += oof_ / SA
    ypred += ypred_ / SA
oof_df['nn'] = oof
ypred_df['nn'] = ypred

In [None]:
# Plot training & validation loss values
def plot_history(history):
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Val'], loc='upper right', frameon=False)
    plt.show()
    
plot_history(history)

## Small NN

In [None]:
# NN
ypred = np.zeros(test.shape[0])
oof = np.zeros(train.shape[0])
for s in range(SA): # seed average
    oof_, ypred_, history = snn_mae(train, test, features, target, seed=SEED+s)
    oof += oof_ / SA
    ypred += ypred_ / SA
oof_df['snn'] = oof
ypred_df['snn'] = ypred

In [None]:
plot_history(history)

## XGB

In [None]:
ypred = np.zeros(test.shape[0])
oof = np.zeros(train.shape[0])
for s in range(SA): # seed average
    oof_, ypred_ = xgb_mae(train, test, features, target, seed=SEED+s)
    oof += oof_ / SA
    ypred += ypred_ / SA
oof_df['xgb'] = oof
ypred_df['xgb'] = ypred

## LGB

In [None]:
ypred = np.zeros(test.shape[0])
oof = np.zeros(train.shape[0])
for s in range(SA): # seed average
    oof_, ypred_, fi = lgb_mae(train, test, features, target, seed=SEED+s)
    oof += oof_ / SA
    ypred += ypred_ / SA
oof_df['lgb'] = oof
ypred_df['lgb'] = ypred

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 12))
sns.barplot(x='importance', y='features', data=fi.sort_values(by='importance', ascending=False).iloc[:20], ax=ax)

## CatB

In [None]:
ypred = np.zeros(test.shape[0])
oof = np.zeros(train.shape[0])
for s in range(SA): # seed average
    oof_, ypred_, fi = catb_mae(train, test, features, target, seed=SEED+s)
    oof += oof_ / SA
    ypred += ypred_ / SA
oof_df['catb'] = oof
ypred_df['catb'] = ypred

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 12))
sns.barplot(x='importance', y='features', data=fi.sort_values(by='importance', ascending=False).iloc[:20], ax=ax)

## Linear

In [None]:
ypred = np.zeros(test.shape[0])
oof = np.zeros(train.shape[0])
for s in range(SA): # seed average
    oof_, ypred_, fi = lin_mae(train, test, features, target, seed=SEED+s)
    oof += oof_ / SA
    ypred += ypred_ / SA
oof_df['lin'] = oof
ypred_df['lin'] = ypred

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 12))
sns.barplot(x='importance', y='features', data=fi.sort_values(by='importance', ascending=False).iloc[:20], ax=ax)

## Lasso

In [None]:
ypred = np.zeros(test.shape[0])
oof = np.zeros(train.shape[0])
for s in range(SA): # seed average
    oof_, ypred_ = lasso_mae(train, test, features, target, seed=SEED+s)
    oof += oof_ / SA
    ypred += ypred_ / SA
oof_df['lasso'] = oof
ypred_df['lasso'] = ypred

## BayesianRidge

In [None]:
ypred = np.zeros(test.shape[0])
oof = np.zeros(train.shape[0])
for s in range(SA): # seed average
    oof_, ypred_, fi = br_mae(train, test, features, target, seed=SEED+s)
    oof += oof_ / SA
    ypred += ypred_ / SA
oof_df['br'] = oof
ypred_df['br'] = ypred

## SVR

In [None]:
# ypred = np.zeros(test.shape[0])
# oof = np.zeros(train.shape[0])
# for s in range(SA): # seed average
#     oof_, ypred_ = svr_mae(train, test, features, target, seed=SEED+s)
#     oof += oof_ / SA
#     ypred += ypred_ / SA
# oof_df['svm'] = oof
# ypred_df['svm'] = ypred

## Stacking

In [None]:
# CV
idx = train['validation'] > 0
for m in oof_df.columns.values.tolist():
    print('{}: MAE = {:.3f}, MAE (last 3) = {:.3f}'.format(m, 
          mean_absolute_error(train[target], oof_df[m]),
          mean_absolute_error(train.loc[idx, target], oof_df.loc[idx, m])))
    plt.hist(oof_df[m].values, alpha=0.4, label=m)
plt.legend(frameon=False)

In [None]:
oof_df.head()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(12, 12))
sns.heatmap(oof_df.corr(), annot=True, ax=ax)

In [None]:
# stacking
for f in [target, 'group_a', 'Patient']:
    oof_df[f] = train[f].values
    
ypred = np.zeros(test.shape[0])
oof = np.zeros(train.shape[0])
for s in range(SA): # seed average
    oof_, ypred_, history = snn_mae(oof_df, ypred_df, ypred_df.columns.values.tolist(), target, seed=SEED+s)
    oof += oof_ / SA
    ypred += ypred_ / SA 
# oof, ypred, history = snn_mae(oof_df, ypred_df, ypred_df.columns.values.tolist(), target, seed=SEED)    

In [None]:
plot_history(history)

In [None]:
# fig, ax = plt.subplots(1, 1, figsize=(8, 12))
# sns.barplot(x='importance', y='features', data=fi.sort_values(by='importance', ascending=False).iloc[:20], ax=ax)

# FVC CV score

In [None]:
# FVC mae CV
score = mean_absolute_error(train['FVC'], oof)
print(f'MAE all = {score}')

idx = train['validation'] > 0
score = mean_absolute_error(train.loc[idx, 'FVC'], oof[idx])
print(f'MAE last 3 = {score}')

# Optimize Confidence

In [None]:
train['FVC_pred'] = oof
test['FVC_pred'] = ypred

In [None]:
import scipy as sp
from functools import partial

def loss_func(weight, row):
    confidence = weight
    sigma_clipped = max(confidence, 70)
    diff = abs(row['FVC'] - row['FVC_pred'])
    delta = min(diff, 1000)
    score = -math.sqrt(2)*delta/sigma_clipped - np.log(math.sqrt(2)*sigma_clipped)
    return -score

results = []
tk0 = tqdm(train.iterrows(), total=len(train))
for _, row in tk0:
    loss_partial = partial(loss_func, row=row)
    weight = [281]
#     bounds = [(70, 100)]
#     result = sp.optimize.minimize(loss_partial, weight, method='SLSQP', bounds=bounds)
    result = sp.optimize.minimize(loss_partial, weight, method='Nelder-Mead')
    x = result['x']
    results.append(x[0])

In [None]:
# optimized score
train['Confidence'] = results
train['sigma_clipped'] = train['Confidence'].apply(lambda x: max(x, 70))
train['diff'] = abs(train['FVC'] - train['FVC_pred'])
train['delta'] = train['diff'].apply(lambda x: min(x, 1000))
train['score'] = -math.sqrt(2)*train['delta']/train['sigma_clipped'] - np.log(math.sqrt(2)*train['sigma_clipped'])
score = train['score'].mean()
score2 = train.loc[train['validation'] > 0, 'score'].mean()
print(score, score2)

In [None]:
train['Confidence'].hist(alpha=0.4)
train['sigma_clipped'].hist(alpha=0.4)

In [None]:
# all last 3
np.random.seed(SEED)
for p in tqdm(train['Patient'].unique()):
    lasts = train.loc[(train['Patient'] == p) & (train['validation'] > 0), 'Confidence'].values
    cond = (train['Patient'] == p) & (train['validation'] == 0)
    train.loc[cond, 'Confidence'] = np.random.choice(lasts, size=np.sum(cond))

In [None]:
train['Confidence'].hist(alpha=0.4)

In [None]:
# # Confidence to normal distribution
# train['normal_Confidence'] = train['sigma_clipped'].values
# train.loc[train['normal_Confidence'] <= 0, 'normal_Confidence'] = 0
# train['normal_Confidence'] = np.log1p(train['normal_Confidence'].values)

# train['normal_Confidence'].hist(alpha=0.4)

## NN

In [None]:
# NN
target = 'Confidence'
oof_df = pd.DataFrame()
ypred_df = pd.DataFrame()

ypred = np.zeros(test.shape[0])
oof = np.zeros(train.shape[0])
for s in range(SA): # seed average
    oof_, ypred_, history = nn_mae(train, test, features, target, seed=SEED+s)
    oof += oof_ / SA
    ypred += ypred_ / SA
oof_df['nn'] = oof
ypred_df['nn'] = ypred

In [None]:
# Plot training & validation loss values
plot_history(history)

## Small NN

In [None]:
# NN
ypred = np.zeros(test.shape[0])
oof = np.zeros(train.shape[0])
for s in range(SA): # seed average
    oof_, ypred_, history = snn_mae(train, test, features, target, seed=SEED+s)
    oof += oof_ / SA
    ypred += ypred_ / SA
oof_df['snn'] = oof
ypred_df['snn'] = ypred

In [None]:
plot_history(history)

## XGB

In [None]:
ypred = np.zeros(test.shape[0])
oof = np.zeros(train.shape[0])
for s in range(SA): # seed average
    oof_, ypred_ = xgb_mae(train, test, features, target, seed=SEED+s)
    oof += oof_ / SA
    ypred += ypred_ / SA
oof_df['xgb'] = oof
ypred_df['xgb'] = ypred

## LGB

In [None]:
ypred = np.zeros(test.shape[0])
oof = np.zeros(train.shape[0])
for s in range(SA): # seed average
    oof_, ypred_, fi = lgb_mae(train, test, features, target, seed=SEED+s)
    oof += oof_ / SA
    ypred += ypred_ / SA
oof_df['lgb'] = oof
ypred_df['lgb'] = ypred

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 12))
sns.barplot(x='importance', y='features', data=fi.sort_values(by='importance', ascending=False).iloc[:20], ax=ax)

## CatB

In [None]:
ypred = np.zeros(test.shape[0])
oof = np.zeros(train.shape[0])
for s in range(SA): # seed average
    oof_, ypred_, fi = catb_mae(train, test, features, target, seed=SEED+s)
    oof += oof_ / SA
    ypred += ypred_ / SA
oof_df['catb'] = oof
ypred_df['catb'] = ypred

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 12))
sns.barplot(x='importance', y='features', data=fi.sort_values(by='importance', ascending=False).iloc[:20], ax=ax)

## Linear

In [None]:
ypred = np.zeros(test.shape[0])
oof = np.zeros(train.shape[0])
for s in range(SA): # seed average
    oof_, ypred_, fi = lin_mae(train, test, features, target, seed=SEED+s)
    oof += oof_ / SA
    ypred += ypred_ / SA
oof_df['lin'] = oof
ypred_df['lin'] = ypred

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 12))
sns.barplot(x='importance', y='features', data=fi.sort_values(by='importance', ascending=False).iloc[:20], ax=ax)

## Lasso

In [None]:
ypred = np.zeros(test.shape[0])
oof = np.zeros(train.shape[0])
for s in range(SA): # seed average
    oof_, ypred_ = lasso_mae(train, test, features, target, seed=SEED+s)
    oof += oof_ / SA
    ypred += ypred_ / SA
oof_df['lasso'] = oof
ypred_df['lasso'] = ypred

## BayesianRidge

In [None]:
ypred = np.zeros(test.shape[0])
oof = np.zeros(train.shape[0])
for s in range(SA): # seed average
    oof_, ypred_, fi = br_mae(train, test, features, target, seed=SEED+s)
    oof += oof_ / SA
    ypred += ypred_ / SA
oof_df['br'] = oof
ypred_df['br'] = ypred

## SVR

In [None]:
ypred = np.zeros(test.shape[0])
oof = np.zeros(train.shape[0])
for s in range(SA): # seed average
    oof_, ypred_ = svr_mae(train, test, features, target, seed=SEED+s)
    oof += oof_ / SA
    ypred += ypred_ / SA
oof_df['svm'] = oof
ypred_df['svm'] = ypred

## Stacking

In [None]:
# CV
idx = train['validation'] > 0
for m in oof_df.columns.values.tolist():
    print('{}: MAE = {:.3f}, MAE (last 3) = {:.3f}'.format(m, 
          mean_absolute_error(train[target], oof_df[m]),
          mean_absolute_error(train.loc[idx, target], oof_df.loc[idx, m])))
    plt.hist(oof_df[m].values, alpha=0.4, label=m)
plt.legend(frameon=False)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(12, 12))
sns.heatmap(oof_df.corr(), annot=True, ax=ax)

In [None]:
# stacking
for f in [target, 'group_a', 'Patient']:
    oof_df[f] = train[f].values
    
ypred = np.zeros(test.shape[0])
oof = np.zeros(train.shape[0])
for s in range(SA): # seed average
    oof_, ypred_, history = snn_mae(oof_df, ypred_df, ypred_df.columns.values.tolist(), target, seed=SEED+s)
    oof += oof_ / SA
    ypred += ypred_ / SA 
# oof, ypred, history = snn_mae(oof_df, ypred_df, ypred_df.columns.values.tolist(), target, seed=SEED)    

In [None]:
plot_history(history)

In [None]:
# fig, ax = plt.subplots(1, 1, figsize=(8, 12))
# sns.barplot(x='importance', y='features', data=fi.sort_values(by='importance', ascending=False).iloc[:20], ax=ax)

In [None]:
# oof = oof_df.median(axis=1)
# ypred = ypred_df.median(axis=1)

# Final CV score

In [None]:
train['Confidence'] = oof
test['Confidence'] = ypred

# train['Confidence'] = np.expm1(oof)
# test['Confidence'] = np.expm1(ypred)

In [None]:
def lb_metric(data):
    data['sigma_clipped'] = data['Confidence'].apply(lambda x: max(x, 70))
    data['diff'] = abs(data['FVC'] - data['FVC_pred'])
    data['delta'] = data['diff'].apply(lambda x: min(x, 1000))
    data['score'] = -math.sqrt(2)*data['delta']/data['sigma_clipped'] - np.log(math.sqrt(2)*data['sigma_clipped'])
    score = data['score'].mean()
    score2 = data.loc[data['validation'] > 0, 'score'].mean()
    return score, score2

score, score2 = lb_metric(train)
print(f'Overall CV = {score}, {score2}')

# Examples

In [None]:
def prediction_example(train, patient='ID00419637202311204720264'):
    fig, ax = plt.subplots(1, 1, figsize=(8, 5))
    idx = train['Patient'] == patient
    labels = ['upper confidence', 'FVC prediction', 'lower confidence']
    ax.plot(train.loc[idx, 'Weeks'].values, train.loc[idx, 'FVC_pred'].values, color='k', lw=4, alpha=0.4, label=labels[1])
    ax.plot(train.loc[idx, 'Weeks'].values, train.loc[idx, 'FVC_pred'].values + train.loc[idx, 'Confidence'].values, color='r', lw=4, alpha=0.4, label=labels[0])
    ax.plot(train.loc[idx, 'Weeks'].values, train.loc[idx, 'FVC_pred'].values - train.loc[idx, 'Confidence'].values, color='r', lw=4, alpha=0.4, label=labels[-1])
    ax.plot(train.loc[idx, 'Weeks'].values, train.loc[idx, 'FVC'].values, '-og', label='True')
    _, score = lb_metric(train.loc[idx, :])
    ax.legend(frameon=False, loc='center left', bbox_to_anchor=(1, 0.5))
    ax.set_xlabel('weeks')
    ax.set_ylabel('FVC')
    ax.set_title('{}\nscore={}'.format(patient, score))
prediction_example(train, patient=test['Patient'].unique()[0])

In [None]:
prediction_example(train, patient=test['Patient'].unique()[1])

In [None]:
prediction_example(train, patient=test['Patient'].unique()[2])

In [None]:
prediction_example(train, patient=test['Patient'].unique()[3])

In [None]:
prediction_example(train, patient=test['Patient'].unique()[4])

# Prediction

In [None]:
submission = pd.read_csv(INPUT_DIR + '/sample_submission.csv')
submission.head()

In [None]:
test[[ID, 'FVC_pred', 'Confidence']].head()

In [None]:
test[['FVC_pred', 'Confidence']].describe().T

In [None]:
sub = submission.drop(columns=['FVC', 'Confidence']).merge(test[['Patient_Week', 'FVC_pred', 'Confidence']], 
                                                           on='Patient_Week')
sub.loc[sub['Confidence'] < 70, 'Confidence'] = 70
sub.columns = submission.columns
sub.to_csv('submission.csv', index=False)
sub.head()