## Intuition of Data Augmentation

### Set-up

In this challenge, each patient is given with specific and time-varying informations. For the time being, we will only focus on tabular data. 

More precisely in the training dataset for a patient $i$ at week $t$ we are given time varying informations $x_{it}$ and patient-specific traits $p_i$. 
* $x_{it}$ encompasses: `Percent`, `FVC`, and `Week`, i.e $$ x_{it} = (\text{FVC}_{it},\text{Percent}_{it}, \text{Weeks}_{it})  $$
* $p_{i}$ encompasses: `Sex`, `Age`, and `Smoking Status`

But in the test dataset, we can observe patient-specific traits $p_i$, and time-varying features for only one week, let's say $\tau_i$, that is we only know $x_{i \tau_i}$.

### Augmentation Strategy

Therefore to line-up with the test dataset set-up, we can augment the dataset with this strategy.Now we are going to devise a dataset with a three-level key: $i$ for patient level, $t$ for week level and $\tau$ for base week. We will keep using patient-specific trait $p_i$ and time-varying $x_{it}^{\tau}$ where:
$$  x_{it}^{\tau} = (\text{FVC}_{i \tau},\text{Percent}_{i \tau}, \text{Weeks}_{i \tau}, \text{Weeks}_{it} - \text{Weeks}_{i \tau})$$


In [None]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import KFold,StratifiedKFold
import os

In [None]:
import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow.keras.layers as L
import tensorflow.keras.models as M

In [None]:
def seed_everything(seed=2020):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    
seed_everything(42)

## Preprocessing

In [None]:
ROOT = "../input/osic-pulmonary-fibrosis-progression"

In [None]:
df_tr = pd.read_csv(f"{ROOT}/train.csv")
chunk = pd.read_csv(f"{ROOT}/test.csv")
te = pd.read_csv(f"{ROOT}/sample_submission.csv", usecols=['Patient_Week'])
#===
print("Naive doublon handling...")
chunk.drop_duplicates(keep=False, inplace=True, subset=['Patient'])
df_tr.drop_duplicates(keep=False, inplace=True, subset=['Patient','Weeks'])
#===

In [None]:
te['Patient'] = te['Patient_Week'].apply(lambda x:x.split('_')[0])
te['Weeks'] = te['Patient_Week'].apply(lambda x: int(x.split('_')[-1]))
piv = df_tr[['Patient','Weeks','FVC','Percent']].copy()

## Augmentation

In [None]:
print(df_tr.shape, chunk.shape, te.shape)

In [None]:
print("Rename columns for pivot dataframes")
ren_dct = {"Weeks":"base_Weeks","FVC":"base_FVC","Percent":"base_Percent"}
df_tr = df_tr.rename(columns=ren_dct)
chunk = chunk.rename(columns=ren_dct)
print("Test handling...")
te = te.merge(chunk, on="Patient", how="left")
del chunk
print("Train handling...")
WEEKS = df_tr.base_Weeks.unique()
CHUNKS = []
for week in tqdm(WEEKS):
    tp = piv.merge(df_tr.loc[df_tr.base_Weeks==week], on="Patient", how="inner")
    CHUNKS.append(tp)
tr = pd.DataFrame()
tr = tr.append(CHUNKS)
print("original training dataset", df_tr.shape)
print("augmented training dataset", tr.shape)
del WEEKS, CHUNKS, df_tr, piv
#

## FE

In [None]:
te['Percent'] = te['base_Percent']

In [None]:
tr.shape, te.shape

In [None]:
tr["CLUSTER"]=tr.Patient.astype('category').cat.codes
tr["wk1"] = tr["Weeks"]
tr["wk2"] = tr["Weeks"] - tr["base_Weeks"]
te["wk1"] = te["Weeks"]
te["wk2"] = te["Weeks"] - te["base_Weeks"]

In [None]:
FE = []
CATCOLS = ["Sex","SmokingStatus"]
for col in CATCOLS:
    for mod in tr[col].unique():
        FE.append(mod)
        tr[mod] = (tr[col] == mod).astype(int)
        te[mod] = (te[col] == mod).astype(int)
#=================
NUMCOLS = ["base_Weeks","base_FVC","wk1","wk2","Age","base_Percent"] #,"Percent"
FE += NUMCOLS

In [None]:
print(FE)

## Model

In [None]:
def metric( trueFVC, predFVC, predSTD ):
    
    clipSTD = np.clip( predSTD, 70 , 9e9 )  
    
    deltaFVC = np.clip( np.abs(trueFVC-predFVC), 0 , 1000 )  

    return np.mean( -1*(np.sqrt(2)*deltaFVC/clipSTD) - np.log( np.sqrt(2)*clipSTD ) )
#

In [None]:
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

In [None]:
C1, C2 = tf.constant(70, dtype='float32'), tf.constant(1000, dtype="float32")
#=============================#
def score(y_true, y_pred):
    tf.dtypes.cast(y_true, tf.float32)
    tf.dtypes.cast(y_pred, tf.float32)
    sigma = y_pred[:, 2] - y_pred[:, 0]
    fvc_pred = y_pred[:, 1]
    
    #sigma_clip = sigma + C1
    sigma_clip = tf.maximum(sigma, C1)
    delta = tf.abs(y_true[:, 0] - fvc_pred)
    delta = tf.minimum(delta, C2)
    sq2 = tf.sqrt( tf.dtypes.cast(2, dtype=tf.float32) )
    metric = (delta / sigma_clip)*sq2 + tf.math.log(sigma_clip* sq2)
    return K.mean(metric)
#============================#
def qloss(y_true, y_pred):
    # Pinball loss for multiple quantiles
    qs = [0.2, 0.50, 0.8]
    q = tf.constant(np.array([qs]), dtype=tf.float32)
    e = y_true - y_pred
    v = tf.maximum(q*e, (q-1)*e)
    return K.mean(v)
#=============================#
def mloss(_lambda):
    def loss(y_true, y_pred):
        return _lambda * qloss(y_true, y_pred) + (1 - _lambda)*score(y_true, y_pred)
    return loss
#=================
def make_model(nh):
    z = L.Input((nh,), name="Patient")
    x = L.Dense(100, activation="relu", name="d1")(z)
    x = L.Dense(100, activation="relu", name="d2")(x)
    x = L.Dense(100, activation="relu", name="d3")(x)
    p1 = L.Dense(3, activation="linear", name="p1")(x)
    p2 = L.Dense(3, activation="relu", name="p2")(x)
    preds = L.Lambda(lambda x: x[0] + tf.cumsum(x[1], axis=1), 
                     name="preds")([p1, p2])
    
    model = M.Model(z, preds, name="CNN")
    #model.compile(loss=qloss, optimizer="adam", metrics=[score])
    model.compile(loss=mloss(0.8), optimizer=tf.keras.optimizers.Adam(lr=0.1, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.01, amsgrad=False), metrics=[score])
    return model

### Training

In [None]:
y = tr['FVC'].values
z = tr[FE].values
ze = te[FE].values
cl = tr['CLUSTER'].values

In [None]:
sc = MinMaxScaler()
z = sc.fit_transform(z)
ze = sc.transform(ze)

In [None]:
NFOLD = 10
#kf = KFold(n_splits=NFOLD)
kf = StratifiedKFold(n_splits=NFOLD)

In [None]:
%%time
nh = len(FE)
BATCH_SIZE=500
pe = np.zeros((ze.shape[0], 3))
pred = np.zeros((z.shape[0], 3))

cnt = 0
EPOCHS = 250#500
for tr_idx, val_idx in kf.split(z, cl):
    cnt += 1
    print(f"FOLD {cnt}")
    net = make_model(nh)
    ckpt = ModelCheckpoint("w.h5", monitor='val_score', verbose=0, save_best_only=True,mode='min')
    net.fit(z[tr_idx], y[tr_idx], batch_size=BATCH_SIZE, epochs=EPOCHS, 
            validation_data=(z[val_idx], y[val_idx]), verbose=0, callbacks=[ckpt]) #
    net = make_model(nh)
    net.load_weights("w.h5")
    print("train", net.evaluate(z[tr_idx], y[tr_idx], verbose=0, batch_size=BATCH_SIZE))
    print("val", net.evaluate(z[val_idx], y[val_idx], verbose=0, batch_size=BATCH_SIZE))
    print("predict val...")
    pred[val_idx] = net.predict(z[val_idx], batch_size=BATCH_SIZE, verbose=0)
    print("predict test...")
    pe += net.predict(ze, batch_size=BATCH_SIZE, verbose=0) / NFOLD
#==============

In [None]:
print("oof", metric( y, pred[:, 1], pred[:, 2] - pred[:, 0] ))

In [None]:
sigma_opt = mean_absolute_error(y, pred[:, 1])
unc = pred[:,2] - pred[:, 0]
sigma_mean = np.mean(unc)
print(sigma_opt, sigma_mean)

In [None]:
idxs = np.random.randint(0, y.shape[0], 50)
plt.plot(y[idxs], label="ground truth")
plt.plot(pred[idxs, 0], label="q25")
plt.plot(pred[idxs, 1], label="q50")
plt.plot(pred[idxs, 2], label="q75")
plt.legend(loc="best")
plt.show()

In [None]:
plt.hist(unc)
plt.title("uncertainty in prediction")
plt.show()

## Prediction

In [None]:
te['FVC'] = pe[:, 1]
te['Confidence'] = pe[:, 2] - pe[:, 0]

In [None]:
subm = te[['Patient_Week','FVC','Confidence']].copy()

In [None]:
otest = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')
for i in range(len(otest)):
    subm.loc[subm['Patient_Week']==otest.Patient[i]+'_'+str(otest.Weeks[i]), 'FVC'] = otest.FVC[i]
    subm.loc[subm['Patient_Week']==otest.Patient[i]+'_'+str(otest.Weeks[i]), 'Confidence'] = 0.1

In [None]:
subm.head()

In [None]:
subm.describe().T

In [None]:
subm.to_csv("submission.csv", index=False)