## Basic imports

In [None]:
import numpy as np
import pandas as pd
import os
import random
from tqdm.auto import tqdm


import matplotlib.pyplot as plt
import cv2


In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold, RepeatedKFold

import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow.keras.layers as L
import tensorflow.keras.models as M

In [None]:
seed = 123

def seed_everything(seed=123):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    
seed_everything(seed)

In [None]:
# data params
mean = -0.13859619186395233 #from training notebook
std = 0.1546567782480763 #from training notebook
num_cv_folds = 4
num_repeats = 50

## Quantile Regression
See: https://www.kaggle.com/ulrich07/osic-multiple-quantile-regression-starter

**Data preparation**

See: https://www.kaggle.com/mattbast/feature-engineering-with-a-linear-model

In [None]:
## construct train input
train_data = pd.read_csv("../input/osic-pulmonary-fibrosis-progression/train.csv")
print(train_data.shape)

# merge "base values"
base_df = train_data.copy()[['Patient', 'FVC', 'Percent', 'Weeks']].sort_values(['Patient','Weeks']).groupby('Patient').head(1)
base_df.columns = ['Patient', 'base_FVC', 'base_Percent', 'base_Weeks']
train_data = train_data.merge(base_df, on='Patient')
print(train_data.shape)


train_data['Weeks_passed'] = train_data.Weeks-train_data.base_Weeks
train_data = train_data[train_data.Weeks_passed>0]
#train_data = train_data.groupby(['Patient','base_Weeks']).tail(10)
train_data = train_data.drop_duplicates()
print(train_data.shape)


# folds = pd.read_csv('../input/osic-targets/train_folds.csv')[['Patient', 'split-all']].drop_duplicates()
# folds.columns = ['Patient', 'fold']
# folds['fold'] = folds['fold'].astype(int)
# train_data = train_data.merge(folds)
# print(train_data.shape)

train_data['est_Percent'] = train_data.apply(lambda x: mean*(x.Weeks-x.base_Weeks) + x.base_Percent, axis=1)
train_data['est_FVC'] = train_data['base_FVC']/train_data['base_Percent']*train_data['est_Percent']

train_data = train_data[['Patient', 'Weeks', 'FVC', 'Percent', 'Age', 'Sex', 'SmokingStatus', 
                         'base_FVC', 'base_Percent', 'base_Weeks', 'Weeks_passed', #'fold',
                         'est_Percent', 'est_FVC',
                        ]].drop_duplicates()
train_data['WHERE'] = 'train'

print(train_data.shape)
train_data.head()

In [None]:
train_data.describe().T

In [None]:
sample_sub = pd.read_csv("../input/osic-pulmonary-fibrosis-progression/sample_submission.csv")
sample_sub['Patient'] = sample_sub['Patient_Week'].apply(lambda x: x.split("_")[0])
sample_sub['Weeks'] = sample_sub['Patient_Week'].apply(lambda x: x.split("_")[1])
sample_sub.head()

In [None]:
# construct test_data input
test_data = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')
test_data = test_data.groupby('Patient').first()[['FVC', 'Percent', 'Weeks', 'Percent', 'Age', 'Sex', 'SmokingStatus',]]
test_data.columns = ['base_FVC', 'base_Percent', 'base_Weeks', 'Percent', 'Age', 'Sex', 'SmokingStatus',]

test_data = test_data.merge(sample_sub[['Patient','Weeks']], on='Patient')
test_data['Weeks'] = test_data.Weeks.astype(int)
test_data['Weeks_passed'] = test_data.Weeks-test_data.base_Weeks

print(test_data.shape)
test_data['est_Percent'] = test_data.apply(lambda x: mean*(x.Weeks-x.base_Weeks) + x.base_Percent, axis=1)
test_data['est_FVC'] = test_data['base_FVC']/test_data['base_Percent']*test_data['est_Percent']
        
print(test_data.shape)
                                  
test_data = test_data[['Patient', 'Weeks', 'Age', 'Sex', 'SmokingStatus', 
                         'base_FVC', 'base_Percent', 'base_Weeks', 'Weeks_passed', 
                         'est_Percent', 'est_FVC',]].drop_duplicates()
test_data['WHERE'] = 'test'
print(test_data.shape)
test_data.head()

In [None]:
test_data.describe().T

**Combine datasets**

In [None]:
def calculate_height(row):
    if row['Sex'] == 'Male':
        return row['base_FVC'] / (27.63 - 0.112 * row['Age'])
    else:
        return row['base_FVC'] / (21.78 - 0.101 * row['Age'])
    
def scale_feature(series):
    return (series - series.min()) / (series.max() - series.min())

In [None]:
all_data = train_data.append(test_data)
all_data['Height'] = all_data.apply(calculate_height, axis=1)

# to categorical
all_data = pd.concat([
    all_data,
    pd.get_dummies(all_data.Sex),
    pd.get_dummies(all_data.SmokingStatus)
], axis=1)

all_data = all_data.drop(columns=['Sex', 'SmokingStatus'])
        

all_data.head()

In [None]:
# normalize
all_data['normalized_Weeks'] = scale_feature(all_data['Weeks'])
all_data['normalized_base_Percent'] = scale_feature(all_data['base_Percent'])

all_data['normalized_Age'] = scale_feature(all_data['Age'])
all_data['normalized_base_Weeks'] = scale_feature(all_data['base_Weeks'])
all_data['normalized_base_FVC'] = scale_feature(all_data['base_FVC'])
all_data['normalized_Weeks_passed'] = scale_feature(all_data['Weeks_passed'])
all_data['normalized_Height'] = scale_feature(all_data['Height'])
all_data['normalized_est_FVC'] = scale_feature(all_data['est_FVC'])
all_data['normalized_est_Percent'] = scale_feature(all_data['est_Percent'])

all_data.head()

In [None]:
FEATURE_COLS = ['Female', 'Male', 
                'Currently smokes', 'Ex-smoker', 'Never smoked',
                'normalized_Weeks', 'normalized_base_Weeks', 'normalized_Weeks_passed', 
                'normalized_base_Percent',
                'normalized_base_FVC', 
                'normalized_Age', 
                #'normalized_Height',
               # 'normalized_est_FVC', 'normalized_est_Percent',
               ]

In [None]:
tr = all_data.loc[all_data.WHERE=='train'].reset_index(drop=True)
te = all_data.loc[all_data.WHERE=='test'].reset_index(drop=True)

tr.shape, te.shape

In [None]:
C1, C2 = tf.constant(70, dtype='float32'), tf.constant(1000, dtype="float32")
#=============================#
def score(y_true, y_pred):
    tf.dtypes.cast(y_true, tf.float32)
    tf.dtypes.cast(y_pred, tf.float32)
    sigma = y_pred[:, 2] - y_pred[:, 0]
    fvc_pred = y_pred[:, 1]
    
    #sigma_clip = sigma + C1
    sigma_clip = tf.maximum(sigma, C1)
    delta = tf.abs(y_true[:, 0] - fvc_pred)
    delta = tf.minimum(delta, C2)
    sq2 = tf.sqrt( tf.dtypes.cast(2, dtype=tf.float32) )
    metric = (delta / sigma_clip)*sq2 + tf.math.log(sigma_clip* sq2)
    return -K.mean(metric)

def qloss(y_true, y_pred):
    # Pinball loss for multiple quantiles
    qs = [0.2, 0.50, 0.8]
    q = tf.constant(np.array([qs]), dtype=tf.float32)
    e = y_true - y_pred
    v = tf.maximum(q*e, (q-1)*e)
    return K.mean(v)

def mloss(_lambda):
    def loss(y_true, y_pred):
        return _lambda * qloss(y_true, y_pred) + (1 - _lambda)*score(y_true, y_pred)
    return loss

#=================
def make_model(nh):
    z = L.Input((nh,), name="Patient")
    x = L.Dense(100, activation="relu", name="d1")(z)
    x = L.Dense(100, activation="relu", name="d2")(x)
    p1 = L.Dense(3, activation="linear", name="p1")(x)
    p2 = L.Dense(3, activation="relu", name="p2")(x)
    preds = L.Lambda(lambda x: x[0] + tf.cumsum(x[1], axis=1), 
                     name="preds")([p1, p2])
    
    model = M.Model(z, preds, name="CNN")
    model.compile(loss=mloss(0.65), optimizer=tf.keras.optimizers.Adam(lr=0.1, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.01, amsgrad=False), metrics=[score])
    return model

In [None]:
nh = len(FEATURE_COLS)
net = make_model(nh)
print(net.summary())

In [None]:
NOISE_ON_FEATURES = ['FVC',
                     'normalized_Weeks', 'normalized_base_Weeks', 'normalized_Weeks_passed',
                     'normalized_base_Percent', 'normalized_base_FVC', 
                     'normalized_Age', 'normalized_Height',
                     #'normalized_est_FVC', 'normalized_est_Percent'
                    ]

NOISE_MAX_PCT = 0.05
noise_generator = lambda x: x*(1+(np.random.randint(-NOISE_MAX_PCT*1000, NOISE_MAX_PCT*1000)/1000))

N_ARTIFICIAL_SAMPLES = len(tr)

In [None]:
BATCH_SIZE = 128
EPOCHS = 200

In [None]:
pred = np.zeros((len(tr), 3))
pe = np.zeros((len(te[FEATURE_COLS]), 3))


rkf = RepeatedKFold(n_splits=num_cv_folds, n_repeats=num_repeats, random_state=seed)
fold = 0
for tr_idx, val_idx in rkf.split(tr):
    print(f"FOLD {fold}")
    X_tr = tr.loc[tr_idx]
    X_tr_artif = X_tr.sample(n=N_ARTIFICIAL_SAMPLES, replace=True)
    X_tr_artif[NOISE_ON_FEATURES] = X_tr_artif[NOISE_ON_FEATURES].apply(noise_generator)
    X_tr = X_tr.append(X_tr_artif)
    y_tr = X_tr['FVC'].values
    X_tr = X_tr[FEATURE_COLS].values
    
    X_val = tr.loc[val_idx]
    #X_val[NOISE_ON_FEATURES] = X_val[NOISE_ON_FEATURES].apply(noise_generator)
    y_val = X_val['FVC'].values
    X_val = X_val[FEATURE_COLS].values
    
    
    X_te = te[FEATURE_COLS].values
    
    net = make_model(nh)
    net.fit(X_tr, y_tr, 
            batch_size=BATCH_SIZE, epochs=EPOCHS, 
            validation_data=(X_val, y_val), verbose=0) #
    
    print("train", net.evaluate(X_tr, y_tr, verbose=0, batch_size=BATCH_SIZE))
    print("val", net.evaluate(X_val, y_val, verbose=0, batch_size=BATCH_SIZE))
    print("predict val...", end=" ")
    pred[val_idx] += net.predict(X_val, batch_size=BATCH_SIZE, verbose=0)
    print("done")
    
    print("predict test...", end=" ")
    pe += net.predict(X_te, batch_size=BATCH_SIZE, verbose=0) / (num_cv_folds*num_repeats)
    print("done")
    fold+=1
#==============

pred = pred/num_repeats

In [None]:
X = tr[FEATURE_COLS].values
y = tr['FVC'].values

sigma_opt = mean_absolute_error(y, pred[:, 1])
unc = pred[:,2] - pred[:, 0]
sigma_mean = np.mean(unc)
print(sigma_opt, sigma_mean)

In [None]:
tr['FVC_pred'] = pred[:, 1]
tr['Confidence'] = pred[:,2] - pred[:, 0]

In [None]:
def lb_metric(train):
    train['sigma_clipped'] = train['Confidence'].apply(lambda x: max(x, 70))
    train['diff'] = abs(train['FVC'] - train['FVC_pred'])
    train['delta'] = train['diff'].apply(lambda x: min(x, 1000))
    train['score'] = -np.sqrt(2)*train['delta']/train['sigma_clipped'] - np.log(np.sqrt(2)*train['sigma_clipped'])
    score = train['score'].mean()
    return score


score = lb_metric(tr)
print(f'Local Score: {score}')

In [None]:
idxs = np.random.randint(0, y.shape[0], 100)
plt.figure(figsize=(16,9))
plt.plot(y[idxs], label="ground truth")
plt.plot(pred[idxs, 0], label="q25")
plt.plot(pred[idxs, 1], label="q50")
plt.plot(pred[idxs, 2], label="q75")
plt.legend(loc="best")
plt.show()

In [None]:
print(unc.min(), unc.mean(), unc.max(), (unc>=0).mean())

In [None]:
plt.hist(unc)
plt.title("Uncertainty in prediction")
plt.show()

In [None]:
fig, ax = plt.subplots(5, 1, figsize=(10, 20))

for i, pt in enumerate(np.random.choice(tr['Patient'], 5)):
    patient_log = tr[tr['Patient'] == pt]

    ax[i].set_title(pt)
    ax[i].scatter(patient_log['Weeks_passed'], patient_log['FVC'], label='truth')
    ax[i].scatter(patient_log['Weeks_passed'], patient_log['FVC_pred'], label='prediction')
    ax[i].legend()

plt.show()

In [None]:
xx = np.linspace(70, 1000, 100)
best_y = -12

for x in xx:
    tr['Confidence'] = x
    yy = lb_metric(tr)
    plt.scatter(x, yy, color='k')
    
    if best_y<yy:
        best_y=yy
        best_x=x

plt.axhline(best_y, linestyle=":")
plt.title('Fixed confidence score')
plt.xlabel('Confidence (ml)')
plt.ylabel('Score')
plt.show()

print(f'Best local Score: {best_y} | Confidence: {best_x}')

In [None]:
sub = te[['Patient', 'Weeks', 'base_FVC', 'base_Weeks']].copy()
sub['FVC'] = pe[:, 1]
c = (sub.loc[sub.base_Weeks == sub.Weeks, 'base_FVC']/sub.loc[sub.base_Weeks == sub.Weeks, 'FVC']).mean()
sub['FVC'] = c*pe[:, 1]
sub['Patient_Week'] = sub['Patient'] +"_"+sub['Weeks'].astype(str)

sub.loc[sub.base_Weeks == sub.Weeks, 'FVC'] = sub.loc[sub.base_Weeks == sub.Weeks, 'base_FVC'] 
sub['Confidence'] = pe[:, 2] - pe[:, 0]
sub.loc[sub.base_Weeks == sub.Weeks, 'Confidence'] = 70
sub['Confidence'] = sub.Confidence.fillna(70).apply(lambda x: np.clip(x, 70, best_x+140))

sub = sub[['Patient_Week','FVC','Confidence']]
sub.head(20)

In [None]:
sub.describe().T

In [None]:
sub.to_csv('submission.csv', index=False)