In [None]:
import numpy as np
import pandas as pd
import pydicom
import os
import random
import matplotlib.pyplot as plt
from tqdm import tqdm
from PIL import Image
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold, GroupKFold

In [None]:
import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow.keras.layers as L
import tensorflow.keras.models as M

In [None]:
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.graph_objects as go

In [None]:
def seed_everything(seed=2020):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    
seed_everything(42)

In [None]:
ROOT = "../input/osic-pulmonary-fibrosis-progression"
BATCH_SIZE= 128

In [None]:
tr = pd.read_csv(f"{ROOT}/train.csv")
tr.drop_duplicates(keep=False, inplace=True, subset=['Patient','Weeks'])
chunk = pd.read_csv(f"{ROOT}/test.csv")

print("add infos")
sub = pd.read_csv(f"{ROOT}/sample_submission.csv")
sub['Patient'] = sub['Patient_Week'].apply(lambda x:x.split('_')[0])
sub['Weeks'] = sub['Patient_Week'].apply(lambda x: int(x.split('_')[-1]))
sub =  sub[['Patient','Weeks','Confidence','Patient_Week']]
sub = sub.merge(chunk.drop('Weeks', axis=1), on="Patient")

In [None]:
tr['WHERE'] = 'train'
chunk['WHERE'] = 'val'
sub['WHERE'] = 'test'
data = tr.append([chunk, sub])

In [None]:
print(tr.shape, chunk.shape, sub.shape, data.shape)
print(tr.Patient.nunique(), chunk.Patient.nunique(), sub.Patient.nunique(), 
      data.Patient.nunique())
#

In [None]:
data['min_week'] = data['Weeks']
data.loc[data.WHERE=='test','min_week'] = np.nan
data['min_week'] = data.groupby('Patient')['min_week'].transform('min')

In [None]:
base = data.loc[data.Weeks == data.min_week]
base = base[['Patient','FVC']].copy()
base.columns = ['Patient','min_FVC']
base['nb'] = 1
base['nb'] = base.groupby('Patient')['nb'].transform('cumsum')
base = base[base.nb==1]
base.drop('nb', axis=1, inplace=True)

In [None]:
data = data.merge(base, on='Patient', how='left')
data['base_week'] = data['Weeks'] - data['min_week']
del base

In [None]:
COLS = ['Sex','SmokingStatus']
FE = []
for col in COLS:
    for mod in data[col].unique():
        FE.append(mod)
        data[mod] = (data[col] == mod).astype(int)

In [None]:
data['age'] = (data['Age'] - data['Age'].min() ) / ( data['Age'].max() - data['Age'].min() )
data['BASE'] = (data['min_FVC'] - data['min_FVC'].min() ) / ( data['min_FVC'].max() - data['min_FVC'].min() )
data['week'] = (data['base_week'] - data['base_week'].min() ) / ( data['base_week'].max() - data['base_week'].min() )
data['percent'] = (data['Percent'] - data['Percent'].min() ) / ( data['Percent'].max() - data['Percent'].min() )
FE += ['age','percent','week','BASE']

In [None]:
tr = data.loc[data.WHERE=='train']
chunk = data.loc[data.WHERE=='val']
sub = data.loc[data.WHERE=='test']
del data

In [None]:
tr.shape, chunk.shape, sub.shape


In [None]:
C1, C2 = tf.constant(70, dtype='float32'), tf.constant(1000, dtype="float32")
#=============================#
def score(y_true, y_pred):
    tf.dtypes.cast(y_true, tf.float32)
    tf.dtypes.cast(y_pred, tf.float32)
    sigma = y_pred[:, 2] - y_pred[:, 0]
    fvc_pred = y_pred[:, 1]
    
    #sigma_clip = sigma + C1
    sigma_clip = tf.maximum(sigma, C1)
    delta = tf.abs(y_true[:, 0] - fvc_pred)
    delta = tf.minimum(delta, C2)
    sq2 = tf.sqrt( tf.dtypes.cast(2, dtype=tf.float32) )
    metric = (delta / sigma_clip)*sq2 + tf.math.log(sigma_clip* sq2)
    return K.mean(metric)
#============================#
def qloss(y_true, y_pred):
    # Pinball loss for multiple quantiles
    qs = [0.2, 0.5, 0.8]
    q = tf.constant(np.array([qs]), dtype=tf.float32)
    e = y_true - y_pred
    v = tf.maximum(q*e, (q-1)*e)
    return K.mean(v)
#=============================#
def mloss(_lambda):
    def loss(y_true, y_pred):
        return _lambda * qloss(y_true, y_pred) + (1 - _lambda)*score(y_true, y_pred)
    return loss
#=================
def make_model():
    z = L.Input((9,), name="Patient")
    x = L.Dense(100, activation="relu", name="d1")(z)
    x = L.Dense(100, activation="relu", name="d2")(x)
    #x = L.Dense(100, activation="relu", name="d3")(x)
    p1 = L.Dense(3, activation="linear", name="p1")(x)
    p2 = L.Dense(3, activation="relu", name="p2")(x)
    preds = L.Lambda(lambda x: x[0] + tf.cumsum(x[1], axis=1), 
                     name="preds")([p1, p2])
    
    
    model = M.Model(z, preds, name="CNN")
    #model.compile(loss=qloss, optimizer="adam", metrics=[score])
    model.compile(loss=mloss(0.775), optimizer=tf.keras.optimizers.Adam(lr=0.1, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.01, amsgrad=False), metrics=[score])
    return model

In [None]:
net = make_model()
print(net.summary())
print(net.count_params())

In [None]:
y = tr['FVC'].values
z = tr[FE].values
ze = sub[FE].values
pe = np.zeros((ze.shape[0], 3))
pred = np.zeros((z.shape[0], 3))
delta = np.zeros((z.shape[0], 3))

In [None]:
NFOLD =5
kf = GroupKFold(n_splits=NFOLD)
split_groups = tr['Patient']

In [None]:
%%time
cnt = 0



for tr_idx, val_idx in kf.split(z,groups=split_groups):
    cnt += 1
    print(f"FOLD {cnt}")
    
    
    net = make_model()
    
    net.fit(z[tr_idx], y[tr_idx], batch_size=BATCH_SIZE, epochs=800, 
            validation_data=(z[val_idx], y[val_idx]), verbose=0) #
    
    
    print("train", net.evaluate(z[tr_idx], y[tr_idx], verbose=0, batch_size=BATCH_SIZE))
    print("val", net.evaluate(z[val_idx], y[val_idx], verbose=0, batch_size=BATCH_SIZE))
    print("predict val...")
    pred[val_idx] = net.predict(z[val_idx], batch_size=BATCH_SIZE, verbose=0)
    print("predict test...")
    pe += net.predict(ze, batch_size=BATCH_SIZE, verbose=0) / NFOLD
    
    delta += net.predict(z) / NFOLD
    
#==============

In [None]:
sigma_opt = mean_absolute_error(y, pred[:, 1])
unc = pred[:,2] - pred[:, 0]
sigma_mean = np.mean(unc)
print(sigma_opt, sigma_mean)

In [None]:
# Scoring

o_clipped = np.maximum(delta[:,2] - delta[:,0], 70)
delta = np.minimum(np.abs(delta[:, 1] - y), 1000)
sqrt = (np.sqrt((2)))
score = (-(sqrt * (delta))/(o_clipped)) - tf.math.log(sqrt * o_clipped)

print(np.mean(score))

In [None]:
print(unc.min(), unc.mean(), unc.max(), (unc>=0).mean())

In [None]:
idxs = np.random.randint(0, y.shape[0], 100)
plt.plot(y[idxs], label="ground truth")
plt.plot(pred[idxs, 0], label="q25")
plt.plot(pred[idxs, 1], label="q50")
plt.plot(pred[idxs, 2], label="q75")
plt.legend(loc="best")
plt.show()

In [None]:
sub['FVC1'] = pe[:, 1]
sub['Confidence1'] = pe[:, 2] - pe[:, 0]

In [None]:
subm = sub[['Patient_Week','FVC','Confidence','FVC1','Confidence1']].copy()

In [None]:
subm.loc[~subm.FVC1.isnull()].head(10)

In [None]:
subm.loc[~subm.FVC1.isnull(),'FVC'] = subm.loc[~subm.FVC1.isnull(),'FVC1']
if sigma_mean<70:
    subm['Confidence'] = sigma_opt
else:
    subm.loc[~subm.FVC1.isnull(),'Confidence'] = subm.loc[~subm.FVC1.isnull(),'Confidence1']

In [None]:
otest = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')
for i in range(len(otest)):
    subm.loc[subm['Patient_Week']==otest.Patient[i]+'_'+str(otest.Weeks[i]), 'FVC'] = otest.FVC[i]
    subm.loc[subm['Patient_Week']==otest.Patient[i]+'_'+str(otest.Weeks[i]), 'Confidence'] = 0.1

In [None]:
subm[["Patient_Week","FVC","Confidence"]].to_csv("submission_Quantile_Regression.csv", index=False)

# QUANTILE REGRESSION 

In [None]:
tr = pd.read_csv(f"{ROOT}/train.csv")
tr.drop_duplicates(keep=False, inplace=True, subset=['Patient','Weeks'])
chunk = pd.read_csv(f"{ROOT}/test.csv")

print("add infos")
submission = pd.read_csv(f"{ROOT}/sample_submission.csv")
submission['Patient'] = submission['Patient_Week'].apply(lambda x:x.split('_')[0])
submission['Weeks'] = submission['Patient_Week'].apply(lambda x: int(x.split('_')[-1]))
submission =  submission[['Patient','Weeks','Confidence','Patient_Week']]
submission = submission.merge(chunk.drop('Weeks', axis=1), on="Patient")

In [None]:
tr['WHERE'] = 'train'
chunk['WHERE'] = 'val'
submission['WHERE'] = 'test'
data = tr.append([chunk, submission])

In [None]:
data['min_week'] = data['Weeks']
data.loc[data.WHERE=='test','min_week'] = np.nan
data['min_week'] = data.groupby('Patient')['min_week'].transform('min')

base = data.loc[data.Weeks == data.min_week]
base = base[['Patient','FVC']].copy()
base.columns = ['Patient','min_FVC']
base['nb'] = 1
base['nb'] = base.groupby('Patient')['nb'].transform('cumsum')
base = base[base.nb==1]
base.drop('nb', axis=1, inplace=True)


In [None]:
data = data.merge(base, on='Patient', how='left')
data['base_week'] = data['Weeks'] - data['min_week']
del base



In [None]:
COLS = ['Sex','SmokingStatus']
FE = []
for col in COLS:
    for mod in data[col].unique():
        FE.append(mod)
        data[mod] = (data[col] == mod).astype(int)


In [None]:
data['age'] = (data['Age'] - data['Age'].min() ) / ( data['Age'].max() - data['Age'].min() )
data['BASE'] = (data['min_FVC'] - data['min_FVC'].min() ) / ( data['min_FVC'].max() - data['min_FVC'].min() )
data['week'] = (data['base_week'] - data['base_week'].min() ) / ( data['base_week'].max() - data['base_week'].min() )
data['percent'] = (data['Percent'] - data['Percent'].min() ) / ( data['Percent'].max() - data['Percent'].min() )
FE += ['age','percent','week','BASE']


In [None]:

tr = data.loc[data.WHERE=='train']
chunk = data.loc[data.WHERE=='val']
submission = data.loc[data.WHERE=='test']
del data

In [None]:
tr.shape, chunk.shape, submission.shape


In [None]:
C1, C2 = tf.constant(70, dtype='float32'), tf.constant(1000, dtype="float32")
#=============================#
def score(y_true, y_pred):
    tf.dtypes.cast(y_true, tf.float32)
    tf.dtypes.cast(y_pred, tf.float32)
    sigma = y_pred[:, 2] - y_pred[:, 0]
    fvc_pred = y_pred[:, 1]
    
    #sigma_clip = sigma + C1
    sigma_clip = tf.maximum(sigma, C1)
    delta = tf.abs(y_true[:, 0] - fvc_pred)
    delta = tf.minimum(delta, C2)
    sq2 = tf.sqrt( tf.dtypes.cast(2, dtype=tf.float32) )
    metric = (delta / sigma_clip)*sq2 + tf.math.log(sigma_clip* sq2)
    return K.mean(metric)
#============================#
def qloss(y_true, y_pred):
    # Pinball loss for multiple quantiles
    qs = [0.2, 0.5, 0.8]
    q = tf.constant(np.array([qs]), dtype=tf.float32)
    e = y_true - y_pred
    v = tf.maximum(q*e, (q-1)*e)
    return K.mean(v)
#=============================#
def mloss(_lambda):
    def loss(y_true, y_pred):
        return _lambda * qloss(y_true, y_pred) + (1 - _lambda)*score(y_true, y_pred)
    return loss
#=================
def make_model():
    z = L.Input((9,), name="Patient")
    x = L.Dense(100, activation="relu", name="d1")(z)
    x = L.Dense(100, activation="elu", name="d2")(z)
    x = L.Dense(100, activation="relu", name="d3")(z)
    p1 = L.Dense(3, activation="linear", name="p1")(x)
    p2 = L.Dense(3, activation="relu", name="p2")(x)
    preds = L.Lambda(lambda x: x[0] + tf.cumsum(x[1], axis=1), 
                     name="preds")([p1, p2])
    
    
    model = M.Model(z, preds, name="CNN")
    #model.compile(loss=qloss, optimizer="adam", metrics=[score])
    model.compile(loss=mloss(0.8), optimizer=tf.keras.optimizers.Adam(lr=0.1, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.01, amsgrad=False), metrics=[score])
    return model

In [None]:
y = tr['FVC'].values
z = tr[FE].values
ze = submission[FE].values
pe = np.zeros((ze.shape[0], 3))
pred = np.zeros((z.shape[0], 3))
delta = np.zeros((z.shape[0], 3))

In [None]:
NFOLD =5
kf = GroupKFold(n_splits=NFOLD)
split_groups = tr['Patient']


In [None]:
%%time
cnt = 0



for tr_idx, val_idx in kf.split(z,groups=split_groups):
    cnt += 1
    print(f"FOLD {cnt}")
    
    
    net = make_model()
    
    
    net.fit(z[tr_idx], y[tr_idx], batch_size=BATCH_SIZE, epochs=800, 
            validation_data=(z[val_idx], y[val_idx]), verbose=0) #
    
    
    print("train", net.evaluate(z[tr_idx], y[tr_idx], verbose=0, batch_size=BATCH_SIZE))
    print("val", net.evaluate(z[val_idx], y[val_idx], verbose=0, batch_size=BATCH_SIZE))
    print("predict val...")
    pred[val_idx] = net.predict(z[val_idx], batch_size=BATCH_SIZE, verbose=0)
    print("predict test...")
    pe += net.predict(ze, batch_size=BATCH_SIZE, verbose=0) / NFOLD
    
    delta += net.predict(z) / NFOLD
    
#==============

In [None]:
sigma_opt = mean_absolute_error(y, pred[:, 1])
unc = pred[:,2] - pred[:, 0]
sigma_mean = np.mean(unc)
print(sigma_opt, sigma_mean)

In [None]:
# Scoring

o_clipped = np.maximum(delta[:,2] - delta[:,0], 70)
delta = np.minimum(np.abs(delta[:, 1] - y), 1000)
sqrt = (np.sqrt((2)))
score = (-(sqrt * (delta))/(o_clipped)) - tf.math.log(sqrt * o_clipped)

print(np.mean(score))

In [None]:
print(unc.min(), unc.mean(), unc.max(), (unc>=0).mean())

In [None]:
submission['FVC1'] = pe[:, 1]
submission['Confidence1'] = pe[:, 2] - pe[:, 0]

In [None]:
submission1 = submission[['Patient_Week','FVC','Confidence','FVC1','Confidence1']].copy()

In [None]:
submission1.loc[~submission1.FVC1.isnull(),'FVC'] = submission1.loc[~subm.FVC1.isnull(),'FVC1']
if sigma_mean<70:
    submission1['Confidence'] = sigma_opt
else:
    submission1.loc[~submission1.FVC1.isnull(),'Confidence'] = submission1.loc[~submission1.FVC1.isnull(),'Confidence1']

In [None]:
otest = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')
for i in range(len(otest)):
    submission1.loc[submission1['Patient_Week']==otest.Patient[i]+'_'+str(otest.Weeks[i]), 'FVC'] = otest.FVC[i]
    submission1.loc[submission1['Patient_Week']==otest.Patient[i]+'_'+str(otest.Weeks[i]), 'Confidence'] = 0.1

In [None]:
submission1[["Patient_Week","FVC","Confidence"]].to_csv("submission_Quantile_Regression_V2.csv", index=False)

In [None]:
subm

In [None]:
submission1

# QUANTILE REGRESSION MODEL 3

In [None]:
train = pd.read_csv(f"{ROOT}/train.csv")
train.drop_duplicates(keep=False, inplace=True, subset=['Patient','Weeks'])
test = pd.read_csv(f"{ROOT}/test.csv")

print("add infos")
submission2 = pd.read_csv(f"{ROOT}/sample_submission.csv")
submission2['Patient'] = submission2['Patient_Week'].apply(lambda x:x.split('_')[0])
submission2['Weeks'] = submission2['Patient_Week'].apply(lambda x: int(x.split('_')[-1]))
submission2 =  submission2[['Patient','Weeks','Confidence','Patient_Week']]
submission2 = submission2.merge(test.drop('Weeks', axis=1), on="Patient")

In [None]:
train['WHERE'] = 'train'
test['WHERE'] = 'val'
submission2['WHERE'] = 'test'
data = train.append([test, submission2])

In [None]:
data['min_week'] = data['Weeks']
data.loc[data.WHERE=='test','min_week'] = np.nan
data['min_week'] = data.groupby('Patient')['min_week'].transform('min')

In [None]:
base = data.loc[data.Weeks == data.min_week]
base = base[['Patient','FVC']].copy()
base.columns = ['Patient','min_FVC']
base['nb'] = 1
base['nb'] = base.groupby('Patient')['nb'].transform('cumsum')
base = base[base.nb==1]
base.drop('nb', axis=1, inplace=True)

In [None]:
data = data.merge(base, on='Patient', how='left')
data['base_week'] = data['Weeks'] - data['min_week']
del base

In [None]:
COLS = ['Sex','SmokingStatus']
FE = []
for col in COLS:
    for mod in data[col].unique():
        FE.append(mod)
        data[mod] = (data[col] == mod).astype(int)

In [None]:
data['age'] = (data['Age'] - data['Age'].min() ) / ( data['Age'].max() - data['Age'].min() )
data['BASE'] = (data['min_FVC'] - data['min_FVC'].min() ) / ( data['min_FVC'].max() - data['min_FVC'].min() )
data['week'] = (data['base_week'] - data['base_week'].min() ) / ( data['base_week'].max() - data['base_week'].min() )
data['percent'] = (data['Percent'] - data['Percent'].min() ) / ( data['Percent'].max() - data['Percent'].min() )
FE += ['age','percent','week','BASE']

In [None]:
train = data.loc[data.WHERE=='train']
test = data.loc[data.WHERE=='val']
submission2 = data.loc[data.WHERE=='test']
del data

In [None]:
C1, C2 = tf.constant(70, dtype='float32'), tf.constant(1000, dtype="float32")
#=============================#
def score(y_true, y_pred):
    tf.dtypes.cast(y_true, tf.float32)
    tf.dtypes.cast(y_pred, tf.float32)
    sigma = y_pred[:, 2] - y_pred[:, 0]
    fvc_pred = y_pred[:, 1]
    
    #sigma_clip = sigma + C1
    sigma_clip = tf.maximum(sigma, C1)
    delta = tf.abs(y_true[:, 0] - fvc_pred)
    delta = tf.minimum(delta, C2)
    sq2 = tf.sqrt( tf.dtypes.cast(2, dtype=tf.float32) )
    metric = (delta / sigma_clip)*sq2 + tf.math.log(sigma_clip* sq2)
    return K.mean(metric)
#============================#
def qloss(y_true, y_pred):
    # Pinball loss for multiple quantiles
    qs = [0.2, 0.50, 0.8]
    q = tf.constant(np.array([qs]), dtype=tf.float32)
    e = y_true - y_pred
    v = tf.maximum(q*e, (q-1)*e)
    return K.mean(v)
#=============================#
def mloss(_lambda):
    def loss(y_true, y_pred):
        return _lambda * qloss(y_true, y_pred) + (1 - _lambda)*score(y_true, y_pred)
    return loss
#=================
def make_model():
    z = L.Input((9,), name="Patient")
    x = L.Dense(100, activation="relu", name="d1")(z)
    x = L.Dense(100, activation="relu", name="d2")(x)
    #x = L.Dense(100, activation="relu", name="d3")(x)
    p1 = L.Dense(3, activation="linear", name="p1")(x)
    p2 = L.Dense(3, activation="relu", name="p2")(x)
    preds = L.Lambda(lambda x: x[0] + tf.cumsum(x[1], axis=1), 
                     name="preds")([p1, p2])
    
    model = M.Model(z, preds, name="CNN")
    #model.compile(loss=qloss, optimizer="adam", metrics=[score])
    model.compile(loss=mloss(0.65), optimizer=tf.keras.optimizers.Adam(lr=0.11, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.01, amsgrad=False), metrics=[score])
    return model


In [None]:
net = make_model()
print(net.summary())
print(net.count_params())

In [None]:
y = tr['FVC'].values
z = tr[FE].values
ze = sub[FE].values
pe = np.zeros((ze.shape[0], 3))
pred = np.zeros((z.shape[0], 3))
delta = np.zeros((z.shape[0], 3))

In [None]:
NFOLD =5
kf = GroupKFold(n_splits=NFOLD)
split_groups = tr['Patient']

In [None]:
%%time
cnt = 0



for tr_idx, val_idx in kf.split(z,groups=split_groups):
    cnt += 1
    print(f"FOLD {cnt}")
    
    
    net = make_model()
    
    
    net.fit(z[tr_idx], y[tr_idx], batch_size=BATCH_SIZE, epochs=800, 
            validation_data=(z[val_idx], y[val_idx]), verbose=0) #
    
    
    print("train", net.evaluate(z[tr_idx], y[tr_idx], verbose=0, batch_size=BATCH_SIZE))
    print("val", net.evaluate(z[val_idx], y[val_idx], verbose=0, batch_size=BATCH_SIZE))
    print("predict val...")
    pred[val_idx] = net.predict(z[val_idx], batch_size=BATCH_SIZE, verbose=0)
    print("predict test...")
    pe += net.predict(ze, batch_size=BATCH_SIZE, verbose=0) / NFOLD
    
    delta += net.predict(z) / NFOLD
    
#==============

In [None]:
# Scoring

o_clipped = np.maximum(delta[:,2] - delta[:,0], 70)
delta = np.minimum(np.abs(delta[:, 1] - y), 1000)
sqrt = (np.sqrt((2)))
score = (-(sqrt * (delta))/(o_clipped)) - tf.math.log(sqrt * o_clipped)

print(np.mean(score))

In [None]:
submission2['FVC1'] = pe[:, 1]
submission2['Confidence1'] = pe[:, 2] - pe[:, 0]

In [None]:
submission2 = submission2[['Patient_Week','FVC','Confidence','FVC1','Confidence1']].copy()

In [None]:
submission2.loc[~submission2.FVC1.isnull(),'FVC'] = submission2.loc[~submission2.FVC1.isnull(),'FVC1']
if sigma_mean<70:
    submission2['Confidence'] = sigma_opt
else:
    submission2.loc[~submission2.FVC1.isnull(),'Confidence'] = submission2.loc[~submission2.FVC1.isnull(),'Confidence1']

In [None]:
otest = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')
for i in range(len(otest)):
    submission2.loc[submission2['Patient_Week']==otest.Patient[i]+'_'+str(otest.Weeks[i]), 'FVC'] = otest.FVC[i]
    submission2.loc[submission2['Patient_Week']==otest.Patient[i]+'_'+str(otest.Weeks[i]), 'Confidence'] = 0.1

In [None]:
submission2[["Patient_Week","FVC","Confidence"]].to_csv("submission_Quantile_Regression_V3.csv", index=False)


# QUANTILE REGRESSION v4

In [None]:
tr = pd.read_csv(f"{ROOT}/train.csv")
tr.drop_duplicates(keep=False, inplace=True, subset=['Patient','Weeks'])
chunk = pd.read_csv(f"{ROOT}/test.csv")

print("add infos")
submission3 = pd.read_csv(f"{ROOT}/sample_submission.csv")
submission3['Patient'] = submission3['Patient_Week'].apply(lambda x:x.split('_')[0])
submission3['Weeks'] = submission3['Patient_Week'].apply(lambda x: int(x.split('_')[-1]))
submission3 =  submission3[['Patient','Weeks','Confidence','Patient_Week']]
submission3 = submission3.merge(chunk.drop('Weeks', axis=1), on="Patient")

In [None]:
tr['WHERE'] = 'train'
chunk['WHERE'] = 'val'
submission3['WHERE'] = 'test'
data = tr.append([chunk, submission3])

In [None]:
data['min_week'] = data['Weeks']
data.loc[data.WHERE=='test','min_week'] = np.nan
data['min_week'] = data.groupby('Patient')['min_week'].transform('min')

In [None]:
base = data.loc[data.Weeks == data.min_week]
base = base[['Patient','FVC']].copy()
base.columns = ['Patient','min_FVC']
base['nb'] = 1
base['nb'] = base.groupby('Patient')['nb'].transform('cumsum')
base = base[base.nb==1]
base.drop('nb', axis=1, inplace=True)
data = data.merge(base, on='Patient', how='left')
data['base_week'] = data['Weeks'] - data['min_week']
del base

In [None]:
COLS = ['Sex','SmokingStatus']
FE = []
for col in COLS:
    for mod in data[col].unique():
        FE.append(mod)
        data[mod] = (data[col] == mod).astype(int)
data['age'] = (data['Age'] - data['Age'].min() ) / ( data['Age'].max() - data['Age'].min() )
data['BASE'] = (data['min_FVC'] - data['min_FVC'].min() ) / ( data['min_FVC'].max() - data['min_FVC'].min() )
data['week'] = (data['base_week'] - data['base_week'].min() ) / ( data['base_week'].max() - data['base_week'].min() )
data['percent'] = (data['Percent'] - data['Percent'].min() ) / ( data['Percent'].max() - data['Percent'].min() )
FE += ['age','percent','week','BASE']
tr = data.loc[data.WHERE=='train']
chunk = data.loc[data.WHERE=='val']
submission3 = data.loc[data.WHERE=='test']
del data
tr.shape, chunk.shape, submission3.shape

In [None]:
C1, C2 = tf.constant(70, dtype='float32'), tf.constant(1000, dtype="float32")
#=============================#
def score(y_true, y_pred):
    tf.dtypes.cast(y_true, tf.float32)
    tf.dtypes.cast(y_pred, tf.float32)
    sigma = y_pred[:, 2] - y_pred[:, 0]
    fvc_pred = y_pred[:, 1]
    
    #sigma_clip = sigma + C1
    sigma_clip = tf.maximum(sigma, C1)
    delta = tf.abs(y_true[:, 0] - fvc_pred)
    delta = tf.minimum(delta, C2)
    sq2 = tf.sqrt( tf.dtypes.cast(2, dtype=tf.float32) )
    metric = (delta / sigma_clip)*sq2 + tf.math.log(sigma_clip* sq2)
    return K.mean(metric)
#============================#
def qloss(y_true, y_pred):
    # Pinball loss for multiple quantiles
    qs = [0.2, 0.5, 0.8]
    q = tf.constant(np.array([qs]), dtype=tf.float32)
    e = y_true - y_pred
    v = tf.maximum(q*e, (q-1)*e)
    return K.mean(v)
#=============================#
def mloss(_lambda):
    def loss(y_true, y_pred):
        return _lambda * qloss(y_true, y_pred) + (1 - _lambda)*score(y_true, y_pred)
    return loss
#=================
def make_model():
    z = L.Input((9,), name="Patient")
    x = L.Dense(200, activation="relu", name="d1")(z)
    x = L.Dense(200, activation="relu", name="d2")(x)
    #x = L.Dense(100, activation="relu", name="d3")(x)
    p1 = L.Dense(3, activation="linear", name="p1")(x)
    p2 = L.Dense(3, activation="relu", name="p2")(x)
    preds = L.Lambda(lambda x: x[0] + tf.cumsum(x[1], axis=1), 
                     name="preds")([p1, p2])
    
    
    model = M.Model(z, preds, name="CNN")
    #model.compile(loss=qloss, optimizer="adam", metrics=[score])
    model.compile(loss=mloss(0.775), optimizer=tf.keras.optimizers.Adam(lr=0.1, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.01, amsgrad=False), metrics=[score])
    return model


In [None]:
net = make_model()
print(net.summary())
print(net.count_params())

In [None]:
y = tr['FVC'].values
z = tr[FE].values
ze = sub[FE].values
pe = np.zeros((ze.shape[0], 3))
pred = np.zeros((z.shape[0], 3))
delta = np.zeros((z.shape[0], 3))

In [None]:
NFOLD =5
kf = GroupKFold(n_splits=NFOLD)
split_groups = tr['Patient']


In [None]:
%%time
cnt = 0



for tr_idx, val_idx in kf.split(z,groups=split_groups):
    cnt += 1
    print(f"FOLD {cnt}")
    
    
    net = make_model()
    
    
    net.fit(z[tr_idx], y[tr_idx], batch_size=BATCH_SIZE, epochs=800, 
            validation_data=(z[val_idx], y[val_idx]), verbose=0) #
    
    
    print("train", net.evaluate(z[tr_idx], y[tr_idx], verbose=0, batch_size=BATCH_SIZE))
    print("val", net.evaluate(z[val_idx], y[val_idx], verbose=0, batch_size=BATCH_SIZE))
    print("predict val...")
    pred[val_idx] = net.predict(z[val_idx], batch_size=BATCH_SIZE, verbose=0)
    print("predict test...")
    pe += net.predict(ze, batch_size=BATCH_SIZE, verbose=0) / NFOLD
    
    delta += net.predict(z) / NFOLD
    
#==============

In [None]:
# Scoring

o_clipped = np.maximum(delta[:,2] - delta[:,0], 70)
delta = np.minimum(np.abs(delta[:, 1] - y), 1000)
sqrt = (np.sqrt((2)))
score = (-(sqrt * (delta))/(o_clipped)) - tf.math.log(sqrt * o_clipped)

print(np.mean(score))

In [None]:
submission3['FVC1'] = pe[:, 1]
submission3['Confidence1'] = pe[:, 2] - pe[:, 0]
submission3 = submission3[['Patient_Week','FVC','Confidence','FVC1','Confidence1']].copy()

In [None]:
submission3.loc[~submission3.FVC1.isnull(),'FVC'] = submission3.loc[~submission3.FVC1.isnull(),'FVC1']
if sigma_mean<70:
    submission3['Confidence'] = sigma_opt
else:
    submission3.loc[~submission3.FVC1.isnull(),'Confidence'] = submission3.loc[~submission3.FVC1.isnull(),'Confidence1']
otest = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')
for i in range(len(otest)):
    submission3.loc[submission3['Patient_Week']==otest.Patient[i]+'_'+str(otest.Weeks[i]), 'FVC'] = otest.FVC[i]
    submission3.loc[submission3['Patient_Week']==otest.Patient[i]+'_'+str(otest.Weeks[i]), 'Confidence'] = 0.1
submission3[["Patient_Week","FVC","Confidence"]].to_csv("submission_Quantile_Regression_v4.csv", index=False)

# QUANTILE REGRESSION V5

In [None]:
tr = pd.read_csv(f"{ROOT}/train.csv")
tr.drop_duplicates(keep=False, inplace=True, subset=['Patient','Weeks'])
chunk = pd.read_csv(f"{ROOT}/test.csv")
submission4 = pd.read_csv(f"{ROOT}/sample_submission.csv")

submission4['Patient'] = submission4['Patient_Week'].apply(lambda x:x.split('_')[0])
submission4['Weeks'] = submission4['Patient_Week'].apply(lambda x: int(x.split('_')[-1]))
submission4 =  submission4[['Patient','Weeks','Confidence','Patient_Week']]
submission4 = submission4.merge(chunk.drop('Weeks', axis=1), on="Patient")

In [None]:
tr['WHERE'] = 'train'
chunk['WHERE'] = 'val'
submission4['WHERE'] = 'test'
data = tr.append([chunk, submission4])

In [None]:
data['min_week'] = data['Weeks']
data.loc[data.WHERE=='test','min_week'] = np.nan
data['min_week'] = data.groupby('Patient')['min_week'].transform('min')
base = data.loc[data.Weeks == data.min_week]
base = base[['Patient','FVC']].copy()
base.columns = ['Patient','min_FVC']
base['nb'] = 1
base['nb'] = base.groupby('Patient')['nb'].transform('cumsum')
base = base[base.nb==1]
base.drop('nb', axis=1, inplace=True)
data = data.merge(base, on='Patient', how='left')
data['base_week'] = data['Weeks'] - data['min_week']
del base
COLS = ['Sex','SmokingStatus']
FE = []
for col in COLS:
    for mod in data[col].unique():
        FE.append(mod)
        data[mod] = (data[col] == mod).astype(int)
data['age'] = (data['Age'] - data['Age'].min() ) / ( data['Age'].max() - data['Age'].min() )
data['BASE'] = (data['min_FVC'] - data['min_FVC'].min() ) / ( data['min_FVC'].max() - data['min_FVC'].min() )
data['week'] = (data['base_week'] - data['base_week'].min() ) / ( data['base_week'].max() - data['base_week'].min() )
data['percent'] = (data['Percent'] - data['Percent'].min() ) / ( data['Percent'].max() - data['Percent'].min() )
FE += ['age','percent','week','BASE']
tr = data.loc[data.WHERE=='train']
chunk = data.loc[data.WHERE=='val']
submission4 = data.loc[data.WHERE=='test']
del data
tr.shape, chunk.shape, sub.shape
((1535, 22), (5, 22), (730, 22))
C1, C2 = tf.constant(70, dtype='float32'), tf.constant(1000, dtype="float32")
#=============================#
def score(y_true, y_pred):
    tf.dtypes.cast(y_true, tf.float32)
    tf.dtypes.cast(y_pred, tf.float32)
    sigma = y_pred[:, 2] - y_pred[:, 0]
    fvc_pred = y_pred[:, 1]
    
    #sigma_clip = sigma + C1
    sigma_clip = tf.maximum(sigma, C1)
    delta = tf.abs(y_true[:, 0] - fvc_pred)
    delta = tf.minimum(delta, C2)
    sq2 = tf.sqrt( tf.dtypes.cast(2, dtype=tf.float32) )
    metric = (delta / sigma_clip)*sq2 + tf.math.log(sigma_clip* sq2)
    return K.mean(metric)
#============================#
def qloss(y_true, y_pred):
    # Pinball loss for multiple quantiles
    qs = [0.2, 0.5, 0.8]
    q = tf.constant(np.array([qs]), dtype=tf.float32)
    e = y_true - y_pred
    v = tf.maximum(q*e, (q-1)*e)
    return K.mean(v)
#=============================#
def mloss(_lambda):
    def loss(y_true, y_pred):
        return _lambda * qloss(y_true, y_pred) + (1 - _lambda)*score(y_true, y_pred)
    return loss
#=================
def make_model():
    z = L.Input((9,), name="Patient")
    x = L.Dense(100, activation="relu", name="d1")(z)
    x = L.Dense(100, activation="relu", name="d2")(x)
    #x = L.Dense(100, activation="relu", name="d3")(x)
    p1 = L.Dense(3, activation="linear", name="p1")(x)
    p2 = L.Dense(3, activation="relu", name="p2")(x)
    preds = L.Lambda(lambda x: x[0] + tf.cumsum(x[1], axis=1), 
                     name="preds")([p1, p2])
    
    
    model = M.Model(z, preds, name="CNN")
    #model.compile(loss=qloss, optimizer="adam", metrics=[score])
    model.compile(loss=mloss(0.65), optimizer=tf.keras.optimizers.Adam(lr=0.1, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.01, amsgrad=False), metrics=[score])
    return model

In [None]:
y = tr['FVC'].values
z = tr[FE].values
ze = sub[FE].values
pe = np.zeros((ze.shape[0], 3))
pred = np.zeros((z.shape[0], 3))
delta = np.zeros((z.shape[0], 3))
NFOLD =5
kf = GroupKFold(n_splits=NFOLD)
split_groups = tr['Patient']


In [None]:
%%time
cnt = 0



for tr_idx, val_idx in kf.split(z,groups=split_groups):
    cnt += 1
    print(f"FOLD {cnt}")
    
    
    net = make_model()
    
    
    net.fit(z[tr_idx], y[tr_idx], batch_size=BATCH_SIZE, epochs=800, 
            validation_data=(z[val_idx], y[val_idx]), verbose=0) #
    
    
    print("train", net.evaluate(z[tr_idx], y[tr_idx], verbose=0, batch_size=BATCH_SIZE))
    print("val", net.evaluate(z[val_idx], y[val_idx], verbose=0, batch_size=BATCH_SIZE))
    print("predict val...")
    pred[val_idx] = net.predict(z[val_idx], batch_size=BATCH_SIZE, verbose=0)
    print("predict test...")
    pe += net.predict(ze, batch_size=BATCH_SIZE, verbose=0) / NFOLD
    
    delta += net.predict(z) / NFOLD
    
#==============

In [None]:
# Scoring

o_clipped = np.maximum(delta[:,2] - delta[:,0], 70)
delta = np.minimum(np.abs(delta[:, 1] - y), 1000)
sqrt = (np.sqrt((2)))
score = (-(sqrt * (delta))/(o_clipped)) - tf.math.log(sqrt * o_clipped)

print(np.mean(score))

In [None]:
submission4['FVC1'] = pe[:, 1]
submission4['Confidence1'] = pe[:, 2] - pe[:, 0]
submission4 = submission4[['Patient_Week','FVC','Confidence','FVC1','Confidence1']].copy()

submission4.loc[~submission4.FVC1.isnull(),'FVC'] = submission4.loc[~submission4.FVC1.isnull(),'FVC1']
if sigma_mean<70:
    submission4['Confidence'] = sigma_opt
else:
    submission4.loc[~submission4.FVC1.isnull(),'Confidence'] = submission4.loc[~submission4.FVC1.isnull(),'Confidence1']

In [None]:
otest = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')
for i in range(len(otest)):
    submission4.loc[submission4['Patient_Week']==otest.Patient[i]+'_'+str(otest.Weeks[i]), 'FVC'] = otest.FVC[i]
    submission4.loc[submission4['Patient_Week']==otest.Patient[i]+'_'+str(otest.Weeks[i]), 'Confidence'] = 0.1

In [None]:
submission4[["Patient_Week","FVC","Confidence"]].to_csv("submission_Quantile_Regression_v5.csv", index=False)

# BAYESIAN RIDGE

In [None]:
df_train = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/train.csv')
df_test = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')
sub = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/sample_submission.csv')

print('Train shape: ', df_train.shape)
print('Number of unique customers in train: {}'.format(df_train['Patient'].nunique()))
print('Test shape:', df_test.shape)

In [None]:
df_base = df_train.drop_duplicates(subset='Patient', keep='first')
df_base = df_base[['Patient', 'Weeks', 'FVC', 
                   'Percent', 'Age']].rename(columns={'Weeks': 'base_week',
                                                      'Percent': 'base_percent',
                                                      'Age': 'base_age',
                                                      'FVC': 'base_FVC'})
df_base.head(3)

In [None]:
df_train['visit'] = 1
df_train['visit'] = df_train[['Patient', 'visit']].groupby('Patient').cumsum()
df_train = df_train.loc[df_train['visit'] > 0, :]

In [None]:
# Merge with base info
df_train = pd.merge(df_train,
                    df_base,
                    on='Patient',
                    how='left')
print(df_train.shape)
df_train.head(3)

In [None]:
df_train['weeks_passed'] = df_train['Weeks'] - df_train['base_week']
df_train = pd.get_dummies(df_train, columns=['Sex', 'SmokingStatus'])
sub['Patient'] = sub['Patient_Week'].apply(lambda x: x.split('_')[0])
sub['Weeks'] = sub['Patient_Week'].apply(lambda x: x.split('_')[1]).astype(int)
sub.head()

In [None]:
df_test = df_test.rename(columns={'Weeks': 'base_week', 
                                  'Percent': 'base_percent',
                                  'Age': 'base_age',
                                  'FVC': 'base_FVC'})
df_test = pd.merge(sub,
                   df_test,
                   on='Patient',
                   how='right')
df_test = pd.get_dummies(df_test, columns=['Sex', 'SmokingStatus'])
df_test['weeks_passed'] = df_test['Weeks'] - df_test['base_week']
df_test.head()

In [None]:
missing_columns = np.setdiff1d(df_train.drop(['Patient', 'FVC', 'Percent', 'Age', 'visit'], axis = 1).columns, df_test.columns)
if len(missing_columns) > 0:
    print('/!\ Missing columns in test: ', missing_columns)
    for col in missing_columns:
        df_test[col] = 0

In [None]:
def OSIC_metric(y_true, y_pred, y_pred_std):
    delta = np.clip(abs(y_true - y_pred), 0, 1000)
    std_clipped = np.clip(y_pred_std, 70, np.inf)
    return np.mean(-(np.sqrt(2)*delta/std_clipped) - np.log(np.sqrt(2)*std_clipped))

In [None]:
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import BayesianRidge

class Model():
    def __init__(self, model=BayesianRidge(alpha_1=0.1,alpha_2=0.1,lambda_1=0.03,lambda_2=0.01), n_splits=2):
        self.regressor = model
        self.n_splits = n_splits
        self.gkf = GroupKFold(n_splits=n_splits)
        self.train_cols = ['Weeks', 'base_week', 'base_FVC', 
                           'base_percent', 'base_age', 'weeks_passed', 'Sex_Female',
                           'Sex_Male', 'SmokingStatus_Currently smokes', 
                           'SmokingStatus_Ex-smoker', 'SmokingStatus_Never smoked']
    
    def fit(self, X, y):
        self.regressor.fit(X, y)
            
    def predict(self, X):
        pred = self.regressor.predict(X, return_std=True)        
        return pred
    
    def fit_predict_cv(self, df, df_test=pd.DataFrame()):
        
        scores = np.zeros((self.n_splits, ))
        oof = np.zeros((len(df), ))
        oof_std = np.zeros_like(oof)
        
        if len(df_test) > 0:
            pred_sub = np.zeros((len(df_test), self.n_splits))
            pred_sub_std = np.zeros_like(pred_sub)
        
        target = 'FVC'
        
        for i, (train_idx, val_idx) in enumerate(self.gkf.split(df, groups=df['Patient'])):
            X_train = df.loc[train_idx, self.train_cols]
            y_train = df.loc[train_idx, target]
            X_val = df.loc[val_idx, self.train_cols]
            y_val = df.loc[val_idx, target]
            
            self.fit(X_train, y_train)
            
            pred_train, pred_train_std = self.predict(X_train)
            pred_val, pred_val_std = self.predict(X_val)
            
            if len(df_test) > 0:
                pred_sub[:, i], pred_sub_std[:, i] = self.predict(df_test[self.train_cols])
            
            oof[val_idx] = pred_val
            oof_std[val_idx] = pred_val_std
            print('Train score: {0:.2f} | Test score: {1:.2f}'.format(OSIC_metric(y_train, pred_train, pred_train_std),
                                                                    OSIC_metric(y_val, pred_val, pred_val_std)))
        print('OOF score: {0:.4f}'.format(OSIC_metric(df[target], oof, oof_std)))
        res = dict()
        res['oof'] = oof
        res['oof_std'] = oof_std
        
        if len(df_test) > 0:
            res['pred_sub'] = pred_sub.mean(axis=1)
            res['pred_sub_std'] = pred_sub_std.mean(axis=1)
        
        return res
fvc_model = Model()
res = fvc_model.fit_predict_cv(df_train, df_test)


In [None]:
df_test['FVC'] = res['pred_sub']
df_test['Confidence'] = res['pred_sub_std']

submission5 = sub[['Patient_Week']]
submission5 = pd.merge(submission5,
                      df_test[['Patient_Week', 'FVC', 'Confidence']],
                      on='Patient_Week',
                      how='left')
submission5.head()


# RIDGE

In [None]:
import os
import numpy as np
import pandas as pd
import random
import math

from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from sklearn.metrics import mean_squared_error
import category_encoders as ce

from sklearn.linear_model import Ridge
from functools import partial
import scipy as sp

import warnings
warnings.filterwarnings("ignore")

In [None]:
def seed_everything(seed=777):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

In [None]:
OUTPUT_DICT = './'

ID = 'Patient_Week'
TARGET = 'FVC'
SEED = 37
seed_everything(seed=SEED)

N_FOLD = 5

In [None]:
train = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/train.csv')
otest = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')

In [None]:
# construct train input
train = pd.concat([train,otest])
output = pd.DataFrame()
gb = train.groupby('Patient')
tk0 = tqdm(gb, total=len(gb))
for _, usr_df in tk0:
    usr_output = pd.DataFrame()
    for week, tmp in usr_df.groupby('Weeks'):
        rename_cols = {'Weeks': 'base_Week', 'FVC': 'base_FVC', 'Age': 'base_Age'}
        tmp = tmp.rename(columns=rename_cols)
        drop_cols = ['Age', 'Sex', 'SmokingStatus', 'Percent']
        _usr_output = usr_df.drop(columns=drop_cols).rename(columns={'Weeks': 'predict_Week'}).merge(tmp, on='Patient')
        _usr_output['Week_passed'] = _usr_output['predict_Week'] - _usr_output['base_Week']
        usr_output = pd.concat([usr_output, _usr_output])
    output = pd.concat([output, usr_output])
    
train = output[output['Week_passed']!=0].reset_index(drop=True)

In [None]:
# construct test input
test = otest.rename(columns={'Weeks': 'base_Week', 'FVC': 'base_FVC', 'Age': 'base_Age'})
submission6 = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/sample_submission.csv')
submission6['Patient'] = submission6['Patient_Week'].apply(lambda x: x.split('_')[0])
submission6['predict_Week'] = submission6['Patient_Week'].apply(lambda x: x.split('_')[1]).astype(int)
test = submission6.drop(columns=['FVC', 'Confidence']).merge(test, on='Patient')
test['Week_passed'] = test['predict_Week'] - test['base_Week']
test.set_index('Patient_Week', inplace=True)

In [None]:
folds = train[['Patient', TARGET]].copy()
Fold = GroupKFold(n_splits=N_FOLD)
groups = folds['Patient'].values
for n, (train_index, val_index) in enumerate(Fold.split(folds, folds[TARGET], groups)):
    folds.loc[val_index, 'fold'] = int(n)
folds['fold'] = folds['fold'].astype(int)

In [None]:
#===========================================================
# model
#===========================================================
def run_single_model(clf, train_df, test_df, folds, features, target, fold_num=0):
    
    trn_idx = folds[folds.fold!=fold_num].index
    val_idx = folds[folds.fold==fold_num].index
    
    y_tr = target.iloc[trn_idx].values
    X_tr = train_df.iloc[trn_idx][features].values
    y_val = target.iloc[val_idx].values
    X_val = train_df.iloc[val_idx][features].values
    
    oof = np.zeros(len(train_df))
    predictions = np.zeros(len(test_df))
    clf.fit(X_tr, y_tr)
    
    oof[val_idx] = clf.predict(X_val)
    predictions += clf.predict(test_df[features])
    return oof, predictions


def run_kfold_model(clf, train, test, folds, features, target, n_fold=5):
    
    oof = np.zeros(len(train))
    predictions = np.zeros(len(test))
    feature_importance_df = pd.DataFrame()

    for fold_ in range(n_fold):

        _oof, _predictions = run_single_model(clf,
                                              train, 
                                              test,
                                              folds,  
                                              features,
                                              target, 
                                              fold_num=fold_)
        oof += _oof
        predictions += _predictions/n_fold
    
    return oof, predictions

In [None]:
target = train[TARGET]
test[TARGET] = np.nan

# features
cat_features = ['Sex', 'SmokingStatus']
num_features = [c for c in test.columns if (test.dtypes[c] != 'object') & (c not in cat_features)]
features = num_features + cat_features
drop_features = [TARGET, 'predict_Week', 'Percent', 'base_Week']
features = [c for c in features if c not in drop_features]

if cat_features:
    ce_oe = ce.OrdinalEncoder(cols=cat_features, handle_unknown='impute')
    ce_oe.fit(train)
    train = ce_oe.transform(train)
    test = ce_oe.transform(test)

In [None]:
clf = Ridge(alpha=0.1)
oof, predictions = run_kfold_model(clf, train, test, folds, features, target, n_fold=N_FOLD)

train['FVC_pred'] = oof
test['FVC_pred'] = predictions

In [None]:
# baseline score
train['Confidence'] = 100
train['sigma_clipped'] = train['Confidence'].apply(lambda x: max(x, 70))
train['diff'] = abs(train['FVC'] - train['FVC_pred'])
train['delta'] = train['diff'].apply(lambda x: min(x, 1000))
train['score'] = -math.sqrt(2)*train['delta']/train['sigma_clipped'] - np.log(math.sqrt(2)*train['sigma_clipped'])
score = train['score'].mean()
print(score)

In [None]:
def loss_func(weight, row):
    confidence = weight
    sigma_clipped = max(confidence, 70)
    diff = abs(row['FVC'] - row['FVC_pred'])
    delta = min(diff, 1000)
    score = -math.sqrt(2)*delta/sigma_clipped - np.log(math.sqrt(2)*sigma_clipped)
    return -score

results = []
tk0 = tqdm(train.iterrows(), total=len(train))
for _, row in tk0:
    loss_partial = partial(loss_func, row=row)
    weight = [100]
    result = sp.optimize.minimize(loss_partial, weight, method='SLSQP')
    x = result['x']
    results.append(x[0])

In [None]:
# optimized score
train['Confidence'] = results
train['sigma_clipped'] = train['Confidence'].apply(lambda x: max(x, 70))
train['diff'] = abs(train['FVC'] - train['FVC_pred'])
train['delta'] = train['diff'].apply(lambda x: min(x, 1000))
train['score'] = -math.sqrt(2)*train['delta']/train['sigma_clipped'] - np.log(math.sqrt(2)*train['sigma_clipped'])
score = train['score'].mean()
print(score)

In [None]:
TARGET = 'Confidence'

target = train[TARGET]
test[TARGET] = np.nan

# features
cat_features = ['Sex', 'SmokingStatus']
num_features = [c for c in test.columns if (test.dtypes[c] != 'object') & (c not in cat_features)]
features = num_features + cat_features
drop_features = [ID, TARGET, 'predict_Week', 'base_Week', 'FVC', 'FVC_pred']
features = [c for c in features if c not in drop_features]

oof, predictions = run_kfold_model(clf, train, test, folds, features, target, n_fold=N_FOLD)

In [None]:
train['Confidence'] = oof
train['sigma_clipped'] = train['Confidence'].apply(lambda x: max(x, 70))
train['diff'] = abs(train['FVC'] - train['FVC_pred'])
train['delta'] = train['diff'].apply(lambda x: min(x, 1000))
train['score'] = -math.sqrt(2)*train['delta']/train['sigma_clipped'] - np.log(math.sqrt(2)*train['sigma_clipped'])
score = train['score'].mean()
print(score)

In [None]:
test['Confidence'] = predictions
test = test.reset_index()

In [None]:
submission6 = submission6[['Patient_Week']].merge(test[['Patient_Week', 'FVC_pred', 'Confidence']], on='Patient_Week')
submission6 = submission6.rename(columns={'FVC_pred': 'FVC'})

In [None]:
for i in range(len(otest)):
    submission6.loc[submission6['Patient_Week']==otest.Patient[i]+'_'+str(otest.Weeks[i]), 'FVC'] = otest.FVC[i]
    submission6.loc[submission6['Patient_Week']==otest.Patient[i]+'_'+str(otest.Weeks[i]), 'Confidence'] = 0.1

In [None]:
submission6.to_csv('submission_Ridge.csv', index=False, float_format='%.1f')


In [None]:
subm

In [None]:
submission1

In [None]:
submission2

In [None]:
submission3

In [None]:
submission4

In [None]:
submission5

In [None]:
submission6

In [None]:
submission_final = (subm.drop(['Patient_Week'], axis=1) + submission1.drop(['Patient_Week'], axis=1) + submission2.drop(['Patient_Week'], axis=1) + submission3.drop(['Patient_Week'], axis=1) + submission4.drop(['Patient_Week'], axis=1))/5


In [None]:
submission_final['Patient_Week'] = subm['Patient_Week']


In [None]:
cols = [submission_final.columns[-1]] + list(submission_final.columns[: -1])
submission_final = submission_final[cols]
submission_final.head()

In [None]:
submission_final.drop(['FVC1','Confidence1'],axis=1,inplace=True)

In [None]:
submission_final

In [None]:
submission_final=pd.merge(submission_final,submission5,on='Patient_Week',how='inner').merge(submission6,on='Patient_Week',how='inner').rename(columns={'FVC':'FVC1','Confidence':'Confidence1'})

In [None]:
submission_final['FVC']=(submission_final['FVC_x']+submission_final['FVC_y']+submission_final['FVC1'])/3

submission_final['Confidence'] = (submission_final['Confidence1']+submission_final['Confidence_x']+submission_final['Confidence_y'])/3

In [None]:
submission_final

In [None]:
submission_final[['Patient_Week','FVC','Confidence']].to_csv("submission.csv",index=False)