In [None]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.ensemble import GradientBoostingRegressor

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [None]:
VERBOSE=True
SEED=2020
FOLDS=5
ALPHA=0.8

In [None]:
def metric(preds, confidence, targets):
    confidence[confidence < 70] = 70
    delta = np.abs(preds - targets)
    delta[delta > 1000] = 1000
    return -np.sqrt(2) * delta / confidence - np.log(np.sqrt(2) * confidence)

In [None]:
train_df = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/train.csv')

## Creating Folds

In [None]:
patient_df = pd.DataFrame()
for i, patient in train_df.groupby('Patient'):
    patient_df = pd.concat([patient_df, patient])

patient_df = patient_df[['Patient', 'Age', 'Sex', 'SmokingStatus']].drop_duplicates().reset_index(drop=True)
patient_df['Sex'] = patient_df['Sex'].factorize()[0]
patient_df['SmokingStatus'] = patient_df['SmokingStatus'].factorize()[0]

patient_df['SS'] = patient_df.apply(lambda x: str(x['Sex']) + '-' + str(x['SmokingStatus']), axis=1).astype('category')

kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
patient_df['fold'] = 0
fold = 0
for train_index, test_index in kf.split(patient_df[['Age', 'Sex', 'SmokingStatus']], patient_df['SS']):
    patient_df['fold'].iloc[test_index] = fold
    fold += 1

## Creating Training Data

In [None]:
train = train_df.merge(patient_df[['Patient', 'fold']], on='Patient')

# Winsorize FVC
all_fvc = train.FVC.mean()
train.FVC = train.FVC.clip(all_fvc - 2 * train.FVC.std(), all_fvc + 2 * train.FVC.std())

# Winsorize Age
all_age = train.Age.mean()
train.Age = train.Age.clip(all_age - 2 * train.Age.std(), all_age + 2 * train.Age.std())


output = pd.DataFrame()
for patient_id, patient in train.groupby('Patient'):
    
    usr_output = pd.DataFrame()
    for week, tmp in patient.groupby('Weeks'):
        rename_cols = {'Weeks': 'base_Week', 'FVC': 'base_FVC', 'Percent': 'base_Percent', 'Age': 'base_Age'}
        tmp = tmp.rename(columns=rename_cols)
        drop_cols = ['Age', 'Sex', 'SmokingStatus', 'Percent', 'fold']
        _usr_output = patient.drop(columns=drop_cols).rename(columns={'Weeks': 'predict_Week'}).merge(tmp, on='Patient')
        _usr_output['Week_passed'] = _usr_output['predict_Week'] - _usr_output['base_Week']
        _usr_output['val'] = 0
        _usr_output.tail(n=3)['val'] = 1
        usr_output = pd.concat([usr_output, _usr_output])
    output = pd.concat([output, usr_output])
    
train = output[output['Week_passed']!=0].reset_index(drop=True)

train['Sex'] = train['Sex'].factorize()[0]
train['SmokingStatus'] = train['SmokingStatus'].factorize()[0]

## Creating Test Data

In [None]:
test = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')\
        .rename(columns={'Weeks': 'base_Week', 'FVC': 'base_FVC', 'Percent': 'base_Percent', 'Age': 'base_Age'})
submission = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/sample_submission.csv')
submission['Patient'] = submission['Patient_Week'].apply(lambda x: x.split('_')[0])
submission['predict_Week'] = submission['Patient_Week'].apply(lambda x: x.split('_')[1]).astype(int)
test = submission.drop(columns=['FVC', 'Confidence']).merge(test, on='Patient')
test['Week_passed'] = test['predict_Week'] - test['base_Week']

test['Sex'] = test['Sex'].factorize()[0]
test['SmokingStatus'] = test['SmokingStatus'].factorize()[0]

## Training and Predicting Using GradientBoostingRegression with Quantile Loss

In [None]:
%%time
# Feature and Target columns that are used in Test as well
x_cols = ['base_Age', 'Sex', 'SmokingStatus', 'base_Week', 'base_FVC', 'base_Percent', 'Week_passed']
y_cols = ['FVC']

fvc_up_prediction = np.zeros(len(train))
fvc_md_prediction = np.zeros(len(train))
fvc_lw_prediction = np.zeros(len(train))

tst_up = []
tst_md = []
tst_lw = []

# Training Cycle
for fold in range(FOLDS):
    if VERBOSE: print("Fold: ", fold)
    # Currently using only testing last 3 weeks in OOF
    # Base FVC has the highest feature importance
    # therefore not going to include earlier weeks in same fold
    # as this seems to introduce a lot of leak. Makes sense
    # since there's only so much the lungs can decrease by.
    
    #trn = train[(train['fold']!=fold)|(train['val']==0)]
    val = train[(train['fold']==fold)&(train['val']==1)]

    trn = train[(train['fold']!=fold)]
    #val = train[(train['fold']==fold)]
    
    trn_y = trn[y_cols]
    trn_x = trn[x_cols]

    val_y = val[y_cols]
    val_x = val[x_cols]
    
    # For Test predictions
    tst_x = test[x_cols]

    fvc_model = GradientBoostingRegressor(loss='quantile', 
                                          alpha=ALPHA,
                                          n_estimators=250, 
                                          max_depth=3,
                                          learning_rate=.05, 
                                          min_samples_leaf=9,
                                          min_samples_split=9,
                                          subsample=0.5,
                                          random_state=SEED)
    
    # Fit on the upper bound
    fvc_model.fit(trn_x, trn_y)
    trn_upper = fvc_model.predict(trn_x)
    fvc_upper = fvc_model.predict(val_x)
    tst_upper = fvc_model.predict(tst_x)

    # Fit on the lower bound
    fvc_model.set_params(alpha=1.0 - ALPHA)
    fvc_model.fit(trn_x, trn_y)
    trn_lower = fvc_model.predict(trn_x)
    fvc_lower = fvc_model.predict(val_x)
    tst_lower = fvc_model.predict(tst_x)

    # Get the median
    fvc_model.set_params(loss='ls')
    fvc_model.fit(trn_x, trn_y)
    trn_pred = fvc_model.predict(trn_x)
    fvc_pred = fvc_model.predict(val_x)
    tst_pred = fvc_model.predict(tst_x)

    # Set the OOF predictions
    fvc_up_prediction[val.index] = fvc_upper
    fvc_md_prediction[val.index] = fvc_pred
    fvc_lw_prediction[val.index] = fvc_lower
    
    # Get the Fold prediction score
    print("Train Score: ", np.mean(metric(trn_pred, trn_upper - trn_lower, trn_y.FVC.tolist())))
    print("Fold Score: ", np.mean(metric(fvc_pred, fvc_upper - fvc_lower, val_y.FVC.tolist())))
    print()
    
    # Save the Test predictions
    tst_up.append(tst_upper)
    tst_md.append(tst_pred)
    tst_lw.append(tst_lower)
    
    if VERBOSE:
        # Plot Feature Importance
        feature_importance = fvc_model.feature_importances_
        sorted_idx = np.argsort(feature_importance)
        pos = np.arange(sorted_idx.shape[0]) + .5
        fig = plt.figure(figsize=(12, 6))
        plt.subplot(1, 2, 1)
        plt.barh(pos, feature_importance[sorted_idx], align='center')
        plt.yticks(pos, np.array(x_cols)[sorted_idx])
        plt.title(f'Fold {fold}: Feature Importance')

    
# Get the mean over the Folds
tst_up_predictions = np.mean(tst_up, axis=0)
tst_md_predictions = np.mean(tst_md, axis=0)
tst_lw_predictions = np.mean(tst_lw, axis=0)

# OOF Score
print("=" * 40)

val_idx = train[train['val']==1].index
print("OOF Score: ", np.mean(metric(fvc_md_prediction[val_idx], fvc_up_prediction[val_idx] - fvc_lw_prediction[val_idx], train.iloc[val_idx]['FVC'].tolist())))

## Submission

In [None]:
sub = test[['Patient_Week']].copy()
sub['FVC'] = tst_md_predictions
sub['Confidence'] = tst_up_predictions - tst_lw_predictions

In [None]:
sub

In [None]:
sub.to_csv('submission.csv', index=False)