# Linear Decay and Quantile Regression Inference Notebook


This version started Sep 23, 2020. Submission: Pending
- 2 models, one linear decay, and one quantile regression, trained on 5 different strategized Kfolds each.
- Use tabular features only. 
- Linear Decay Model predicts the slope of the fvc vs weeks line, then makes inference on fvc values
    - The slope is also used to build a heuristic confidence band (a.k.a. sigma) aroud the fvc predictions
    - A customized `sigma` function applies to predictions from each fold. 
    - 8 features used in all 5 folds.
    
- Quantile Regression Model predicts FVC directly. 
    - 7 or 8 features for each fold, depending on performance during development. 
    
    
    
- Models are ready, no need to pre-build. 
    - Quantile regression models need to be loaded with `compile = False` and compile separetely

reference: colab notebooks `OSIC_linear_decay_and_quantile_02.ipynb` and  `OSIC_quantile_02Sep.ipyn`

To DO:

- submit predictions of Linear Decay Model trained with fold 0 [done]
- submit predictions of Linear Decay Model trained with fold 1 [done]
- submit predictions of Linear Decay Model trained with fold 2 [done]
- submit predictions of Linear Decay Model trained with fold 3 [done]
- submit predictions of Linear Decay Model trained with fold 4 [done]

Sep 24
- submit predictions of Quantile Regression Model trained with fold 0 [done]
- submit predictions of Quantile Regression Model trained with fold 1 [done]
- submit predictions of Quantile Regression Model trained with fold 2 [done]
- submit predictions of Quantile Regression Model trained with fold 3 [done]
- submit predictions of Quantile Regression Model trained with fold 4 [done]

Sep 25
- submit simple mean of Quantile Regression Model, all folds [done]
- submit simple mean of Linear Decay Model, all folds [done]
- submit simple mean of Linear Decay Model (all) + Quantile Reg (all), all folds [this run]

In [None]:
import os, sys
import numpy as np
import pandas as pd

import tensorflow as tf

from IPython.display import display
pd.set_option('display.max_columns', 50)

tf_version = tf.__version__
print("\nTensorflow version " + tf_version)

## Initial Data Preprocessing

In [None]:
input_path = '../input/osic-pulmonary-fibrosis-progression'
pretrained_path = '../input/osic-linear-decay-and-quant-reg-base/pretrained_weights'

In [None]:
def height_proxy(fvc_e, age, sex):
    # src: https://en.wikipedia.org/wiki/Vital_capacity
    if sex == 'Female': h = fvc_e/(21.78-0.101*age)
    else: h = fvc_e/(27.63-0.112*age)
    return h

def process_init_week(df, train_df = False):
    if train_df:
        df['min_week'] = df.groupby('Patient')['Weeks'].transform('min')

    base = df.loc[df.Weeks == df.min_week][['Patient', 'FVC', 'Percent', 'Age', 'Sex']]
    base['FVC_init_avg']= base.groupby('Patient')['FVC'].transform('mean').astype(int)
    #base['FVC_init_first']= base.groupby('Patient')['FVC'].transform('first')
    base['Percent_init']= base.groupby('Patient')['Percent'].transform('mean')
    base = base[['Patient', 'FVC_init_avg', 'Percent_init', 'Age', 'Sex']].drop_duplicates()
    base['FVC_expected'] = (base['FVC_init_avg'] / (base['Percent_init']/100))
    base['Height_proxy'] = base.apply(lambda x: height_proxy(x.FVC_expected, x.Age, x.Sex), axis=1)
    base = base[['Patient', 'Height_proxy', 'FVC_init_avg', 'Percent_init']]

    df = df.merge(base, on='Patient', how='left')
    df['init_week'] = df['Weeks'] - df['min_week']
    return df

In [None]:
train = pd.read_csv(input_path + '/train.csv')
train = process_init_week(train, train_df = True)
train.drop_duplicates(keep='first', inplace=True, subset=['Patient','Weeks'])
train.head(3)

In [None]:
sub = pd.read_csv(input_path + '/sample_submission.csv') 
test = pd.read_csv(input_path + '/test.csv')

sub['Patient'] = sub['Patient_Week'].apply(lambda x:x.split('_')[0])
sub['Weeks'] = sub['Patient_Week'].apply(lambda x: int(x.split('_')[-1]))
sub =  sub[['Patient','Weeks','Confidence','Patient_Week']]

test = test.rename(columns={'Weeks': 'min_week'})
sub = sub.merge(test, on='Patient')

sub = process_init_week(sub, train_df = False)
sub.head(3)

In [None]:
def scale_fn(var_name):
    col = train[var_name]
    return lambda x: (x - col.min())/(col.max()- col.min())

scale_age = scale_fn('Age')
scale_height = scale_fn('Height_proxy')
scale_percent = scale_fn('Percent')
scale_fvc = scale_fn('FVC_init_avg')

# scale week manually. weeks on test data go from -12 to 133.
scale_week = lambda x: (x - (-12))/(133-(-12))

def transform_features(df):
    df = df.assign(sex_code = np.where(df['Sex'] == 'Female', 1, 0))
    df = df.assign(ex_smoker = np.where(df['SmokingStatus'] == 'Ex-smoker', 1, 0))
    df = df.assign(never_smoked = np.where(df['SmokingStatus'] == 'Never smoked', 1, 0))
    df = df.assign(current_smoker = np.where(df['SmokingStatus'] == 'Currently smokes', 1, 0))
    df['has_smoked'] = df['ex_smoker'] + df['current_smoker']

    df['age'] = df['Age'].map(scale_age)
    df['height'] = df['Height_proxy'].map(scale_height)
    df['percent'] = df['Percent'].map(scale_percent)
    df['percent_init'] = df['Percent_init'].map(scale_percent) # scale the same as Percent 
    df['week'] = df['Weeks'].map(scale_week) # this is to original week, use init week for validation analysis
    df['fvc_init'] = df['FVC_init_avg'].map(scale_fvc) # can change 'FVC_init_avg' to 'FVC_init_first' see data exploration notes
    return df

In [None]:
train = transform_features(train)
train.reset_index(inplace=True, drop = True)
train.head(3)

In [None]:
sub = transform_features(sub)
sub.head(3)

## Data for Linear Decay Model

- Gathers the features using only the inital values. That is the values of the week in which fvc was first measures i.e. `init_week == 0.`
- 1 row per patient

In [None]:
linear_decay_features = ['age', 'sex_code', 'has_smoked', 'current_smoker', 'height', 'percent_init', 'fvc_init']

def get_patient_tab(df): # df is either train or sub
    patients_init = df[df['init_week'] == 0].copy()
    patients_init = patients_init[['Patient']+ linear_decay_features]
    patients_init.set_index('Patient', inplace = True)
    return patients_init

patients_tab_train = get_patient_tab(train)
patients_tab_test = get_patient_tab(sub)
print(patients_tab_train.shape)
display(patients_tab_train.head(3))
patients_tab_test

# Inference

In [None]:
# Start a dataframe to put all predictons from both models
PREDICTIONS = sub[['Patient', 'Weeks', 'Patient_Week']].copy()
PREDICTIONS.head(5)

## Inference Linear Decay

In [None]:
LD_inference = pd.read_csv(pretrained_path + '/inference_linear_decay_2020Sep19.csv')
LD_inference

In [None]:
LD_test = patients_tab_test.reset_index()
pred_cols = ['Patient', 'Weeks', 'Patient_Week', 'FVC_init_avg', 'init_week']
return_cols = ['Patient', 'Weeks', 'Patient_Week', 'FVC_hat', 'sigma']

def get_sigma_function(s_intercept, s_multiplier, s_power):
    def alt_sigma(coeff, init_week):
        coeff = abs(coeff)
        week_distance = abs(init_week)
        sigma = s_intercept + s_multiplier*coeff*(week_distance**s_power)
        return sigma
    return alt_sigma

def pred_test(model, sigma_fn):
    X = LD_test[linear_decay_features].copy()
    XID = LD_test[['Patient']].copy()
    XID['coeff_pred'] = model.predict(X, batch_size = 32)

    P = sub[pred_cols].copy()
    P = P.merge(XID, how='left', on='Patient')

    P['FVC_hat'] = P['FVC_init_avg'] + (P['coeff_pred'] * P['init_week'])
    P['sigma'] = P.apply(lambda x: sigma_fn(x.coeff_pred, x.init_week), axis = 1)
    return P[return_cols]

In [None]:
for fold_num in range(5):
    prefix = LD_inference.loc[fold_num].prefix
    fname = '{}/{}_weights.h5'.format(pretrained_path, prefix)
    s_intercept, s_multiplier, s_power = eval(LD_inference.loc[fold_num].alt_sigma_param)
    f_sigma = get_sigma_function(s_intercept, s_multiplier, s_power)

    model = tf.keras.models.load_model(fname)
    P = pred_test(model, sigma_fn = f_sigma)
    PREDICTIONS['FVC_LD{}'.format(fold_num)] = P['FVC_hat']
    PREDICTIONS['Confidence_LD{}'.format(fold_num)] = P['sigma']
    
del P, fold_num, fname, model

In [None]:
PREDICTIONS

## Inference Quantile Regression

In [None]:
QR_inference = pd.read_csv(pretrained_path + '/inference_quant_reg_2020Sep23.csv')
QR_inference

In [None]:
qr_features8 = ['fvc_init','week', 'sex_code', 'age', 'height', 'has_smoked', 'current_smoker', 'percent_init']
qr_features7 = ['fvc_init','week', 'sex_code', 'age', 'has_smoked', 'current_smoker', 'percent_init']

In [None]:
for fold_num in range(5):
    prefix = QR_inference.loc[fold_num].prefix
    fname = '{}/{}_weights.h5'.format(pretrained_path, prefix)
    model = tf.keras.models.load_model(fname, compile = False)
    model.compile(loss='mae', optimizer='adam', metrics=['mae'])

    num_features = QR_inference.loc[fold_num].num_features
    if num_features == 7: features = qr_features7
    else: features = qr_features8

    X = sub[features].copy()
    preds = model.predict(X)

    PREDICTIONS['FVC_QR{}'.format(fold_num)] = preds[:,1]
    PREDICTIONS['Confidence_QR{}'.format(fold_num)] = preds[:,2] - preds[:,0]

del X, fold_num, fname, model, prefix, num_features, features, preds

## Get FVC combinations (mean, weighted average, etc)

In [None]:
QR_FVC_cols = [col for col in PREDICTIONS.columns if 'FVC_QR' in col]
LD_FVC_cols = [col for col in PREDICTIONS.columns if 'FVC_LD' in col]
QR_Conf_cols = [col for col in PREDICTIONS.columns if 'Confidence_QR' in col]
LD_Conf_cols = [col for col in PREDICTIONS.columns if 'Confidence_LD' in col]

PREDICTIONS['QR_FVC_mean'] = PREDICTIONS[QR_FVC_cols].mean(axis = 1)
PREDICTIONS['QR_Conf_mean'] = PREDICTIONS[QR_Conf_cols].mean(axis = 1)
PREDICTIONS['LD_FVC_mean'] = PREDICTIONS[LD_FVC_cols].mean(axis = 1)
PREDICTIONS['LD_Conf_mean'] = PREDICTIONS[LD_Conf_cols].mean(axis = 1)

# mean of all LD + QR
PREDICTIONS['QR_LD_FVC_mean'] = PREDICTIONS[['QR_FVC_mean','LD_FVC_mean']].mean(axis = 1)
PREDICTIONS['QR_LD_Conf_mean'] = PREDICTIONS[['QR_Conf_mean','LD_Conf_mean']].mean(axis = 1)

In [None]:
PREDICTIONS

## Readying submission

In [None]:
to_submit = PREDICTIONS[['Patient_Week','QR_LD_FVC_mean', 'QR_LD_Conf_mean']]
to_submit.columns = ['Patient_Week','FVC','Confidence']
to_submit

In [None]:
to_submit.describe().T

In [None]:
to_submit.to_csv('submission.csv', index=False)

In [None]:
!head -3 submission.csv