Main ideas

* easly create subset of data to train and validate on (using  `train_mask` and `valid_mask`) 
* transform the target variable with `transform` and `inverse_transform`

In [None]:
import os, re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

import xgboost
import lightgbm
import catboost

from sklearn import ensemble, metrics, model_selection, linear_model, preprocessing, utils

import tensorflow as tf
from tensorflow import keras

In [None]:
def laplace_likelihood(y, p):
    m, s = p
    diff = np.minimum(1000, np.abs(y - m))
    s = np.maximum(70, s)
    lik = -np.sqrt(2) * diff / s - np.log(np.sqrt(2) * s)
    return np.mean(lik)


def laplace_likelihood_bound(y, p):
    m = p
    diff = np.minimum(1000, np.abs(y - m))
    s = np.maximum(70, np.sqrt(2) * diff)
    lik = -np.sqrt(2) * diff / s - np.log(np.sqrt(2) * s)
    return np.mean(lik)


def laplace_likelihood_avg(y, p):
    m = p
    diff = np.minimum(1000, np.abs(y - m))
    s = np.sqrt(2) * metrics.mean_absolute_error(y, m)
    lik = -np.sqrt(2) * diff / s - np.log(np.sqrt(2) * s)
    return np.mean(lik)

In [None]:
def build_folds(X, y, group=None, k=5, shuffle=False, train_mask=None, valid_mask=None):
    if isinstance(X, pd.DataFrame): X = X.values
    if isinstance(y, pd.DataFrame): y = y.values
    if isinstance(group, pd.DataFrame): group = group.values
    if isinstance(train_mask, pd.DataFrame): train_mask = train_mask.values
    if isinstance(valid_mask, pd.DataFrame): valid_mask = valid_mask.values
    if group is None:
        folds = list(model_selection.KFold(k, shuffle=True).split(X, y))
    else:
        idx = utils.shuffle(np.arange(X.shape[0]))
        if shuffle:
            Xs, ys, groups = X.copy()[idx], y.copy()[idx], group.copy()[idx]
            folds = list(model_selection.GroupKFold(k).split(np.array(Xs), np.array(ys), np.array(groups)))
        else:
            folds = list(model_selection.GroupKFold(k).split(X, y, group))
    if train_mask is not None:
        for i in range(k):
            folds[i] = (np.array([j for j in folds[i][0] if train_mask[j]]), folds[i][1])
    if valid_mask is not None:
        for i in range(k):
            folds[i] = (folds[i][0], np.array([j for j in folds[i][1] if valid_mask[j]]))
    return folds

In [None]:
class GroupMeanTransformer:
    
    def __init__(self, grouper, cols):
        self.grouper = grouper
        self.cols = cols
        self.generated_features = []
        self.means = dict()
        pass
    
    def fit_transform(self, x, y=None):
        self.fit(x, y)
        return self.transform(x)
        
    def fit(self, x, y=None):
        for gr in self.grouper:
            group_name = '_'.join(gr) if isinstance(gr, list) else gr
            for col in self.cols:
                self.means[(group_name, col)] = x.groupby(gr)[col].mean()
                self.means[(group_name, col)].name = '%s_mean_groupby_%s' % (col, group_name)
                self.generated_features += ['%s_mean_groupby_%s' % (col, group_name)]
                self.generated_features += ['delta_%s_mean_groupby_%s' % (col, group_name)]
        pass
    
    def transform(self, x):
        x = x.copy()
        for k in self.means.keys():
            col = k[1]
            m = self.means[k]
            x[m.name] = x[m.index.names].merge(m, on=m.index.names)[m.name]
            x['delta_' + m.name] = x[col] - x[m.index.names].merge(m, on=m.index.names)[m.name]
        return x

In [None]:
def _transform(x, mode: str=None, df=None):
    if mode is None: return x
    if mode.upper() == "DIFF": return x - df.FVC_base
    if mode.upper() == "DIFF_FVC": return np.where(df.n_weeks == 0, 0.0, x - df.FVC_base)
    if mode.upper() == "PERC": return x / df.FVC_base
    if mode.upper() == "PERC_FVC": return np.where(df.n_weeks == 0, 1.0, x / df.FVC_base)
    if mode.upper() == "WEEKLY": return x / np.maximum(1.0, np.abs(df.n_weeks))
    if mode.upper() == "SYMLOG_WEEKLY": return x / np.log(1 + np.maximum(1.0, np.abs(df.n_weeks)))
    if re.match("^LOG", mode.upper()): 
        gamma = re.sub("^LOG[:]*", "", mode.upper())
        gamma = float(gamma) if len(gamma) > 0 else 0.0
        return np.log(gamma + x)
    if re.match("^BOXCOX", mode.upper()):
        gamma = re.sub("^BOXCOX[:]*", "", mode.upper())
        gamma = float(gamma) if len(gamma) > 0 else 1.0
        return (x ** gamma - 1.0) / gamma
    return None


def _inverse_transform(x, mode=None, df=None):
    if mode is None: return x
    if mode.upper() == "DIFF": return x + df.FVC_base
    if mode.upper() == "DIFF_FVC": return np.where(df.n_weeks == 0, df.FVC_base, x + df.FVC_base)
    if mode.upper() == "PERC": return x * df.FVC_base
    if mode.upper() == "PERC_FVC": return np.where(df.n_weeks == 0, df.FVC_base, x * df.FVC_base)
    if mode.upper() == "WEEKLY": return x * np.maximum(1.0, np.abs(df.n_weeks))
    if mode.upper() == "SYMLOG_WEEKLY": return x * np.log(1 + np.maximum(1.0, np.abs(df.n_weeks)))
    if re.match("^LOG", mode.upper()): 
        gamma = re.sub("^LOG[:]*", "", mode.upper())
        gamma = float(gamma) if len(gamma) > 0 else 0.0
        return np.exp(x) - gamma
    if re.match("^BOXCOX", mode.upper()):
        gamma = re.sub("^BOXCOX[:]*", "", mode.upper())
        gamma = float(gamma) if len(gamma) > 0 else 1.0
        return (1.0 + x * gamma) ** (1 / gamma)
    return None
    

def transform(x, mode=None, df=None):
    out = x.copy()
    if isinstance(mode, list):
        for m in mode:
            out = _transform(out, mode=m, df=df)
    if isinstance(mode, str):
        out = _transform(out, mode=mode, df=df)
    return out


def inverse_transform(x, mode=None, df=None):
    out = x.copy()
    if isinstance(mode, list):
        for m in mode[::-1]:
            out = _inverse_transform(out, mode=m, df=df)
    if isinstance(mode, str):
        out = _inverse_transform(out, mode=mode, df=df)
    return out

# Data

In [None]:
def feature_eng(df):
    df = df.copy()
    df['n_weeks'] = df['Weeks_target'] - df['Weeks_base']
    df['symlog_n_weeks'] = np.sign(df['n_weeks']) * np.log(1 + np.abs(df['n_weeks']))
    df['symlog_n_weeks2'] = np.sign(df['n_weeks']) * np.log(1 + np.abs(df['n_weeks']) ** 2)
    df['expdecay_n_weeks'] = np.exp(-np.abs(df['n_weeks']))
    df['Sex_female'] = (df['Sex'] == 'Female').astype('float')
    df['Smoking_ex'] = (df['SmokingStatus'] == 'Ex-smoker').astype(int)
    df['Smoking_currently'] = (df['SmokingStatus'] == 'Currently smokes').astype(int)
    return df

In [None]:
data_folder = "../input/osic-pulmonary-fibrosis-progression"

In [None]:
df_train = pd.read_csv(os.path.join(data_folder, "train.csv")).drop_duplicates(keep=False, subset=['Patient', 'Weeks'])
df_train['n_obs'] = df_train.groupby('Patient').Weeks.cumcount()
df_train['till_last'] = df_train.groupby('Patient').Weeks.transform('count') - 1 - df_train['n_obs']
df_train = df_train.merge(df_train.drop(['Age', 'Sex', 'Percent', 'SmokingStatus'], 1), on='Patient', suffixes=['_base', '_target'])
cols_before_fe = df_train.columns
df_train = feature_eng(df_train)
FEATURES = ['FVC_base', 'Percent', 'Age'] + [c for c in df_train.columns if c not in cols_before_fe]
df_train[FEATURES].head(2)

In [None]:
train_mask = (df_train.n_weeks > 0) & df_train.n_obs_base.between(0, 5) & df_train.till_last_target.between(0, 3)
valid_mask = (df_train.n_weeks > 0) & df_train.n_obs_base.between(0, 0) & df_train.till_last_target.between(0, 2)

In [None]:
df_test = pd.read_csv(os.path.join(data_folder, "test.csv")).rename(columns={'Weeks': 'Weeks_base', 'FVC': 'FVC_base'})
df_test = df_test.assign(k=0).merge(pd.DataFrame({'Weeks_target': np.arange(-12, 133 + 1), 'k': 0})).drop('k', 1)
df_test = feature_eng(df_test)
df_test.head(2)

In [None]:
print(50 * "#")
print("##  Train:  %4d rows with %3d unique patients  ##" % (df_train[train_mask].shape[0], df_train[train_mask].Patient.nunique()))
print("##  Valid:  %4d rows with %3d unique patients  ##" % (df_train[valid_mask].shape[0], df_train[valid_mask].Patient.nunique()))
print("##  Test:   %4d rows with %3d unique patients  ##" % (df_test.shape[0], df_test.Patient.nunique()))
print(50 * "#")

In [None]:
gr = GroupMeanTransformer(['Sex', 'SmokingStatus'], ['FVC_base'])
gr.fit(df_train)
df_train = gr.transform(df_train)
df_test = gr.transform(df_test)
FEATURES += [f for f in gr.generated_features if not re.match("^delta_", f)]

In [None]:
MODE = ["PERC_FVC", "LOG"]
SIGMA_MODE = ["PERC"]

X = df_train[FEATURES].copy().values
y = df_train['FVC_target'].copy().values
group = df_train['Patient'].values
X_test = df_test[FEATURES].copy().values

# Training

In [None]:
N_FOLDS = 10
folds = build_folds(X, y, group, k=N_FOLDS, train_mask=train_mask, valid_mask=valid_mask)
valid_folds = pd.DataFrame({'idx': np.concatenate([np.array(f[1]) for fi, f in enumerate(folds)]),
                            'fold': np.concatenate([np.repeat(np.array(fi).reshape(1), len(f[1])) for fi, f in enumerate(folds)])}).set_index('idx')

prep = preprocessing.Normalizer()
Z = prep.fit_transform(X)
Z_test = prep.transform(X_test)

mean_target = transform(y, MODE, df_train)
plt.hist(mean_target, 40); 

In [None]:
pred_oof = np.nan * np.zeros((N_FOLDS, X.shape[0]))
pred_test = np.nan * np.zeros((N_FOLDS, X_test.shape[0]))
for i, (idxT, idxV) in enumerate(tqdm(folds)):
    model = linear_model.LassoCV()
    model.fit(Z[idxT], mean_target[idxT])
    pred_oof[i] = model.predict(Z)
    pred_oof[i, idxT] = np.nan
    pred_test[i] = model.predict(Z_test)
pred_mean = inverse_transform(np.nanmean(pred_oof, 0), MODE, df_train)
test_mean = inverse_transform(np.nanmean(pred_test, 0), MODE, df_test)

In [None]:
opt_sigma = np.sqrt(2) * np.clip(np.abs(y - pred_mean), 0, 1000)
sigma_target = transform(opt_sigma, SIGMA_MODE, df_train)
plt.hist(sigma_target[train_mask | valid_mask], 50);

In [None]:
pred_oof = np.nan * np.zeros((N_FOLDS, X.shape[0]))
pred_test = np.nan * np.zeros((N_FOLDS, X_test.shape[0]))
for i, (idxT, idxV) in enumerate(tqdm(folds)):
    model = linear_model.LinearRegression()
    model.fit(Z[idxT], sigma_target[idxT])
    pred_oof[i] = model.predict(Z)
    pred_oof[i, idxT] = np.nan
    pred_test[i] = model.predict(Z_test)
pred_sigma = inverse_transform(np.nanmean(pred_oof, 0), SIGMA_MODE, df_train)
test_sigma = inverse_transform(np.nanmean(pred_test, 0), SIGMA_MODE, df_test)

# Monitoring

In [None]:
lll_overall = laplace_likelihood(y[valid_mask], [pred_mean[valid_mask], pred_sigma[valid_mask]])
lllb_overall = laplace_likelihood_bound(y[valid_mask], pred_mean[valid_mask])
y_rmse_overall = metrics.mean_squared_error(y[valid_mask], pred_mean[valid_mask]) ** 0.5
y_mae_overall = metrics.mean_absolute_error(y[valid_mask], pred_mean[valid_mask])
s_rmse_overall = metrics.mean_squared_error(opt_sigma[valid_mask], pred_sigma[valid_mask]) ** 0.5
s_mae_overall = metrics.mean_absolute_error(opt_sigma[valid_mask], pred_sigma[valid_mask])


_df_base = pd.DataFrame({'y': y, 'so': opt_sigma, 'p': pred_mean, 's': pred_sigma}).merge(valid_folds, left_index=True, right_index=True)
lllb   = _df_base.groupby('fold').apply(lambda x: laplace_likelihood_bound(x['y'], x['p']))
lll    = _df_base.groupby('fold').apply(lambda x: laplace_likelihood(x['y'], [x['p'], x['s']]))
y_rmse = _df_base.groupby('fold').apply(lambda x: metrics.mean_squared_error(x['y'], x['p']) ** 0.5)
y_mae  = _df_base.groupby('fold').apply(lambda x: metrics.mean_absolute_error(x['y'], x['p']))
s_rmse = _df_base.groupby('fold').apply(lambda x: metrics.mean_squared_error(x['so'], x['s']) ** 0.5)
s_mae  = _df_base.groupby('fold').apply(lambda x: metrics.mean_absolute_error(x['so'], x['s']))

#
print("        OVERALL  ||  FOLD-WISE")
print("Bound:  %7.4f  ||  %7.4f  (+- %7.4f)" % (lllb_overall, lllb.mean(), 2 * lllb.std()))
print("Score:  %7.4f  ||  %7.4f  (+- %7.4f)" % (lll_overall, lll.mean(), 2 * lll.std()))
print("yRMSE:  %7.2f  ||  %7.2f  (+- %7.2f)" % (y_rmse_overall, y_rmse.mean(), 2 * y_rmse.std()))
print("y-MAE:  %7.2f  ||  %7.2f  (+- %7.2f)" % (y_mae_overall, y_mae.mean(), 2 * y_mae.std()))
print("sRMSE:  %7.2f  ||  %7.2f  (+- %7.2f)" % (s_rmse_overall, s_rmse.mean(), 2 * s_rmse.std()))
print("s-MAE:  %7.2f  ||  %7.2f  (+- %7.2f)" % (s_mae_overall, s_mae.mean(), 2 * s_mae.std()))

In [None]:
pd.DataFrame({'LLL': lll, 'Bound': lllb}).boxplot(figsize=(12, 4));

In [None]:
plt.figure(figsize=(16, 6))
plt.subplot(1, 2, 1)
plt.scatter(y[valid_mask], pred_mean[valid_mask], alpha=0.2);
plt.xlim(min(plt.xlim()[0], plt.ylim()[0]), max(plt.xlim()[1], plt.ylim()[1]))
plt.ylim(min(plt.xlim()[0], plt.ylim()[0]), max(plt.xlim()[1], plt.ylim()[1]))
plt.plot(plt.xlim(), plt.ylim(), color='tab:red', alpha=0.5, linestyle='--')
plt.subplot(1, 2, 2)
plt.scatter(opt_sigma[valid_mask], pred_sigma[valid_mask], alpha=0.2)
plt.xlim(min(plt.xlim()[0], plt.ylim()[0]), max(plt.xlim()[1], plt.ylim()[1]))
plt.ylim(min(plt.xlim()[0], plt.ylim()[0]), max(plt.xlim()[1], plt.ylim()[1]))
plt.plot(plt.xlim(), plt.ylim(), color='tab:red', alpha=0.5, linestyle='--');

In [None]:
plt.figure(figsize=(16, 6))
plt.subplot(1, 2, 1)
plt.scatter(df_train.Weeks_target[valid_mask], pred_mean[valid_mask], alpha=0.5)
plt.subplot(1, 2, 2)
plt.scatter(df_train.Weeks_target[valid_mask], pred_sigma[valid_mask], alpha=0.5);

In [None]:
plt.figure(figsize=(16, 8))
for i, pid in enumerate(df_train[valid_mask].Patient.unique()[:12]):
    plt.subplot(3, 4, i + 1)
    idx = (df_train.Patient == pid) & (df_train.n_obs_base == 0)
    plt.fill_between(
        df_train[idx].Weeks_target,
        pred_mean[idx] - 1 * pred_sigma[idx],
        pred_mean[idx] + 1 * pred_sigma[idx],
        color='tab:blue', alpha=0.1
    )
    plt.fill_between(
        df_train[idx].Weeks_target,
        pred_mean[idx] - 2 * pred_sigma[idx],
        pred_mean[idx] + 2 * pred_sigma[idx],
        color='tab:blue', alpha=0.1
    )
    plt.plot(df_train[idx].Weeks_target, pred_mean[idx], marker='x', color='tab:blue')
    plt.plot(df_train[idx].Weeks_target, df_train[idx].FVC_target, marker='o', color='tab:red')

In [None]:
for i, pid in enumerate(df_train[valid_mask].Patient.unique()):
    idx = (df_train.Patient == pid) & (df_train.n_obs_base == 0)
    plt.plot(df_train[idx].n_weeks, pred_mean[idx] / df_train[idx].FVC_base, color='tab:blue', alpha=0.2)

# Submission

In [None]:
submission = df_test.copy()[['Patient', 'Weeks_target']]
submission['Patient_Week'] = submission['Patient'] + "_" + submission['Weeks_target'].astype('str')
submission['FVC'] = test_mean
submission['Confidence'] = test_sigma
submission = submission.sort_values(['Weeks_target', 'Patient'])[['Patient_Week', 'FVC', 'Confidence']]
submission.head()

In [None]:
submission.to_csv("submission.csv", index=False)

In [None]:
plt.figure(figsize=(16, 8))
for i, pid in enumerate(df_test[valid_mask].Patient.unique()):
    plt.subplot(2, 3, i + 1)
    idx = (df_test.Patient == pid)
    plt.fill_between(
        df_test[idx].Weeks_target,
        test_mean[idx] - 1 * test_sigma[idx],
        test_mean[idx] + 1 * test_sigma[idx],
        color='tab:blue', alpha=0.1
    )
    plt.fill_between(
        df_test[idx].Weeks_target,
        test_mean[idx] - 2 * test_sigma[idx],
        test_mean[idx] + 2 * test_sigma[idx],
        color='tab:blue', alpha=0.1
    )
    plt.plot(df_test[idx].Weeks_target, test_mean[idx], color='tab:blue')
    plt.plot(df_test[idx].Weeks_base, df_test[idx].FVC_base, marker='o', color='tab:red')