In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import glob

from tqdm import tqdm

from sklearn.model_selection import (
    cross_val_score, GroupKFold, GridSearchCV, 
    cross_val_predict, RandomizedSearchCV)

from sklearn.linear_model import LinearRegression
from sklearn.metrics import make_scorer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.base import RegressorMixin, BaseEstimator
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder, OrdinalEncoder

plt.style.use("dark_background")
%matplotlib inline

In [None]:
# kaggle
main_dir = "../input/osic-pulmonary-fibrosis-progression"

!ls {main_dir}

In [None]:
train_files = tf.io.gfile.glob(main_dir+"/train/*/*")
test_files = tf.io.gfile.glob(main_dir+"/test/*/*")
sample_sub = pd.read_csv(main_dir + "/sample_submission.csv")
train = pd.read_csv(main_dir + "/train.csv")
test = pd.read_csv(main_dir + "/test.csv")

print ("Number of train patients: {}\nNumber of test patients: {:4}"
       .format(train.Patient.nunique(), test.Patient.nunique()))

print ("\nTotal number of Train patient records: {}\nTotal number of Test patient records: {:6}"
       .format(len(train_files), len(test_files)))

train.shape, test.shape, sample_sub.shape

In [None]:
def laplace_log_likelihood(y_true, y_pred, sigma=70):
    # values smaller than 70 are clipped
    sigma_clipped = tf.maximum(sigma, 70)

    # errors greater than 1000 are clipped
    delta_clipped = tf.minimum(tf.abs(y_true - y_pred), 1000)
    
    # type cast them suitably
    delta_clipped = tf.cast(delta_clipped, dtype=tf.float32)
    sigma_clipped = tf.cast(sigma_clipped, dtype=tf.float32)
    
    # score function
    score = - tf.sqrt(2.0) * delta_clipped / sigma_clipped - tf.math.log(tf.sqrt(2.0) * sigma_clipped)
    
    return tf.reduce_mean(score)

In [None]:
# create a simple scorer function that can alter the value of sigma
def l1(s):
    def scorer_func(x, y, sigma=s):
        return laplace_log_likelihood(x, y, sigma=s).numpy()
    
    return make_scorer(scorer_func, greater_is_better=False)

In [None]:
def base_shift(data, q=50):
    x = data.copy()

    # Create base_Week, Base_FVC and Base_Percent for train
    temp = (x.groupby("Patient")
            .apply(lambda x: x.loc[int(
                np.percentile(x['Weeks'].index, q=q)
            ), ["Weeks", "FVC", "Percent"]]))

    temp.rename(
        {"Weeks": "Base_Week", 
         "FVC": "Base_FVC", 
         "Percent": "Base_Percent"}, 
        axis=1, inplace=True)

    # merge it with train data
    x = x.merge(temp, on='Patient')

    # create week offsets
    x['Week_Offset'] = x['Weeks'] - x['Base_Week']
    
    return x

In [None]:
def multi_baseweek_frame(data, display=True):
    '''Function to return multiple base week frames -> instead of creating one base week,
    creates several to help our model learn better and predict past and future data better.'''       
    
    op = data.merge(
        data[['Patient', 'Weeks', 'FVC', 'Percent']].rename(
            {"Weeks": "Base_Week", 
             "FVC": "Base_FVC", 
             "Percent": "Base_Percent"}, axis=1), 
        on='Patient')

    # create week offsets
    op['Week_Offset'] = op['Weeks'] - op['Base_Week']

    # only take those rows with offset other than 0
    op = op[op['Week_Offset'] != 0]

    if display:
        # number of training samples
        print ("Number of Samples:{:5} -> {:5}\nNumber of Columns:{:5} -> {:5}".format(
            data.shape[0], op.shape[0], data.shape[1], op.shape[1]))
    
    return op.sort_values(by=['Patient', 'Base_Week']).reset_index(drop=True)

In [None]:
def get_model_data(data, cat_cols, num_cols, to_drop, cat_method='1h', transform_stats=None,
                   age_bins=None, train=True, display_stats=True, math=None, factor=False):
    
    '''Our pipeline for this notebook. This portion is complex, could be written much more efficiently 
    using simple sklearn tools. I have this bad habit of reinventing the wheel from scratch.'''
    
    X = data.copy().reset_index(drop=True)
    
    ##########################################################################
    
    if age_bins:    
        X['binned_age'] = pd.cut(X['Age'], bins=range(0, 101, 100//(age_bins-1))).cat.codes / age_bins
        to_drop = to_drop + ['Age']   
        
    if math:
        prod = np.ones(X.shape[0])
        if 'sin' in math:
            X['Sin_week'] = X.groupby("Patient")['Weeks'].apply(np.sin)
            prod = prod * X['Sin_week']
        if 'cos' in math:
            X['Cos_week'] = X.groupby("Patient")['Weeks'].apply(np.cos)
            prod = prod * X['Cos_week']
        if 'tan' in math:
            X['Tan_week'] = X.groupby("Patient")['Weeks'].apply(np.tan)
            prod = prod * X['Cos_week']
            
        if len(math) > 1:
            X['Math_Prod'] = prod
            
    if factor:
        X['factor'] = X['Base_FVC'] / X['Base_Percent']
        
    ##########################################################################
    
    if cat_cols != []:
    
        if cat_method == 'ord': # ordinal encoding for tree based models
            global ordenc
            if train:
                from sklearn.preprocessing import OrdinalEncoder
                ordenc = OrdinalEncoder()
                X = X.merge(
                    pd.DataFrame(
                        ordenc.fit_transform(X[cat_cols]).astype(int),
                        columns=map(lambda x: x+"_ord", cat_cols)),
                    left_index=True, right_index=True)

            else:
                X = X.merge(
                    pd.DataFrame(
                        ordenc.transform(X[cat_cols]).astype(int),
                        columns=map(lambda x: x+"_ord", cat_cols)),
                    left_index=True, right_index=True)           

        elif cat_method == '1h': # one hot encoding
            global onehenc
            if train:
                onehenc = OneHotEncoder()
                X = X.merge(
                    pd.DataFrame(
                        onehenc.fit_transform(X[cat_cols]).todense(),
                        columns=[*np.concatenate(onehenc.categories_)]),
                    left_index=True, right_index=True)

            else:
                X = X.merge(
                    pd.DataFrame(
                        onehenc.transform(X[cat_cols]).todense(),
                        columns=[*np.concatenate(onehenc.categories_)]),
                    left_index=True, right_index=True)

        elif cat_method == 'poly': # polynomial feature encoding
            global cat_comb
            if train:
                cat_comb = np.array(
                    np.meshgrid(*[X[cat].unique() for cat in cat_cols])
                ).T.reshape(-1, len(cat_cols))

            for combination in cat_comb:
                name = "_".join(map(str, combination))
                X[name] = 1
                for i in range(len(cat_cols)):
                    X[name] = X[name] & (X[cat_cols[i]] == combination[i]).astype(int)

    # drop the columns after they have been encoded
    to_drop = to_drop + cat_cols
                
    ##########################################################################
    
    global stats
    if train:
        if transform_stats is None:
            # saving stats for the future
            stats = X.describe().T
        else:
            stats = transform_stats

    # lets scale the numeric columns (We scale it with max possible values)
    for col in num_cols:
        
        if (not train) and (col not in X.columns):
            continue
            
        X[col] = (X[col] - stats.loc[col, 'min']) / (stats.loc[col, 'max'] - stats.loc[col, 'min'])
        
    ##########################################################################

    global x_cols
    if train:
        
        Y = X['FVC'].dropna()
        
        if display_stats:
        
            # print out how well our features would do
            print (X.corr()['FVC'].abs().sort_values(ascending=False)[1:])
        
        X = X.drop(to_drop, axis=1)
        x_cols = X.columns
        
        return X, Y
    
    else:
        
        X = X.drop(to_drop, axis=1, errors='ignore')
        X = X[x_cols]
        
        return X

In [None]:
def augment_train_cosine(data, n_similar=3, threshold=0.25, display_sample=True):
    
    '''
    - `n_similar` is number of patients we cluster at a time more the cluster, more eratic it gets.
    - `threshold` is used for as a measure to counter the influence of "outliers"
    '''
    
    from sklearn.metrics.pairwise import cosine_similarity

    temp = base_shift(data.copy(), q=0)

    # feature engineering -> simiarity of percentage changes across patients
    # we use slopes, sex, SmokingStatus and the way the Percent features vary 
    # from Base_Percent as preditive features for clustering patients together.
    
    temp['present_minus_past'] = temp.groupby("Patient")['FVC'].transform('diff').fillna(0)
    temp['Week_diff'] = temp.groupby("Patient")['Weeks'].transform('diff').fillna(0)
    temp['pms'] = (temp['present_minus_past'] / temp['Week_diff']).replace([np.inf, -np.inf]).fillna(0)
    temp['pms_min']  = temp.groupby("Patient")['pms'].transform('min')
    temp['pms_25']   = temp.groupby("Patient")['pms'].transform(lambda x: np.percentile(x, q=25))
    temp['pms_mean'] = temp.groupby("Patient")['pms'].transform('mean')
    temp['pms_75']   = temp.groupby("Patient")['pms'].transform(lambda x: np.percentile(x, q=75))
    temp['pms_max']  = temp.groupby("Patient")['pms'].transform('max')
    temp['pms_sum']  = temp.groupby("Patient")['pms'].transform('sum')
    
    temp['pmb_avg'] = (temp['Percent'] - temp['Base_Percent']).groupby(temp.Patient).transform("mean")
    temp['p_std']    = temp.groupby("Patient")['Percent'].transform('std')

    temp = temp.merge(
        pd.concat([
            temp.groupby("Patient").apply(
                lambda x: (x['Percent'].values[-1] - x['Percent'].values[0]) / 
                (x['Weeks'].values[-1] - x['Weeks'].values[0])).rename("Slope"),
            
            temp.groupby("Patient").apply(
                lambda x: x['Week_Offset'].iloc[np.argmax(x['pms'])]).rename("pmsw_max"),    
            temp.groupby("Patient").apply(
                lambda x: x['Week_Offset'].iloc[np.argmin(x['pms'])]).rename("pmsw_min")
            
        ], axis=1, ignore_index=False), on='Patient')

    # we take just the head row for each patient for comparison
    temp = temp.groupby('Patient').head(1).reset_index(drop=True)

    # another measure of similartiy
    temp['factor'] = temp['Base_FVC'] / temp['Base_Percent']

    # features to use for similarity clustering
    train_cols = [
        # compulsary features
        'Patient', #'Sex', 'SmokingStatus'
        
        # optionally added features for clustering
        # 'pms_min', 'pms_max', 'p_std',
        'pms_25', 'pms_mean', 'pms_75', 
        'pms_sum', 'pmsw_min', 'pmsw_max', 
        'pmb_avg', 'Slope', 'factor'
    ]
    
    cat_cols = np.intersect1d(['Sex', 'SmokingStatus'], train_cols)

    temp = temp[train_cols]
    temp = pd.get_dummies(temp, columns=cat_cols, drop_first=True, prefix='', prefix_sep='')

    # including that patient, find n_similar more patients
    n_similar += 1
    groups = (pd.DataFrame(
        np.argsort(
            # cosine similarity to get their similarity scores
            cosine_similarity(temp.drop("Patient", 1), temp.drop('Patient', 1)))
        [:, -1:-n_similar-1:-1]))

    # cosine similarity is symmetric, so we remove the redundant ones
    groups = groups[~pd.DataFrame(np.sort(groups.values, axis=1)).duplicated()]
    
    # convert the indices to patient ids
    groups = groups.applymap(lambda x: temp.Patient.to_dict()[x]).apply(list, axis=1).to_dict()

    # the bottle neck of this function
    aug_data = []
    for group in tqdm(groups.values(), disable=not display_sample):

        temp = base_shift(train[train.Patient.isin(group)], q=0)

        temp['base_per_diff_from_mean'] = temp['Base_Percent'] - temp['Base_Percent'].unique().mean()
        temp['Percent_shifted'] = temp['Percent'] - temp['base_per_diff_from_mean']

        temp['base_week_diff_from_mean'] = temp['Base_Week'] - temp['Base_Week'].unique().mean()
        temp['Week_shifted'] = temp['Weeks'] - temp['base_week_diff_from_mean']

        temp = pd.merge_ordered(
            temp.drop(['Age', 'Sex', 'SmokingStatus', 'Base_Week', 'Week_Offset'], axis=1),

            # we obtain the mean only for those with samples greater than threshold %
            # these mean values are fit as such as the expected Percent
            (temp.groupby("Week_shifted")['Percent_shifted']
             .agg(['mean', 'count']).query(f'count > {n_similar * threshold}')
             .drop("count", 1)),

            on='Week_shifted', left_by='Patient'
        )

        aug_data.append(temp)

    temp = pd.concat(aug_data).reset_index(drop=True)

    # recuring features can simply be padded
    temp[['base_per_diff_from_mean', 'base_week_diff_from_mean', 'Base_Percent', 'Base_FVC']] = (
        temp.groupby("Patient")[['base_per_diff_from_mean', 'base_week_diff_from_mean', 
                                 'Base_Percent', 'Base_FVC']].fillna(method='ffill'))

    # get back the weeks from shifted weeks
    temp['Week_aug'] = temp['Week_shifted'] + temp['base_week_diff_from_mean']

    # For those percent values already present copy them, augment the rest
    temp['Percent_aug'] = np.where(
        temp['Percent'].isna(), 
        temp['mean'] + temp['base_per_diff_from_mean'],
        temp['Percent'])

    # drop/clean those which have neither 
    temp = temp[~temp['Percent_aug'].isna()]

    # fill the FVC values using percent -> FVC corelation
    test_ids = temp['FVC'].isna()
    temp.loc[test_ids, 'FVC'] = LinearRegression().fit(
        temp.loc[~test_ids, ['Base_Percent', 'Percent_aug', 'Base_FVC']], 
        temp.loc[~test_ids, 'FVC']).predict(
        temp.loc[test_ids, ['Base_Percent', 'Percent_aug', 'Base_FVC']])

    temp = temp.groupby(["Patient", 'Week_aug']).mean().reset_index()

    # retain only those columns that we need & rename them to match train
    temp = temp[['Patient', 'Week_aug', 'Percent_aug', 'FVC']]
    temp = temp.rename({'Week_aug': 'Weeks', 'Percent_aug': 'Percent'}, axis=1)
    
    # some weeks may end up slightly shifted to left or right
    temp['Weeks'] = temp['Weeks'].astype(int)
    
    # add these columns to be able to fit inside multibaseweek pipeline
    temp = temp.merge(
        (data[['Patient', 'Sex', 'Age', 'SmokingStatus']]
         .groupby('Patient').head(1).reset_index(drop=True)), 
        
        on='Patient')
    
    if display_sample:
        
        print ("Data augmented by factor: {:.2f}x".format(1 + (
            temp.shape[0] - data.shape[0]) / data.shape[0]))

        f, ax = plt.subplots(nrows=4, ncols=2, figsize=(20, 20))
        for i, pat in enumerate(temp.Patient.unique()[:4]):

            ax[i][0].plot(*list(zip(*data[data.Patient == pat][['FVC', 'Weeks']].values))
                          [::-1], c='g', alpha=0.7)

            ax[i][1].plot(*list(zip(*data[data.Patient == pat][['Percent', 'Weeks']].values))
                          [::-1], c='g', alpha=0.7)

            ax[i][0].scatter(*list(zip(*temp[temp.Patient == pat][['FVC', 'Weeks']].values))
                          [::-1], c='r')

            ax[i][1].scatter(*list(zip(*temp[temp.Patient == pat][['Percent', 'Weeks']].values))
                          [::-1], c='r')
            

            ax[i][0].set(xlabel='Weeks', ylabel='FVC')
            ax[i][1].set(xlabel='Weeks', ylabel='Percent')
            f.suptitle("FVC & Percent Augmentation", y=.9)
    
    return temp

In [None]:
def augment_train_naive(
    data, steps=5, method='index', noise=.25, val_split=0.25, 
    end_pts=[None, None], display_sample=True):
    
    '''
    end_pts -> start and end of augmentation, if None defaults to min/max for that patient
    '''
    
    temp = data[['Patient','Weeks', 'FVC', 'Percent']].merge(
        
        ((data.groupby("Patient")['Weeks']
         .apply(lambda x: pd.Series(
             np.union1d(np.arange(
                 end_pts[0] if end_pts[0] else x.min(), 
                 end_pts[1] if end_pts[1] else x.max(), 
                 step=steps), x))
               ).reset_index(level=0))),

        on=['Patient', 'Weeks'], how='right')

    temp.loc[:, ['FVC', 'Percent']] = (
        temp.groupby("Patient")[['FVC', 'Percent']]
        .apply(lambda x: (
            # interpolate 
            x.interpolate(method=method, limit_direction='both') + 
            
            # noise factor: Gaussian noice + standard deviation of resp features
            # (we assume std of percent is scaled up version of std of FVC)
            (x.std().values * np.random.uniform(-noise, noise, [len(x), 1])))))

    temp = temp.merge(
        data.groupby("Patient")[['Patient', 'Age', 'Sex', 'SmokingStatus']].head(1),
        on='Patient')
    
    if display_sample:
        f, ax = plt.subplots(nrows=4, ncols=2, figsize=(20, 20))

        for i, pat in enumerate(temp.Patient.unique()[:4]):

            ax[i][0].plot(*list(zip(*data[data.Patient == pat][['FVC', 'Weeks']].values))
                [::-1], c='g', alpha=0.7)
            
            ax[i][1].plot(*list(zip(*data[data.Patient == pat][['Percent', 'Weeks']].values))
                          [::-1], c='g', alpha=0.7)
            
            ax[i][0].scatter(*list(zip(*temp[temp.Patient == pat][['FVC', 'Weeks']].values))[::-1], c='r')
            
            ax[i][1].scatter(*list(zip(*temp[temp.Patient == pat][['Percent', 'Weeks']].values))[::-1], c='r')
            
            ax[i][0].set(xlabel='Weeks', ylabel='FVC')
            ax[i][1].set(xlabel='Weeks', ylabel='Percent')
            f.suptitle("FVC & Percent Augmentation", y=.9)
            
        print ("Data augmented by factor: {:.2f}x".format(1 + (
            temp.shape[0] - data.shape[0]) / data.shape[0]))
    
    return temp

In [None]:
sub = sample_sub.Patient_Week.str.extract("(ID\w+)_(\-?\d+)").rename({0: "Patient", 1: "Weeks"}, axis=1)
sub['Weeks'] = sub['Weeks'].astype(int)
sub = pd.merge(sub, test[['Patient', 'Sex', 'SmokingStatus']], on='Patient')
sub["Patient_Week"] = sub.Patient + "_" + sub.Weeks.astype(str)
sub.head()

In [None]:
class GBR(RegressorMixin, BaseEstimator):
    def __init__(self, alpha=.75, **params):
        self.alpha = alpha
        self.umodel = self._create_model(loss='quantile', q=self.alpha, **params)
        self.mmodel = self._create_model(loss='lad', **params)
        self.lmodel = self._create_model(loss='quantile', q=1-self.alpha, **params)
              
    def _create_model(self, loss, q=.75, **params):
        model = GradientBoostingRegressor(
            init=LinearRegression(),
            criterion='friedman_mse',
            n_estimators=50, max_depth=2, 
            loss=loss, alpha=q, **params)
        
        return model
        
    def fit(self, x, y):
        
        self.umodel.fit(x, y)
        self.mmodel.fit(x, y)
        self.lmodel.fit(x, y)
        return self
    
    def predict(self, X):
        
        return self.mmodel.predict(X)
    
    def predict_forecast(self, X, return_bounds=False):
        
        preds = self.mmodel.predict(X)
        upper = self.umodel.predict(X)
        lower = self.lmodel.predict(X)
        
        if return_bounds:
            return preds, upper, lower
        else:
            return preds, (upper - lower)

In [None]:
# saving stats for feeding to pipe
stats = multi_baseweek_frame(pd.concat([train, test]), display=False).describe().T

In [None]:
op = multi_baseweek_frame(
    augment_train_cosine(train, display_sample=False, n_similar=3, threshold=.25)
)

In [None]:
alpha = .85

In [None]:
# preprocessing pipe essentials
cat_cols = ['Sex', 'SmokingStatus']
to_drop = ['FVC', 'Percent', 'Weeks', 'factor', 'Base_Percent']
num_cols = ['Weeks', 'Week_Offset', 'Base_Week', 'Age', 'Base_FVC', 'Percent', 'Base_Percent']
cat_method = 'ord'
math = []
age_bins = 5

# cross validation 
folds = 7
total_patients = train.Patient.unique()
np.random.shuffle(total_patients)
val_len = len(total_patients) // folds

# data frame to hold the predictions on train & test data
temp = pd.DataFrame()
preds = pd.DataFrame()

# creating data suitable for model fitting and predictions
X, Y = get_model_data(
    op, factor=True, num_cols=num_cols, cat_cols=cat_cols, 
    age_bins=age_bins, to_drop=to_drop, display_stats=False, 
    cat_method=cat_method, transform_stats=stats, math=math)

X_VAL = base_shift(train, q=0)
Y_VAL = X_VAL['FVC'].dropna()

# percent value as the base percent
X_VAL['Percent'] = X_VAL['Base_Percent']

X_VAL = get_model_data(
    X_VAL, num_cols=num_cols, cat_cols=cat_cols, to_drop=to_drop, factor=True,
    cat_method=cat_method, train=False, age_bins=age_bins, math=math)

# creating x_test for predictions simaltanesly
x_test = sub[['Patient', 'Weeks']].merge(
    test.rename({"Weeks": "Base_Week", 
                 "FVC": "Base_FVC", 
                 "Percent": "Base_Percent"}, axis=1), 
    on='Patient')

# create week offsets
x_test['Week_Offset'] = x_test['Weeks'] - x_test['Base_Week']

# percent value as the base percent
x_test['Percent'] = x_test['Base_Percent']

x_test = get_model_data(
    x_test, cat_cols=cat_cols, num_cols=num_cols, to_drop=to_drop, math=math,
    factor=True, train=False, cat_method=cat_method, 
    age_bins=age_bins).drop("Patient", 1)

for i in range(folds):
   
    val_patients = total_patients[(i)*val_len:(i+1)*val_len]
    train_patients = np.setdiff1d(total_patients, val_patients)
    
    assert len(np.intersect1d(val_patients, train_patients)) == 0
    
    x, y, = (X[X.Patient.isin(train_patients)].drop("Patient", 1), Y[X.Patient.isin(train_patients)])
    x_val, y_val = (X_VAL[X_VAL.Patient.isin(val_patients)].drop("Patient", 1), 
                    Y_VAL[X_VAL.Patient.isin(val_patients)])
        
    # creating base model, no parameter tweakingparam_distributions
    model = GBR(alpha=alpha)

    model.fit(x, y)
    y_middle, y_upper, y_lower = model.predict_forecast(x_val, return_bounds=True)
    y_middle_pred, y_test_conf = model.predict_forecast(x_test)
    
    print ("For Fold #{} Val Score: {:.2f} @ 70 Confidence | {:.2f} @ Pred Confidence".format(
        i+1, - laplace_log_likelihood(y_middle, y_val), 
        - laplace_log_likelihood(y_middle, y_val, y_upper - y_lower)))
    
    temp = temp.append(pd.DataFrame(
        data=np.stack([y_upper, y_lower, y_middle, y_val], axis=1),
        columns=['upper', 'lower', 'pred', 'actual']
    ))
    
    preds = preds.append(pd.DataFrame(
        data=np.stack([y_middle_pred, y_test_conf], axis=1) / folds,
        columns=['pred', 'Confidence']
    ))
    
preds = preds.groupby(preds.index).sum()
temp['Confidence'] = temp['upper'] - temp['lower']

In [None]:
pat_scores = (
    temp.reset_index(drop=True)
    .groupby(train.Patient)
     .apply(lambda x: -laplace_log_likelihood(x['actual'], x['pred'], x['Confidence']).numpy())
).rename('scores').reset_index().sort_values("scores", ascending=False)

pat_scores = pat_scores.head(8)
print ("Worst Patient-Mean-Score: {:.2f}".format(pat_scores.scores.mean()))
pat_scores = pat_scores.Patient.values

In [None]:
print ("\n|================== Summary ==================|\n\
Score on Total Dataset: {:.3f} @   70 Confidence\n\
Score on Total Dataset: {:.3f} @  225 Confidence\n\
Score on Total Dataset: {:.3f} @ Pred Confidence".format(
    -laplace_log_likelihood(temp['actual'], temp['pred'], 70),
    -laplace_log_likelihood(temp['actual'], temp['pred'], 225),
    -laplace_log_likelihood(temp['actual'], temp['pred'], temp['Confidence'])
))

f, ax = plt.subplots(figsize=(40, 40), nrows=4, ncols=2)
ax = ax.ravel()
for i, pat in enumerate(pat_scores):
    (temp.reset_index(drop=True).loc[train.Patient == pat]
     .drop(["Confidence"], 1).plot(ax=ax[i], legend=False))
f.suptitle("Model's Worst Predictions", size=30)
f.tight_layout(rect=[0, 0.03, 1, 0.95]);

In [None]:
sub['FVC'] = preds['pred']
sub['Confidence'] = preds['Confidence']

# final touches before submission
for i in range(len(test)):
    sub.loc[sub['Patient_Week']==test.Patient[i]+'_'+str(test.Weeks[i]), 'FVC'] = test.FVC[i]
    sub.loc[sub['Patient_Week']==test.Patient[i]+'_'+str(test.Weeks[i]), 'Confidence'] = 70

sub[['Patient_Week', 'FVC', 'Confidence']].to_csv("quant_submission.csv", index=False)
sub.head()

In [None]:
cat_cols = ['Sex', 'SmokingStatus']
num_cols = ['Weeks', 'Week_Offset', 'Base_Week', 'Age', 'Base_FVC', 'Percent', 'Base_Percent']
to_drop = ["FVC", 'Percent', 'Weeks', 'Base_Week', 'Age', 'Base_Percent', 'factor']
math = []

multi_method = {}
multi_data = {}
folds = 7

methods = ['cubic', 'quadratic', 'cubicspline', 
           'pchip', 'akima', 'nearest', 'zero', 
           'slinear', 'linear', 'SIMILARITY_AUG']

# train patients for CV
total_patients = train.Patient.unique()
val_len = len(total_patients) // folds

for method in methods:
    
    # create the agumented dataset
    if method == 'SIMILARITY_AUG':
        temp = augment_train_cosine(train, n_similar=3, threshold=0.25, display_sample=False)
    else:
        temp = augment_train_naive(data=train, method=method, steps=15, noise=0., display_sample=False)
        
    # create multi base week data
    temp = multi_baseweek_frame(temp, display=False)
    # save the augmented dataframe
    multi_data[method] = temp
    
    # creating the data (processed for fitting)
    X, Y = get_model_data(temp, num_cols=num_cols, cat_cols=cat_cols, to_drop=to_drop, 
        display_stats=False, cat_method='1h', math=math, factor=True)
        
    X_VAL = base_shift(train, q=0)
    Y_VAL = X_VAL['FVC'].dropna()
    X_VAL['Percent'] = X_VAL['Base_Percent']
    X_VAL = get_model_data(
        X_VAL, num_cols=num_cols, cat_cols=cat_cols, factor=True,
        to_drop=to_drop, train=False, cat_method='1h', math=math)
    
    # shuffle the patients for cv
    np.random.shuffle(total_patients)
    scores = {}
    
    for i in range(folds):
   
        val_patients = total_patients[(i)*val_len:(i+1)*val_len]
        train_patients = np.setdiff1d(total_patients, val_patients)

        x, y = X[X.Patient.isin(train_patients)], Y[X.Patient.isin(train_patients)]
        x_val, y_val = X_VAL[X_VAL.Patient.isin(val_patients)], Y_VAL[X_VAL.Patient.isin(val_patients)]
        
        assert len(np.intersect1d(x_val.Patient.unique(), x.Patient.unique())) == 0
        
       # how does it perform on train
        scores['Train'] = scores.get('Train', []) + [cross_val_score(
            GBR(alpha=alpha), x.drop("Patient", 1), y, 
            scoring=l1(70), cv=GroupKFold(5), groups=x.Patient).mean()]

        # fit to measure model's performance
        lr = GBR(alpha=alpha).fit(x.drop("Patient", 1), y) 
        temp = lr.predict_forecast(x_val.drop("Patient", 1))
        temp = pd.DataFrame(np.stack(temp, 1), columns=['pred', 'conf'])
        temp['actual'] = y_val.reset_index(drop=True)
        
        # performance on validation data
        scores['Val'] = scores.get('Val', []) + [-laplace_log_likelihood(
            temp['actual'], temp['pred'], 70
        ).numpy()]
        
        # performance with confidence        
        scores['ValC'] = scores.get('ValC', []) + [-laplace_log_likelihood(
            temp['actual'], temp['pred'], temp['conf']
        ).numpy()]
        
        # Worst Performing mean scores
        scores['ValW'] = scores.get('ValW', []) + [temp.apply(lambda x: -laplace_log_likelihood(
            x['actual'], x['pred'], x['conf']).numpy(), 1).nlargest(25).mean()]
        
    print ("Method: {}\nTrain score: {:5.2f} @ {:.2f} Variance \
    \nVal Score: {:7.2f} @ {:.2f} Variance\
    \nValC Score: {:6.2f} @ {:.2f} Variance\
    \nWorst Score: {:5.2f} @ {:.2f} Variance\n{}\n".format(
        method.upper(), np.mean(scores['Train']), np.std(scores['Train']), 
        np.mean(scores['Val']), np.std(scores['Val']), np.mean(scores['ValC']), 
        np.std(scores['ValC']), np.mean(scores['ValW']), np.std(scores['ValW']), "=" * 35
    ))
    
    multi_method[method] = scores

In [None]:
val_scores = [np.mean(multi_method[i]['ValW']) for i in methods]
cut_off = np.percentile(val_scores, 50)
        
print("At {:.1f} cutoff, the Worst Score would be {:.3f} @ Pred Conf".format(
    cut_off, np.mean(list(filter(lambda x: x < cut_off, val_scores)))))

In [None]:
# creating x_test for predictions simaltanesly
x_test = sub[['Patient', 'Weeks']].merge(
    test.rename({"Weeks": "Base_Week", 
                 "FVC": "Base_FVC", 
                 "Percent": "Base_Percent"}, axis=1), 
    on='Patient')

# create week offsets
x_test['Week_Offset'] = x_test['Weeks'] - x_test['Base_Week']

# percent for test
x_test['Percent'] = x_test['Base_Percent']

# predictions dataframe
preds = {i: None for i in methods}

# train each model on all the saved augmented frames
for method in methods:
    
    if np.mean(multi_method[method]['ValW']) >= cut_off:
        preds.pop(method)
        continue
    
    x, y = get_model_data(
        multi_data[method], num_cols=num_cols, cat_cols=cat_cols, 
        to_drop=to_drop, train=True, cat_method='1h', 
        display_stats=False, factor=True,
    )

    lr = GBR(alpha=alpha).fit(x.drop("Patient", 1), y) 

    preds[method] = lr.predict_forecast(get_model_data(
        x_test, cat_cols=cat_cols, num_cols=num_cols, to_drop=to_drop, train=False, factor=True,
    ).drop("Patient", 1), return_bounds=True)

preds.keys()

In [None]:
temp = {}
for i in multi_method:
    if i in preds.keys():
        temp['Val'] = temp.get('Val', []) + [np.mean(multi_method[i]['Val'])]
        temp['ValC'] = temp.get('ValC', []) + [np.mean(multi_method[i]['ValC'])]
        
print ("At same cutoff:\n\nBest val score: {:6.3f} @   70 Conf\n\
Best Val score: {:6.3f} @ Pred Conf".format(np.mean(temp['Val']), np.mean(temp['ValC'])))

In [None]:
temp = pd.DataFrame(preds)

mean = np.mean(np.stack(temp.iloc[0].values), 0)
upper = np.mean(np.stack(temp.iloc[1].values), axis=0)
lower = np.mean(np.stack(temp.iloc[2].values), axis=0)

sub['FVC'] = mean
sub['Confidence'] = upper - lower

# save to csv file
sub[['Patient_Week', 'FVC', 'Confidence']].to_csv("submission.csv", index=False)

sub.head()

In [None]:
# max, min
sub.Confidence.hist()