In [None]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import KFold

train1 = pd.read_csv("../input/osic-pulmonary-fibrosis-progression/train.csv")
test = pd.read_csv("../input/osic-pulmonary-fibrosis-progression/test.csv")

################### Team CV scheme ########################

def mask_first(x):
    result = np.ones_like(x)
    result[0] = 0
    return result

class RobustCV(KFold):
    
    def __init__(self, df, *args, **kwargs):
        
        self.df = df.sort_values(['Patient', 'Weeks'])
        
        self.base_features = self.df.groupby('Patient').head(1)
        self.base_features.columns = [c if c == 'Patient' else f'base_{c}' for c in self.base_features.columns]

        self.df = self.df[['Patient', 'Weeks', 'FVC', ]].set_index(['Patient', 'Weeks'])
        
        super().__init__(*args, **kwargs)
    
    def split(self, patients, val_last_three_only=False, train_drop_first=False, train_last_three_only=False):
        for train_idx, val_idx in super().split(patients):
            train_patient = patients[train_idx]
            val_patient = patients[val_idx]
            assert set(train_patient).intersection(set(val_patient)) == set()
            
            train = self.df.loc[train_patient].reset_index()
            
            # drop first row (baseline week)
            if train_drop_first:
                mask = train.groupby('Patient')['Patient'].transform(mask_first).astype(bool)
                train = train[mask]
            
            if train_last_three_only:
                train = train.groupby('Patient').tail(3)

            
            train = pd.merge(train, self.base_features, left_on='Patient', right_on='Patient')
            
            
            
            val = self.df.loc[val_patient].reset_index()
            
            # drop first row (baseline week)
            mask = val.groupby('Patient')['Patient'].transform(mask_first).astype(bool)
            val = val[mask]
            
            # merge with base features
            val = pd.merge(val, self.base_features, left_on='Patient', right_on='Patient')
            
            # if last three take last 3 week only per patient
            if val_last_three_only:
                val = val.groupby('Patient').tail(3)
            
            yield train, val
ROOT = '../input/osic-pulmonary-fibrosis-progression'
train_df = pd.read_csv(f'{ROOT}/train.csv')
N_FOLDS = 5
kf = RobustCV(train_df, N_FOLDS, shuffle=True, random_state=2020)
for n_fold, (train, val) in enumerate(kf.split(train_df['Patient'].unique(), val_last_three_only=False, train_drop_first=False, train_last_three_only=False)):
    if n_fold==0:
        val['fold'] = n_fold
        folddf = val.copy()        
    else:
        val['fold'] = n_fold
        folddf = folddf.append(val.copy()).reset_index(drop=True)
folddf = folddf[['Patient', 'fold']].drop_duplicates(ignore_index=True)

train1 = pd.merge(train1, folddf, on='Patient', how='inner')

########################### Flag last three records per patient for CV results ##################################
def mlthree(s):
    s2 = pd.Series(0, index=s.index)
    s2.iloc[-1] = 1
    if len(s2)>1:
        s2.iloc[-2] = 1
    if len(s2)>2:
        s2.iloc[-3] = 1
    return s2

train1["forscore"] = train1.groupby(['Patient'])['Weeks'].apply(mlthree)
test['forscore'] = 0
######################################################################


##################### Natural "agumentation" use every week (other than last 3) as baseline ###########################

train1 = pd.merge(train1,
                 ((train1.rename(columns={"Weeks": "realbasewk"}))[['Patient', 'realbasewk']]).groupby('Patient').min(),
                 how='left', on='Patient')

train2 = pd.merge(train1[train1['realbasewk']!=train1['Weeks']],
         (train1[train1['forscore']==0].rename(columns={"Weeks": "basewk", 
                                            "FVC": "base",
                                           "Percent": "basepercent"}))[['Patient', 'basewk', 'base', 'basepercent']],
         how='outer', on='Patient').reset_index()

train2['original'] = (train2['realbasewk']==train2['basewk']) # Flag for original records, only use those for CV scoring

######################################################################



################### Old CV scheme ########################
#np.random.seed(42)
#
#folds = pd.DataFrame({'Patient': np.unique(train2['Patient'])}) #train2.loc[train2['train']==1, 'patno']
#folds['fold'] = np.int32(np.random.choice(a=np.repeat([0,1,2,3,4], 
#                                            repeats=np.ceil(len(folds)/5)), 
#                                size=len(folds), 
#                                replace=False))
##train2 = pd.merge(train2, folds, on='Patient', how='left') #.reset_index()
#train2.loc[np.isnan(train2.fold.values), 'fold'] = -1


################### Add some more features ########################
train2 = train2.assign(logchg = lambda x: np.log(x.FVC) - np.log(x.base),
                       logbase = lambda x: np.log(x.base),
                       logbasepercent = lambda x: np.log(x.basepercent),
                       logpercent = lambda x: np.log(x.Percent),
                       logpercentchg = lambda x: np.log(x.Percent) - np.log(x.basepercent),
                       chg = lambda x: x.FVC - x.base,
                       relweek = lambda x: x.Weeks - x.basewk,
                       logFVC = lambda x: np.log(x.FVC),
                       sexm = lambda x: np.select([x.Sex.eq('Male'), x.Sex.eq('Female')], [1, 0]),
                       smoker = lambda x: np.select( [x.SmokingStatus.eq("Never smoked"),
                                        x.SmokingStatus.eq("Ex-smoker"),
                                        x.SmokingStatus.eq("Currently smokes")],
                                      [0, 1, 2]))

train2['rwk_bwk'] = train2['relweek']*train2['basewk']
train2['rwk_smk1'] = train2['relweek']*(train2['smoker']>0)
train2['rwk_smk2'] = train2['relweek']*(train2['smoker']>1)
train2['rwk_lbp'] = train2['relweek']*train2['logbasepercent']

train2['rwk_base'] = train2['relweek']*train2['base']
train2['rwk_logbase'] = train2['relweek']*train2['logbase']
train2['rwk_bp'] = train2['relweek']*train2['basepercent']



In [None]:
# A bunch of random number seeds to ensure repeated model training produces varying, but reproducicble results
myseeds = [538, 42, 23, 666, 2020, 4629, 25479,1337, 13, 649913,
           73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173,
    179, 181, 191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281,
    283, 293, 307, 311, 313, 317, 331, 337, 347, 349, 353, 359, 367, 373, 379, 383, 389, 397, 401, 409,
    419, 421, 431, 433, 439, 443, 449, 457, 461, 463, 467, 479, 487, 491, 499, 503, 509, 521, 523, 541,
    547, 557, 563, 569, 571, 577, 587, 593, 599, 601, 607, 613, 617, 619, 631, 641, 643, 647, 653, 659,
    661, 673, 677, 683, 691, 701, 709, 719, 727, 733, 739, 743, 751, 757, 761, 769, 773, 787, 797, 809,
    811, 821, 823, 827, 829, 839, 853, 857, 859, 863, 877, 881, 883, 887, 907, 911, 919, 929, 937, 941,
    947, 953, 967, 971, 977, 983, 991, 997, 1009, 1013, 1019, 1021, 1031, 1033, 1039, 1049, 1051, 1061, 1063, 1069,
    1087, 1091, 1093, 1097, 1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163, 1171, 1181, 1187, 1193, 1201, 1213, 1217, 1223,
    1229, 1231, 1237, 1249, 1259, 1277, 1279, 1283, 1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321, 1327, 1361, 1367, 1373,
    1381, 1399, 1409, 1423, 1427, 1429, 1433, 1439, 1447, 1451, 1453, 1459, 1471, 1481, 1483, 1487, 1489, 1493, 1499, 1511,
    1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571, 1579, 1583, 1597, 1601, 1607, 1609, 1613, 1619, 1621, 1627, 1637, 1657,
    1663, 1667, 1669, 1693, 1697, 1699, 1709, 1721, 1723, 1733, 1741, 1747, 1753, 1759, 1777, 1783, 1787, 1789, 1801, 1811,
    1823, 1831, 1847, 1861, 1867, 1871, 1873, 1877, 1879, 1889, 1901, 1907, 1913, 1931, 1933, 1949, 1951, 1973, 1979, 1987,
    1993, 1997, 1999, 2003, 2011, 2017, 2027, 2029, 2039, 2053, 2063, 2069, 2081, 2083, 2087, 2089, 2099, 2111, 2113, 2129,
    2131, 2137, 2141, 2143, 2153, 2161, 2179, 2203, 2207, 2213, 2221, 2237, 2239, 2243, 2251, 2267, 2269, 2273, 2281, 2287,
    2293, 2297, 2309, 2311, 2333, 2339, 2341, 2347, 2351, 2357, 2371, 2377, 2381, 2383, 2389, 2393, 2399, 2411, 2417, 2423,
    2437, 2441, 2447, 2459, 2467, 2473, 2477, 2503, 2521, 2531, 2539, 2543, 2549, 2551, 2557, 2579, 2591, 2593, 2609, 2617,
    2621, 2633, 2647, 2657, 2659, 2663, 2671, 2677, 2683, 2687, 2689, 2693, 2699, 2707, 2711, 2713, 2719, 2729, 2731, 2741,
    2749, 2753, 2767, 2777, 2789, 2791, 2797, 2801, 2803, 2819, 2833, 2837, 2843, 2851, 2857, 2861, 2879, 2887, 2897, 2903,
    2909, 2917, 2927, 2939, 2953, 2957, 2963, 2969, 2971, 2999, 3001, 3011, 3019, 3023, 3037, 3041, 3049, 3061, 3067, 3079,
    3083, 3089, 3109, 3119, 3121, 3137, 3163, 3167, 3169, 3181, 3187, 3191, 3203, 3209, 3217, 3221, 3229, 3251, 3253, 3257,
    3259, 3271, 3299, 3301, 3307, 3313, 3319, 3323, 3329, 3331, 3343, 3347, 3359, 3361, 3371, 3373, 3389, 3391, 3407, 3413,
    3433, 3449, 3457, 3461, 3463, 3467, 3469, 3491, 3499, 3511, 3517, 3527, 3529, 3533, 3539, 3541, 3547, 3557, 3559, 3571,
    3581, 3583, 3593, 3607, 3613, 3617, 3623, 3631, 3637, 3643, 3659, 3671, 3673, 3677, 3691, 3697, 3701, 3709, 3719, 3727,
    3733, 3739, 3761, 3767, 3769, 3779, 3793, 3797, 3803, 3821, 3823, 3833, 3847, 3851, 3853, 3863, 3877, 3881, 3889, 3907,
    3911, 3917, 3919, 3923, 3929, 3931, 3943, 3947, 3967, 3989, 4001, 4003, 4007, 4013, 4019, 4021, 4027, 4049, 4051, 4057,
    4073, 4079, 4091, 4093, 4099, 4111, 4127, 4129, 4133, 4139, 4153, 4157, 4159, 4177, 4201, 4211, 4217, 4219, 4229, 4231,
    4241, 4243, 4253, 4259, 4261, 4271, 4273, 4283, 4289, 4297, 4327, 4337, 4339, 4349, 4357, 4363, 4373, 4391, 4397, 4409,
    4421, 4423, 4441, 4447, 4451, 4457, 4463, 4481, 4483, 4493, 4507, 4513, 4517, 4519, 4523, 4547, 4549, 4561, 4567, 4583,
    4591, 4597, 4603, 4621, 4637, 4639, 4643, 4649, 4651, 4657, 4663, 4673, 4679, 4691, 4703, 4721, 4723, 4729, 4733, 4751]

In [None]:
# Get CV splits and data ready
fold_splits = [ [ train2[ (train2['fold']!=fff )  ].index.tolist(), 
                 train2[ (train2['fold']==fff) & (train2['original']==True)].index.tolist()] for fff in range(5) ]
# Note: we will model logFVC, because that might be numerically more stable (at least performed a little better) 
y = np.array( train2['logFVC'] ).flatten()
X = np.array( train2[ ['relweek', 'logbase', 'rwk_smk2', 'rwk_logbase']] )


# Number of RANSAC models to fit and take the median prediction over
num_ransac = 250

# Dataframe that will contain out-of-fold predictions
oofs0 = train2.copy()

# Loop over folds
for foldid in range(5):
    trainidx, predidx = fold_splits[foldid][0], fold_splits[foldid][1]
    
    mepreds = np.empty([num_ransac, len(predidx)]) # For writing predictions into
    
    # Fit RANSAC models for the fold 
    for ransac_model in range(num_ransac):        
        ransac = linear_model.RANSACRegressor(random_state=myseeds[ransac_model])
        ransac.fit(X[trainidx], y[trainidx])
        mepreds[ransac_model, :] = ransac.predict(X[predidx]) # write predictions from individual models
        
    oofs0.loc[predidx, 'pred'] = np.exp( np.median(mepreds, axis=0) ) # Point estimate is median prediction of the 250 models
    oofs0.loc[predidx, 'predsd'] = np.std( np.exp( mepreds), axis=0)  # Point estimate is median prediction of the 250 models

#print(train2[['Percent', 'ransacloo1']])
#print(train2[['FVC', 'ransacloo2']])


In [None]:
# Looking at how the MAE behaves gives some indication on whether a constant 'Confidence'-value would be okay
investigate = oofs0.loc[oofs0['pred'].notnull(), ['FVC', 'pred', 'predsd', 'forscore', 'relweek']]
investigate['logMAE'] = np.log(np.absolute(investigate['FVC']-investigate['pred']))
investigate['logsd'] = np.log(investigate['predsd']) 

import seaborn as sns

# Firstly, no hint that the variation in the model predictions is a good indicator 
#sns.regplot(data=investigate[investigate['forscore']==1], x="logsd", y="logMAE",  lowess=True)

# It seem like error is higher the further away we are from the baseline week (makes sense)
sns.regplot(data=investigate[investigate['forscore']==1], x="relweek", y="logMAE",  lowess=True)


In [None]:
print( np.mean(np.absolute(oofs0['FVC']-oofs0['pred'])) ) #201.48929130152206
#print( np.std(np.absolute(oofs0['FVC']-oofs0['pred'])) ) #201.48929130152206

print( -np.mean(np.absolute(oofs0['FVC']-oofs0['pred']))*np.sqrt(2)/290 - np.log(np.sqrt(2)*290)) 

confs = 235 + oofs0['relweek']/63 * 80 
#-6.987823939981736, 235,80: -6.985560881063153, 235,85: -6.9856983175228615, 235,75:-6.985543406781326  240: -6.985843635925146
print( np.mean( -np.absolute(oofs0['FVC']-oofs0['pred'])*np.sqrt(2) / confs  - np.log(np.sqrt(2)* confs )) )


print( np.mean(np.absolute(oofs0[oofs0['forscore']==1]['FVC']-oofs0[oofs0['forscore']==1]['pred'])) ) #201.48929130152206
#print( np.std(np.absolute(oofs0['FVC']-oofs0['pred'])) ) #201.48929130152206

print( -np.mean(np.absolute(oofs0[oofs0['forscore']==1]['FVC']-oofs0[oofs0['forscore']==1]['pred']))*np.sqrt(2)/290 - np.log(np.sqrt(2)*290)) 

confs = 235 + oofs0[oofs0['forscore']==1]['relweek']/63 * 80 
#-6.987823939981736, 235,80: -6.985560881063153, 235,85: -6.9856983175228615, 235,75:-6.985543406781326  240: -6.985843635925146
print( np.mean( -np.absolute(oofs0[oofs0['forscore']==1]['FVC']-oofs0[oofs0['forscore']==1]['pred'])*np.sqrt(2) / confs  - np.log(np.sqrt(2)* confs )) )

In [None]:
oofs1 = oofs0.loc[oofs0['pred'].notnull(), ['Patient', 'Weeks', 'relweek', 'pred', 'fold', 'forscore', 'FVC']].rename(columns={'FVC':'trueFVC', "pred": "FVC", 'forscore': 'last3'}).reset_index()
oofs1['Confidence'] = 235 + oofs1['relweek']/63 * 80 

oofs1['Patient_Week'] = oofs1['Patient'].astype(str) + '_' + oofs1['Weeks'].astype(str)

#oofs1[ ['Patient', 'Weeks', 'FVC', 'Confidence'] ]
oofs1 = oofs1[ ['Patient_Week', 'FVC', 'Confidence', 'fold', 'last3', 'trueFVC'] ]

oofs1

In [None]:
oofs1.to_csv('ransacaug_oof.csv')