In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#  OSIC Pulmonary fibrosis progression 


<img src="https://media.giphy.com/media/WtUK5I9TbWiRcGrVZh/giphy.gif">

# 1. Description 

## Suppose you are diagonsed with pulmonary fibrosis ( Its a disorder  with no known cause  and no known cure created by scarring of lungs ) , its outcome can range from long term stability to rapid  deterioration and doctors aren’t easily able to tell where an individual may fall on that spectrum. So its our job to take the responsibilty as we are the data scientist hence we wont let anyone suffer from that disease anymore  

# 2 . So what is Pulmonary Fibrosis ?

## Pulmonary fibrosis is a lung disease that occurs when lung tissue becomes damaged and scarred. This thickened, stiff tissue makes it more difficult for your lungs to work properly. As pulmonary fibrosis worsens, you become progressively more short of breath.

## The scarring associated with pulmonary fibrosis can be caused by a multitude of factors. But in most cases, doctors can't pinpoint what's causing the problem. When a cause can't be found, the condition is termed idiopathic pulmonary fibrosis.

<img src = 'https://www.wikidoc.org/images/d/d3/Pulmonary_fibrosis.gif' > 

# 3.  So what do we need to predict ?

## We need to predict a patient’s severity of decline in lung function based on a CT scan of their  lungs ,  we need to determine the  lung function based on output from a spirometer, which measures the volume of air inhaled and exhaled. The challenge is to use machine learning techniques to make a prediction with the image, metadata, and baseline FVC as input.

 # 4 . so what is FVC ?

## It stands for forced vital capacity . It  is the amount of air that can be forcibly exhaled from your lungs after taking the deepest breath possible, as measured by spirometry.

# 5. Spirometry ?? what is that !

## Spirometry is the most common of the pulmonary function tests. It measures lung function, specifically the amount and/or speed of air that can be inhaled and exhaled. Spirometry is helpful in assessing breathing patterns that identify conditions such as asthma, pulmonary fibrosis, cystic fibrosis, and COPD.

# 1 :  Importing the libaraies 

In [None]:
import tensorflow as tf 
from tensorflow.keras.layers import *
from tensorflow.keras.models import Sequential
import tensorflow.io as tfio
from keras.preprocessing import image
import matplotlib.pyplot as plt 
import glob as glob 
import seaborn as sns 
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd 
import numpy as np
from skimage import morphology , segmentation , measure 
from sklearn.preprocessing import OneHotEncoder , LabelEncoder 
from sklearn.compose import ColumnTransformer
import os
import pydicom
!pip install dicom
import dicom 
import imageio
from IPython.display import Image
from timeit import timeit
import tensorflow.keras.backend as K
import tensorflow.keras.layers as Layers
import tensorflow.keras.models as Models
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold
from sklearn.metrics import mean_absolute_error

# 2 :  Now its time for Exploratory Data Analysis

In [None]:
# checking the no of rows and columns train data
train_x = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/train.csv')
print('the no of rows is {} and the no of columns is {} '.format(train_x.shape[0] , train_x.shape[1]))

In [None]:
train_x.describe()

In [None]:
# checking the no of rows and columns in test data 
test_x = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')
print('the no of rows is {} and the no of columns is {} '.format(test_x.shape[0] , test_x.shape[1]))

In [None]:
test_x.describe()

In [None]:
# lets check the no of males and females 
sns.countplot( x = 'Sex' , data = train_x )

## as we can clearly see that no of males are way higher then no of females hence it is a point to remember 

# Distribution of images for each patient 

In [None]:
# Thanks to PAB97 for the plot, couldn't figure out how to do it with Seaborn .


new_df = train_x.groupby(
    [
        train_x.Patient,
        train_x.Age,train_x.Sex, 
        train_x.SmokingStatus
    ]
)['Patient'].count()

new_df.index = new_df.index.set_names(
    [
        'id',
        'Age',
        'Sex',
        'SmokingStatus'
    ]
)

new_df = new_df.reset_index()
new_df.rename(columns = {'Patient': 'freq'},inplace = True)

fig = px.bar(new_df, x='id',y ='freq',color='freq')
fig.update_layout(
    xaxis={'categoryorder':'total ascending'},
    title='Distribution of images for each patient'
)
fig.update_xaxes(showticklabels=False)
fig.show()


# distribution of age 

In [None]:
fig = px.histogram(
    new_df, 
    x='Age',
    nbins = 42
)

fig.update_traces(
    marker_color='rgb(158,202,225)', 
    marker_line_color='rgb(8,48,107)',
    marker_line_width=1.5, 
    opacity=0.6
)

fig.update_layout(
    title = 'Distribution of Age'
)

fig.show()

In [None]:
# lets check the smoking status 
sns.countplot( x = 'SmokingStatus' , data = train_x )

# here we can see that no of ex smoker is way higher 

In [None]:
 fig = px.histogram(
    train_x, 
    x='Age',
    color='SmokingStatus',
    color_discrete_map=
        {
            'Never smoked':'yellow',
            'Currently smokes':'cyan',
            'Ex-smoker': 'green', 
        },
    hover_data=train_x.columns
)

fig.update_layout(title='Distribution of Age w.r.t. SmokingStatus for unique patients')

fig.update_traces(
    marker_line_color='black',
    marker_line_width=1.5, 
    opacity=0.85
)

fig.show()

In [None]:
# now analyzing the sex with respect to smoking status 
plt.figure(figsize = (5 , 5))
sns.countplot(x = 'Sex' , hue = 'SmokingStatus' , data = train_x)

# as we can clearly see that male is dominating in case of ex smoker 

In [None]:
fig = px.histogram(
    train_x, 
    x='Age',
    color='Sex',
    color_discrete_map=
        {
            'Male':'blue',
            'Female':'mediumturquoise'
        },
    hover_data=train_x.columns
)

fig.update_layout(title='Distribution of Age w.r.t. sex for unique patients')

fig.update_traces(
    marker_line_color='black',
    marker_line_width=1.5, 
    opacity=0.85
)

fig.show()
50
55
60
65
70
75
80
85
0
20
40
60
80
100
120
140


# as we can clearly see that the no of males are too high in between age ( 64 - 74 )

In [None]:
# now lets see the correlation between features using heatmap 
sns.heatmap(train_x.corr() , annot = True , cmap=plt.cm.cool)

# as we can see that the percent and FVC are having a good relationship

In [None]:
# lets check the FVC distribution graph
a= sns.distplot(train_x['FVC'] , color = 'r' , )
a.set_title('Distribution plot of SVC ' , color = 'g'  , fontsize = 18)


In [None]:
# now lets check the percent distribution graph 
b = sns.distplot(train_x['Percent'] , color = 'g')
b.set_title('Distribution plot of Percent' , color = 'r' , fontsize = 18)


In [None]:
# weeks 
import plotly.express as px
data=px.bar(x=list(train_x['Weeks'].value_counts().keys()), y=list(train_x['Weeks'].value_counts().values) )
data

In [None]:
# lets see the progression of FVC by sex 
fig = px.line(train_x, 'Weeks', 'FVC', line_group='Patient', color='Sex',
             title='Pulmonary Condition Progression by Sex')
fig.update_traces(mode='lines + markers')

In [None]:
# now lets check the pulmonary condition progression with respect to sex
fig = px.line(train_x, 'Weeks', 'FVC', line_group='Patient', color='SmokingStatus',
             title='Pulmonary Condition Progression by Smoking Status')
fig.update_traces(mode='lines+markers')

# **Now friends the wait is over lets pre-process the DICOM files **

In [None]:
# lets see the no of unique patient 
print('The Number of Unique Patients in training data are : {}'.format(len(train_x['Patient'].unique()), "\n"))


In [None]:
data_path = '../input/osic-pulmonary-fibrosis-progression/train/'

output_path = '../input/output/'
train_image_files = sorted(glob.glob(os.path.join(data_path, '*','*.dcm')))
patients = os.listdir(data_path)
patients.sort()

print('Some sample Patient ID''s :', len(train_image_files))
print("\n".join(train_image_files[:5]))

# Now lets create two helper functions 
### 1. load_scan will load all DICOM images from a folder into a list for manipulation.
### 2. The voxel values in the images are raw. get_pixels_hu converts raw values into Houndsfeld units
### 3. The transformation is linear. Therefore, so long as you have a slope and an intercept, you can rescale a voxel value to HU.
### 4. Both the rescale intercept and rescale slope are stored in the DICOM header at the time of image acquisition (these values are scanner-dependent, so you will need external information).

In [None]:
def load_scan(path):
    """
    Loads scans from a folder and into a list.
    
    Parameters: path (Folder path)
    
    Returns: slices (List of slices)
    """
    
    slices = [pydicom.read_file(path + '/' + s) for s in os.listdir(path)]
    slices.sort(key = lambda x: int(x.InstanceNumber))
    
    try:
        slice_thickness = np.abs(slices[0].ImagePositionPatient[2] - slices[1].ImagePositionPatient[2])
    except:
        slice_thickness = np.abs(slices[0].SliceLocation - slices[1].SliceLocation)
        
    for s in slices:
        s.SliceThickness = slice_thickness
    return slices
def get_pixels_hu(scans):
    """
    Converts raw images to Hounsfield Units (HU).
    
    Parameters: scans (Raw images)
    
    Returns: image (NumPy array)
    """
    
    image = np.stack([s.pixel_array for s in scans])
    image = image.astype(np.int16)

    # Since the scanning equipment is cylindrical in nature and image output is square,
    # we set the out-of-scan pixels to 0
    image[image == -2000] = 0
    
    
    # HU = m*P + b
    intercept = scans[0].RescaleIntercept
    slope = scans[0].RescaleSlope
    
    if slope != 1:
        image = slope * image.astype(np.float64)
        image = image.astype(np.int16)
        
    image += np.int16(intercept)
    
    return np.array(image, dtype=np.int16)

# we need to know what exactly is Housnfield unit .
## The Hounsfield unit (HU) scale is a linear transformation of the original linear attenuation coefficient measurement into one in which the radiodensity of distilled water at standard pressure and temperature (STP) is defined as zero Hounsfield units (HU), while the radiodensity of air at STP is defined as -1000 HU.
<img src =  'https://pbrainmd.files.wordpress.com/2015/10/hounsfield-2.jpg' >


In [None]:
test_patient_scans = load_scan(data_path + patients[2])
test_patient_images = get_pixels_hu(test_patient_scans)

#We'll be taking a random slice to perform segmentation:

for imgs in range(len(test_patient_images[0:5])):
    f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True, figsize=(15,15))
    ax1.imshow(test_patient_images[imgs], cmap=plt.cm.bone)
    ax1.set_title("Original Slice")
    
    ax2.imshow(test_patient_images[imgs], cmap=plt.cm.bone)
    ax2.set_title("Original Slice")
    
    ax3.imshow(test_patient_images[imgs], cmap=plt.cm.bone)
    ax3.set_title("Original Slice")
    plt.show()


# animated scan 

In [None]:
def set_lungwin(img, hu=[-1200., 600.]):
    lungwin = np.array(hu)
    newimg = (img-lungwin[0]) / (lungwin[1]-lungwin[0])
    newimg[newimg < 0] = 0
    newimg[newimg > 1] = 1
    newimg = (newimg * 255).astype('uint8')
    return newimg


scans = load_scan('../input/osic-pulmonary-fibrosis-progression/train/ID00007637202177411956430/')
scan_array = set_lungwin(get_pixels_hu(scans))

imageio.mimsave("/tmp/gif.gif", scan_array, duration=0.00001)
Image(filename="/tmp/gif.gif", format='png')

In [None]:

train_x.shape 

In [None]:
test_x.shape 

# osic laplace function 

In [None]:
def eval_metric(FVC,FVC_Pred,sigma):
    n = len(sigma)
    a=np.empty(n)
    a.fill(70)
    sigma_clipped = np.maximum(sigma,a) 
    delta = np.minimum(np.abs(FVC,FVC_Pred),1000)
    eval_metric = -np.sqrt(2)*delta/sigma_clipped - np.log(np.sqrt(2)*sigma_clipped)
    return eval_metric

# data wrangling and processing for tabular data 

In [None]:
## CHECK SUBMISSION FORMAT
sub_df = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/sample_submission.csv')

print(f"The sample submission contains: {sub_df.shape[0]} rows and {sub_df.shape[1]} columns.")

In [None]:
# split Patient_Week Column and re-arrage columns
sub_df[['Patient','Weeks']] = sub_df.Patient_Week.str.split("_",expand = True)
sub_df =  sub_df[['Patient','Weeks','Confidence', 'Patient_Week']]

In [None]:
sub_df = sub_df.merge(test_x.drop('Weeks', axis = 1), on = "Patient")

In [None]:
# introduce a column to indicate the source (train/test) for the data
train_x['Source'] = 'train'
sub_df['Source'] = 'test'

data_df = train_x.append([sub_df])
data_df.reset_index(inplace = True)
data_df.head()

The first big challenge is data wrangling: We could see that some patients take FVE measurements only after their baseline CT-Images, and some took measurements before that. So let's first find out what the actual baseline-week and baseline-FVC for each Patient is.
We start with the baseline week:



In [None]:
def get_baseline_week(df):
    # make a copy to not change original df    
    _df = df.copy()
    # ensure all Weeks values are INT and not accidentaly saved as string
    _df['Weeks'] = _df['Weeks'].astype(int)
    # as test data is containing all weeks, 
    _df.loc[_df.Source == 'test','min_week'] = np.nan
    _df["min_week"] = _df.groupby('Patient')['Weeks'].transform('min')
    _df['baselined_week'] = _df['Weeks'] - _df['min_week']
    
    return _df   

In [None]:
data_df = get_baseline_week(data_df)
data_df.head()

What we can see here, is that the Patient with ID ending on "430" had his first FVC measure 4 weeks before the first (baseline) CT images ( = "Weeks" column -4) were taken. Then the patient took the next FVC measurement 9 weeks later. In the next step we need to baseline the FVC values. Note, that the BASELINE-FVC it not the minimum FVC, but the first measurement, meaning the measurement taken in the "min_week" or baselined_week = 0.

For getting the baselined FVC I first wrote the following straightforward function:

In [None]:
def get_baseline_FVC_old(df):
    # copy the DF to not in-place change the original one
    _df = df.copy()
    # get only the rows containing the baseline (= min_weeks) and therefore the baseline FVC
    baseline = _df.loc[_df.Weeks == _df.min_week]
    baseline = baseline[['Patient','FVC']].copy()
    baseline.columns = ['Patient','base_FVC']      
    
    # fill the df with the baseline FVC values
    for idx in _df.index:
        patient_id = _df.at[idx,'Patient']
        _df.at[idx,'base_FVC'] = baseline.loc[baseline.Patient == patient_id, 'base_FVC'].iloc[0]
    _df.drop(['min_week'], axis = 1)
    
    return _df

This apporach works fine, but as it contains a lot of look-ups, its slow and didn't feel right.
Btw: there is an even worse approach: Using for row in df.iterrows() is roughly 8 times slower than using for idx in df.index.
So I looked up how other people solved it and I found a rough equivalent to the following function:

In [None]:
def get_baseline_FVC(df):
    # same as above
    _df = df.copy()
    base = _df.loc[_df.Weeks == _df.min_week]
    base = base[['Patient','FVC']].copy()
    base.columns = ['Patient','base_FVC']
    
    # add a row which contains the cumulated sum of rows for each patient
    base['nb'] = 1
    base['nb'] = base.groupby('Patient')['nb'].transform('cumsum')
    
    # drop all except the first row for each patient (=unique rows!), containing the min_week
    base = base[base.nb == 1]
    base.drop('nb', axis = 1, inplace = True)
    
    # merge the rows containing the base_FVC on the original _df
    _df = _df.merge(base, on = 'Patient', how = 'left')    
    _df.drop(['min_week'], axis = 1)
    
    return _df

The second apporach is using transform, which is not as known as apply, but faster for basic-operations not involving multiple columns of a dataframe. Here is an interesting post about it for those, who want to learn more: Apply vs transform.

I wanted to know how much this speeds up the processing, you can find the results in the following:

In [None]:
def old_baseline_FVC():
    return get_baseline_FVC_old(data_df)
    pass

def new_baseline_FVC():
    return get_baseline_FVC(data_df)
    

duration_old = timeit(old_baseline_FVC, number = 3)
duration_new = timeit(new_baseline_FVC, number = 3)

print(f"Taking the old, non-vectorized version took {duration_old / 3:.2f} sec, while the vectorized version only took {duration_new / 3:.3f} sec. That's {duration_old/duration_new:.0f} times faster!" )

In [None]:
data_df = get_baseline_FVC(data_df)
data_df.head()

# Preparing the data for the Neural Network

In [None]:
from sklearn.preprocessing import OneHotEncoder , LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.compose import ColumnTransformer

# define which attributes shall not be transformed, are numeric or categorical
no_transform_attribs = ['Patient', 'Weeks', 'min_week']
num_attribs = ['FVC', 'Percent', 'Age', 'baselined_week', 'base_FVC']
cat_attribs = ['Sex', 'SmokingStatus']

In [None]:
def own_MinMaxColumnScaler(df, columns):
    """Adds columns with scaled numeric values to range [0, 1]
    using the formula X_scld = (X - X.min) / (X.max - X.min)"""
    for col in columns:
        new_col_name = col + '_scld'
        col_min = df[col].min()
        col_max = df[col].max()        
        df[new_col_name] = (df[col] - col_min) / ( col_max - col_min )

In [None]:
def own_OneHotColumnCreator(df, columns):
    """OneHot Encodes categorical features. Adds a column for each unique value per column"""
    for col in cat_attribs:
        for value in df[col].unique():
            df[value] = (df[col] == value).astype(int)

In [None]:
## APPLY DEFINED TRANSFORMATIONS
own_MinMaxColumnScaler(data_df, num_attribs)
own_OneHotColumnCreator(data_df, cat_attribs)

data_df[data_df.Source != "train"].head()

In [None]:
# get back original data split
train_df = data_df.loc[data_df.Source == 'train']
sub = data_df.loc[data_df.Source == 'test']

Okay, so the second apporach (using our own implementation) was more straightforward and less code. Downside: if you want to replace the MinMaxScaler with another scaling method (RobustScaler, StdScaler), you need to implement it first.

# Model & Loss
In this section we are going to define the loss & a first model. First we are taking care of the loss. We are trying to minimize the following:



In [None]:
######## CONFIG ########

## Features
features_list = ['baselined_week_scld', 'Percent_scld', 'Age_scld', 'base_FVC_scld', 'Male', 'Female', 'Ex-smoker', 'Never smoked', 'Currently smokes']

## Basics
EPOCHS = 1000
BATCH_SIZE = 128


## LOSS; set tradeoff btw. Pinball-loss and adding score
_lambda = 0.8 # 0.8 default


## Optimizers
ADAM = tf.keras.optimizers.Adam(lr = 0.1,
                                beta_1 = 0.9, 
                                beta_2 = 0.999,
                                decay = 0.01)
SGD = tf.keras.optimizers.SGD()

# choose ADAM or SGD
optimizer = ADAM


## To-DO: Implement Callbacks for Learning Rate Schedulers

lr_start   = 0.0001
lr_max     = 0.0001 * BATCH_SIZE # higher batch size --> higher lr
lr_min     = 0.00001
lr_ramp_ep = EPOCHS * 0.3
lr_sus_ep  = 0
lr_decay   = 0.992

def test_the_scheduler(epoch):
        if epoch < lr_ramp_ep:
            lr = (lr_max - lr_start) / lr_ramp_ep * epoch + lr_start
            
        elif epoch < lr_ramp_ep + lr_sus_ep:
            lr = lr_max
            
        else:
            lr = (lr_max - lr_min) * lr_decay**(epoch - lr_ramp_ep - lr_sus_ep) + lr_min
            
        return lr

rng = [i for i in range(EPOCHS)]
y = [test_the_scheduler(x) for x in rng]
plt.plot(rng, y)
print("Learning rate schedule: {:.3g} to {:.3g} to {:.3g}".format(y[0], max(y), y[-1]))

In [None]:
# create constants for the loss function
C1, C2 = tf.constant(70, dtype='float32'), tf.constant(1000, dtype="float32")

# define competition metric
def score(y_true, y_pred):
    """Calculate the competition metric"""
    tf.dtypes.cast(y_true, tf.float32)
    tf.dtypes.cast(y_pred, tf.float32)
    sigma = y_pred[:, 2] - y_pred[:, 0]
    fvc_pred = y_pred[:, 1]
    
    sigma_clip = tf.maximum(sigma, C1)
    delta = tf.abs(y_true[:, 0] - fvc_pred)
    delta = tf.minimum(delta, C2)
    sq2 = tf.sqrt( tf.dtypes.cast(2, dtype = tf.float32) )
    metric = (delta / sigma_clip) * sq2 + tf.math.log(sigma_clip * sq2)
    return K.mean(metric)

# define pinball loss
def qloss(y_true, y_pred):
    """Calculate Pinball loss"""
    # IMPORTANT: define quartiles, feel free to change here!
    qs = [0.2, 0.50, 0.8]
    q = tf.constant(np.array([qs]), dtype = tf.float32)
    e = y_true - y_pred
    v = tf.maximum(q * e, (q-1) * e)
    return K.mean(v)

# combine competition metric and pinball loss to a joint loss function
def mloss(_lambda):
    """Combine Score and qloss"""
    def loss(y_true, y_pred):
        return _lambda * qloss(y_true, y_pred) + (1 - _lambda) * score(y_true, y_pred)
    return loss


Neural Network Model
In this section we build an initial neural Network. The code of this section is derived from Ulrich's great notebook, which also inspired me to change my loss to the above coded version. Please support the original Notebook creators! The chosen quartiles are simply derived by testing; using 0.25 and 0.75 leads to worse results.

For the architecture: It's good practice to use numbers of units following the schema 2^x, with x element of N (= resulting in 1,2,4,8,16,32,64,128,..).
We are going to use dropout for regularization and not a too broad and deep network, as the training data is very limited.

In [None]:
def get_model():
    "Creates and returns a model"
    inp = Layers.Input((len(features_list),), name = "Patient")
    x = Layers.Dense(128, activation = "relu", name = "d1")(inp)
    x = Layers.Dropout(0.25)(x)
    x = Layers.Dense(128, activation = "relu", name = "d2")(x)
    x = Layers.Dropout(0.2)(x)
    # predicting the 
    p1 = Layers.Dense(3, activation = "relu", name = "p1")(x)
    # quantile adjusting p1 predictions
    p2 = Layers.Dense(3, activation = "relu", name = "p2")(x)
    preds = Layers.Lambda(lambda x: x[0] + tf.cumsum(x[1], axis = 1), 
                     name = "preds")([p1, p2])
    
    model = Models.Model(inp, preds, name = "NeuralNet")
    model.compile(loss = mloss(_lambda), optimizer = optimizer, metrics = [score])
    
    return model

In [None]:
# create neural Network
neuralNet = get_model()
neuralNet.summary()


In [None]:
## GET TRAINING DATA AND TARGET VALUE

# get target value
y = train_df['FVC'].values.astype(float)


# get training & test data
X_train = train_df[features_list].values
X_test = sub[features_list].values

# instantiate target arrays
train_preds = np.zeros((X_train.shape[0], 3))
test_preds = np.zeros((X_test.shape[0], 3))

In the following we want to create leak-free folds to get a robust cross-validation strategy in order to evaluate all our models & our training. The idea is to avoid having the same patient (= PatientID) in training- and in validation-Data, as this might lead to evaluate a higher CV-score for a model which is luckily learning/memorizing the data for a particular patientID which is also frequently occuring in the validation-data.

The idea on how to do that is coming from @PAB97 Pierre's great notebook (CHECK IT OUT!) Please note, that we still don't use propoer stratification based on 'Age', 'Sex', 'SmokingStatus'.

In [None]:
## Non-Stratified GroupKFold-split (can be further enhanced with stratification!)
"""K-fold variant with non-overlapping groups.
The same group will not appear in two different folds: in this case we dont want to have overlapping patientIDs in TRAIN and VAL-Data!
The folds are approximately balanced in the sense that the number of distinct groups is approximately the same in each fold."""

NFOLDS = 10
gkf = GroupKFold(n_splits = NFOLDS)
# extract Patient IDs for ensuring 
groups = train_df['Patient'].values

count = 0
for train_idx, val_idx in gkf.split(X_train, y, groups = groups):
    count += 1
    print(f"FOLD {count}:")
    
    # create and fit model
    net = get_model()
    net.fit(X_train[train_idx], y[train_idx], batch_size = BATCH_SIZE, epochs = EPOCHS, 
            validation_data = (X_train[val_idx], y[val_idx]), verbose = 0) 
    
    # evaluate
    print("Train:", net.evaluate(X_train[train_idx], y[train_idx], verbose = 0, batch_size = BATCH_SIZE))
    print("Val:", net.evaluate(X_train[val_idx], y[val_idx], verbose = 0, batch_size = BATCH_SIZE))
    
    # generate predictions for the known train data and the unknown test data
    train_preds[val_idx] = net.predict(X_train[val_idx], batch_size = BATCH_SIZE, verbose = 0)
    
    print("Predicting Test...")
    test_preds += net.predict(X_test, batch_size = BATCH_SIZE, verbose = 0) / NFOLDS

In the next section we are going to use the train_preds to calculate the optimized sigma, which is a measure for certainty or rather uncertainty. We can do that, as we have both: the model's estimate and the real data. We subtract the lower quartile from the upper quartile (defined in the loss function) and average it.

In [None]:
## FIND OPTIMIZED STANDARD-DEVIATION
sigma_opt = mean_absolute_error(y, train_preds[:,1])
sigma_uncertain = train_preds[:,2] - train_preds[:,0]
sigma_mean = np.mean(sigma_uncertain)
print(sigma_opt, sigma_mean)

In [None]:
sub.head()

In [None]:
## PREPARE SUBMISSION FILE WITH OUR PREDICTIONS
sub['FVC1'] = test_preds[:, 1]
sub['Confidence1'] = test_preds[:,2] - test_preds[:,0]

# get rid of unused data and show some non-empty data
submission = sub[['Patient_Week','FVC','Confidence','FVC1','Confidence1']].copy()
submission.loc[~submission.FVC1.isnull()].head(10)

In [None]:
submission.head()

In [None]:
submission.describe().T

In [None]:
org_test = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')

for i in range(len(org_test)):
    submission.loc[submission['Patient_Week']==org_test.Patient[i]+'_'+str(org_test.Weeks[i]), 'FVC'] = org_test.FVC[i]
    submission.loc[submission['Patient_Week']==org_test.Patient[i]+'_'+str(org_test.Weeks[i]), 'Confidence'] = 70

In [None]:
submission[["Patient_Week","FVC","Confidence"]].to_csv("submission.csv", index = False)
