In [None]:
#!pip install torch==1.6.0+cu101 torchvision==0.7.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
#!pip install fastai==2.0.9

!pip install /kaggle/input/fast-v2-offline/torch-1.6.0-cp37-cp37m-manylinux1_x86_64.whl -q
!pip install /kaggle/input/fast-v2-offline/torchvision-0.7.0-cp37-cp37m-manylinux1_x86_64.whl -q
!pip install /kaggle/input/fast-v2-offline/dataclasses-0.6-py3-none-any.whl -q
!pip install /kaggle/input/fast-v2-offline/fastprogress-1.0.0-py3-none-any.whl -q
!pip install /kaggle/input/fast-v2-offline/fastcore-1.0.1-py3-none-any.whl -q
!pip install /kaggle/input/fast-v2-offline/fastai-2.0.8-py3-none-any.whl -q

In [None]:
#pip install fastai --upgrade

In [None]:
import os, random
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport

from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
from sklearn import ensemble
from sklearn.ensemble import RandomForestRegressor

from fastai.tabular.all import *
from fastai.medical.imaging import *

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def seed_everything(seed): 
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(123)

In [None]:
path = Path("../input/osic-pulmonary-fibrosis-progression")
path.ls()

# What Are We Working To Predict?
We will be predicting, given an initial FVC measurement and the CT scan images for a patient, what the final 3 FVC measurements are for the patient. Since that initial FVC could be any week, we must create predictions for Weeks of value [-12, 133]. Our final score for this model will be the accuracy of the final 3 week predictions given by the Laplace Log Likelihood.

Let's explore our data first.

In [None]:
train_df = pd.read_csv(path/'train.csv')
test_df = pd.read_csv(path/'test.csv')
submission_df = pd.read_csv(path/'sample_submission.csv')
train_df.head()

In [None]:
train_df.info()

In [None]:
profile = ProfileReport(train_df, title = "Profiling Report")

In [None]:
profile.to_notebook_iframe()

In [None]:
train_df.describe()

# Data Cleaning
## Duplicates

First, let's identify if we have duplicates in our training set and correct for them, before anything else:

In [None]:
duplicates = train_df[train_df.duplicated(subset = ['Patient', 'Weeks'], keep = False)]
duplicates

Since we don't have much data to start with, let's average the FVC and Percent values and keep the rest intact.

In [None]:
train_df = train_df.groupby(['Patient', 'Weeks']).agg({
    'FVC': 'mean',
    'Percent' : 'mean',
    'Age' : 'mean',
    'Sex': lambda x: pd.unique(x), 
    'SmokingStatus' : lambda y: pd.unique(y)}).reset_index() #give us back train if test is in train
train_df

In [None]:
duplicates = train_df[train_df.duplicated(subset = ['Patient', 'Weeks'], keep = False)]
duplicates #none please!

## Outliers
Through data exploration (and evaluating largest losses in our models), we can identify some data points that don't look particularly correct. By removing these, our hope is our model is more robust.

In [None]:
train_df[train_df['Patient'] == 'ID00298637202280361773446']

In [None]:
train_df = train_df[train_df['Patient'] != 'ID00298637202280361773446']
train_df[train_df['Patient'] == 'ID00298637202280361773446']

In [None]:
outliers = {'ID00093637202205278167493' : 16,
           'ID00207637202252526380974' : 33,
           'ID00061637202188184085559' : 24,
           'ID00319637202283897208687' : 16,
           'ID00355637202295106567614' : 28}
train_df[train_df['Patient'] == 'ID00093637202205278167493']

In [None]:
for key, value in outliers.items():
    train_df = train_df[(train_df['Patient'] != key) | (train_df['Weeks'] != value)]
train_df[train_df['Patient'] == 'ID00093637202205278167493'] #Week 16 is gone

# Initial EDA
## Groups
We initial intuition with this problem is to see if we can identify groups of individuals that may trend differently according to the tabular data provided. Let's do some initial EDA to preliminarily identify any trends for FVC over time in regards to sex, age, and smoking status. We will analyze both FVC and FVC Percent over time.

Questions to answer:

How do FVC and FVC Percent change over time? And how does this differ for each major group?

In [None]:
sns.lmplot(x="Weeks", y="FVC", hue="Sex", data = train_df).fig.set_size_inches(10,7)

Males are much higher FVC - however the general trend actually shows Males decrease over time while Females increase over time. Note there are many more Males than Females. What if instead of FVC, we looked at Percent?

In [None]:
sns.lmplot(x="Weeks", y="Percent", hue="Sex", data = train_df).fig.set_size_inches(10,7)

When we use Percent as opposed to FVC, the male vs female absolute differential goes away. This is expected as FVC Percent is a representation of the % of FVC for a 'typical' patient, given some unknown demographic and health factors (believe it is sex, height, weight?). However, the small trend of males decreasing more than females remains.

It may be useful to look at how a few individual patients trend over time.

In [None]:
sns.lmplot(x="Weeks", y="Percent", hue="Patient", data = train_df.head(100)).fig.set_size_inches(15,7)

In [None]:
sns.lmplot(x="Weeks", y="Percent", hue="Patient", data = train_df.tail(95)).fig.set_size_inches(15,7)

We're seeing general declines on the whole, but nothing huge. What about Age?

In [None]:
sns.lmplot(x="Weeks", y="Percent", hue="Age", data = train_df).fig.set_size_inches(15,12)

Let's inspect a "bottom-level" group, and ascertain if they are similar to one another.

In [None]:
tr_Group = train_df[(train_df['Age'] == 66) & (train_df['Sex'] == "Male") & (train_df['SmokingStatus'] == "Ex-smoker")]
sns.lmplot(x="Weeks", y="Percent", hue="Patient", data = tr_Group).fig.set_size_inches(15,7)

Even within our (small) example group where Age, Sex, and Smoking Status are the same, we have wildly different Percent intercepts for Weeks_Adj between 0 and 10, from 90s to high 40s. Therefore the intercept of the initial FVC will be extremely important. What we really hope to learn is how that may change as the disease progresses given the data we have.

# Data Wrangling and Transformations
## Combining Datasets
First we'll combine the train and test sets such that our transformations are applied the same to both train and test data.

In [None]:
submission_df[['Patient','Weeks']] = submission_df['Patient_Week'].str.split("_",expand = True) #split Patient_Week
submission_df = submission_df.drop('FVC', axis=1)
submission_df = submission_df.merge(test_df.drop(['Weeks'], axis = 1), on = "Patient")

#introduce a column to indicate the source (train/test) for the data
train_df['source'] = 'train'
test_df['source'] = 'test'
submission_df['source'] = 'submission'

submission_df

In [None]:
data_df = train_df.append([test_df, submission_df])
data_df.reset_index(inplace = True)
data_df = data_df.drop('index', axis=1)
data_df

And for a single patient from Test:

In [None]:
data_df[data_df['Patient'] == 'ID00419637202311204720264']

## Feature Engineering
### Weeks 
Let's add the minimum week for each Patient as a feature as well as a "Baseline" Week feature, which is Week relative to this first week. We need to do this carefully so it makes sense for both the training data and the submission data, which includes ALL weeks for the test Patients.



In [None]:
data_df['Weeks'] = data_df['Weeks'].astype(int)
data_df['min_week'] = data_df['Weeks'] #first create column
data_df.loc[data_df['source']=='submission','min_week'] = np.nan #next get rid of submission's "false" weeks
data_df['min_week'] = data_df.groupby('Patient')['min_week'].transform('min') #now we can groupby Patient and use min of min_week

data_df['baseline_week'] = data_df['Weeks'] - data_df['min_week'] #add column that represents offset from min Week

data_df

As a dataset with a time component, it is very important to ensure our time measurements are in good shape for prediction. In this case we don't have a proper time feature that represents the progression of the dependent variable (FVC, a proxy for progression of the disease); instead we have Week relative to CT scan. As they are relative to the CT scan, they can be negative, meaning that an FVC test was taken some weeks before the CT scan.

However, week relative to CT scan does not matter. What does matter is progression of the disease relative to each patient (the assumption being made is the initial FVC measurement is somewhere near the initial onset and subsequent diagnosis of the disease, while timing of CT scan dependent on other factors like availability/scheduling). To adjust for this, we will create a new feature called Weeks_Adj to adjust the Weeks field so the progression is more comparable. 

The Weeks_Adj will alter negative weeks so as to have the first FVC measure Weeks_Adj = 0 (equivalent to Week 0), and offset all further Week_Adj data points for the patient. Then, for all other patients, their CT Scan is already at Week 0, so no adjustment is needed, assuming that a CT Scan would signal that the disease is present and a scan is needed in order to diagnose. We assume that if they had the CT scan of their chest, it was related to degradation of lung function, and for whatever reason early FVC measurements were not taken or available.

The hope is that the Week_Adj is now more comparable on a patient-by-patient basis and more predictive of the specific future Weeks we are asked to predict than Weeks relative to CT scan.

In [None]:
data_df= data_df.assign(weeks_adj=[y if x < 0 else z for x, y, z in zip(
                                       data_df['min_week'], 
                                       data_df['baseline_week'], 
                                       data_df['Weeks'])])
data_df

Note how these fields will appear for our submission data:

In [None]:
data_df[data_df['source'] == 'submission']

We can quickly confirm only when min_week is less than 0 is when weeks_adj is different than Weeks:

In [None]:
weeksadj_df = data_df[data_df['Weeks'] != data_df['weeks_adj']] #all data points where Weeks is changed
weeksadj_df

In [None]:
weeksadj_df[weeksadj_df['min_week'] > -1] #confirming it is only patients whose min_week is negative

### FVC Related Features
Now let's create features for Initial FVC and Initial Percent for each row in our data. We'll use our new Weeks features to make this fairly simple to do!

In [None]:
#get initial % and FVC for each patient as features for every row
for field in ['FVC', 'Percent']:
    baseline_df = data_df[data_df['baseline_week'] == 0].groupby('Patient')[field].mean().to_frame()
    data_df = data_df.merge(baseline_df, on = 'Patient')
    data_df = data_df.rename(columns={field+"_x": field, field+"_y": "initial_"+field.lower()})
data_df

In [None]:
#you can run these lines to check that for Submission rows initial fvc and initial percent are correct (the resulting dfs are empty)
#data_df[(data_df['source'] == 'submission') & (data_df['FVC'] != data_df['initial_fvc'])]
#data_df[(data_df['source'] == 'submission') & (data_df['Percent'] != data_df['initial_percent'])]

Since we are given Percent, we can also assume that there is a "normal" FVC for each patient. Not all factors that determine it are provided in the dataset, but I believe age, sex, and height are the major factors. We can confirm by calculating this feature:

In [None]:
data_df['fvc_norm'] = data_df.FVC / data_df.Percent * 100
data_df[(data_df['baseline_week'] == 0) & (data_df['source'] == 'train')].tail()

Indeed we can see Patients with similar Sex and Age can have very different normal FVC levels!

In [None]:
data_df

### One-Hot Encoding
Next, we'll one-hot encode our categorial variables - just a few and with only a couple categories.

In [None]:
data_df = pd.get_dummies(data_df, columns=['Sex', 'SmokingStatus'], drop_first=True)
data_df

In [None]:
data_df.describe()

Our data is looking good!

## Split Data into Train and Test
Now that our transformations are done, let's split our data for training. Our Source feature makes this quite simple! We also should reset the index of our train_df so the indices match up, as we'll need to use the indices for modeling.

In [None]:
train_df = data_df.loc[data_df['source'] == 'train'].copy()
test_df = data_df.loc[data_df['source'] == 'test'].copy()
submission_df = data_df.loc[data_df['source'] == 'submission'].copy()

train_df = train_df.reset_index().drop('index', axis=1)
train_df.describe()

Note min_week - has an average of 14, which means on average the first FVC intake is 14 weeks after the CT scan, with a std of 15. Max is Week 79 after the CT scan. Fairly interesting! Let's look at things a little deeper...

# Data Analysis
## Final 3 FVC
Remember that for scoring, what matters is the final 3 measurements. For the training data, we know what these weeks are, but for the test data we don't, so we can't train specifically to predict that - and thus, must submit predictions for all possible Weeks. Regardless, let's create a feature in our training DataFrame for analysis purposes that marks whether the row is a final 3 FVC measurement or not. We may be able to gain some intuition to see if there is anything we can do to train a model that would predict these better.

In [None]:
train_df['final_3'] = train_df['Patient'] != train_df['Patient'].shift(-3)
train_df.head(25)

What do those final 3 look like? First we could look at the distribution of final 3 FVC measurements over FVC itself:

In [None]:
bins = range(800, 6800, 500)
train_df[train_df['final_3'] == True]['FVC'].plot.hist(bins=bins, alpha=0.5, label='Final 3')
train_df[(train_df['final_3'] == False)]['FVC'].plot.hist(bins=bins, alpha=0.4, label='not Final 3')
plt.legend(prop={'size': 12})
plt.title('Distribution of Final 3 FVC Measurements Over FVC')
plt.xlabel('FVC')
plt.ylabel('Frequency')

It seems that the Final 3 measurements are more likely to be distributed in FVC < 2000, as we may expect, but not hugely so. The other brackets above 2000 show >75% of the measurements outside of the final 3, but still a good amount in FVCs above 3500+ compared to total measurements.

What about Final 3 in relation to Percent?

In [None]:
bins = range(20, 100, 5)
train_df[train_df['final_3'] == True]['Percent'].plot.hist(bins=bins, alpha=0.5, label='Final 3')
train_df[(train_df['final_3'] == False)]['Percent'].plot.hist(bins=bins, alpha=0.4, label='not Final 3')
plt.legend(prop={'size': 12})
plt.title('Distribution of Final 3 FVC Measurements Over Percent')
plt.xlabel('Percent')
plt.ylabel('Frequency')

Seems like Final 3 % have a fair representation of 25-40% of the measurmements of FVC Percent above 50%. Less than than that though, and the Final 3 are 75%+ of the FVC measurements where FVC Percent is less than 50%.

Now, what about Weeks?

In [None]:
bins = range(-10,140,5)
train_df[train_df['final_3'] == True]['Weeks'].plot.hist(bins=bins, alpha=0.5, label='Final 3')
train_df[train_df['final_3'] == False]['Weeks'].plot.hist(bins=bins, alpha=0.5, label='not Final 3')
plt.legend(prop={'size': 12})
plt.title('Distribution of Final 3 FVC Measurements Over Weeks')
plt.xlabel('Weeks')
plt.ylabel('Frequency')

Somewhat surprisingly, we have most of our final 3 FVC measurements in the 25-70 weeks range. In fact these are not necessarily backloaded in our Weeks distribution, at least not as much as I would have thought, with the largest bins being 30, 40, and 55 Weeks.

This graph illustrates a key point. The majority of our training data for our model is helping it learn about early weeks which may not be very helpful to our model. Instead, we would prefer the model learn accordingly with the blue distribution - we want a model that could predict Weeks 25+ very well, at the expense of not being able to predict Weeks 0-25 well whatsoever, let's say.

With this more stark split between Final 3 and not, we could possibly tune our model to predict certain Weeks better than others as a proxy for Final 3 performance.

In [None]:
#for i in range(40, 58):
  #  cond_train_df = train_df[(train_df['Weeks'] > i)]
   # final_3_rows = cond_train_df[cond_train_df['final_3'] == True].shape[0]
   # total_rows = cond_train_df.shape[0] 
   # print(f'Weeks above {i}, have {round(final_3_rows/total_rows*100,2)}% of their data in Final 3 FVC, or {final_3_rows} \
#out of {total_rows} total rows.')

# Modeling
## Prep Training DataFrame
First we need to do any last adjustments to our training data before we feed into the Dataloaders. We have some NaN columns that we can simply remove from training.

In [None]:
train_df = train_df.drop(['Patient_Week', 'Confidence'], axis=1)
train_df


## Validation Set and Features Lists
Now let's decide on our validation set. This is an extremely important part of the modeling process!

We need to select a subset of training data that includes rows for a few specific patients so the model will be validated against patients it has never seen before. This mirrors the test set - new patients the model has never seen before. 

Next - how much of our small dataset should we set aside for validation?

In [None]:
valid_set_patient_num = 30
cond = train_df['Patient'].isin(np.random.choice(train_df.Patient.unique(),valid_set_patient_num,replace=False))

valid_idx = np.where(cond)[0]
train_idx = np.where(~cond)[0]
splits = (list(train_idx),list(valid_idx))
train_idx.shape, valid_idx.shape

And with our list of validation indices, we prepare the indices in the dataframe for our fastai TabularPandas class, which creates new training and validation objects for us to work with:

In [None]:
cat_features = ['Sex_Male', 'SmokingStatus_Ex-smoker', 'SmokingStatus_Never smoked']
cont_features = ['Weeks','Age', 'min_week', 'baseline_week', 'weeks_adj', 'initial_percent', 'initial_fvc', 'fvc_norm']
procs = Categorify
tab_obj = TabularPandas(train_df, procs, cat_features, cont_features, y_names='FVC', splits=splits)
len(tab_obj.train), len(tab_obj.valid)

Note these are the only features we'll have at test time. We won't have Percent, or final_3, so we can't include them in the model.



In [None]:
tab_obj.show(10)

In [None]:
tab_obj.items.head(10) #all columns from train_df are still here!

## First Model - Random Forest

Now let's create our Random Forest Function. We'll evaluate using root mean squared error at first, just to get a sense for things.

In [None]:
def create_rf(xs, y, n_estimators=200, max_features=0.7, min_samples_leaf=7, **kwargs):
    return RandomForestRegressor(n_jobs=-1, n_estimators=n_estimators, max_features=max_features, 
                                 min_samples_leaf=min_samples_leaf, oob_score=True).fit(xs,y)

def rmse(pred,y): return round(math.sqrt(((pred-y)**2).mean()), 6)
def model_rmse(m, xs, y): return rmse(m.predict(xs),y)

We can reassign the train, valid x and y columns easily accessible via Fastai's tabular object into easy to access variables...

In [None]:
xs, y = tab_obj.train.xs, tab_obj.train.y
valid_xs, valid_y = tab_obj.valid.xs, tab_obj.valid.y

Create a forest and test it on RMSE!

In [None]:
n_trees = 300
max_features=.7
min_samples_leaf=7
m = create_rf(xs, y, n_trees, max_features, min_samples_leaf)
model_rmse(m, xs, y), model_rmse(m, valid_xs, valid_y), rmse(m.oob_prediction_, y)

We can use the m.estimators_ attribute of our model to access the predictions for each tree and combine them into a matrix called preds, where each column is a set of predictions, and each row is each individual tree in our random forest. The mean of this for each column evaluated against the validation array is the same as our RMSE!

In [None]:
preds = np.stack(t.predict(valid_xs) for t in m.estimators_)
pd.DataFrame(data=preds)

In [None]:
rmse(preds.mean(0), valid_y)

## Evaluation with Laplace Log Likelihood
Now let's define the competition metric to use as a score: Laplace Log Likelihood. This metric will be negative, and a higher score is better. A perfect model would score about -4.6.

In [None]:
def LaplaceLogLikelihood(pred, y, sigma):
    
    sigma_clip = np.maximum(sigma, 70)
    delta = np.minimum(np.absolute(y - pred), 1000.)
    sq2 = math.sqrt(2.)
    metric = ((delta / sigma_clip) * -sq2) - np.log(sq2 * sigma_clip)
    return np.mean(metric)

But remember, our metric is only applied to the final 3 FVC measurements. We'll need to add functionality to score ourselves on only these data points. With our final_3 feature we created earlier, we easily access the indices of the final 3 FVC measurements, which we can use for scoring.

In [None]:
final3_idx = np.where(train_df['final_3'] == True)[0]
valid_y_final3 = valid_y.loc[valid_y.index.intersection(final3_idx)]
final3_idx.shape, valid_y_final3.shape

We create a function that we can pass our model, xs, y, and sigma, and will calculate the LLL score for only the xs and ys that appear in the final 3 FVC measurements.

In [None]:
def model_score(m, xs, y, sigma): 
    final3_xs = xs.loc[xs.index.intersection(final3_idx)] #intersection of final3_idx and provided rows
    final3_y = y.loc[y.index.intersection(final3_idx)]
    
    sigma_df = pd.Series(data=sigma, index=xs.index) #transform sigma into Series with indices to match xs
    final3_sigma = sigma_df.loc[sigma_df.index.intersection(final3_idx)] #filter those indices for final 3s
    
    return LaplaceLogLikelihood(m.predict(final3_xs), final3_y, final3_sigma) 

For this scoring, we also need a confidence measure, which is related to standard deviation. We can use the preds array we used before to calculate the standard deviation of each prediction across all trees, and use this as a measure of confidence.

In [None]:
preds_std = preds.std(axis=0)
preds_std.shape

In [None]:
preds_std[:25]

Let's try it out!

In [None]:
model_score(m, xs, y, 231), model_score(m, valid_xs, valid_y, preds_std)

And we have a LLL Score for our RF Model! Compared to the variance between trees, what's the best a constant Confidence could give us?

In [None]:
def constant_sigma_choice(m, xs, y):
    sc = -100.
    sigma = 70
    for i in range(71, 500, 5):
        temp = model_score(m, xs ,y, i)
        if temp > sc: 
            sc = temp
            sigma = i
    return sc, sigma
score, sigma = constant_sigma_choice(m, valid_xs, valid_y)
score, sigma

In [None]:
model_score(m, valid_xs, valid_y, sigma)

## Final Model

In [None]:
n_trees = 1000
max_features= .6
min_samples_leaf= 3
m = create_rf(xs, y, n_trees, max_features, min_samples_leaf)
#get new preds matrix for std
preds = np.stack(t.predict(valid_xs) for t in m.estimators_)
preds_std = preds.std(axis=0)
[model_score(m, xs, y, 231), 
 model_score(m, valid_xs, valid_y, preds_std), 
 model_score(m, valid_xs, valid_y, constant_sigma_choice(m, valid_xs, valid_y)[1])]

In [None]:
eval_df = valid_xs.copy().reset_index().drop('index', axis=1)
preds = pd.Series(m.predict(valid_xs))
eval_df['FVC_pred'] = preds
eval_df['FVC_true'] = valid_y.reset_index().drop('index', axis=1)
eval_df

In [None]:
eval_df['FVC_error'] = np.absolute(eval_df['FVC_true'] - eval_df['FVC_pred'])
eval_df.groupby('initial_percent').transform('mean').drop_duplicates().sort_values(by=['FVC_error'], ascending=False)[0:25]
#eval_df[eval_df['initial_fvc'] == 1690]

In [None]:
train_df[train_df['initial_fvc'] == (6399)]

# Feature Importance

In [None]:
def rf_feat_importance(m, df):
    return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}).sort_values('imp', ascending=False)

In [None]:
fi = rf_feat_importance(m, xs)
fi

In [None]:
from sklearn.inspection import plot_partial_dependence

fig,ax = plt.subplots(figsize=(12, 4))
plot_partial_dependence(m, valid_xs, ['Sex_Male', 'Age'],
                        grid_resolution=20, ax=ax);

# Submission
Let's use our model to predict the FVC on the submission dataframe and submit!

In [None]:
xs_submit = submission_df[cat_features + cont_features]
submission_df['FVC'] = m.predict(xs_submit)
#use standard deviations of each Tree as confidence
preds_submit = np.stack(t.predict(xs_submit) for t in m.estimators_)
submission_df['Confidence'] = preds_submit.std(axis=0)

In [None]:
submission_df = submission_df[['Patient_Week', 'FVC', 'Confidence']]
submission_df

In [None]:
submission_df.to_csv('submission.csv', index=False)
submission_df.describe()