In [None]:
import numpy as np
import pandas as pd 
import tensorflow as tf
import matplotlib
import matplotlib.pyplot as plt
import tensorflow_probability as tfp
import seaborn as sns
import pydicom
import os
import re
import time
import gzip
from tqdm import tqdm
import seaborn as sns
import sklearn.preprocessing
tfd = tfp.distributions
tfb = tfp. bijectors
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import ct_scan_processing2_module as ct
import osic_utils
from tqdm import tqdm_notebook as tqdm

## Determine patient ids from train 

In [None]:
lung_stats = pd.read_csv('../usr/lib/ct_scan_processing2/lung_statistics.csv')
cols = lung_stats.columns[~pd.Series(lung_stats.columns).isin(['PatientId'])]
row, col_ = (3,2)
fig, ax = plt.subplots(row, col_, figsize = (15,9))
plt.subplots_adjust(bottom = -0.4)
for i, col in enumerate(cols):
    ridx = int(i/col_)
    cidx = i - col_ * ridx      
    sns.distplot(lung_stats[col], ax = ax[ridx][cidx])
    ax[ridx][cidx].set_xlabel('')
    ax[ridx][cidx].set_title(col)
fig.delaxes(ax[row - 1][col_ - 1])


In [None]:
lung_stats.describe()

In [None]:
patient_dcm_dict = {}
root_dir = "/kaggle/input/osic-pulmonary-fibrosis-progression/train/"

remove_ids = []
for dirname, _, filenames in os.walk(root_dir):
    if 'ID' in dirname:
        dirname_ = dirname.replace(root_dir, "")
        # check if the id is among the ids to be removed
        s = [i for i in remove_ids if dirname_ == i]
        if len(s) == 0:            
            patient_dcm_dict[dirname_] = filenames
        
patient_dcm_dict = {k: i for i, k in enumerate(sorted(patient_dcm_dict.keys()))}
print("A total of", len(patient_dcm_dict.keys()), "patients")

In [None]:
patient_dcm_dict = {k: i for i, k in enumerate(sorted(patient_dcm_dict.keys()))}
{k: v for i, (k, v) in enumerate(patient_dcm_dict.items()) if i < 5}

Sort the lung statistics dataset by the patient dictionary

In [None]:
lung_stats = lung_stats.set_index('PatientId') \
.loc[list(patient_dcm_dict.keys())] \
.reset_index()

lung_stats['lung_vol'] = lung_stats['lung_vol']/10000
lung_stats.head()

We now create a tf dataset for easy batching and shuffling.

## Preprocess structured data

Outlined below is a pipeline for processing the data ranging from structured to CT scan images.

In [None]:
train_data = pd.read_csv("/kaggle/input/osic-pulmonary-fibrosis-progression/train.csv")
test_data = pd.read_csv("/kaggle/input/osic-pulmonary-fibrosis-progression/test.csv")

### fit encoders before running preprocessing data ###
# one-hot encoder
oh_encoder = OneHotEncoder(sparse = False, dtype = 'int32')
oh_encoder.fit(train_data['SmokingStatus'].values.reshape(-1, 1))

### Standard scalers ####
# AGE
age_scaler = StandardScaler()
age_scaler.fit(train_data[['Patient', 'Age']].drop_duplicates()['Age'] \
               .values.reshape(-1, 1))
# WEEKS
week_scaler = StandardScaler()
week_scaler.fit(train_data[['Patient', 'Weeks']].drop_duplicates()['Weeks'] \
               .values.reshape(-1,1))
# PERCENTAGE
pct_scaler = StandardScaler()
pct_scaler.fit(train_data['Percent'].values.reshape(-1,1))
# BASELINE FVC
baseline_scaler = StandardScaler()
baseline_scaler.fit(train_data['FVC'].values.reshape(-1,1))

encoder_dict = {
    'Age': age_scaler,
    'Weeks': week_scaler,
    'pct_bsl': pct_scaler,
    'baseline': baseline_scaler,
    'SmokingStatus': oh_encoder
}
X, pid, lbl = osic_utils.preprocess_structured_data(
    train_data, 
    patient_dcm_dict, 
    encoder_dict)
print(X.shape)
print(pid.shape)
print(lbl.shape)

Concatenate the lung statistics with the structured data features

In [None]:
lung_stats_train = tf.gather(lung_stats.iloc[:,1:].values, 
                             tf.constant(pid, dtype = tf.int32), axis = 0)
X = tf.concat([X,lung_stats_train], axis = 1)
X

In [None]:
holdout = osic_utils.transform_holdout_data(test_data)

# there are no actual labels. 
#lbl_holdout is just there as placeholder for consistency with train tf dataset generation
patient_dcm_holdout = {k: i for i, k in enumerate(holdout.Patient.unique())}
X_holdout, pid_holdout, lbl_holdout = osic_utils.preprocess_structured_data(holdout, 
                                                         patient_dcm_holdout,
                                                         encoder_dict)
print(X_holdout.shape)
print(pid_holdout.shape)


In [None]:
# create tf dataset for train
dataset = osic_utils.DatasetGen(patient_dcm_dict, split = None, seed = 300, 
                         batch_size = 1, 
                         root_dir = '/kaggle/input/osic-pulmonary-fibrosis-progression/train/',
                         shuffle = False)

# create tf dataset for test
"""
holdout_ds = osic_utils.DatasetGen(patient_dcm_holdout, 
                                   split = None,
                                   seed = 300, batch_size =1,
                                   root_dir = '/kaggle/input/osic-pulmonary-fibrosis-progression/test/',
                                   struct_data = X_holdout, label = lbl_holdout, 
                                   id_tensor = pid_holdout,
                                   return_stats_only = True,
                                  shuffle = False)
"""



In [None]:
def make_joint_dist_coroutine(num_ids, pids, X):
    def model():
        X_cst = tf.cast(X, tf.float32)
        # random intercepts
        Root = tfd.JointDistributionCoroutine.Root
        patient_scale = yield Root(tfd.HalfCauchy(loc = 0, scale = 5.))
        intercept = yield Root(tfd.Normal(loc = 0, scale = 10.))
        patient_prior = yield tfd.MultivariateNormalDiag(loc = tf.zeros(num_ids), 
                                                       scale_identity_multiplier = patient_scale)
        int_resp = tf.gather(patient_prior, pids, axis = -1) + intercept[...,tf.newaxis]
        
        # random slopes for week var
        beta_week = yield Root(tfd.Normal(loc = 0, scale = 10.))
        beta_week_scale = yield Root(tfd.HalfCauchy(loc = 0, scale = 5.))
        beta_week_prior = yield tfd.MultivariateNormalDiag(loc = tf.zeros(num_ids),
                                                          scale_identity_multiplier = beta_week_scale)
        bw_resp = (tf.gather(beta_week_prior, pids, axis = -1)  + beta_week[..., tf.newaxis]) * X_cst[:,0]
        
        # other variables
        betas = yield Root(tfd.MultivariateNormalDiag(loc = tf.zeros(tf.shape(X_cst)[1] - 1), 
                                                scale_identity_multiplier = 10.))
        other_vars_resp = tf.tensordot(betas, tf.transpose(X_cst[:,1:]), axes = 1)    
        total_response = int_resp + bw_resp + other_vars_resp
        
        # response scale
        resp_scale = yield Root(tfd.HalfCauchy(loc =0, scale = 5.))
        
        yield tfd.Normal(loc = total_response,  scale = resp_scale[...,tf.newaxis])
        
    return tfd.JointDistributionCoroutineAutoBatched(model)
    

Try sampling from the joint distribution

In [None]:
dist = make_joint_dist_coroutine(176, tf.constant(pid, tf.int32), X)
[i.shape for i in dist.sample(2)]

### Stochastic Variational Inference on hierarchical linear model

In [None]:
"""
# define parameters to be optimized
_init_loc = lambda name, shape = (): tf.Variable(
    tf.random.uniform(shape, name = name,  minval=-2., maxval=2.))

_init_scale =lambda name, shape = (): tfp.util.TransformedVariable(
    initial_value=tf.random.uniform(shape, minval=0.01, maxval=1.),
    bijector=tfb.Softplus(), name = name)

### SET PARAMETERS HERE ####
num_epochs = 1
print_every = 5
num_ids = len(patient_dcm_dict.keys())
num_draws = 2

# intercept & random intercepts
a0 = _init_loc(name = "alpha0")
a0_sigma = _init_scale(name = "alpha0_sigma")
patient_scale = _init_loc(name = "patient_scale")
patient_scale_sigma = _init_scale(name = "patient_scale_sigma")
a = _init_loc(name = "alphas", shape = [num_ids])
a_sigma = _init_scale(name = "alphas_sigma", shape = [num_ids])

int_vars = [patient_scale, patient_scale_sigma.trainable_variables[0],
           a0, a0_sigma.trainable_variables[0],
           a, a_sigma.trainable_variables[0]]

# random slope (week variable)
bw = _init_loc(name = "bw")
bw_sigma = _init_scale(name = "bw_sigma")
bws_scale = _init_loc(name = "bw_scale")
bws_scale_sigma = _init_scale(name = "bws_scale_sigma")
bws = _init_loc(name = "bws", shape = [num_ids])
bws_sigma = _init_scale(name = "bws_sigma", shape = [num_ids])

rnd_slope_vars  = [bw, bw_sigma.trainable_variables[0], 
                   bws_scale, bws_scale_sigma.trainable_variables[0],
                   bws, bws_sigma.trainable_variables[0]]

# other vars
b = _init_loc(name = "betas", shape = [11])
b_sigma = _init_scale(name = "betas_sigma", shape = [11])

other_vars = [b, b_sigma.trainable_variables[0]]

# response scale
resp_scale = _init_loc(name = "resp_scale")
resp_scale_sigma = _init_scale(name = "resp_scale_sigma")
response_scale_vars = [resp_scale, resp_scale_sigma.trainable_variables[0]]

trainable_vars = int_vars + rnd_slope_vars + other_vars + response_scale_vars 

optimizer = tf.optimizers.Adam(learning_rate = 0.001)
epoch_loss = []
start = time.time()
for e in range(num_epochs):
    batch_loss = []
    i = 0
    for ids, X, lbl, idx, img_stats in dataset.train:
        with tf.GradientTape() as tape:      
            lbl = tf.cast(lbl/1000, tf.float32) #reduce the magnitude of the labels
            shp = tf.shape(ids)
            # concatenate structured data and extracted image features
            X = tf.concat([tf.cast(X, tf.float32), tf.cast(img_stats, tf.float32)], axis = 1)
            jd = make_joint_dist_coroutine(shp.numpy()[0], idx, X)
        
            # create surrogate posterior dynamically
            def variational_model_fn():
                return tfd.JointDistributionSequentialAutoBatched([
                  tfb.Softplus()(tfd.Normal(patient_scale, patient_scale_sigma)),  # scale_prior
                  tfd.Normal(a0, a0_sigma),                                        # intercept
                  tfd.Normal(tf.gather(a, ids), tf.gather(a_sigma, ids)),          # patient prior
                  tfd.Normal(bw, bw_sigma),                                        # week random slope
                  tfb.Softplus()(tfd.Normal(bws_scale, bws_scale_sigma)),          # bw scale prior
                  tfd.Normal(tf.gather(bws, ids), tf.gather(bws_sigma, ids)),      # patient by week prior
                  tfd.Normal(b, b_sigma),                                          # other vars prior
                  tfb.Softplus()(tfd.Normal(resp_scale, resp_scale_sigma)),        # response scale
                    
                ])

            # create the surrogate posterior
            surrogate_pos = variational_model_fn()
            # calculate losses
            loss = tfp.vi.monte_carlo_variational_loss(
                lambda *args: jd.log_prob(*args, lbl),
                surrogate_pos,
                sample_size = num_draws,
                use_reparameterization = True
            )
            
        # compute gradients 
        grad = tape.gradient(loss, trainable_vars)        
        optimizer.apply_gradients(zip(grad, trainable_vars))
        print("epoch {}, batch {} loss: {}".format(e, i, loss))
        i += 1
        batch_loss.append(loss)
        
    epoch_loss.append(np.mean(batch_loss))
    if e % print_every == 0:
        print("epoch {} loss: {}".format(e, np.mean(batch_loss)))
end = time.time()

print("Total processing time:", str((end - start)/60), "mins")
"""

In [None]:
num_ids = len(patient_dcm_dict.keys())
jd = make_joint_dist_coroutine(num_ids, pid, X)
def target_log_prob_fn(*args):
    return jd.log_prob(*args, lbl/1000)
    
_init_loc = lambda shape=(): tf.Variable(
    tf.random.uniform(shape, minval=-2., maxval=2.))
_init_scale = lambda shape=(): tfp.util.TransformedVariable(
    initial_value=tf.random.uniform(shape, minval=0.01, maxval=1.),
    bijector=tfb.Softplus())

num_other_vars = X.shape[1] - 1

surrogate_posterior = tfd.JointDistributionSequentialAutoBatched([
                  tfb.Softplus()(tfd.Normal(_init_loc(), _init_scale())),                       # scale_prior
                  tfd.Normal(_init_loc(), _init_scale()),                                       # intercept
                  tfd.Normal(_init_loc(shape = [num_ids]), _init_scale(shape = [num_ids])),      # patient prior
                  tfd.Normal(_init_loc(), _init_scale()),                                       # week random slope
                  tfb.Softplus()(tfd.Normal(_init_loc(), _init_scale())),                       # week scale prior
                  tfd.Normal(_init_loc(shape = [num_ids]), _init_scale(shape = [num_ids])),      # patient by week prior
                  tfd.Normal(_init_loc(shape = [num_other_vars]), 
                             _init_scale(shape = [num_other_vars])),                            # other vars prior
                  tfb.Softplus()(tfd.Normal(_init_loc(), _init_scale())),                       # response scale
                    
                ])


optimizer = tf.optimizers.Adam(learning_rate=1e-4)

start = time.time()
losses = tfp.vi.fit_surrogate_posterior(
    target_log_prob_fn,  
    surrogate_posterior,
    optimizer=optimizer,
    num_steps= 100000, 
    seed=42,
    sample_size= 500)
end = time.time()

print("processing time:", str((end - start)/60), "min")

(scale_prior_, 
 intercept_,  
 patient_weights,
 week_slope,
 week_scale_prior,
 week_patient_weights,
 other_vars_weights,
 response_scale), _ = surrogate_posterior.sample_distributions()


### Model Checking

In [None]:
print('intercept:', intercept_.mean())
print('week weight:', week_slope.mean())
print('other var weights', other_vars_weights.mean())

In [None]:
fig = plt.figure(figsize = (10, 6))
plt.plot(losses)
plt.title('ELBO loss across epochs', size = 15)
plt.show()

### Prediction

In [None]:
def predict_from_params(X):
    other_vars_weights_ = other_vars_weights.mean().numpy().reshape(-1, 1)
    intercept = intercept_.mean().numpy().reshape(-1, 1)
    week_slope_ = week_slope.mean().numpy().reshape(-1, 1)
    expected_val = (np.matmul(X[:, 1:], other_vars_weights_)
                    + week_slope_ * X[:, :1]
                    + intercept)
    return expected_val

### Check train fit

In [None]:
min_FVC = np.min(lbl/1000)
max_FVC = np.max(lbl/1000)
plt.subplots(1,1, figsize = (8,6))
plt.scatter(np.squeeze(predict_from_params(X)), lbl/1000, alpha =0.3)
plt.xlabel('Predicted', size = 10)
plt.ylabel('Actual', size = 10)
plt.plot([min_FVC, max_FVC], [min_FVC, max_FVC], "--")

In [None]:
preds_holdout = []
for i, (ids, idx) in enumerate(patient_dcm_holdout.items()):
    path = "/kaggle/input/osic-pulmonary-fibrosis-progression/test/" + ids
    stats = ct.image_processing_pipeline(path, return_stats_only = True)
    stats[0,0] = stats[0,0]/1000
    # get corresponding feature set
    idxs = np.where(pid_holdout == idx)[0]
    X_h = np.take(X_holdout, idxs, axis = 0)   
    pred = predict_from_params(np.concatenate([X_h, np.repeat(stats, idxs.shape[0], axis=0)], axis=1))
    preds_holdout.append(pred)
    
    

In [None]:
holdout['FVC'] = np.concatenate(preds_holdout, axis = 0) * 1000
holdout['Confidence'] = np.mean(response_scale.sample(50000).numpy()) * 1000
holdout['Patient_Week'] = holdout[['Patient', 'Weeks', 'Weeks_base']] \
.apply(lambda x: '{}_{}'.format(x['Patient'], x['Weeks'] + x['Weeks_base']), axis = 1)
holdout


In [None]:
submission = holdout[['Patient_Week',  'FVC', 'Confidence']]
submission.to_csv("submission.csv", index = False)

In [None]:
submission.head()