In [None]:
import cv2
import matplotlib
import numpy as np
import os
import pandas as pd
import pickle
import pydicom as dicom
import tensorflow as tf
import time

from keras import Sequential
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.models import load_model
from keras.optimizers import SGD
from matplotlib import pyplot as plt
from PIL import Image
from scipy import ndimage
from skimage import measure, morphology
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from statsmodels.formula.api import quantreg
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
# PART 1: Quantile Regression model
# Model obtained from:
# https://www.kaggle.com/titericz/tabular-simple-eda-linear-model 
os.chdir('/kaggle/input/osic-pulmonary-fibrosis-progression')

train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')

# Marker to mark rows as from train or test csv
train['traintest'] = 0
test ['traintest'] = 1

# Generate format for submission.csv
submission = pd.read_csv('sample_submission.csv')
submission['Weeks'] = submission['Patient_Week'].apply(lambda x: int(x.split('_')[-1]))
submission['Patient'] = submission['Patient_Week'].apply(lambda x: x.split('_')[0]) 

train = pd.concat((train, test))
train.sort_values(['Patient','Weeks'], inplace=True)

# Encode string value into categorical value
train['Sex'] = pd.factorize(train['Sex'])[0]
train['SmokingStatus'] = pd.factorize(train['SmokingStatus'])[0]

# Standardize categorical values
train['Percent']       = (train['Percent'] - train['Percent'].mean()) / train['Percent'].std()
train['Age']           = (train['Age'] - train['Age'].mean()) / train['Age'].std()
train['Sex']           = (train['Sex'] - train['Sex'].mean()) / train['Sex'].std()
train['SmokingStatus'] = (train['SmokingStatus'] - train['SmokingStatus'].mean()) / train['SmokingStatus'].std()

# Train model
modelL = quantreg('FVC ~ Weeks+Percent+Age+Sex+SmokingStatus', train).fit(q=0.15)
model  = quantreg('FVC ~ Weeks+Percent+Age+Sex+SmokingStatus', train).fit(q=0.50)
modelH = quantreg('FVC ~ Weeks+Percent+Age+Sex+SmokingStatus', train).fit(q=0.85)

# This part is used to evaluate model on train set
train['ypredL'] = modelL.predict(train).values
train['ypred']  = model.predict(train).values
train['ypredH'] = modelH.predict(train).values
train['ypredstd'] = 0.5 * np.abs(train['ypredH'] - train['ypred']) + \
                    0.5 * np.abs(train['ypred'] - train['ypredL'])

# Prepare test dataframe for predicting FVC values
dt = train.loc[train.traintest==1 ,['Patient','Percent','Age','Sex','SmokingStatus']]
test = pd.merge(submission, dt, on='Patient', how='left')
test.sort_values(['Patient','Weeks'], inplace=True)

# Predict FVC values for test dataframe
test['ypredL'] = modelL.predict(test).values
test['FVC']    = model.predict(test).values
test['ypredH'] = modelH.predict(test).values
test['Confidence'] = np.abs(test['ypredH'] - test['ypredL']) / 2

# Predict FVC from each Patient at Week = 0
# Use FVC at Week = 0 to train CNN
cnn_train = train.groupby('Patient').first()
cnn_train['Week_0'] = 0
cnn_train.rename(columns={'Weeks': 'Week_i', 'Week_0': 'Weeks'}, inplace=True)

cnn_train['FVC_0'] = model.predict(cnn_train).values

In [None]:
# Preprocessing funtions
def processImage(image, slope, intercept):
    # Crop image if not square
    def crop(image):
        if image.shape[0] != image.shape[1]:
            height = image.shape[0]
            width = image.shape[1]
            
            height_crop = (height - 512) // 2
            width_crop = (width - 512) // 2
            
            image = image[height_crop:-height_crop, width_crop:-width_crop]
        return image
    
    # Transform image values into Hounsfield Units with formula given in
    # https://www.kaggle.com/avirdee/understanding-dicoms
    def rescale(image):
        rescaled = image * slope + intercept
        return rescaled

    # Normalize image values to between 0 and 1
    def normalize(image):
        min_value = -1000
        max_value = 400

        image[image < min_value] = min_value
        image[image > max_value] = max_value

        image = (image - min_value) / (max_value - min_value)
        image = image.astype("float32")

        return image
    
    # Lung masking/segmentation
    # https://www.kaggle.com/andradaolteanu/pulmonary-fibrosis-competition-eda-dicom-prep
    def mask(image):
        rows = image.shape[0]
        cols = image.shape[1]

        # Using K-Means to segment image
        flat_image = np.reshape(image, [np.prod(image.shape), 1])
        kmeans = KMeans(n_clusters=2).fit(flat_image)
        centers = sorted(kmeans.cluster_centers_.flatten())
        threshold = np.mean(centers)
        segmented = np.where(image < threshold, 1.0, 0.0)

        final_masks = []

        # Access each segmented region individually
        labels = measure.label(segmented) 
        regions = measure.regionprops(labels)
        for region in regions:
            B = region.bbox
            # Hard-coded to ignore "borders" of the image
            if (B[2] - B[0] < rows / 10 * 9 and 
                B[3] - B[1] < cols / 10 * 9 and 
                B[0] > rows / 10 and 
                B[2] < cols / 10 * 9):

                final_masks.append(region.label)

        # Final mask to segment out the lungs
        mask = np.ndarray([rows, cols], dtype=np.int8)
        mask[:] = 0
        for N in final_masks:
            mask = mask + np.where(labels == N, 1, 0)

        return mask
    
    def resize(image):
        resize_shape = (128, 128)
        curr_shape = image.shape
        
        resized = ndimage.zoom(image, resize_shape[0] / curr_shape[0])
        
        return resized
    
    cropped = crop(image)
    rescaled = rescale(cropped)
    normalized = normalize(rescaled)
    masked = mask(normalized)
    resized = resize(masked)
    
    return resized

def processVolume(volume):
    # Resize the 3D volume
    def resize(volume):
        resize_shape = (128, 128, 64)
        
        curr_shape = volume.shape
        
        height_ratio = resize_shape[0] / curr_shape[0]
        width_ratio = resize_shape[1] / curr_shape[1]
        depth_ratio = resize_shape[2] / curr_shape[2]
        
        resized = ndimage.zoom(volume, (height_ratio, width_ratio, depth_ratio))
        
        return resized
    
    volume = np.swapaxes(np.swapaxes(volume, 0, 1), 1, 2)
    resized = resize(volume)
    resized = resized.astype('int8')
    
    return resized

In [None]:
# Extract 3D CT-scan from each patient
os.chdir('/kaggle/input/osic-pulmonary-fibrosis-progression')

CT_scans = {}

for patient in os.listdir('train'):
    try:
        patient_path = os.path.join('train', patient)

        files = os.listdir(patient_path)
        files = sorted(files, key=lambda x: int(x[:-4]))
        
        ratio = int(len(files) // 64) if len(files) >= 64 else 1
        
        volume = []
        
        # Extract CT-scan slices from patient's .dicom files
        for index, file in enumerate(files):
            if index % ratio == 0:
                file_path = os.path.join(patient_path, file)

                dcm = dicom.dcmread(file_path)

                image = dcm.pixel_array
                processed = processImage(image, dcm.RescaleSlope, dcm.RescaleIntercept)

                volume.append(processed)
        
        volume = np.array(volume)        
        volume = processVolume(volume)
        
        CT_scans[patient] = volume

    except:
        continue   

In [None]:
# Train / Test sets
x_train = []
y_train = []

# Augment CT-scans
for item in CT_scans.items():
    patient = item[0]
    volume = item[1]
    
    volume[volume < 0] = 0
    volume[volume > 1] = 1
    
    label = cnn_train[cnn_train.index == patient]['FVC_0'][0]
    
    def augmentVolume(volume, label):
        exp_volume = np.expand_dims(volume, axis=3)

        x_train.append(exp_volume)
        y_train.append(label)      

        # Rotation x5
        for rotation in [30, 60, 90, 120, 150]:
            augmented = ndimage.rotate(volume, rotation, reshape=False, mode='nearest')

            augmented[augmented < 0] = 0
            augmented[augmented > 1] = 1

            exp_augmented = np.expand_dims(augmented, axis=3)

            x_train.append(exp_augmented)
            y_train.append(label)      

    augmentVolume(volume, label)

In [None]:
# Prepare train / test sets
x_train = np.array(x_train)
y_train = np.array(y_train)
y_train = StandardScaler().fit_transform(y_train.reshape(len(y_train), 1))[:, 0]

x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.25)
x_train, y_train = shuffle(x_train, y_train)
x_test, y_test = shuffle(x_test, y_test)

train_loader = tf.data.Dataset.from_tensor_slices((x_train, y_train))
validation_loader = tf.data.Dataset.from_tensor_slices((x_test, y_test))

batch_size = 8

train_dataset = (
    train_loader.shuffle(len(x_train))
    .batch(batch_size)
    .prefetch(8)
)

validation_dataset = (
    validation_loader.shuffle(len(x_test))
    .batch(batch_size)
    .prefetch(8)
)

In [None]:
# 3D-CNN model
def build_model():

    model = Sequential([
        keras.Input((128, 128, 64, 1)),
        layers.Conv3D(filters=64, kernel_size=3, activation="relu"),
        layers.MaxPool3D(pool_size=2),
        layers.BatchNormalization(),

        layers.Conv3D(filters=64, kernel_size=3, activation="relu"),
        layers.MaxPool3D(pool_size=2),
        layers.BatchNormalization(),

        layers.Conv3D(filters=128, kernel_size=3, activation="relu"),
        layers.MaxPool3D(pool_size=2),
        layers.BatchNormalization(),

        layers.Conv3D(filters=256, kernel_size=3, activation="relu"),
        layers.MaxPool3D(pool_size=2),
        layers.BatchNormalization(),

        layers.GlobalAveragePooling3D(),
        layers.Dense(units=256, activation="relu"),
        layers.Dropout(0.3),

        layers.Dense(units=3, activation="relu", name='intermediate'),
    
        layers.Dense(units=1, activation="linear")
    ])

    initial_learning_rate = 0.0001
    lr_schedule = keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate, decay_steps=100000, decay_rate=0.98, staircase=True
    )
    model.compile(loss='mean_squared_error', 
                  optimizer=keras.optimizers.Adam(learning_rate=lr_schedule),
                  metrics=['mse'])
    
    return model

model = build_model()
model.summary()

In [None]:
# Train model
callbacks = [EarlyStopping(monitor='val_loss', patience=20, verbose=1),
             ModelCheckpoint('/kaggle/working/test.hdf5', 
                             monitor='val_loss', 
                             save_best_only=True, 
                             mode='min', 
                             verbose=1)]

model.fit(train_dataset,
          validation_data=validation_dataset,
          epochs=300,
          callbacks=callbacks)

In [None]:
model = load_model('/kaggle/input/ctscans/test.hdf5')

# Obtain feature extractor from trained model
extractor = keras.Model(inputs=model.input,
                        outputs=model.get_layer('intermediate').output)

In [None]:
# Extract features from each patient's CT-scan
features = {}
f_1 = {}
f_2 = {}
f_3 = {}

for item in CT_scans.items():
    patient = item[0]
    volume = item[1]
    feature = extractor.predict(np.array([volume]))
    
    features[patient] = feature[0]
    f_1[patient] = feature[0][0]
    f_2[patient] = feature[0][1]
    f_3[patient] = feature[0][2]

In [None]:
# PART 2: Retrain Quantile Regression model using features
os.chdir('/kaggle/input/osic-pulmonary-fibrosis-progression')

train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')

# Marker to mark rows as from train or test csv
train['traintest'] = 0
test ['traintest'] = 1

# Generate format for submission.csv
submission = pd.read_csv('sample_submission.csv')
submission['Weeks'] = submission['Patient_Week'].apply(lambda x: int(x.split('_')[-1]))
submission['Patient'] = submission['Patient_Week'].apply(lambda x: x.split('_')[0]) 

train = pd.concat((train, test))
train.sort_values(['Patient','Weeks'], inplace=True)

# Add feature columns
train['f1'] = train.Patient.map(f_1)
train['f2'] = train.Patient.map(f_2)
train['f3'] = train.Patient.map(f_3)

# Drop Patient ID00011637202177653955184
train = train.dropna()

# Encode string value into categorical value
train['Sex'] = pd.factorize(train['Sex'])[0]
train['SmokingStatus'] = pd.factorize(train['SmokingStatus'])[0]

# Standardize categorical values
train['Percent']       = (train['Percent'] - train['Percent'].mean()) / train['Percent'].std()
train['Age']           = (train['Age'] - train['Age'].mean()) / train['Age'].std()
train['Sex']           = (train['Sex'] - train['Sex'].mean()) / train['Sex'].std()
train['SmokingStatus'] = (train['SmokingStatus'] - train['SmokingStatus'].mean()) / train['SmokingStatus'].std()

train['f1']            = (train['f1'] - train['f1'].mean()) / train['f1'].std()
train['f2']            = (train['f2'] - train['f2'].mean()) / train['f2'].std()
train['f3']            = (train['f3'] - train['f3'].mean()) / train['f3'].std()

# Train model
modelL = quantreg('FVC ~ Weeks+Percent+Age+Sex+SmokingStatus+f1+f2+f3', train).fit(q=0.15)
model  = quantreg('FVC ~ Weeks+Percent+Age+Sex+SmokingStatus+f1+f2+f3', train).fit(q=0.50)
modelH = quantreg('FVC ~ Weeks+Percent+Age+Sex+SmokingStatus+f1+f2+f3', train).fit(q=0.85)

# This part is used to evaluate model on train set
# Can ignore for now
train['ypredL'] = modelL.predict(train).values
train['ypred']  = model.predict(train).values
train['ypredH'] = modelH.predict(train).values
train['ypredstd'] = 0.5 * np.abs(train['ypredH'] - train['ypred']) + \
                    0.5 * np.abs(train['ypred'] - train['ypredL'])

# Prepare test dataframe for predicting FVC values
dt = train.loc[train.traintest==1, ['Patient','Percent','Age','Sex','SmokingStatus','f1','f2','f3']]
test = pd.merge(submission, dt, on='Patient', how='left')
test.sort_values(['Patient','Weeks'], inplace=True)

# Predict FVC values for test dataframe
test['ypredL'] = modelL.predict(test).values
test['FVC']    = model.predict(test).values
test['ypredH'] = modelH.predict(test).values
test['Confidence'] = np.abs(test['ypredH'] - test['ypredL']) / 2

# Save submission file
submission = test[['Patient_Week','FVC','Confidence']]
submission.to_csv('/kaggle/working/submission.csv', index=False)

In [None]:
# Additional function used to make CT-scan gif
def make_gif():
    os.chdir('/kaggle/input/osic-pulmonary-fibrosis-progression')
    
    # Make gif of CT scan from patient 0
    patient = os.listdir('train')[0]
    patient_path = os.path.join('train', patient)

    files = os.listdir(patient_path)
    files = sorted(files, key=lambda x: int(x[:-4]))

    for index, file in enumerate(files):
        # Select just 1/10 of the total slices
        if index % 10 == 0:
            file_path = os.path.join(patient_path, file)

            dcm = dicom.dcmread(file_path)

            image = dcm.pixel_array
            processed = processImage(image, dcm.RescaleSlope, dcm.RescaleIntercept)
            masked = mask(processed)

            # Save slices as .png files first
            gif_original_path = os.path.join('/kaggle/working/original', '{}.png'.format(index))
            gif_masked_path = os.path.join('/kaggle/working/masked', '{}.png'.format(index))

            matplotlib.image.imsave(gif_original_path, processed)
            matplotlib.image.imsave(gif_masked_path, masked)

    os.chdir('/kaggle/working')
            
    # Create gif for original scan
    original_frames = []

    originals = os.listdir('original')
    originals = sorted(originals, key=lambda x: int(x[:-4]))

    for original in originals:
        image_path = os.path.join('original', original)
        frame = Image.open(image_path)
        original_frames.append(frame)

    original_frames[0].save('original.gif', format='GIF',
                            append_images=original_frames[1:],
                            save_all=True,
                            duration=150, 
                            loop=10)

    # Create gif for masked scan
    masked_frames = []

    masks = os.listdir('masked')
    masks = sorted(masks, key=lambda x: int(x[:-4]))

    for mask in masks:
        image_path = os.path.join('masked', mask)
        frame = Image.open(image_path)
        masked_frames.append(frame)

    masked_frames[0].save('masked.gif', format='GIF',
                           append_images=masked_frames[1:],
                           save_all=True,
                           duration=150, 
                           loop=10)
    
show_gif(filename="masked.gif", format='png', width=200, height=200)