# Work in Progress
Osic-pulmonary-fibrosis-progression
Data Preprocessing for Patients and their Images
model selection and parameter settings

In [None]:
!conda install -c conda-forge gdcm -y

In [None]:
from __future__ import print_function

import os
from os import listdir
import IPython
import IPython.display
import copy
import pandas as pd
import numpy as np
import pydicom as dicom
from pydicom import dcmread

import matplotlib.pyplot as plt
import seaborn as sns
import tqdm
import glob
from typing import Dict

from sklearn.preprocessing import RobustScaler
from scipy import ndimage
from scipy.ndimage.interpolation import zoom

from skimage import measure, morphology, segmentation
from skimage.measure import label, regionprops
from skimage.morphology import binary_closing
from skimage.segmentation import clear_border

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models

In [None]:
file_path = "../input/osic-pulmonary-fibrosis-progression/"

listdir(file_path)

In [None]:
train_df = pd.read_csv(file_path + "train.csv")
test_df = pd.read_csv(file_path + "test.csv")
sub_df = pd.read_csv(file_path + "sample_submission.csv")

train_df.head()

In [None]:
test_df.head()

In [None]:
sub_df.head()

In [None]:
duplicate_data = train_df[train_df.duplicated(subset=["Patient", "Weeks"], keep=False)]

duplicate_data

In [None]:
train_df.drop_duplicates(subset=["Patient", "Weeks"], keep="last", inplace=True)
train_df.info()

In [None]:
sub_df[["Patient", "Weeks"]] = sub_df["Patient_Week"].str.split("_", expand=True)
sub_df = sub_df[["Patient", "Weeks", "Patient_Week"]]
sub_df = sub_df.merge(test_df.drop("Weeks", axis=1), on="Patient")

train_df["Source"] = "train"
sub_df["Source"] = "test"

dataset = train_df.append([sub_df])
dataset.reset_index(drop=True, inplace=True)
dataset.head()

In [None]:
dataset["FVC_ave"] = (dataset["FVC"] ) / dataset["Percent"] * 100
dataset.head(10)

In [None]:
def baseline_week(df):
    df = df.copy()
    df["Weeks"] = df["Weeks"].astype(int)
    df.loc[df["Source"] == "test", "min_weeks"] = np.nan
    df["min_weeks"] = df.groupby("Patient")["Weeks"].transform("min")
    df["baseline_week"] = df["Weeks"] - df["min_weeks"]
    
    return df

In [None]:
dataset = baseline_week(dataset)
dataset.head()

In [None]:
def get_baseline_fvc(df):
    df = df.copy()
    base = df.loc[df["Weeks"] == df["min_weeks"]].copy()
    base = df[["Patient", "FVC"]].copy()
    base.columns = ["Patient", "base_fvc"]
    base["no"] = 1
    base["no"] = base.groupby("Patient")["no"].transform("cumsum")
    base = base[base.no == 1]
    base.drop("no", axis=1, inplace=True)
    df = df.merge(base, on = "Patient", how = "left")
    
    return df

In [None]:
dataset = get_baseline_fvc(dataset)
dataset.head()

In [None]:
dataset["Sex"] = pd.Categorical(dataset["Sex"])
dataset["Sex"] = dataset.Sex.cat.codes
dataset["SmokingStatus"] = pd.Categorical(dataset["SmokingStatus"])
dataset["SmokingStatus"] = dataset.SmokingStatus.cat.codes

dataset.tail()

In [None]:
true_pat_res = test_df.Patient.unique()
true_pat_res.sort()

true_result = train_df.loc[train_df["Patient"].isin(true_pat_res)].copy()
true_result.info()

In [None]:
dataset.drop_duplicates(subset=["Patient", "Weeks"], keep="last", inplace=True)

train_df = dataset.loc[dataset["Source"] == "train"].copy()

test_df = dataset.loc[dataset["Source"] == "test"].copy()
train_df.drop("Source", axis=1, inplace=True)
test_df.drop("Source", axis=1, inplace=True)


train_df.head()

In [None]:
train_df.drop(["Patient_Week", "Percent", "min_weeks"], axis=1, inplace=True)

train_df.head()

In [None]:
test_df.head()

In [None]:
test_df.drop(["Percent", "Patient_Week", "min_weeks"], axis=1, inplace=True)

test_df

In [None]:
test_df.info()

In [None]:
if file_path == "../input/osic-pulmonary-fibrosis-progression/":
    train_df["dcm_path"] = file_path + "train/" + train_df.Patient + "/"
else:
    train_df["dcm_path"] = file_path + "train/" + train_df.StudyInstanceUID + "/" + train_df.SeriesInstanceUID

In [None]:
if file_path == "../input/osic-pulmonary-fibrosis-progression/":
    test_df["dcm_path"] = file_path + "test/" + test_df.Patient + "/"
else:
    test_df["dcm_path"] = file_path + "test/" + test_df.StudyInstanceUID + "/" + test_df.SeriesInstanceUID

In [None]:
patient_id = []
patient_path = []

if file_path == "../input/osic-pulmonary-fibrosis-progression/":
    patients = train_df.Patient.unique()
else:
    patients = train_df.StudyInstanceUID.unique()

for patient in patients:
    patient_id.append(patient)
    if file_path == "../input/osic-pulmonary-fibrosis-progression/":
        path = train_df[train_df.Patient == patient].dcm_path.values[0]
    else:
        path = train_df[train_df.StudyInstanceUID == patient].dcm_path.values[0]
    ex_dcm = listdir(path)[0]
    patient_path.append(path)
    ds = dcmread(path + "/" + ex_dcm)



patient_df = pd.DataFrame(data=patient_id, columns=["patient"])
patient_df.loc[:, "patient_path"] = patient_path
patient_df.head()

In [None]:
def load_scan(dcm_path):
    if file_path == "/..input/osic-pulmonary-fibrosis-progression/":
        files = listdir(dcm_path)
        file_no = [np.int(files.split(".")[0]) for file in files]
        sorted_files = np.sort(file_no)[::-1]
        slices = [dcmread(dcm_path  + "/" + str(file_no) + ".dcm") for file_no in sorted_files]
    else:
            
        slices = [dcmread(dcm_path + "/" + s) for s in listdir(dcm_path)]
        slices = [s for s in slices if "SliceLocation" in s]
        slices.sort(key=lambda x: int(x.InstanceNumber))
        try:
            slice_thickness = np.abs(slices[0].ImagePositionPatient[2] - slices[1].ImagePositionPatient[2])
        except:
            slice_thickness = np.abs(slices[0].SliceLocation - slices[1].SliceLocation)
            
        for s in slices:
            s.SliceThickness = slice_thickness
            
    return slices

In [None]:
def convert_to_hu(slices):
    image = np.stack([s.pixel_array for s in slices])
    image = image.astype(np.int16)
    
    image[image == -2000] = 0
    
    intercept = scans[0].RescaleIntercept
    slope = scans[0].RescaleSlope
    
    if slope != 1:
        image = slope * image.astype(np.float64)
        image = image.astype(np.int16)
        
    image += np.int16(intercept)
    
    return np.array(image, dtype=np.int16)

In [None]:
ex = train_df.dcm_path.values[0]
scans = load_scan(ex)
hu_scans = convert_to_hu(scans)

plt.figure()
plt.imshow(hu_scans[13], cmap=plt.cm.gray)
plt.show()

In [None]:
def resample(image, scan, new_spacing=[1,1,1]):
    spacing = np.array([scan[0].SliceThickness] + list(scan[0].PixelSpacing), dtype=np.float32)
    resize_factor = spacing / new_spacing
    new_real_shape = image.shape / resize_factor
    new_shape = np.round(new_real_shape)
    real_resize_factor = new_shape / image.shape
    new_spacing = spacing / real_resize_factor
    
    image = ndimage.interpolation.zoom(image, real_resize_factor, mode="nearest")
    
    return image, new_spacing

In [None]:
im_res, spacing = resample(hu_scans, scans, [1,1,1])
hu_scans.shape, im_res.shape

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(11, 7))
ax[0].imshow(hu_scans[20], cmap=plt.cm.gray)
ax[1].imshow(im_res[1], cmap=plt.cm.gray)
plt.show()

In [None]:
def get_multival_vals(feature):
    if type(feature) == dicom.multival.MultiValue:
        return np.int(feature[0])
    else:
        return np.int(feature)

In [None]:
def generate_markers(image):
    marker_internal = image < -400
    marker_internal = segmentation.clear_border(marker_internal)
    marker_internal_labels = measure.label(marker_internal)
    areas = [r.area for r in measure.regionprops(marker_internal_labels)]
    areas.sort()
    if len(areas) > 2:
        for region in measure.regionprops(marker_internal_labels):
            if region.area < areas[-2]:
                for coordinates in region.coords:
                    marker_internal_labels[coordinates[0], coordinates[1]] = 0
    marker_internal = marker_internal_labels > 0
    
    external_a = ndimage.binary_dilation(marker_internal, iterations=10)
    external_b = ndimage.binary_dilation(marker_internal, iterations=55)
    marker_external = external_b ^ external_a
    
    marker_watershed = np.zeros((image.shape), dtype=np.int)
    marker_watershed += marker_internal * 255
    marker_watershed += marker_external * 128
    
    return marker_internal, marker_external, marker_watershed

In [None]:
patient_internal, patient_external, patient_watershed = generate_markers(hu_scans[13])

fig, ax = plt.subplots(1, 3, figsize=(17, 6))
ax[0].set_title("Internel marker")
ax[0].imshow(patient_internal, cmap="gray")
ax[1].set_title("External Marker")
ax[1].imshow(patient_external, cmap="gray")
ax[2].set_title("watershed image")
ax[2].imshow(patient_watershed, cmap="gray")

plt.show()

In [None]:
def separate_lungs(image):
    marker_internal, marker_external, marker_watershed = generate_markers(image)
    
    sobel_filtered_dx = ndimage.sobel(image, 0)
    sobel_filtered_dy = ndimage.sobel(image, 1)
    sobel_gradient = np.hypot(sobel_filtered_dx, sobel_filtered_dy)
    sobel_gradient *= 255.0 / np.max(sobel_gradient)
    
    watershed = morphology.watershed(sobel_gradient, marker_watershed)
    
    outline = ndimage.morphological_gradient(watershed, size=(3, 3))
    outline = outline.astype(bool)
    
    blackhat_structure = [[0, 0, 1, 1, 1, 0, 0],
                          [0, 1, 1, 1, 1, 1, 0],
                          [1, 1, 1, 1, 1, 1, 1],
                          [1, 1, 1, 1, 1, 1, 1],
                          [1, 1, 1, 1, 1, 1, 1],
                          [0, 1, 1, 1, 1, 1, 0],
                          [0, 0, 1, 1, 1, 0, 0]]
    
    blackhat_structure = ndimage.iterate_structure(blackhat_structure, iterations=7)
    outline += ndimage.black_tophat(outline, structure=blackhat_structure)
    
    lungfilter = np.bitwise_or(marker_internal, outline)
    lungfilter = ndimage.morphology.binary_closing(lungfilter, structure=np.ones((5, 5)), iterations = 3)
    
    segmented = np.where(lungfilter == 1, image, -2000*np.ones((image.shape)))
    
    return segmented

In [None]:
train_segmented = separate_lungs(hu_scans[13])

In [None]:
plt.figure(figsize=(7, 7))
plt.title("Segmented Lung")
plt.imshow(train_segmented, cmap=plt.cm.gray)


plt.show()

In [None]:
def img_hu_processing(patient_df):
    
    for i, patient in enumerate(tqdm.tqdm(patient_df["patient"].values)):
        try:
            path = patient_df.loc[patient_df["patient"] == patient].patient_path.values[0]
            scans = load_scan(path)
            n = len(scans)
            if n >= 30:
                m = int(n/10.0)
                scans = scans[int(n*0.1):int(n*0.9):int(m)*2]
                hu_scans = convert_to_hu(scans)
            else:
                hu_scans = convert_to_hu(scans)
                
            for patient in path:
                b = hu_scans
                np.savez("imgs.npz", b)   
        except Exception as e:
            continue
            

In [None]:
img_hu_processing(patient_df)

In [None]:
dict_a = np.load("imgs.npz")
print(dict_a.keys())

In [None]:
patient_df.head()

In [None]:
column_indices = {name: i for i, name in enumerate(train_df.columns)}

n = len(train_df)

train_ds = train_df[0:int(n*0.7)]
val_ds = train_df[int(n*0.7):int(n*0.9)]
test_ds = train_df[int(n*0.9):]

num_features = train_df.shape[1]

In [None]:
train_df.describe().transpose()

train_mean = train_ds.mean()
train_std = train_ds.std()

train_ds = (train_ds - train_mean) / train_std
val_ds = (val_ds - train_mean) / train_std
test_ds = (test_ds - train_mean) / train_std

In [None]:
class WindowGenerator():
    def __init__(self, input_width, label_width, shift,
                 train_ds=train_ds, val_ds=val_ds, test_ds=test_ds,
                 label_columns=None):
        
        self.train_ds = train_ds
        self.val_ds = val_ds
        self.test_ds = test_ds
        
        self.label_columns = label_columns
        if label_columns is not None:
            self.label_columns_indices = {name: i for i, name in enumerate(label_columns)}
            
        self.column_indices = {name: i for i, name in enumerate(train_ds.columns)}
        
        self.input_width = input_width
        self.label_width = label_width
        self.shift = shift
        
        self.total_window_size = input_width + shift
        
        self.input_slice = slice(0, input_width)
        self.input_indices = np.arange(self.total_window_size)[self.input_slice]
        
        self.label_start = self.total_window_size - self.label_width
        self.labels_slice = slice(self.label_start, None)
        self.label_indices = np.arange(self.total_window_size)[self.labels_slice]
        
    def __repr__(self):
        return "\n".join([
            f"TotalWindowSpread: {self.total_window_size}",
            f"Total Indices: {self.input_indices}",
            f"Label Indices: {self.label_indices}",
            f"Label Name: {self.label_columns}"])

In [None]:
w1 = WindowGenerator(input_width=30, label_width=1, shift=30, label_columns=["FVC"])
w1

In [None]:
w2 = WindowGenerator(input_width=5, label_width=1, shift=1, label_columns=["FVC"])
w2

In [None]:
def split_window(self, features):
    inputs = features[:, self.input_slice, :]
    labels = features[:, self.labels_slice, :]
    if self.label_columns is not None:
        labels = tf.stack(
            [labels[:, :, self.column_indices[name]] for name in self.label_columns],
            axis=-1)
        
    inputs.set_shape([None, self.input_width, None])
    labels.set_shape([None, self.label_width, None])
    
    return inputs, labels

WindowGenerator.split_window = split_window

In [None]:
ex_window = tf.stack([np.array(train_ds[:w2.total_window_size]),
                      np.array(train_ds[:+w2.total_window_size]),
                      np.array(train_ds[:+w2.total_window_size])])


ex_inputs, ex_labels = w2.split_window(ex_window)

print(f"Window shape: {ex_window.shape}")
print(f"Inputs shape: {ex_inputs.shape}")
print(f"Labels shape: {ex_labels.shape}")

In [None]:
w2.example = ex_inputs, ex_labels

In [None]:
def plot(self, model=None, plot_column="FVC", max_subplots=3):
    inputs, labels = self.example
    plt.figure(figsize=(11, 7))
    plot_column_index = self.column_indices[plot_column]
    max_n = min(max_subplots, len(inputs))
    for n in range(max_n):
        plt.subplot(3, 1, n+1)
        plt.ylabel(f"{plot_column} [normed]")
        plt.plot(self.input_indices, inputs[n, :, plot_column_index],
                 label="Inputs", marker=".", zorder=-10)
        
        if self.label_columns:
            label_column_index = self.label_columns_indices.get(plot_column, None)
        else:
            label_column_index = plot_column_index
            
        if label_column_index is None:
            continue
            
        plt.scatter(self.label_indices, labels[n, :, label_column_index],
                    edgecolors="k", label="Labels", c="#2ca02c", s=64)
        if model is not None:
            predictions = model(inputs)
            plt.scatter(self.label_indices, predictions[n, :, label_column_index],
                        marker="X", edgecolors="k", label="Predictions",
                        c="#ff7f0e", s=64)
        
        if n == 0:
            plt.legend()
        
    plt.xlabel("Weeks")
    
WindowGenerator.plot = plot

In [None]:
w2.plot()

In [None]:
def create_dataset(self, data):
    data = np.array(data, dtype=np.float32)
    ds = tf.keras.preprocessing.timeseries_dataset_from_array(
        data=data, targets=None, sequence_length=self.total_window_size,
        sequence_stride=1, shuffle=True, batch_size=32)
    
    ds = ds.map(self.split_window)
    
    return ds

WindowGenerator.create_dataset = create_dataset

In [None]:
@property
def train(self):
    return self.create_dataset(self.train_ds)

@property
def val(self):
    return self.create_dataset(self.val_ds)

@property
def test(self):
    return self.create_dataset(self.test_ds)

@property
def example(self):
    result = getattr(self, "_example", None)
    if result is None:
        result = next(iter(self.train))
        self._example = result
    return result

WindowGenerator.train = train
WindowGenerator.val = val
WindowGenerator.test = test
WindowGenerator.example = example

In [None]:
w2.train.element_spec

In [None]:
for example_inputs, example_labels in w2.train.take(1):
    print(f"exInput shape: {example_inputs.shape}")
    print(f"exLabel shape: {example_labels.shape}")
    

In [None]:
singlestep_wind = WindowGenerator(input_width=1, label_width=1, shift=1, label_columns=["FVC"])
singlestep_wind

In [None]:
for example_inputs, example_labels in singlestep_wind.train.take(1):
    print(f"Inputs: {example_inputs.shape}")
    print(f"ouputs: {example_labels.shape}")

In [None]:
class Baseline(tf.keras.Model):
    def __init__(self, label_index=None):
        super().__init__()
        self.label_index = label_index
        
    def call(self, inputs):
        if self.label_index is None:
            return inputs
        
        result = inputs[:, :, self.label_index]
        return result[:, :, tf.newaxis]

In [None]:
max_epochs = 100
early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", 
                                                  patience=2, 
                                                  mode="min")

def compile_and_fit(model, window):
    
    model.compile(loss=tf.keras.losses.MeanSquaredError(),
                  optimizer=tf.keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8),
                  metrics=["mae", "mse"])
    model.fit(window.train, epochs = max_epochs,
              validation_data = window.val, 
              callbacks=[early_stopping])
    history = model
    return history

In [None]:
baseline = Baseline(label_index=column_indices["FVC"])

baseline.compile(loss=tf.keras.losses.MeanSquaredError(),
                 metrics=["mse"])

val_performance = {}
performance = {}

val_performance["Baseline"] = baseline.evaluate(singlestep_wind.val)
performance["Baseline"] = baseline.evaluate(singlestep_wind.test, verbose=0)

In [None]:
wide_wind = WindowGenerator(input_width=30, label_width=30, shift=1, label_columns=["FVC"])
wide_wind

In [None]:
print("Input shape:", singlestep_wind.example[0].shape)
print("output shape:", baseline(singlestep_wind.example[0]).shape)

In [None]:
wide_wind.plot(baseline)

In [None]:
lstm = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(32, return_sequences=True),
    tf.keras.layers.Dense(units=1)])

In [None]:
print("Lstm Input shape:", wide_wind.example[0].shape)
print("Lstm Output shape:", lstm(wide_wind.example[0]).shape)

In [None]:
history = compile_and_fit(lstm, wide_wind)

IPython.display.clear_output()
val_performance["LSTM"] = lstm.evaluate(wide_wind.val)
performance["LSTM"] = lstm.evaluate(wide_wind.test, verbose=0)

In [None]:
wide_wind.plot(lstm)

In [None]:
for name, value in performance.items():
    print(f"{name:12s}: {value[1]:0.4f}")
    

In [None]:
output_steps = 140
multi_window = WindowGenerator(input_width=140,
                               label_width=output_steps,
                               shift=output_steps)

multi_window.plot()
multi_window

In [None]:
class MultiStepLastBaseline(tf.keras.Model):
    def call(self, inputs):
        return tf.tile(inputs[:, -1:, :], [1, output_steps, 1])
    

last_baseline = MultiStepLastBaseline()
last_baseline.compile(loss=tf.keras.losses.MeanSquaredError(),
                      metrics=["mse", "mae"])


multi_val_performance = {}
multi_performance = {}

multi_val_performance["Last"] = last_baseline.evaluate(multi_window.val)
multi_performance["Last"] = last_baseline.evaluate(multi_window.test, verbose=0)
multi_window.plot(last_baseline)

In [None]:
class RepeatBaseline(tf.keras.Model):
    def call(self, inputs):
        return inputs

    
repeat_baseline = RepeatBaseline()
repeat_baseline.compile(loss=tf.keras.losses.MeanSquaredError(),
                        metrics=["mse", tf.keras.metrics.Accuracy()])

multi_val_performance["Repeat"] = repeat_baseline.evaluate(multi_window.val)
multi_performance["Repeat"] = repeat_baseline.evaluate(multi_window.test, verbose=0)
multi_window.plot(repeat_baseline)

In [None]:
multi_lstm = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(32, return_sequences=True),
    tf.keras.layers.Dense(output_steps*num_features, kernel_initializer=tf.initializers.zeros),
    tf.keras.layers.Reshape([output_steps, num_features])])

history = compile_and_fit(multi_lstm, multi_window)

IPython.display.clear_output()


multi_val_performance["LSTM"] = multi_lstm.evaluate(multi_window.val)
multi_performance["LSTM"] = multi_lstm.evaluate(multi_window.test, verbose=0)
multi_window.plot(multi_lstm)

In [None]:
class FeedBack(tf.keras.Model):
    def __init__(self, units, output_steps):
        super().__init__()
        self.output_steps = output_steps
        self.units = units
        self.lstm_cell = tf.keras.layers.LSTMCell(units)
        self.lstm_rnn = tf.keras.layers.RNN(self.lstm_cell, return_state=True)
        self.dense = tf.keras.layers.Dense(num_features)

In [None]:
feedback_lstm = FeedBack(units=32, output_steps=output_steps)

In [None]:
def warmup(self, inputs):
    x, *state = self.lstm_rnn(inputs)
    
    prediction = self.dense(x)
    
    return prediction, state

FeedBack.warmup = warmup

In [None]:
prediction, state = feedback_lstm.warmup(multi_window.example[0])
prediction.shape

In [None]:
def call(self, inputs, training=None):
    predictions = []
    
    prediction, state = self.warmup(inputs)
    
    predictions.append(prediction)
    
    for n in range(1, self.output_steps):
        x = prediction
        x, state = self.lstm_cell(x, states=state, training=training)
        
        prediction = self.dense(x)
        predictions.append(prediction)
        
    predictions = tf.stack(predictions)
    predictions = tf.transpose(predictions, [1, 0, 2])
    return predictions
    
FeedBack.call = call

In [None]:
print("Batch, Time, Features:", feedback_lstm(multi_window.example[0]).shape)

In [None]:
history = compile_and_fit(feedback_lstm, multi_window)

IPython.display.clear_output()

multi_val_performance["AR LSTM"] = feedback_lstm.evaluate(multi_window.val)
multi_performance["AR LSTM"] = feedback_lstm.evaluate(multi_window.test, verbose=0)
multi_window.plot(feedback_lstm)

In [None]:
for name, value in multi_performance.items():
    print(f"{name:12s}: {value[1]:0.4f}")

In [None]:
test_df.head()

In [None]:
sub_df.head()

In [None]:
def score(FVC_true, FVC_pred, sigma):
    sigma_clipped = np.max(sigma, 70)
    delta = np.abs(FVC_true - FVC_pred)
    delta = np.min(delta, 1000)
    sq_2 = np.sqrt(2)
    metric = (delta / sigma_clipped) * sq_2 + (np.log(sigma_clipped * sq_2))
    
    return np.mean(metric)