In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt


# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
IMAGE_PATH = "../input/osic-pulmonary-fibrosis-progression"
X_full = pd.read_csv(IMAGE_PATH + "/train.csv")
X_test_full = pd.read_csv(IMAGE_PATH + "/test.csv")
#remove duplicates in training and test sets
X_concat = pd.concat([X_full,X_test_full])
X_dropDups = X_concat.drop_duplicates(keep=False)
train_df = X_dropDups
test_df = X_test_full

#add pixel_array to dataset
import cv2
import math
import pydicom

def chunks(l,n):
    # Credit: Ned Batchelder
    # Link: http://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
    """Yield successive n-sized chunks from 1."""
    for i in range(0,len(l),n):
        yield l[i:i+n]

#Credit: Sentdex
#Link: https://www.kaggle.com/sentdex/first-pass-through-data-w-3d-convnet
def mean(l):
    return sum(l)/len(l)

IMG_PX_SIZE = 50
HM_SLICES = 20
data_dir = '../input/osic-pulmonary-fibrosis-progression/train/'


#create new dataframe for storing patient IDs and CT pixel array

def create_df(data_dir,IMG_PX_SIZE,HM_SLICES):
    patients = os.listdir(data_dir)
    new_slices = []
    CT_scans = []
    patient_IDs = []
    for patient in patients:
        try:
            path = data_dir + patient
            slices = [pydicom.read_file(path + '/' + s) for s in os.listdir(path)]
            slices.sort(key = lambda x: int(x.InstanceNumber))
            new_slices = []

            slices = [cv2.resize(np.array(each_slice.pixel_array),(IMG_PX_SIZE,IMG_PX_SIZE)) for each_slice in slices]

            chunk_sizes = math.ceil(len(slices) / HM_SLICES)

            for slice_chunk in chunks(slices, chunk_sizes):
                slice_chunk = list(map(mean, zip(*slice_chunk)))
                new_slices.append(slice_chunk)

            if len(new_slices) < HM_SLICES:
                while len(new_slices) < HM_SLICES:
                    new_slices.append(new_slices[-1])

            if len(new_slices) > HM_SLICES:
                while len(new_slices) > HM_SLICES:
                    new_val = list(map(mean, zip(*[new_slices[HM_SLICES-1],new_slices[HM_SLICES],])))
                    del new_slices[HM_SLICES]
                    new_slices[HM_SLICES-1] = new_val
            CT_scans.append(np.array(new_slices))
            patient_IDs.append(patient)

        except Exception as e:
            # some images cannot be decoded with pydicom
            pass

    pixel_df = pd.DataFrame({'Patient': patient_IDs,
                            'PixelArray': CT_scans})
    del patient_IDs, CT_scans

    return pixel_df

In [None]:
def getBaselineFVC(data_dir,df):
    patients = os.listdir(data_dir)
    baselineFVC = []
    patient_IDs = []
    for patient in patients:
        idx = (df['Patient'] == patient).idxmax()
        baselineFVC.append(df.iloc[idx]['FVC'])
        patient_IDs.append(patient)
    baselineFVC_df = pd.DataFrame({'Patient': patient_IDs,
                                 'BaselineFVC': baselineFVC})
    del patient_IDs, baselineFVC
    return baselineFVC_df

baselineFVC_train_df = getBaselineFVC('../input/osic-pulmonary-fibrosis-progression/train/',train_df)

        

In [None]:

#X_trainingData = pd.merge(X_trainingData, pixel_df, on='Patient')
train_df_pix = create_df(data_dir,IMG_PX_SIZE,HM_SLICES)
data_dir_test = '../input/osic-pulmonary-fibrosis-progression/test/'
test_df_pix = create_df(data_dir_test,IMG_PX_SIZE,HM_SLICES)

In [None]:
train_df.head()

In [None]:
train_df=pd.merge(train_df,train_df_pix,on='Patient')
train_df=pd.merge(train_df,baselineFVC_train_df,on='Patient')
test_df=pd.merge(test_df,test_df_pix,on='Patient')

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
#source: https://www.pyimagesearch.com/2019/02/04/keras-multiple-inputs-and-mixed-data/
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from scipy import sparse

def process_patient_attributes(df,train,test):
    continuous = ['Weeks','FVC','BaselineFVC','Age']
    # performin min-max scaling each continuous feature column to
    # the range [0, 1]
    cs = MinMaxScaler()
    trainContinuous = cs.fit_transform(train[continuous])
    testContinuous = cs.transform(test[continuous])
    # one-hot encode the zip code categorical data (by definition of
    # one-hot encoding, all output features are now in the range [0, 1])
    categorical = ['Sex','SmokingStatus']
    binarizer = OneHotEncoder().fit(df[categorical])
    trainCategorical = binarizer.transform(train[categorical])
    testCategorical = binarizer.transform(test[categorical])
    # construct our training and testing data points by concatenating
    # the categorical features with the continuous features
    trainX = sparse.hstack([trainCategorical,trainContinuous]).toarray()
    testX = sparse.hstack([testCategorical,testContinuous]).toarray()
    # return the concatenated training and testing data
    return(trainX,testX)

In [None]:
# import the necessary packages
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Conv3D
from tensorflow.keras.layers import MaxPooling3D
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model

def create_mlp(dim, regress=False):
    # define our MLP network
    model = Sequential()
    model.add(Dense(8, input_dim=dim, activation="relu"))
    model.add(Dense(4, activation="relu"))
    # check to see if the regression node should be added
    if regress:
        model.add(Dense(1, activation="linear"))
    # return our model
    return model

def create_cnn(x):
    ''' # initialize the input shape and channel dimension, assuming
    # TensorFlow/channels-last ordering
    inputShape = (height, width, depth,1)
    chanDim = -1
    # define the model input
    inputs = Input(shape=inputShape)
    # loop over the number of filters
    for (i, f) in enumerate(filters):
        # if this is the first CONV layer then set the input
        # appropriately
        if i == 0:
            x = inputs
        # CONV => RELU => BN => POOL
        x = Conv3D(f, (3, 3, 1), padding="same")(x)
        x = Activation("relu")(x)
        x = BatchNormalization(axis=chanDim)(x)
        x = MaxPooling3D(pool_size=(2, 2, 1))(x)

        # flatten the volume, then FC => RELU => BN => DROPOUT
        x = Flatten()(x)
        x = Dense(16)(x)
        x = Activation("relu")(x)
        x = BatchNormalization(axis=chanDim)(x)
        x = Dropout(0.5)(x)
        # apply another FC layer, this one to match the number of nodes
        # coming out of the MLP
        x = Dense(4)(x)
        x = Activation("relu")(x)
        # check to see if the regression node should be added
        if regress:
            x = Dense(1, activation="linear")(x)
        # construct the CNN
        model = Model(inputs, x)
        # return the CNN
        return model
        '''
    n_classes = 2
    batch_size = 10
        #                # 5 x 5 x 5 patches, 1 channel, 32 features to compute.
    weights = {'W_conv1':tf.Variable(tf.random.normal([3,3,3,1,32])),
               #       5 x 5 x 5 patches, 32 channels, 64 features to compute.
               'W_conv2':tf.Variable(tf.random.normal([3,3,3,32,64])),
               #                                  64 features
               'W_fc':tf.Variable(tf.random.normal([54080,1024])),
               'out':tf.Variable(tf.random.normal([1024, n_classes]))}

    biases = {'b_conv1':tf.Variable(tf.random.normal([32])),
               'b_conv2':tf.Variable(tf.random.normal([64])),
               'b_fc':tf.Variable(tf.random.normal([1024])),
               'out':tf.Variable(tf.random.normal([n_classes]))}

    #                            image X      image Y        image Z
    x = tf.reshape(x, shape=[-1, IMG_PX_SIZE, IMG_PX_SIZE, HM_SLICES, 1])

    conv1 = tf.nn.relu(Conv3d(x, weights['W_conv1']) + biases['b_conv1'])
    conv1 = maxpool3d(conv1)


    conv2 = tf.nn.relu(Conv3d(conv1, weights['W_conv2']) + biases['b_conv2'])
    conv2 = maxpool3d(conv2)

    fc = tf.reshape(conv2,[-1, 54080])
    fc = tf.nn.relu(tf.matmul(fc, weights['W_fc'])+biases['b_fc'])
    fc = tf.nn.dropout(fc, keep_rate)

    output = tf.matmul(fc, weights['out'])+biases['out']

    return output

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import concatenate



In [None]:
split = train_test_split(train_df[['Weeks','FVC','BaselineFVC','Age','Sex','SmokingStatus']],train_df['PixelArray'],test_size = 0.25,random_state = 42)
trainAttrX, validAttrX, trainImagesX, validImagesX = split

In [None]:
maxFVC = trainAttrX['FVC'].max()
trainY = np.asarray(trainAttrX['FVC']/maxFVC)
validY = np.asarray(validAttrX['FVC']/maxFVC)


In [None]:
trainImagesX = trainImagesX.to_numpy()
validImagesX = validImagesX.to_numpy()
(trainAttrX,validAttrX) = process_patient_attributes(train_df,trainAttrX,validAttrX)

In [None]:
for i in range(len(trainImagesX)):
    trainImagesX[i] = np.asarray(trainImagesX[i])
    try:
        trainImagesX[i] = tf.convert_to_tensor(trainImagesX[i])
    except Exception as e:
        print(str(e))
for i in range(len(validImagesX)):
    validImagesX[i] = np.asarray(validImagesX[i])
    try:
        validImagesX[i] = tf.convert_to_tensor(validImagesX[i])
    except Exception as e:
        print(str(e))
    

In [None]:
# create the MLP and CNN models
mlp = create_mlp(trainAttrX.shape[1], regress=False)
cnn = create_cnn(trainImagesX)

In [None]:
# create the input to our final set of layers as the *output* of both
# the MLP and CNN
combinedInput = concatenate([mlp.output, cnn.output])
# our final FC layer head will have two dense layers, the final one
# being our regression head
x = Dense(4, activation="relu")(combinedInput)
x = Dense(1, activation="linear")(x)
# our final model will accept categorical/numerical data on the MLP
# input and images on the CNN input, outputting a single value (the
# predicted price of the house)
model = Model(inputs=[mlp.input, cnn.input], outputs=x)

In [None]:
# compile the model using mean absolute percentage error as our loss,
# implying that we seek to minimize the absolute percentage difference
# between our price *predictions* and the *actual prices*
opt = Adam(lr=1e-3, decay=1e-3 / 200)
model.compile(loss="mean_absolute_percentage_error", optimizer=opt)
# train the model
print("[INFO] training model...")

model.fit(
    x=[trainAttrX, trainImagesX], y=trainY,
    validation_data=([validAttrX, validImagesX], validY),
    epochs=200, batch_size=8)



In [None]:
# make predictions on the testing data
print("[INFO] predicting FVC...")
preds = model.predict([validAttrX, validImagesX])