In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df= pd.read_csv('/kaggle/input/nih-chest-xrays-tfrecords/preprocessed_data.csv')
df.head()

In [None]:
dfk=df.iloc[:,1:]

In [None]:
dfk

In [None]:
cols = dfk.columns
num_cols = dfk._get_numeric_data().columns
char_cols=list(set(cols) - set(num_cols))
char_cols

In [None]:
from sklearn.preprocessing import LabelEncoder
le= LabelEncoder()
def encode(df):
    for i in cols:
        df[i]= le.fit_transform(df[i])
    return df

In [None]:
encode(dfk)

In [None]:
df=pd.concat([df.iloc[:,0],dfk], axis=1)

In [None]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
import IPython.display as display
import matplotlib.pyplot as plt
import seaborn as sns
import random
from functools import partial
import sys
from numpy import load
from keras.models import Sequential
from keras.layers import Conv2D
from keras.layers import MaxPooling2D
from keras.layers import Dense
from keras.layers import Flatten
import time as timer


In [None]:
start_time = timer.time()

In [None]:
data_dir = '/kaggle/input/nih-chest-xrays-tfrecords/'

image_dir = data_dir + 'data/'

tfrlist_suffix = os.listdir(image_dir)

print('TFRecord file count: ' + str(len(tfrlist_suffix)))

In [None]:
heads = list(df.columns)[1:]
cols = int(np.ceil(len(heads)/2))

_, axs = plt.subplots(cols,2, figsize=(15, 30))

for i, _ in enumerate(heads):
    if i % 2 == 0:
        sns.countplot(x=heads[i], data=df, ax=axs[int(i/2),0])
    else:
        sns.countplot(x=heads[i], data=df, ax=axs[int((i-1)/2),1])

In [None]:
heads = list(df.columns)[2:]

In [None]:
tfrlist = [image_dir + x for x in tfrlist_suffix]

FILENAMES = tf.io.gfile.glob(tfrlist)

In [None]:
ALL = list(range(len(FILENAMES)))

TRAIN_AND_VALID_INDEX = random.sample(ALL, int(len(ALL) * 0.8))
TEST_INDEX = list(set(ALL) - set(TRAIN_AND_VALID_INDEX))

TRAIN_INDEX = random.sample(TRAIN_AND_VALID_INDEX, int(len(TRAIN_AND_VALID_INDEX) * 0.9))
VALID_INDEX = list(set(TRAIN_AND_VALID_INDEX) - set(TRAIN_INDEX))

In [None]:
TRAINING_FILENAMES, VALID_FILENAMES, TEST_FILENAMES = [FILENAMES[index] for index in TRAIN_INDEX], [FILENAMES[index] for index in VALID_INDEX], [FILENAMES[index] for index in TEST_INDEX]

In [None]:
print("Train TFRecord Files:", len(TRAINING_FILENAMES))
print("Validation TFRecord Files:", len(VALID_FILENAMES))
print("Test TFRecord Files:", len(TEST_FILENAMES))

In [None]:
feature_description = {}

for elem in list(df.columns)[2:]:
    feature_description[elem] = tf.io.FixedLenFeature([], tf.int64)
    
feature_description['image'] = tf.io.FixedLenFeature([], tf.string)

In [None]:
BATCH_SIZE = 32
IMAGE_ONE_AXIS = 100
IMAGE_SIZE = [IMAGE_ONE_AXIS, IMAGE_ONE_AXIS]
AUTOTUNE = tf.data.experimental.AUTOTUNE

In [None]:
def read_tfrecord(example):
    example = tf.io.parse_single_example(example, feature_description)
    image = tf.io.decode_jpeg(example["image"], channels=3)
    image = tf.image.resize(image, IMAGE_SIZE)
    image = tf.cast(image, tf.float32) / 255.0
    
    label = []
    
    for val in heads:
        label.append(example[val])
    
    return image, label


In [None]:
def load_dataset(filenames):
    ignore_order = tf.data.Options()
    ignore_order.experimental_deterministic = False
    dataset = tf.data.TFRecordDataset(filenames)
    dataset = dataset.with_options(ignore_order)
    dataset = dataset.map(read_tfrecord)
    
    return dataset

In [None]:
def get_dataset(filenames):
    dataset = load_dataset(filenames)
    dataset = dataset.shuffle(2048)
    dataset = dataset.prefetch(buffer_size=AUTOTUNE)
    dataset = dataset.batch(BATCH_SIZE)
    
    return dataset

In [None]:
train_dataset = get_dataset(TRAINING_FILENAMES)
valid_dataset = get_dataset(VALID_FILENAMES)
test_dataset = get_dataset(TEST_FILENAMES)

In [None]:
image_viz, label_viz = next(iter(train_dataset))

def show_batch(X, Y):
    plt.figure(figsize=(20, 20))
    for n in range(25):
        ax = plt.subplot(5, 5, n + 1)
        plt.imshow(X[n])
        
        result = [x for i, x in enumerate(heads) if Y[n][i]]
        title = "+".join(result)
        
        if result == []: title = "No Finding"
        
        plt.title(title)
        plt.axis("off")

show_batch(image_viz.numpy(), label_viz.numpy())

In [None]:
initial_learning_rate = 0.01
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate, decay_steps=5, decay_rate=0.96, staircase=True
)

In [None]:
def define_model(in_shape=(IMAGE_SIZE[0], IMAGE_SIZE[1], 3), out_shape=len(heads)):
    model = Sequential()
    model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same', input_shape=in_shape))
    model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
    model.add(Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Conv2D(128, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
    model.add(Conv2D(128, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Flatten())
    model.add(Dense(128, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(out_shape, activation='sigmoid'))

    model.compile(optimizer=tf.keras.optimizers.Adadelta(learning_rate=lr_schedule),
                  loss='binary_crossentropy',
                  metrics=[tf.keras.metrics.AUC(name="auc")])
    return model

In [None]:
train_size = sum(1 for _ in tf.data.TFRecordDataset(TRAINING_FILENAMES))
validation_size = sum(1 for _ in tf.data.TFRecordDataset(VALID_FILENAMES))

epoch_steps = int(np.ceil(train_size/BATCH_SIZE))
validation_steps = int(np.ceil(validation_size/BATCH_SIZE))

epochs = 10

print("steps_per_epoch: " + str(epoch_steps))
print("validation_steps: " + str(validation_steps))

In [None]:
model = define_model()

history = model.fit(
    train_dataset,
    epochs=epochs,
    validation_data=valid_dataset,
    validation_steps = validation_steps
)

In [None]:
_, test_auc = model.evaluate(test_dataset, verbose=0)

print('Test auc:', test_auc)

In [None]:
# plot loss
ax = plt.subplot(211)
plt.title('Cross Entropy Loss')
plt.plot(history.history['loss'], color='blue', label='train')
plt.plot(history.history['val_loss'], color='orange', label='validation')
ax.axes.xaxis.set_visible(False)

# plot accuracy
plt.subplot(212)
plt.title('AUC')
plt.plot(history.history['auc'], color='blue', label='train')
plt.plot(history.history['val_auc'], color='orange', label='validation')

In [None]:
fitted_model = model.predict(test_dataset)

In [None]:
image_viz, label_viz = next(iter(test_dataset))

def show_batch(X, Y_act):
    plt.figure(figsize=(25, 30))
    for n in range(9):
        
        ax = plt.subplot(3, 3, n + 1)
        ax = plt.imshow(X[n])
        
        result = [x for i, x in enumerate(heads) if Y_act[n][i]]
        
        title = "+".join(result)
        
        if result == []: title = "No Finding"
        
        title = "Actual:\n" + title
        
        title += "\n\n Prediction:\n" + str(fitted_model[n]) + "\n\n My interpretation:\n"
        
        threshold = 0.5
        
        result = []
        for i, _ in enumerate(heads):
            if fitted_model[n][i] > threshold:
                result.append(1)
            else:
                result.append(0)
        
        result = np.asarray(result)

        if np.linalg.norm(result) == 0:
            title += "No Finding"
        else:
            result = [x for i, x in enumerate(heads) if result[i]]
            additional_title = "+".join(result)
            title += additional_title
            
        plt.title(title)
        plt.axis("off")

show_batch(image_viz.numpy(), label_viz.numpy())

In [None]:
end_time = timer.time()

time = end_time - start_time

day = time // (24 * 3600)
time = time % (24 * 3600)
hour = time // 3600
time %= 3600
minutes = time // 60
time %= 60
seconds = np.round(time,0)
print(f"Total code execution time: {day} days, {hour} hours, {minutes} minutes, {seconds} seconds")