In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from glob import glob
import matplotlib.pyplot as plt
import tensorflow as tf

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
all_xray_df = pd.read_csv('../input/data/Data_Entry_2017.csv')
all_image_paths = {os.path.basename(x): x for x in 
                   glob(os.path.join('..', 'input', 'data', 'images*', '*', '*.png'))}
print('Scans found:', len(all_image_paths), ', Total Headers', all_xray_df.shape[0])
all_xray_df['path'] = all_xray_df['Image Index'].map(all_image_paths.get)
# all_xray_df['Patient Age'] = all_xray_df['Patient Age'].map(lambda x: int(x[:-1]))
all_xray_df.sample(3)

In [None]:
all_xray_df['Finding Labels'] = all_xray_df['Finding Labels'].map(lambda x: x.replace('No Finding', ''))
from itertools import chain
all_labels = np.unique(list(chain(*all_xray_df['Finding Labels'].map(lambda x: x.split('|')).tolist())))
all_labels = [x for x in all_labels if len(x)>0]
print('All Labels ({}): {}'.format(len(all_labels), all_labels))
for c_label in all_labels:
    if len(c_label)>1: # leave out empty labels
        all_xray_df[c_label] = all_xray_df['Finding Labels'].map(lambda finding: 1.0 if c_label in finding else 0)
all_xray_df.sample(3)

In [None]:
all_xray_df["labels"] = all_xray_df.apply(lambda x: x["Finding Labels"].split("|"), axis=1)

In [None]:
from sklearn.model_selection import train_test_split
train_df, valid_df = train_test_split(all_xray_df, 
                                   test_size = 0.25, 
                                   random_state = 2018,
                                   stratify = all_xray_df['Finding Labels'].map(lambda x: x[:4]))
print('train', train_df.shape[0], 'validation', valid_df.shape[0])

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
IMG_SIZE = (256, 256)
core_idg = ImageDataGenerator(samplewise_center=True, 
                              samplewise_std_normalization=True, 
                              horizontal_flip = True, 
                              vertical_flip = False, 
                              height_shift_range= 0.05, 
                              width_shift_range=0.1, 
                              rotation_range=5, 
                              shear_range = 0.1,
                              fill_mode = 'reflect',
                              zoom_range=0.15)

In [None]:
train_gen = core_idg.flow_from_dataframe(dataframe=train_df,
                                             directory=None,
                                             x_col='path',
                                             y_col='labels',
                                             class_mode='categorical',
                                             batch_size=32,
                                             classes=all_labels,
                                             color_mode='grayscale',
                                             target_size=IMG_SIZE)

valid_gen = core_idg.flow_from_dataframe(dataframe=valid_df,
                                             directory=None,
                                             x_col='path',
                                             y_col='labels',
                                             class_mode='categorical',
                                             batch_size=256,
                                             classes=all_labels,
                                             color_mode='grayscale',
                                             target_size=IMG_SIZE) # we can use much larger batches for evaluation
# used a fixed dataset for evaluating the algorithm
test_X, test_Y = next(core_idg.flow_from_dataframe(dataframe=valid_df,
                                                   directory=None,
                                                   x_col='path',
                                                   y_col='labels',
                                                   class_mode='categorical',
                                                   batch_size=1024,
                                                   classes=all_labels,
                                                   color_mode='grayscale',
                                                   target_size=IMG_SIZE))

In [None]:
from tensorflow.keras.applications.mobilenet import MobileNet
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout, Flatten
from tensorflow.keras.models import Sequential

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping, ReduceLROnPlateau
weight_path="{}_nih_pretrained.h5".format('xray_class')

checkpoint = ModelCheckpoint(weight_path, monitor='val_loss', verbose=1, 
                             save_best_only=True, mode='min')

early = EarlyStopping(monitor="val_loss", 
                      mode="min", 
                      patience=3)
callbacks_list = [checkpoint, early]

In [None]:
with tf.device("/gpu:0"):

    base_mobilenet_model = MobileNet(input_shape =  (256, 256, 1), 
                                     include_top = False, weights = None)
    multi_disease_model = Sequential()
    multi_disease_model.add(base_mobilenet_model)
    multi_disease_model.add(GlobalAveragePooling2D())
    multi_disease_model.add(Dropout(0.5))
    multi_disease_model.add(Dense(512))
    multi_disease_model.add(Dropout(0.5))
    multi_disease_model.add(Dense(len(all_labels), activation = 'sigmoid'))
    multi_disease_model.compile(optimizer = 'adam', loss = 'binary_crossentropy',
                               metrics = ['binary_accuracy', 'mae'])
    multi_disease_model.summary()


    multi_disease_model.fit(train_gen, 
                            steps_per_epoch=100,
                            validation_data = (test_X, test_Y), 
                            epochs = 15, 
                            callbacks = callbacks_list)

> * Train: 0.8854
> * Val: 0.8720
> * loss_train: 0.3102
> * loss_val: 0.4242

In [None]:
y_preds = multi_disease_model.predict(test_X)

In [None]:
print(np.around(y_preds[4], 2))
print(test_Y[4])

# Fine tune on CheXpert

In [None]:
path = "../input/chexpert-dataset/"

train_df = pd.read_csv('../input/modified-dataset2/modifiedv2_train.csv')
valid_df = pd.read_csv('../input/modified-dataset2/modifiedv2_valid.csv')
train_df["path"] = path + train_df["Path"]
valid_df["path"] = path + valid_df["Path"]

dfs = [train_df, valid_df]
all_xray_df = pd.concat(dfs)
all_xray_df.sample(3)

In [None]:
# all_xray_df.drop("No Finding", axis=1, inplace=True)
all_xray_df.columns

In [None]:
all_xray_df = all_xray_df[all_xray_df["Finding Labels"].notnull()]

In [None]:
all_xray_df['Finding Labels'] = all_xray_df['Finding Labels'].map(lambda x: x.replace('No Finding', ''))

In [None]:
all_labels = ['Atelectasis'
, 'Consolidation'
, 'Infiltration'
, 'Pneumothorax'
, 'Edema'
, 'Emphysema'
, 'Fibrosis'
, 'Pleural Effusion'
, 'Mass'
, 'Pneumonia'
, 'Pleural_thickening'
, 'Cardiomegaly'
, 'Nodule Mass'
, 'Hernia'
, 'Enlarged Cardiom'
, 'Lung Lesion'
, 'Lung Opacity'
, 'Pleural Other'
,'Fracture']

print('All Labels ({}): {}'.format(len(all_labels), all_labels))

In [None]:
for c_label in all_labels:
    if len(c_label)>1: # leave out empty labels
        all_xray_df[c_label] = all_xray_df['Finding Labels'].map(lambda finding: 1.0 if c_label in finding else 0)
all_xray_df.sample(3)

In [None]:
all_xray_df.columns

In [None]:
# all_xray_df.drop("Enlarged Cardiomediastinum", axis=1, inplace=True)

In [None]:
all_xray_df["labels"] = all_xray_df.apply(lambda x: x["Finding Labels"].split("|"), axis=1)

In [None]:
all_xray_df["labels"]

In [None]:
from sklearn.model_selection import train_test_split
train_df, valid_df = train_test_split(all_xray_df, 
                                   test_size = 0.25, 
                                   random_state = 2018,
                                   stratify = all_xray_df['Finding Labels'].map(lambda x: x[:4]))
print('train', train_df.shape[0], 'validation', valid_df.shape[0])

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
IMG_SIZE = (256, 256)
core_idg = ImageDataGenerator(samplewise_center=True, 
                              samplewise_std_normalization=True, 
                              horizontal_flip = True, 
                              vertical_flip = False, 
                              height_shift_range= 0.05, 
                              width_shift_range=0.1, 
                              rotation_range=5, 
                              shear_range = 0.1,
                              fill_mode = 'reflect',
                              zoom_range=0.15)

In [None]:
train_df.head()

In [None]:
train_gen = core_idg.flow_from_dataframe(dataframe=train_df,
                                             directory=None,
                                             x_col='path',
                                             y_col='labels',
                                             class_mode='categorical',
                                             batch_size=32,
                                             classes=all_labels,
                                             color_mode='grayscale',
                                             target_size=IMG_SIZE)

valid_gen = core_idg.flow_from_dataframe(dataframe=valid_df,
                                             directory=None,
                                             x_col='path',
                                             y_col='labels',
                                             class_mode='categorical',
                                             batch_size=256,
                                             classes=all_labels,
                                             color_mode='grayscale',
                                             target_size=IMG_SIZE) # we can use much larger batches for evaluation
# used a fixed dataset for evaluating the algorithm
test_X, test_Y = next(core_idg.flow_from_dataframe(dataframe=valid_df,
                                                   directory=None,
                                                   x_col='path',
                                                   y_col='labels',
                                                   class_mode='categorical',
                                                   batch_size=1024,
                                                   classes=all_labels,
                                                   color_mode='grayscale',
                                                   target_size=IMG_SIZE))

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping, ReduceLROnPlateau
weight_path="{}MobileNetModel.hdf5".format('xray_class')

checkpoint = ModelCheckpoint(weight_path, monitor='val_loss', verbose=1, 
                             save_best_only=True, mode='min')

early = EarlyStopping(monitor="val_loss", 
                      mode="min", 
                      patience=3)
callbacks_list = [checkpoint, early]

In [None]:
for x, y in train_gen:
    print(x.shape)
    break

In [None]:
new_model = tf.keras.Sequential()
for layer in multi_disease_model.layers[0:-2]:
    tempLayer = layer
    tempLayer.trainable = False
    new_model.add(tempLayer)
    

In [None]:
new_model.add(tf.keras.layers.Dense(200, activation="relu"))
new_model.add(tf.keras.layers.Dense(19, activation="sigmoid"))

In [None]:
new_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                 loss=tf.keras.losses.BinaryCrossentropy(),
                 metrics=["binary_accuracy", "mae"])

In [None]:
new_model.summary()

In [None]:
new_model.fit(train_gen, 
                            steps_per_epoch=100,
                            validation_data = (test_X, test_Y), 
                            epochs = 15, 
                            callbacks = callbacks_list)

In [None]:
# new_model.predict(test_X[0])

y_preds = new_model.predict(test_X)

In [None]:
np.around(y_preds[0], 2)

In [None]:
test_Y[0]