# Import required packages

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re # for pattern matching, like grep in r
import os
import gc
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

pal = sns.color_palette()

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

# Read data
## Data Entry

First is data entry with patient IDs, findings and demographics information. 
Then create a subset with interested columns only.

In [None]:
data_entry = pd.read_csv('../input/nihdata/Data_Entry_2017.csv')
data_entry.head()

In [None]:
data_entry_subset = data_entry.loc[:, 'Image Index':'Finding Labels']
data_entry_subset.head()

**Remark:** From `Finding Labels`, we can tell this is a multi-labelled classification instead of binary. Spliting the column and getting the unique values are required.

In [None]:
split = pd.DataFrame(data_entry_subset['Finding Labels'].str.split('|').tolist())

temp = []
for i in split:
    temp.append(split[i].unique())

flatten = pd.DataFrame(temp).values.flatten()

unique = []
for x in flatten:
    if x not in unique:
        unique.append(x)

labels = list(filter(None, unique))
labels

There are 15 disease condition including *No Finding*. Note that *Cardiomegaly* etc are not our sole interested point of prediction. *Finding Labels* will be manipulated that split all the different disease tags.

In [None]:
data_entry_subset["Finding Labels"] = data_entry_subset["Finding Labels"].apply(lambda x:x.split("|"))
data_entry_subset.head()

Now we have a look at the class distribution.

In [None]:
from collections import Counter
labels_count = Counter(label for lbs in data_entry_subset["Finding Labels"] for label in lbs)

labels_count

From the frequency table, we found out that there are two types of imbalances:
1. Imbalance across different classes
2. Imbalance between positive and negative in some classes

We will have to derive a `class_weights` to adjust the imbalance distribution later which will be used to fit into the `fit` function.

In [None]:
total_count = sum(labels_count.values())
class_weights = {cls: total_count / count for cls, count in labels_count.items()}

class_weights

## Images

`f, l` are dummy variables representing two columns of *data_entry_subset*.

In [None]:
import cv2

new_style = {'grid': False}
plt.rc('axes', **new_style)
_, ax = plt.subplots(3, 3, sharex='col', sharey='row', figsize=(20, 20))
i = 0
for f, l in data_entry_subset[:9].values:
    img = cv2.imread('../input/nihdata/images_001/images/{}'.format(f))
    ax[i // 3, i % 3].imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    ax[i // 3, i % 3].set_title('{} - {}'.format(f, l))
    i += 1
    
plt.show()

# Data frame pre-processing
This session is to process the images (unstructed data) to machine learnable format (to let the computer understands the images in its way).

We would also split the entire dataset into training set and validation set for model developement and model validation respectively.

In [None]:
from keras.preprocessing.image import ImageDataGenerator

datagen = ImageDataGenerator(
    rescale=1./255,
    shear_range=0.2,  
    zoom_range=0.2,        
    horizontal_flip=True,
    validation_split=0.2) 

def get_flow_from_dataframe(generator, 
                            dataframe, 
                            subset,
                            image_shape=(150, 150),
                            batch_size=32):
    
    train_generator_1 = generator.flow_from_dataframe(dataframe, target_size=image_shape,
                                                      x_col='Image Index',
                                                      y_col='Finding Labels',
                                                      class_mode='categorical',
                                                      directory = '../input/nihdata/images_001/images',
                                                      batch_size=batch_size,
                                                      classes = labels,
                                                      subset=subset)

    train_generator_2 = generator.flow_from_dataframe(dataframe, target_size=image_shape,
                                                      x_col='Image Index',
                                                      y_col='Finding Labels',
                                                      class_mode='categorical',
                                                      directory = '../input/nihdata/images_002/images',
                                                      batch_size=batch_size,
                                                      classes = labels,
                                                      subset=subset)
    
    train_generator_3 = generator.flow_from_dataframe(dataframe, target_size=image_shape,
                                                      x_col='Image Index',
                                                      y_col='Finding Labels',
                                                      class_mode='categorical',
                                                      directory = '../input/nihdata/images_003/images',
                                                      batch_size=batch_size,
                                                      classes = labels,
                                                      subset=subset)
    
    while True:
        x_1 = train_generator_1.next()
        x_2 = train_generator_2.next()
        x_3 = train_generator_3.next()
        
        yield np.concatenate((x_1[0], x_2[0], x_3[0]), axis = 0), np.concatenate((x_1[1], x_2[1], x_3[1]), axis = 0)

In [None]:
train_gen = get_flow_from_dataframe(generator=datagen, 
                                    dataframe=data_entry_subset, 
                                    subset = 'training',
                                    image_shape=(150, 150),
                                    batch_size=32)

val_gen = get_flow_from_dataframe(generator=datagen, 
                                    dataframe=data_entry_subset, 
                                    subset = 'validation',
                                    image_shape=(150, 150),
                                    batch_size=32)

Define `class_weights_index` based on `class_weights` derived. 

In [None]:
generator = datagen.flow_from_dataframe(data_entry_subset, target_size=(150,150),
                                                      x_col='Image Index',
                                                      y_col='Finding Labels',
                                                      class_mode='categorical',
                                                      directory = '../input/nihdata/images_001/images',
                                                      batch_size=32,
                                                      classes = labels)

generator.class_indices

In [None]:
class_weights_index = {
 1: 50.985951008645536,
 5: 56.25476947535771,
 4: 10.628294660959675,
 10: 2.3448418680936367,
 7: 623.5110132158591,
 8: 7.114557152910425,
 9: 24.478900034590108,
 11: 22.356183857210553,
 0: 12.244744355048015,
 14: 26.695020746887966,
 12: 41.81299852289513,
 13: 98.9077568134172,
 6: 83.94839857651246,
 3: 61.45766391663048,
 2: 30.327190914934647
}

# Model Development
This session is to build a deep learning model that can perform classification in later application.

In [None]:
from keras import layers
from keras import models

model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), padding = 'same',
                        activation='relu', input_shape=(150, 150, 3)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Flatten())
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(15, activation='sigmoid'))

## Alternative: Transfer learning

In [None]:
rows = 150
cols = 150

from keras.models import Sequential
from keras.models import Model
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping, ReduceLROnPlateau, TensorBoard
from keras import optimizers, losses, activations, models
from keras.layers import Convolution2D, Dense, Input, Flatten, Dropout, MaxPooling2D, BatchNormalization, GlobalAveragePooling2D, Concatenate
from keras import applications
input_shape = (rows, cols, 3)

weights_path = '../input/vgg19/vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5'
base_model = applications.VGG19(weights= weights_path, 
                                include_top=False, 
                                input_shape=(rows, cols, 3))
base_model.trainable = False

add_model = Sequential()
add_model.add(base_model)
add_model.add(GlobalAveragePooling2D())
add_model.add(Dropout(0.5))
add_model.add(Dense(15, 
                    activation='sigmoid'))
model_vgg15 = add_model

# Model Optimization
This is to instruct the model to improve its performance (accuracy) by learning from its own mistake.

In [None]:
from keras import optimizers

model.compile(loss='binary_crossentropy',
             optimizer=optimizers.RMSprop(lr=1e-4),
             metrics=['acc'])

# Model Training

In [None]:
step_per_train = 20000//32
step_per_val = 4999//32

In [None]:
history = model.fit_generator(
    train_gen, 
    steps_per_epoch  = step_per_train, 
    validation_data  = val_gen,
    validation_steps = step_per_val,
    class_weight = class_weights_index,
    use_multiprocessing = True,
    epochs = 5)

# Performance Visualization

In [None]:
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)
fig = plt.figure(figsize=(16,9))

plt.subplot(1, 2, 1)
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

In [None]:
model.save('multi_label.h5')

# Demo
Users can change index no of `data_entry_subset.iloc[[]]` to predict different image, same goes to second cell below. Future work need to be done to automate this manual key in process.

In [None]:
from keras.models import load_model
from keras.preprocessing import image

model = load_model('multi_label.h5')

generator.reset()

pred = model.predict_generator(generator, verbose = 1)

In [None]:
pred

In [None]:
predictions = []
pred_threshold = (pred > 0.15)
class_indices = generator.class_indices
class_indices = dict((v, k) for k, v in class_indices.items())

for i in pred_threshold:
    outcome = []
    for index, cls in enumerate(i):
        if cls:
            outcome.append(class_indices[index])
    predictions.append(",".join(outcome))

patient_id = generator.filenames
results = pd.DataFrame({"Filename": patient_id,
                       "Classifications": predictions})

results.head()

In [None]:
import cv2

new_style = {'grid': False}
plt.rc('axes', **new_style)

for f, l in data_entry_subset.iloc[[0]].values:
    img = cv2.imread('../input/nihdata/images_001/images/{}'.format(f))
    plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    plt.title(('{} - {}'.format(f, l)))
    
plt.show()