In [None]:
import os
import json
import cv2
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pydicom
from keras import layers
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import Callback, ModelCheckpoint, EarlyStopping
from keras.initializers import Constant
from keras.models import Sequential
from keras.optimizers import Adam
from tensorflow.python.ops import array_ops
from tqdm import tqdm
from keras import backend as K
import tensorflow as tf
import keras
from keras.applications import Xception
from keras.models import Model, load_model
from math import ceil, floor
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import log_loss
from keras.layers import Dense, Flatten, Dropout, GlobalAveragePooling2D

*** Things to look at for optmization: **
* Different file sizes
* Different Batch sizes and more/less epoch 
* using different learning rate or metrics (AUC) 
* Sensitivy and specificty 
* Only take 'any" or most frequent -> make tables 
* Get balanced data 
* Batch normalization 
* adaptive loss function
* Data augmentation? GANs?

**Testing/prediction**
* Fill the sample documents provided in the dataset + test on validation data
* Try to make a scrpit that takes an image, and return the probability for each label 
* Makes a labeled validation set? 
* Other ways to validate 

In [None]:
os.listdir('/kaggle/input/rsna-intracranial-hemorrhage-detection/rsna-intracranial-hemorrhage-detection')

In [None]:
BASE_PATH = '/kaggle/input/rsna-intracranial-hemorrhage-detection/rsna-intracranial-hemorrhage-detection/'
TRAIN_DIR = 'stage_2_train/'
TEST_DIR = 'stage_2_test/'
train_df = pd.read_csv(BASE_PATH + 'stage_2_train.csv')

In [None]:
sub_df = pd.read_csv(BASE_PATH + 'stage_2_sample_submission.csv')

train_df['filename'] = train_df['ID'].apply(lambda st: "ID_" + st.split('_')[1] + ".png")
train_df['type'] = train_df['ID'].apply(lambda st: st.split('_')[2])
sub_df['filename'] = sub_df['ID'].apply(lambda st: "ID_" + st.split('_')[1] + ".png")
sub_df['type'] = sub_df['ID'].apply(lambda st: st.split('_')[2])

print(train_df.shape)
train_df.head()

In [None]:
test_df = pd.DataFrame(sub_df.filename.unique(), columns=['filename'])
print(test_df.shape)
test_df.head()

In [None]:
png_test_df = pd.DataFrame(sub_df.filename.unique(), columns=['filename'])
print(png_test_df.shape)
png_test_df.head()


In [None]:
dcm_df = test_df
dcm_df['filename'] = dcm_df['filename'].apply(lambda x: x.replace('.png', '.dcm'))
dcm_df

In [None]:
png_test_df

In [None]:
subtypes = train_df.groupby('type').sum()
subtypes

In [None]:
sns.barplot(y=subtypes.index, x=subtypes.Label, palette="deep")

**From this we can note a few things: **
* There are 107933 images with "any" hemmorhages. This is quite low compared to the 720000 images we have in the dataset 
* Thus, we could create a generator with all the images containing hemorrages and the same amount of images not subject to an hemmorhage. 
* Additionaly, all types of hemmorhages are realtively equaly represented, except the 'epidural' type that has only 3145 cases in the whole dataset. We could try to run in with this type discarded.  

In [None]:
np.random.seed(2019)
sample_files = np.random.choice(os.listdir(BASE_PATH + TRAIN_DIR), 400000) # take the rest for testing
sample_df = train_df[train_df.filename.apply(lambda x: x.replace('.png', '.dcm')).isin(sample_files)]

In [None]:
pivot_df = sample_df[['Label', 'filename', 'type']].drop_duplicates().pivot(
    index='filename', columns='type', values='Label').reset_index()
print(pivot_df.shape)
pivot_df

In [None]:
#one_df = pivot_df.drop(pivot_df.loc[pivot_df['subdural']==0].index)
#one_df

In [None]:
#zero_df = pivot_df.drop(pivot_df.loc[pivot_df['any']==1].index)
#zero_df

In [None]:
#zero_df = zero_df.sample(47166)
#zero_df

In [None]:
#sample_df = pd.concat([zero_df, one_df])
#sample_df

In [None]:
#zero_df = pivot_df.drop(pivot_df.loc[pivot_df['any']==1].index)
#zero_df

In [None]:
#zero_df = zero_df.sample(47166)

In [None]:
#sample_df = pd.concat([zero_df, one_df])
#sample_df

In [None]:
#from sklearn.utils import shuffle
#sample_df = shuffle(sample_df)

In [None]:
validation_df = pivot_df.sample(int(len(pivot_df) * 0.15))  
validation_df 

In [None]:
y_true = []
for i in range(len(validation_df)): 
    y_true.append(validation_df.iloc[i,1])
        


In [None]:
len(y_true)

In [None]:
full_true = []
for i in range(len(validation_df)): 
    for j in range(1,7): 
        full_true.append(validation_df.iloc[i,j])
        


In [None]:
#len(full_true)

In [None]:
training_df = pivot_df[~(pivot_df.filename.isin(validation_df.filename))]
training_df


In [None]:
print(training_df.head())
print(validation_df.head())


In [None]:
def get_pixels_hu(scan): 
    image = np.stack([scan.pixel_array])
    image = image.astype(np.int16) 
    
    image[image == -2000] = 0
    
    intercept = scan.RescaleIntercept
    slope = scan.RescaleSlope
    
    if slope != 1: 
        image = slope * image.astype(np.float64)
        image = image.astype(np.int16)
    
    image += np.int16(intercept) 
    
    return np.array(image, dtype=np.int16)

In [None]:
def apply_window(image, center, width):
    image = image.copy()
    min_value = center - width // 2
    max_value = center + width // 2
    image[image < min_value] = min_value
    image[image > max_value] = max_value
    return image


def apply_window_policy(image):

    image1 = apply_window(image, 40, 80) # brain
    image2 = apply_window(image, 80, 200) # subdural
    image3 = apply_window(image, 40, 380) # bone
    image1 = (image1 - 0) / 80
    image2 = (image2 - (-20)) / 200
    image3 = (image3 - (-150)) / 380
    image = np.array([
        image1 - image1.mean(),
        image2 - image2.mean(),
        image3 - image3.mean(),
    ]).transpose(1,2,0)

    return image
#maybe try a new function 

In [None]:
def save_and_resize(filenames, load_dir):    
    save_dir = '/kaggle/tmp/'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    for filename in tqdm(filenames):
        try:
            path = load_dir + filename
            new_path = save_dir + filename.replace('.dcm', '.png')
            dcm = pydicom.dcmread(path)
            image = get_pixels_hu(dcm)
            image = apply_window_policy(image[0])
            image -= image.min((0,1))
            image = (255*image).astype(np.uint8)
            image = cv2.resize(image, (299, 299)) #smaller
            res = cv2.imwrite(new_path, image)
            
        except ValueError:
            continue # it returns a black image, super weird 

In [None]:
save_and_resize(filenames=sample_files, load_dir=BASE_PATH + TRAIN_DIR)
save_and_resize(filenames=dcm_df.filename, load_dir=BASE_PATH + TEST_DIR)

In [None]:
def create_model():    
    base_model = Xception(weights = 'imagenet', include_top = False, input_shape = (299,299,3))
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    x = Dropout(0.15)(x)
    y_pred = Dense(6, activation = 'sigmoid')(x)

    return Model(inputs = base_model.input, outputs = y_pred)

In [None]:
LR = 0.00005
model = create_model()

In [None]:
#from keras.utils.vis_utils import plot_model
#plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

In [None]:
model.compile(optimizer = Adam(learning_rate = LR), 
              loss = 'binary_crossentropy', # <- requires balance/ Binary for unbalanced
              metrics = [tf.keras.metrics.SensitivityAtSpecificity(0.5)]) #run both 

In [None]:
from keras_preprocessing.image import ImageDataGenerator

In [None]:
test_df

In [None]:
BATCH_SIZE = 16 # had to revert back to 16 to have a comparaison point with the large model I ran locally 

def create_datagen():
    return ImageDataGenerator()

def create_test_gen():
    return ImageDataGenerator().flow_from_dataframe(
        png_test_df,
        directory=  '/kaggle/tmp/',
        x_col='filename',
        class_mode=None,
        target_size=(299, 299),
        batch_size=BATCH_SIZE,
        shuffle=False
    )

def create_train_gen(datagen):
    return datagen.flow_from_dataframe(
        training_df, 
        directory='/kaggle/tmp/',
        
        x_col='filename', 
        y_col=['any', 'epidural', 'intraparenchymal', 
               'intraventricular', 'subarachnoid', 'subdural'],
        class_mode='raw',
        target_size=(299, 299),
        batch_size=BATCH_SIZE,
        
       
    )
def create_val_gen(datagen): 
    return datagen.flow_from_dataframe(
        validation_df, 
        directory='/kaggle/tmp/',
        
        x_col='filename', 
        y_col=['any', 'epidural', 'intraparenchymal', 
               'intraventricular', 'subarachnoid', 'subdural'],
        class_mode='raw',
        target_size=(299, 299),
        batch_size=BATCH_SIZE,
        shuffle=False,
        
    )

# Using original generator
data_generator = create_datagen()
train_gen = create_train_gen(data_generator)
val_gen = create_val_gen(data_generator)
test_gen = create_test_gen()

In [None]:
model.summary()

In [None]:
checkpoint = ModelCheckpoint(
    'effnetb4.h5', 
    monitor='val_loss', 
    verbose=0, 
    save_best_only=True, 
    save_weights_only=False,
    mode='auto'
)
Early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=0, verbose=1, 
                                              mode='auto', baseline=None, restore_best_weights=False)
#train_length = len(train_df)
total_steps = sample_files.shape[0] // BATCH_SIZE
total_steps = total_steps // 4
history = model.fit_generator(
    train_gen,
    steps_per_epoch = total_steps,
    validation_data=val_gen,
    validation_steps=total_steps * 0.15,
    callbacks=[checkpoint, Early_stop],
    epochs=10
)

In [None]:
acc = history.history['sensitivity_at_specificity']
val_acc = history.history['val_sensitivity_at_specificity']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'b', label='Training Sens')
plt.plot(epochs, val_acc, 'g', label='Validation Sens')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')

plt.title('Training and validation accuracy')
plt.legend()
fig = plt.figure()
fig.savefig('acc.png')


plt.plot(epochs, loss, 'b', label='Training loss')
plt.plot(epochs, val_loss, 'g', label='Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and validation loss')

plt.legend()
plt.show()

Idea here: make pred on validation, then for each image load the image, the prediciton, and the labels in validation_df 

In [None]:
test_preds = model.predict_generator(test_gen, verbose = 1)

In [None]:
test_preds

In [None]:
test_df = test_df.join(pd.DataFrame(test_preds, columns=[
    'any', 'epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural'
]))


In [None]:
test_df = test_df.melt(id_vars=['filename'])

# Combine the filename column with the variable column
test_df['ID'] = test_df.filename.apply(lambda x: x.replace('.png', '')) + '_' + test_df.variable
test_df['Label'] = test_df['value']

In [None]:
test_df[['ID', 'Label']].to_csv('submission.csv', index=False)

In [None]:
val_preds = model.predict_generator(val_gen, verbose = 1)


In [None]:
val_preds

In [None]:
#y_preds = []
#for i in range(len(val_preds)):
#    y_preds.append(0)
#    for value in val_preds[i]: 
#        if value > 0.5: 
#            y_preds[i] = 1
#            break
            
        
#len(y_preds)


In [None]:
#from sklearn.metrics import roc_curve
#fpr_keras, tpr_keras, thresholds_keras = roc_curve(y_true, y_preds)

In [None]:
#from sklearn.metrics import auc
#auc_keras = auc(fpr_keras, tpr_keras)

In [None]:
#plt.figure(1)
#plt.plot([0, 1], [0, 1], 'k--')
#plt.plot(fpr_keras, tpr_keras, label='Keras (area = {:.3f})'.format(auc_keras))

#plt.xlabel('False positive rate')
#plt.ylabel('True positive rate')
#plt.title('ROC curve')
#plt.legend(loc='best')
#plt.show()

In [None]:
#from sklearn.metrics import confusion_matrix
#print('2*2 Confusion Matrix')
#print(confusion_matrix(y_true, y_preds))
#cm = confusion_matrix(y_true, y_preds)

In [None]:
import itertools   
def plot_confusion_matrix(cm, classes,
                        normalize=False,
                        title='Confusion matrix',
                        cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
            horizontalalignment="center",
            color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
cm_labels = ['no hemorrhage', 'has hemorrhage']

In [None]:
#plot_confusion_matrix(cm=cm, classes=cm_labels, title='Confusion Matrix')

In [None]:
#predictions_list = []
#for pred in val_preds: 
  #  predictions_list.append(pred)

len(predictions_list)

In [None]:
validation_frame = validation_df.drop(['filename'], axis=1)
validation_frame 

In [None]:
len(validation_frame) 

In [None]:
if len(predictions_list) == len(validation_frame): 
    validation_frame.iloc[:,:] = predictions_list
else: 
    print("fix this issue")
        

In [None]:
validation_frame.insert(0, "filename", validation_df.filename)
validation_frame.insert(7, "true_any" ,validation_df.iloc[:,1])
validation_frame.insert(8, "true_epidural", validation_df.epidural)
validation_frame.insert(9, "true_intraparenchymal", validation_df.intraparenchymal)
validation_frame.insert(10, "true_intraventricular", validation_df.intraventricular)
validation_frame.insert(11, "true_subarachnoid", validation_df.subarachnoid)
validation_frame.insert(12, "true_subdural", validation_df.subdural)

In [None]:
validation_frame

In [None]:
for i in range(100): 
    if validation_frame.iloc[i,1] > 0.8: 
        print("ID is : " + str(validation_frame.iloc[i,0]))
        for j in range(1,7): 
            print("predicition = " +  str(validation_frame.iloc[i,j]) )
        for k in range(7,13): 
            print("true predicition = " +  str(validation_frame.iloc[i,k]))
# activation map

In [None]:
any_preds = validation_frame['any']
max_index = any_preds.idxmax()
max_index

In [None]:
def img_to_heatmap(): 
    highest_predicted_img = validation_frame.loc[max_index,'filename']
    if validation_frame.loc[max_index, 'true_any'] == 1:
        return highest_predicted_img

In [None]:
highest_predicted_img =  img_to_heatmap()
highest_predicted_img

In [None]:
test_df

In [None]:
test_df

In [None]:
predictions_list_test = []
for pred in test_preds: 
    predictions_list_test.append(pred)



In [None]:
test_frame =  test_sample_df.drop(['filename'], axis=1)
test_frame

In [None]:
test_frame.iloc[:,:] = predictions_list_test
test_frame

In [None]:
test_sample_df = test_sample_df.stack().reset_index()
test_sample_df

In [None]:
test_frame.insert(0, "filename", test_df.filename)
test_frame

In [None]:
from PIL import Image
for i in range(20): 
  
    for j in range(1,7): 
        if test_frame.iloc[i,j] > 0.8: 
            path = '/kaggle/tmp/' + str(test_frame.iloc[i,0])
            img = Image.open(path)
            plt.imshow(img)
            print(str(test_frame.iloc[i,0]) + " has a probability: "  + str(test_frame.iloc[i,j]) + " for a '" + str(test_frame.columns[j]) + "' type of hemorrhage")
            plt.show()

In [None]:
#heatmap 
#The code used to show the heatmake was taken from: https://keras.io/examples/vision/grad_cam/
#Only slightly modified to fit this workflow and return the image with the highest predicition from the validtion set 
from IPython.display import Image, display

preprocess_input = keras.applications.xception.preprocess_input
decode_predictions = keras.applications.xception.decode_predictions

last_conv_layer_name = "block14_sepconv2_act"

img_path = '/kaggle/tmp/' + str(highest_predicted_img)
display(Image(img_path))

In [None]:
def get_img_array(img_path, size):
    # `img` is a PIL image of size 299x299
    img = keras.preprocessing.image.load_img(img_path, target_size=size)
    # `array` is a float32 Numpy array of shape (299, 299, 3)
    array = keras.preprocessing.image.img_to_array(img)
    # We add a dimension to transform our array into a "batch"
    # of size (1, 299, 299, 3)
    array = np.expand_dims(array, axis=0)
    return array

def make_gradcam_heatmap(img_array, model, last_conv_layer_name, pred_index=None):
    # First, we create a model that maps the input image to the activations
    # of the last conv layer as well as the output predictions
    grad_model = tf.keras.models.Model(
        [model.inputs], [model.get_layer(last_conv_layer_name).output, model.output]
    )

    # Then, we compute the gradient of the top predicted class for our input image
    # with respect to the activations of the last conv layer
    with tf.GradientTape() as tape:
        last_conv_layer_output, preds = grad_model(img_array)
        if pred_index is None:
            pred_index = tf.argmax(preds[0])
        class_channel = preds[:, pred_index]

    # This is the gradient of the output neuron (top predicted or chosen)
    # with regard to the output feature map of the last conv layer
    grads = tape.gradient(class_channel, last_conv_layer_output)

    # This is a vector where each entry is the mean intensity of the gradient
    # over a specific feature map channel
    pooled_grads = tf.reduce_mean(grads, axis=(0, 1, 2))

    # We multiply each channel in the feature map array
    # by "how important this channel is" with regard to the top predicted class
    # then sum all the channels to obtain the heatmap class activation
    last_conv_layer_output = last_conv_layer_output[0]
    heatmap = last_conv_layer_output @ pooled_grads[..., tf.newaxis]
    heatmap = tf.squeeze(heatmap)

    # For visualization purpose, we will also normalize the heatmap between 0 & 1
    heatmap = tf.maximum(heatmap, 0) / tf.math.reduce_max(heatmap)
    return heatmap.numpy()

In [None]:
img_size = (299, 299)

img_array = preprocess_input(get_img_array(img_path, size=img_size))

model.layers[-1].activation = None

heatmap = make_gradcam_heatmap(img_array, model, last_conv_layer_name)

# Display heatmap
plt.matshow(heatmap)
plt.show()


In [None]:
import matplotlib.cm as cm
def save_and_display_gradcam(img_path, heatmap, cam_path="cam.jpg", alpha=0.4):
    # Load the original image
    img = keras.preprocessing.image.load_img(img_path)
    img = keras.preprocessing.image.img_to_array(img)

    # Rescale heatmap to a range 0-255
    heatmap = np.uint8(255 * heatmap)

    # Use jet colormap to colorize heatmap
    jet = cm.get_cmap("jet")

    # Use RGB values of the colormap
    jet_colors = jet(np.arange(256))[:, :3]
    jet_heatmap = jet_colors[heatmap]

    # Create an image with RGB colorized heatmap
    jet_heatmap = keras.preprocessing.image.array_to_img(jet_heatmap)
    jet_heatmap = jet_heatmap.resize((img.shape[1], img.shape[0]))
    jet_heatmap = keras.preprocessing.image.img_to_array(jet_heatmap)

    # Superimpose the heatmap on original image
    superimposed_img = jet_heatmap * alpha + img
    superimposed_img = keras.preprocessing.image.array_to_img(superimposed_img)

    # Save the superimposed image
    superimposed_img.save(cam_path)

    # Display Grad CAM
    display(Image(cam_path))


In [None]:
save_and_display_gradcam(img_path, heatmap)