# **Exploratory Data Exploration Notebook**

This notebook will look at the histopathologic cancer detection images and how they are represented in our final model.

# Import namespaces

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import matplotlib.image as mpimg

from sklearn.model_selection import train_test_split

import pickle

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.preprocessing.image import ImageDataGenerator

import os

# Load dataset

In [None]:
# Load the training data into a DataFrame named 'train'. 
# Print the shape of the resulting DataFrame. 

train = pd.read_csv(f'../input/histopathologic-cancer-detection/train_labels.csv', dtype=str)

print('Training Set Size:', train.shape)

train.head()

Lets update the dataset to include filename extensions

In [None]:
train['id'] = train['id'].apply(lambda x: f'{x}.tif')
train.head()

# Label Distribution

In [None]:
(train.label.value_counts() / len(train)).to_frame().sort_index().T

# View Sample of Images

In [None]:
#Sample images of original dataset

train_path = "../input/histopathologic-cancer-detection/train"
print('Training Images:', len(os.listdir(train_path)))

sample = train.sample(n=16).reset_index()

plt.figure(figsize=(8,8))

for i, row in sample.iterrows():

    img = mpimg.imread(f'../input/histopathologic-cancer-detection/train/{row.id}')    
    label = row.label

    plt.subplot(4,4,i+1)
    plt.imshow(img)
    plt.text(0, -5, f'Class {label}', color='k')
        
    plt.axis('off')

plt.tight_layout()
plt.show()

In [None]:
import cv2

In [None]:
# Example of images for Class = 1
img_names = train[train['label']=='1']['id'][:16]

plt.figure(figsize=[8,8])
i = 1
for img_name in img_names:
    img = mpimg.imread("../input/histopathologic-cancer-detection/train/%s" % img_name)[...,[2, 1, 0]] 
    plt.subplot(4, 4, i)
    plt.imshow(img)
    plt.text(0, -5, f'Class 1', color='k')
    i += 1
    plt.axis('off')
plt.show()

In [None]:
# Example of images for Class = 0
img_names0 = train[train['label']=="0"]['id'][:16]

plt.figure(figsize=[8,8])
i = 1
for img_name0 in img_names0:
    img = mpimg.imread("../input/histopathologic-cancer-detection/train/%s" % img_name0)[...,[2, 1, 0]] 
    plt.subplot(4, 4, i)
    plt.imshow(img)
    plt.text(0, -5, f'Class 0', color='k')
    i += 1
    plt.axis('off')
plt.show()

# Data Generators

In [None]:
#Using original dataset
RANDOM_SEED = 1982
train_df, valid_df = train_test_split(train, test_size=0.2, random_state=RANDOM_SEED, stratify=train.label)

print(train_df.shape)
print(valid_df.shape)

In [None]:
# Create image data generators for both the training set and the validation set. 
# Use the data generators to scale the pixel values by a factor of 1/255. 

train_datagen = ImageDataGenerator(
    rescale=1./255,
    vertical_flip = True,
    horizontal_flip = True,
    rotation_range=90,
    zoom_range=0.2, 
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    channel_shift_range=0.1,
    fill_mode='nearest')

valid_datagen = ImageDataGenerator(rescale=1/255)

In [None]:
# Complete the code for the data loaders below. 

BATCH_SIZE = 64

train_loader = train_datagen.flow_from_dataframe(
    dataframe = train_df,
    directory = train_path,
    x_col = 'id',
    y_col = 'label',
    batch_size = BATCH_SIZE,
    seed = 1,
    shuffle = True,
    class_mode = 'categorical',
    target_size = (96,96)
)

valid_loader = valid_datagen.flow_from_dataframe(
    dataframe = valid_df,
    directory = train_path,
    x_col = 'id',
    y_col = 'label',
    batch_size = BATCH_SIZE,
    seed = 1,
    shuffle = True,
    class_mode = 'categorical',
    target_size = (96,96)
)

In [None]:
#Look at some augmented images
def plotImages(images_arr):
    fig, axes = plt.subplots(3, 5, figsize=(10,10))
    axes = axes.flatten()
    for img, ax in zip( images_arr, axes):
        ax.imshow(img)
    plt.tight_layout()
    plt.show()
    
    
augmented_images = [train_loader[0][0][0] for i in range(15)]
plotImages(augmented_images)

In [None]:
TR_STEPS = len(train_loader)
VA_STEPS = len(valid_loader)

print(TR_STEPS)
print(VA_STEPS)

# Load Model

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import matplotlib.image as mpimg

In [None]:
from keras.applications.vgg16 import VGG16

In [None]:
cnn = keras.models.load_model('../input/finalmodel/LP_HCD_VGG16_Model.h5')
cnn.summary()

In [None]:
model = VGG16(
    weights='imagenet',
    include_top=False,
    input_shape=(96, 96, 3)
)

In [None]:
model.summary()

# **Visualize Filters**

**Visualizing 6 filters out of 64 from the first layer of the VGG16 Model**

The dark squares indicate small or inhibitory weights and the light squares represent large or excitatory weights. 

In [None]:
#Iterate thru all the layers of the model
for layer in model.layers:
    if 'conv' in layer.name:
        weights, bias= layer.get_weights()
        #print(layer.name)
        #print(layer.name, filters.shape)
        
        #normalize filter values between  0 and 1 for visualization
        f_min, f_max = weights.min(), weights.max()
        filters = (weights - f_min) / (f_max - f_min)  
        #print(filters.shape[3])
        filter_cnt=1

In [None]:
from matplotlib import pyplot

n_filters = 6
ix=1
fig = pyplot.figure(figsize=(20,15))
for i in range(n_filters):
    # get the filters
    f = filters[:,:,:,i]
    for j in range(3):
        # subplot for 6 filters and 3 channels
        pyplot.subplot(n_filters,3,ix)
        pyplot.imshow(f[:,:,j])
        ix+=1
#plot the filters 
pyplot.show()

# **Visualize Feature Maps**

The activation maps, called feature maps, capture the result of applying the filters to input, such as the input image or another feature map.

The idea of visualizing a feature map for a specific input image would be to understand what features of the input are detected or preserved in the feature maps. The expectation would be that the feature maps close to the input detect small or fine-grained detail, whereas feature maps close to the output of the model capture more general features.

In [None]:
cancer_data = train[(train.label=="1")]
non_cancer_data = train[(train.label=="0")]

row0 = non_cancer_data.iloc[6,:]
img0 = mpimg.imread(f'../input/histopathologic-cancer-detection/train/{row0.id}')    

row1 = cancer_data.iloc[4,:]
img1 = mpimg.imread(f'../input/histopathologic-cancer-detection/train/{row1.id}')    

plt.subplot(1,2,1)
plt.imshow(img0)
plt.text(0, -2, 'No Cancer', color='k')
plt.axis('off')

plt.subplot(1,2,2)
plt.imshow(img1)
plt.text(0, -2, 'Cancer', color='k')
plt.axis('off')

plt.show()

In [None]:
tensor0 = img0.reshape(1,96,96,3)/255
tensor1 = img1.reshape(1,96,96,3)/255

In [None]:
def display_layer(layer_index, activations, cmap):
    layer_activations = activations[layer_index]
    n_filters = layer_activations.shape[-1]
       
    n_cols = 8
    n_rows = n_filters // n_cols
    
    print(f'{model.layers[layer_index].name} - {n_filters} Filters')
    plt.figure(figsize=[2*n_cols, 2*n_rows])
    
    for i in range(n_filters):
        img = layer_activations[0,:,:,i]
        plt.subplot(n_rows, n_cols, i+1)
        plt.imshow(img, cmap=cmap)
        plt.axis('off')
    plt.show() 


def display_activations(img_tensor, layer_indices=[], cmap='viridis'):
    layer_outputs = [layer.output for layer in model.layers]
    activation_model = tf.keras.models.Model(inputs=model.inputs, outputs=layer_outputs)
    activations = activation_model(img_tensor)
    
    for i in layer_indices:
        display_layer(i, activations, cmap)

# Cancerous Feature Map

Feature map for the block1_conv1 and block1_conv2 filters.

In [None]:
display_activations(tensor1, [1,2], cmap='viridis')

# Non Cancerous Feature Map

In [None]:
display_activations(tensor0, [1,2], cmap='viridis')

# **Class Activation Map**

This technique involves creating heatmaps that shows you what part of an image the network is most interested in when determining its classifications. For any image, you can create one heatmap for each class. The heatmap will tell you what part of the image most strongly indicates the presence of the class in question.


In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import pandas as pd
import pickle
import cv2
from tqdm import tqdm 
import matplotlib as mpl

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import *
from tensorflow.keras.preprocessing.image import ImageDataGenerator

import zipfile 

# Heatmap Function

In [None]:
def create_grad_model(model):
    for layer in reversed(model.layers):
        if len(layer.output_shape) == 4:
            last_conv_layer = layer.name
            break

    grad_model = tf.keras.models.Model(
        inputs=[model.inputs],
        outputs=[model.get_layer(last_conv_layer).output, model.output])
    
    return grad_model 

def compute_heatmap(image, class_ix, grad_model):

    with tf.GradientTape() as tape:
        inputs = tf.cast(image, tf.float32)
        (conv_outputs, predictions) = grad_model(inputs)
        loss = predictions[:, class_ix]
    grads = tape.gradient(loss, conv_outputs)

    cast_conv_outputs = tf.cast(conv_outputs > 0, "float32")
    cast_grads = tf.cast(grads > 0, "float32")
    guided_grads = cast_conv_outputs * cast_grads * grads

    conv_outputs = conv_outputs[0]
    guided_grads = guided_grads[0]

    weights = tf.reduce_mean(guided_grads, axis=(0, 1))

    cam = tf.reduce_sum(tf.multiply(weights, conv_outputs), axis=-1)

    (w, h) = (image.shape[2], image.shape[1])
    heatmap = cv2.resize(cam.numpy(), (w, h))
        
    return heatmap

# Example: First Heatmap

In [None]:
# Create Gradient Model
gm = create_grad_model(model)

# Select Image and Create Heatmap
filename = train.id[0]
#img = mpimg.imread(f'../input/histopathologic-cancer-detection/train/'{filename})    
img = mpimg.imread(f'../input/histopathologic-cancer-detection/train/{filename}')
tensor = img.reshape((1,) + img.shape) / 255
heatmap = compute_heatmap(tensor, 1, gm)

plt.figure(figsize=[9,3])

# Display Image
plt.subplot(1,3,1)
plt.imshow(img)
plt.axis('off')

# Display Heatmap
plt.subplot(1,3,2)
plt.imshow(heatmap, cmap='coolwarm')
plt.axis('off')

# Display Image and Heatmap Together
plt.subplot(1,3,3)
plt.imshow(img, alpha=0.8, cmap='binary_r')
plt.imshow(heatmap, alpha=0.6, cmap='coolwarm')
plt.axis('off')
    
plt.show()

# Displaying Multiple Heatmaps


In [None]:
def get_heatmap_dist(df, class_ix, gm):

    values = None
    for i, row in tqdm(df.iterrows()):
        img = mpimg.imread(f'../input/histopathologic-cancer-detection/train/{row.id}')    
        tensor = img.reshape((1,) + img.shape) / 255
        hm = compute_heatmap(tensor, class_ix, gm)

        if values is None:
            values = hm.flatten()
        else:
            values = np.hstack([values, hm.flatten()])

    return values

In [None]:
values = get_heatmap_dist(train.sample(1000, random_state=1), 1, gm)

In [None]:
low = np.quantile(values, 0.10)
high = np.quantile(values, 0.96)

norm = mpl.colors.Normalize(vmin=low, vmax=high)

print(low)
print(high)

In [None]:
# Select which images to display
indices = range(12)

for i in indices:  
    row = train.iloc[i,:]
    img = mpimg.imread(f'../input/histopathologic-cancer-detection/train/{row.id}')    
    label = row.label
    
    tensor = img.reshape((1,) + img.shape) / 255
    heatmap = compute_heatmap(tensor, 1, gm)

    if(label == '1'):
        print('Cancer Present')
    else:
        print('No Cancer')
    
    plt.figure(figsize=[9,3])

    plt.subplot(1,3,1)
    plt.imshow(img)
    plt.axis('off')

    plt.subplot(1,3,2)
    plt.imshow(heatmap, cmap='coolwarm', norm=norm)
    plt.axis('off')

    plt.subplot(1,3,3)
    plt.imshow(img, alpha=0.6, cmap='binary_r')
    plt.imshow(heatmap, alpha=0.6, cmap='coolwarm', norm=norm)
    plt.axis('off')
    
    plt.show()

# **Distribution of Pixel Channels**

# All Training Images

This shows how the pixel values in each channel are distributed for images with each label (cancerous vs. non-cancerous).

In [None]:
path = "../input/histopathologic-cancer-detection/"

In [None]:
#Creating lists

#Noncancerous Red
avg_list_00=[]
#Noncancerous Green
avg_list_01=[]
#Noncancerous Blue
avg_list_02=[]
#Cancerous Red
avg_list_10=[]
#Cancerous Green
avg_list_11=[]
#Cancerous Blue
avg_list_12=[]

In [None]:
#Loop over the cancerous entries

for i, row in cancer_data.iterrows():
    img = mpimg.imread(f'../input/histopathologic-cancer-detection/train/{row.id}')   
    red_channel = img[:,:,0]
    green_channel = img[:,:,1]
    blue_channel = img[:,:,2]
    redavg=np.average(red_channel)
    greenavg=np.average(green_channel)
    blueavg=np.average(blue_channel)
    avg_list_10.append(redavg)
    avg_list_11.append(greenavg)
    avg_list_12.append(blueavg)
     

In [None]:
#Loop over the non-cancerous entries

for i, row in non_cancer_data.iterrows():
    img = mpimg.imread(f'../input/histopathologic-cancer-detection/train/{row.id}')   
    red_channel0 = img[:,:,0]
    green_channel0 = img[:,:,1]
    blue_channel0 = img[:,:,2]
    redavg0=np.average(red_channel0)
    greenavg0=np.average(green_channel0)
    blueavg0=np.average(blue_channel0)
    avg_list_00.append(redavg0)
    avg_list_01.append(greenavg0)
    avg_list_02.append(blueavg0)

In [None]:
fig = plt.figure(figsize=[10,10])

plt.subplot(3, 2, 1)
plt.hist(avg_list_10, bins = 256, color = 'red')
#plt.xlabel('Intensity')
#plt.ylabel('Quantity')
plt.ylim(0, 2000)
plt.legend(['Red_Channel'])
plt.title("Cancerous Red")

plt.subplot(3, 2, 2)
plt.hist(avg_list_00, bins = 256, color = 'red')
#plt.xlabel('Intensity')
#plt.ylabel('Quantity')
plt.ylim(0, 2000)
plt.legend(['Red_Channel'])
plt.title("Non-cancerous Red")

plt.subplot(3, 2, 3)
plt.hist(avg_list_11, bins = 256, color = 'green')
#plt.xlabel('Intensity')
#plt.ylabel('Quantity')
plt.ylim(0, 2000)
plt.legend(['Green_Channel'])
plt.title("Cancerous Green")

plt.subplot(3, 2, 4)
plt.hist(avg_list_01, bins = 256, color = 'green')
#plt.xlabel('Intensity')
#plt.ylabel('Quantity')
plt.ylim(0, 2000)
plt.legend(['Green_Channel'])
plt.title("Non-cancerous Green")

plt.subplot(3, 2, 5)
plt.hist(avg_list_12, bins = 256, color = 'blue')
#plt.xlabel('Intensity')
#plt.ylabel('Quantity')
plt.ylim(0, 2000)
plt.legend(['Blue_Channel'])
plt.title("Cancerous Blue")

plt.subplot(3, 2, 6)
plt.hist(avg_list_02, bins = 256, color = 'blue')
#plt.xlabel('Intensity')
#plt.ylabel('Quantity')
plt.ylim(0, 2000)
plt.legend(['Blue_Channel'])
plt.title("Non-cancerous Blue")

plt.show()


# One Cancerous Image

In [None]:
# With cv2
#cancer_data = train[(train.label=="1")]
cancer_image = cancer_data.iloc[900]['id']
img = cv2.imread(path + "train/" + cancer_image)
plt.imshow(img)
plt.title("Cancer Cell")
plt.show()

In [None]:
plt.hist(img[:, :, 0].ravel(), bins = 256, color = 'red')
plt.hist(img[:, :, 1].ravel(), bins = 256, color = 'Green')
plt.hist(img[:, :, 2].ravel(), bins = 256, color = 'Blue')
plt.xlabel('Intensity')
plt.ylabel('Quantity')
plt.legend(['Red_Channel', 'Green_Channel', 'Blue_Channel'])
plt.title("The frequency of the color channels of the cancer cells")
plt.show()

# One Non-Cancerous Image

In [None]:
#non_cancer_data = train[(train.label=="0")]
non_cancer_image = non_cancer_data.iloc[500]['id']
img = cv2.imread(path + "train/" + non_cancer_image)
plt.imshow(img)
plt.title("Non-Cancerous Cell")
plt.show()

In [None]:
plt.hist(img[:, :, 0].ravel(), bins = 256, color = 'red')
plt.hist(img[:, :, 1].ravel(), bins = 256, color = 'Green')
plt.hist(img[:, :, 2].ravel(), bins = 256, color = 'Blue')
plt.xlabel('Intensity')
plt.ylabel('Quantity')
plt.legend(['Red_Channel', 'Green_Channel', 'Blue_Channel'])
plt.title("The frequency of the color channels in the absence of cancer cells")
plt.show()