In [None]:
import os
import shutil
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import cv2 as cv
from numpy.random import seed
seed(45)
import pickle

from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from skimage import io
from glob import glob 
%matplotlib inline

In [None]:
train_dirname = '/kaggle/input/histopathologic-cancer-detection/train'

# Dataset exploration

In [None]:
train_labels = pd.read_csv('/kaggle/input/histopathologic-cancer-detection/train_labels.csv')
train_labels.head()

# Label distribution

In [None]:
train_labels['label'].value_counts()

In [None]:
# Display a DataFrame showing the proportion of observations with each 
# possible of the target variable (which is label). 
(train_labels.label.value_counts() / len(train_labels)).to_frame()

In [None]:
train_labels.info()

Data is not entirely balanced, there is more negative samples than positive, by about 30 percent

# View Sample Images

In [None]:
positive_samples = train_labels.loc[train_labels['label'] == 1].sample(4)
negative_samples = train_labels.loc[train_labels['label'] == 0].sample(4)
positive_images = []
negative_images = []
for sample in positive_samples['id']:
    path = os.path.join(train_dirname, sample+'.tif')
    img = cv.imread(path)
    positive_images.append(img)
        
for sample in negative_samples['id']:
    path = os.path.join(train_dirname, sample+'.tif')
    img = cv.imread(path)
    negative_images.append(img)

fig,axis = plt.subplots(2,4,figsize=(20,8))
fig.suptitle('Dataset samples presentation plot',fontsize=20)
for i,img in enumerate(positive_images):
    axis[0,i].imshow(img)
    rect = patches.Rectangle((32,32),32,32,linewidth=4,edgecolor='g',facecolor='none', linestyle=':', capstyle='round')
    axis[0,i].add_patch(rect)
axis[0,0].set_ylabel('Positive samples', size='large')
for i,img in enumerate(negative_images):
    axis[1,i].imshow(img)
    rect = patches.Rectangle((32,32),32,32,linewidth=4,edgecolor='r',facecolor='none', linestyle=':', capstyle='round')
    axis[1,i].add_patch(rect)
axis[1,0].set_ylabel('Negative samples', size='large')
    

## EDA analysis for each channel

## Visulization of ditributions for AVERAGE pixel values

In [None]:
## Visulization of ditributions for AVERAGE pixel values

import matplotlib.image as mpimg
df = train_labels.sample(n=10000).reset_index()
df.head()

positive_samples = df.loc[df['label'] == 1]
negative_samples = df.loc[df['label'] == 0]

## channels pixels for positive samples
pos_r_pixels_mean = []
pos_g_pixels_mean = []
pos_b_pixels_mean = []

for i, row in positive_samples.iterrows():
    
    img = mpimg.imread(f'{train_dirname}/{row.id}.tif')  
    r_pixels = round(img[:, :, 0].flatten().mean(),2)
    g_pixels = round(img[:, :, 1].flatten().mean(),2)
    b_pixels = round(img[:, :, 2].flatten().mean(),2)
    
    pos_r_pixels_mean.append(r_pixels)
    pos_g_pixels_mean.append(g_pixels)
    pos_b_pixels_mean.append(b_pixels)

## channels pixels for negative samples
neg_r_pixels_mean = []
neg_g_pixels_mean = []
neg_b_pixels_mean = []

for i, row in negative_samples.iterrows():
    
    img = mpimg.imread(f'{train_dirname}/{row.id}.tif')  
    r_pixels = round(img[:, :, 0].flatten().mean(),2)
    g_pixels = round(img[:, :, 1].flatten().mean(),2)
    b_pixels = round(img[:, :, 2].flatten().mean(),2)
    
    neg_r_pixels_mean.append(r_pixels)
    neg_g_pixels_mean.append(g_pixels)
    neg_b_pixels_mean.append(b_pixels)
    
len(neg_r_pixels_mean)

In [None]:
df['label'].value_counts()

In [None]:
nr_of_bins = 256 #each possible pixel value will get a bin in the following histograms
fig,axs = plt.subplots(3,2,sharey=True,figsize=(8,8),dpi=150)
fig.suptitle('Histogram of average channel pixels', fontsize= 24)
#RGB channels
axs[0,0].hist(pos_r_pixels_mean,bins=nr_of_bins,density=True)
axs[0,1].hist(neg_r_pixels_mean,bins=nr_of_bins,density=True)
axs[1,0].hist(pos_g_pixels_mean,bins=nr_of_bins,density=True)
axs[1,1].hist(neg_g_pixels_mean,bins=nr_of_bins,density=True)
axs[2,0].hist(pos_b_pixels_mean,bins=nr_of_bins,density=True)
axs[2,1].hist(neg_b_pixels_mean,bins=nr_of_bins,density=True)

#Set image labels
axs[0,0].set_title("Positive samples (N =" + str(positive_samples.shape[0]) + ")");
axs[0,1].set_title("Negative samples (N =" + str(negative_samples.shape[0]) + ")");
axs[0,1].set_ylabel("Red",rotation='horizontal',labelpad=35,fontsize=12)
axs[1,1].set_ylabel("Green",rotation='horizontal',labelpad=35,fontsize=12)
axs[2,1].set_ylabel("Blue",rotation='horizontal',labelpad=35,fontsize=12)
for i in range(3):
    axs[i,0].set_ylabel("Relative frequency")


## Visulization of ditributions for ALL pixel values

In [None]:
### I will sample a relatively smaller dataset for all pixel values
import matplotlib.image as mpimg
df = train_labels.sample(n=1000).reset_index()
df.head()

positive_samples = df.loc[df['label'] == 1]
negative_samples = df.loc[df['label'] == 0]

## channels pixels for positive samples
pos_r_pixels_all = []
pos_g_pixels_all = []
pos_b_pixels_all = []

for i, row in positive_samples.iterrows():
    
    img = mpimg.imread(f'{train_dirname}/{row.id}.tif')  
    r_pixels = img[:, :, 0].flatten()
    g_pixels = img[:, :, 1].flatten()
    b_pixels = img[:, :, 2].flatten()
    
    pos_r_pixels_all.append(r_pixels)
    pos_g_pixels_all.append(g_pixels)
    pos_b_pixels_all.append(b_pixels)

## channels pixels for negative samples
neg_r_pixels_all = []
neg_g_pixels_all = []
neg_b_pixels_all = []

for i, row in negative_samples.iterrows():
    
    img = mpimg.imread(f'{train_dirname}/{row.id}.tif')  
    r_pixels = img[:, :, 0].flatten()
    g_pixels = img[:, :, 1].flatten()
    b_pixels = img[:, :, 2].flatten()
    
    neg_r_pixels_all.append(r_pixels)
    neg_g_pixels_all.append(g_pixels)
    neg_b_pixels_all.append(b_pixels)
len(neg_r_pixels_all)

In [None]:
nr_of_bins = 256 #each possible pixel value will get a bin in the following histograms
fig,axs = plt.subplots(3,2,sharey=True,figsize=(8,8))
fig.suptitle('Histogram of actual channel pixels', fontsize= 24)
#RGB channels
#axs[0,0].hist(pos_r_pixels_all,bins=nr_of_bins,density=True)
#axs[0,1].hist(neg_r_pixels_all,bins=nr_of_bins,density=True)
#axs[1,0].hist(pos_g_pixels_all,bins=nr_of_bins,density=True)
#axs[1,1].hist(neg_g_pixels_all,bins=nr_of_bins,density=True)
#axs[2,0].hist(pos_b_pixels_all,bins=nr_of_bins,density=True)
#axs[2,1].hist(neg_b_pixels_all,bins=nr_of_bins,density=True)
axs[0,0].hist(pos_r_pixels_all,bins=nr_of_bins)
axs[0,1].hist(neg_r_pixels_all,bins=nr_of_bins)
axs[1,0].hist(pos_g_pixels_all,bins=nr_of_bins)
axs[1,1].hist(neg_g_pixels_all,bins=nr_of_bins)
axs[2,0].hist(pos_b_pixels_all,bins=nr_of_bins)
axs[2,1].hist(neg_b_pixels_all,bins=nr_of_bins)
custom_xlim = (0, 256)
custom_ylim = (0, 1000)
# Setting the values for all axes.
plt.setp(axs, xlim=custom_xlim, ylim=custom_ylim)

#Set image labels
axs[0,0].set_title("Positive samples (N =" + str(positive_samples.shape[0]) + ")");
axs[0,1].set_title("Negative samples (N =" + str(negative_samples.shape[0]) + ")");
axs[0,1].set_ylabel("Red",rotation='horizontal',labelpad=35,fontsize=12)
axs[1,1].set_ylabel("Green",rotation='horizontal',labelpad=35,fontsize=12)
axs[2,1].set_ylabel("Blue",rotation='horizontal',labelpad=35,fontsize=12)
for i in range(3):
    axs[i,0].set_ylabel("Pixels Counts")