In [None]:

import pandas as pd
import numpy as np
import matplotlib.image as mpimg

import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.layers import Dense, Dropout, Flatten, Activation, BatchNormalization
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.optimizers import Adam

import cv2
import os
import pickle
from IPython.lib.display import Audio
import zipfile 

from sklearn.utils import shuffle   
from sklearn.model_selection import train_test_split 
import shutil   
import matplotlib.pyplot as plt

import plotly.graph_objects as go

import plotly.figure_factory as ff
%matplotlib inline

tf.random.set_seed(101)

In [None]:
path = "../input/histopathologic-cancer-detection/"

# we load the training set and test set
train = pd.read_csv(path + 'train_labels.csv')
test = pd.read_csv(path + 'sample_submission.csv')

In [None]:
print(len(train))
print(len(test))

In [None]:
train.head()

In [None]:
df_data = train
print(df_data.shape)
train['id']=df_data['id'].apply(lambda x: x+'.tif')
df_data.head()

The dataset contains 220,025 training images, which are labeled 0 or 1.
0 is negative, i.e. no cancer, and 1 is positive, i.e. the image contains (cancer) metastases.

The dataset also includes 57,458 test images. The test samples are unmarked so will not be used to build and evaluate our model.

# Label Distribution

In [None]:

train.label.value_counts() 

In [None]:
round((train.label.value_counts() / len(train)).to_frame()*100,2)

The training dataset consists of 130,908 negatives and 89,117 positives, which is approximately 59.5% and 40.5%, respectively, as shown in the pie chart below.

# Visualization of Images

In [None]:
import cv2
fig, axs = plt.subplots(3,3,figsize=(8, 5), dpi=150)


images = []
for i in range(3):
    for j in range(3):
        
        tran = np.random.randint(0,1000)
                
        image = cv2.imread(path + "train/" + df_data.iloc[tran]['id'])
        images.append(axs[i, j].imshow(image))
        
        if df_data.iloc[tran]['label'] == 1:
            axs[i,j].set_title('Tumor')
        else:
            axs[i,j].set_title('No Cancer')
            
        axs[i,j].set_xticks([])
        axs[i,j].set_yticks([])
        

    
plt.show()
del images

It is difficult to determine the features that distinguish cancer cells from normal cells from the presented pictures. We can see that there are images of cancerous and non-cancerous cells with similar colors and with a large and a small number of round nodes. Let's look at how the frequency of the color channels of a randomly selected image is plotted for two possible categories.

In [None]:
# with mpimg
cancer_data = df_data[(df_data.label==1)]
cancer_image = cancer_data.iloc[900]['id']
img = mpimg.imread(path + "train/" + cancer_image)
plt.imshow(img)
plt.title("Cancer Cell")
plt.show()

In [None]:
# With cv2
cancer_data = df_data[(df_data.label==1)]
cancer_image = cancer_data.iloc[900]['id']
img = cv2.imread(path + "train/" + cancer_image)
plt.imshow(img)
plt.title("Cancer Cell")
plt.show()

In [None]:
plt.hist(img[:, :, 0].ravel(), bins = 256, color = 'red')
plt.hist(img[:, :, 1].ravel(), bins = 256, color = 'Green')
plt.hist(img[:, :, 2].ravel(), bins = 256, color = 'Blue')
plt.xlabel('Intensity')
plt.ylabel('Quantity')
plt.legend(['Red_Channel', 'Green_Channel', 'Blue_Channel'])
plt.title("The frequency of the color channels of the cancer cells")
plt.show()

In [None]:
non_cancer_data = df_data[(df_data.label==0)]
non_cancer_image = non_cancer_data.iloc[500]['id']

img = cv2.imread(path + "train/" + non_cancer_image)
plt.imshow(img)
plt.title("No Cancer Cell")
plt.show()

In [None]:
plt.hist(img[:, :, 0].ravel(), bins = 256, color = 'red')
plt.hist(img[:, :, 1].ravel(), bins = 256, color = 'Green')
plt.hist(img[:, :, 2].ravel(), bins = 256, color = 'Blue')
plt.xlabel('Intensity')
plt.ylabel('Quantity')
plt.legend(['Red_Channel', 'Green_Channel', 'Blue_Channel'])
plt.title("The frequency of the color channels in the absence of cancer cells")
plt.show()

del img, non_cancer_data, cancer_data