<a href="https://colab.research.google.com/github/siddheshcn/BinaryImageClassification-CancerDetection/blob/main/ColabOnly_CancerDetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Environment Setup

```
# This is formatted as code
```



In [1]:
# IMport Libraies
import numpy as np
import pandas as pd
np.random.seed(101)

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Flatten, Activation
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.optimizers import Adam

import os
import cv2
import json

from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

import itertools
import shutil
import matplotlib.pyplot as plt
%matplotlib inline
tf.random.set_seed(101)

In [2]:
#setting pre-requisites
IMAGE_SIZE = 96
IMAGE_CHANNELS = 3
SAMPLE_SIZE = 80000 #this will be per label

##Setup Google Drive for Kaggle API, Dataset and Checkpoints

In [3]:
#Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#Drag and drop kaggle json file in colab files then

#Copy kaggle credentials to Google Drive
!cp ~/.kaggle/kaggle.json /content/drive/My\ Drive/UTD/Projects/CancerDetection/kaggle.json

cp: cannot stat '/root/.kaggle/kaggle.json': No such file or directory


In [7]:
#load Kaggle credentials from Google Drive
!mkdir -p ~/.kaggle
!cp /content/drive/My\ Drive/UTD/Projects/CancerDetection/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Download Dataset on Google Drive and unzip

In [None]:
# prompt: copy a zip file from google drive 'histopathologic-cancer-detection' to colab files. Then unzip the files here.
src_path = '/content/drive/My\ Drive/UTD/Projects/CancerDetection/histopathologic-cancer-detection.zip'
dest_path = '/content/CancerDetection/'
os.mkdir('/content/CancerDetection/')

!cp -r $src_path $dest_path
!unzip -j /content/CancerDetection/histopathologic-cancer-detection.zip


In [6]:
#download and unzip datasets to Google Drive:

#set path
dataset_path = '/content/CancerDetection/input/'

#download dataset
#kaggle competitions download -c histopathologic-cancer-detection -p $dataset_path

#unzip dataset
#!unzip /content/CancerDetection/histopathologic-cancer-detection.zip -d $dataset_path

print("Done")

Archive:  /content/CancerDetection/histopathologic-cancer-detection.zip
replace /content/CancerDetection/input/sample_submission.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: Done


In [7]:
#verify the file exists
!ls $dataset_path

ls: cannot access '/content/CancerDetection/input/': No such file or directory


In [8]:
os.listdir(dataset_path)

FileNotFoundError: [Errno 2] No such file or directory: '/content/CancerDetection/input/'

Data Exploration

## Data Exploration

In [None]:
dataset_path = '/content/CancerDetection/input/'

In [None]:
print(len(os.listdir('/content/CancerDetection/input/train')))
print(len(os.listdir('/content/CancerDetection/input/test')))

### Creating Dataframe of all training images

In [None]:
df_data = pd.read_csv("/content/CancerDetection/input/train_labels.csv")

# removing this image because it caused a training error previously
df_data[df_data['id'] != 'dd6dfed324f9fcb6f93f46f32fc800f2ec196be2']

# removing this image because it's black
df_data[df_data['id'] != '9369c7278ec8bcc6c880d99194de09fc2bd4efbe']


print(df_data.shape)

In [None]:
#check data distribution
df_data['label'].value_counts()

### Check Images
Draw Category Images

    Give a column in a dataframe,
    this function takes a sample of each class and displays that
    sample on one row. The sample size is the same as figure_cols which
    is the number of columns in the figure.
    Because this function takes a random sample, each time the function is run it
    displays different images.

In [None]:
def draw_category_images(col_name, figure_cols, df, IMAGE_PATH):

  categories = (df.groupby([col_name])[col_name].nunique()).index
  f, ax = plt.subplots(nrows=len(categories),
                       ncols=figure_cols,
                       figsize=(4*figure_cols, 4*len(categories))) #size can be adjusted here

  #draw a number of images for each location
  for i, cat in enumerate(categories):
    sample = df[df[col_name]== cat].sample(figure_cols) #figure_cols is also the sample size
    for j in range(0, figure_cols):
      file=IMAGE_PATH + sample.iloc[j]['id'] + '.tif'
      im=cv2.imread(file)
      ax[i,j].imshow(im, resample=True, cmap='gray')
      ax[i,j].set_title(cat, fontsize=16)
  plt.tight_layout()
  plt.show()

In [None]:
IMAGE_PATH = f"/content/CancerDetection/input/train/"
draw_category_images('label',4,df_data, IMAGE_PATH)

### Create Train and Validation Datasets

Get same number of samples of both labels

In [None]:
df_0 = df_data[df_data['label']==0].sample(SAMPLE_SIZE, random_state=101)
df_1 = df_data[df_data['label']==1].sample(SAMPLE_SIZE, random_state=101)

#concat the dataframes
df_data = pd.concat([df_0,df_1], axis=0).reset_index(drop=True)
#shuffle
df_data= shuffle(df_data)

df_data['label'].value_counts()

Train and Validation Split:
- Stratify creates a balanced validation set.

In [None]:
y= df_data['label']

df_train, df_val = train_test_split(df_data, test_size=0.10, random_state = 101, stratify=y)

print(df_train.shape)
print(df_val.shape)


Create Directories
Base Derectory
* Train Directory
 *   a (no tumor tissue)
 *   b (has tumor tissue)

- Validation Directory
 *   a (no tumor tissue)
 *   b (has tumor tissue)


In [None]:
#Create Directories

#base directory

base_dir = 'base_dir'
os.mkdir(base_dir)

#train directory
train_dir = os.path.join(base_dir, 'train_dir')
os.mkdir(train_dir)

#Val dir
val_dir = os.path.join(base_dir, 'val_dir')
os.mkdir(val_dir)

#tumor or no-tumor inside train directory
no_tumor = os.path.join(train_dir, 'a_no_tumor')
os.mkdir(no_tumor)
has_tumor = os.path.join(train_dir, 'b_has_tumor')
os.mkdir(has_tumor)

#tumor or no-tumor inside val directory
no_tumor = os.path.join(val_dir, 'a_no_tumor')
os.mkdir(no_tumor)
has_tumor = os.path.join(val_dir, 'b_has_tumor')
os.mkdir(has_tumor)

In [None]:
#create directories in colab


In [None]:
#test if directories are created
os.listdir('base_dir/train_dir')

In [None]:
#set id as index in df_data
df_data.set_index('id', inplace=True)

TESTing next code snippet

In [None]:
src = '/content/CancerDetection/input/train/'+train_list[3]
src


OG

In [25]:
!rm -rf base_dir/*

In [None]:
#Get a list of train and val images
train_list = list(df_train['id'])
val_list = list(df_val['id'])

error_counter_train = 0
error_counter_val = 0

#transfer the train images
for image in train_list:
  fname = image+'.tif'
  target = df_data.loc[image,'label']
  if target == 0:
    label = 'a_no_tumor'
  if target == 1:
    label = 'b_has_tumor'

  src = '/content/CancerDetection/input/train/'+ fname
  dst = os.path.join(train_dir, label, fname)
  try:
    shutil.copyfile(src, dst)
  except:
    error_counter_train += 1

#Transfer the val images
for image in val_list:
  fname = image+'.tif'
  target = df_data.loc[image,'label']
  if target == 0:
    label = 'a_no_tumor'
  if target == 1:
    label = 'b_has_tumor'


  src = '/content/CancerDetection/input/train/'+ fname
  dst = os.path.join(val_dir, label, fname)
  try:
    shutil.copyfile(src, dst)
  except:
    error_counter_val += 1

In [None]:
print(error_counter_train)
print(error_counter_val)

In [None]:
# check how many images are present in each new folder
print("Train Folder     : A(neg), B(pos)")
print(len(os.listdir('base_dir/train_dir/a_no_tumor')))
print(len(os.listdir('base_dir/train_dir/b_has_tumor')))
print("Validation Folder: A(neg), B(pos)")
print(len(os.listdir('base_dir/val_dir/a_no_tumor')))
print(len(os.listdir('base_dir/val_dir/b_has_tumor')))

## SET UP GENERATORS

In [None]:
train_path = 'base_dir/train_dir'
val_path = 'base_dir/val_dir'
test_path = '/content/CancerDetection/input/test'

num_train_samples = len(df_train)
num_val_samples = len(df_val)
num_test_samples = len(os.listdir(test_path))

train_batch_size = 10
val_batch_size = 10

train_steps = np.ceil(num_train_samples / train_batch_size)
val_steps = np.ceil(num_val_samples / val_batch_size)

In [None]:
#ImageDATAgenerator

datagen = ImageDataGenerator(rescale= 1.0/255)

train_gen = datagen.flow_from_directory(
    train_path,
    target_size=(IMAGE_SIZE, IMAGE_SIZE),
    batch_size=train_batch_size,
    class_mode='categorical')

val_gen = datagen.flow_from_directory(
    val_path,
    target_size=(IMAGE_SIZE, IMAGE_SIZE),
    batch_size=val_batch_size,
    class_mode='categorical')

test_gen = datagen.flow_from_directory(
    val_path,
    target_size=(IMAGE_SIZE, IMAGE_SIZE),
    batch_size=1,
    class_mode=None,
    shuffle=False) #shuffle=False as it is test data that we need not shuffle

# Convoluted NN Models

In [None]:
kernel_size = (3,3)
pool_size = (2,2)
first_filters = 32
second_filters = 64
third_filters = 128

dropout_conv = 0.3
dropout_dense = 0.3

model = Sequential()
model.add(Conv2D(first_filters, kernel_size, activation = 'relu', input_shape = (96, 96, 3)))
model.add(Conv2D(first_filters, kernel_size, activation = 'relu'))
model.add(Conv2D(first_filters, kernel_size, activation = 'relu'))
model.add(MaxPooling2D(pool_size = pool_size))
model.add(Dropout(dropout_conv))

model.add(Conv2D(second_filters, kernel_size, activation ='relu'))
model.add(Conv2D(second_filters, kernel_size, activation ='relu'))
model.add(Conv2D(second_filters, kernel_size, activation ='relu'))
model.add(MaxPooling2D(pool_size = pool_size))
model.add(Dropout(dropout_conv))

model.add(Conv2D(third_filters, kernel_size, activation ='relu'))
model.add(Conv2D(third_filters, kernel_size, activation ='relu'))
model.add(Conv2D(third_filters, kernel_size, activation ='relu'))
model.add(MaxPooling2D(pool_size = pool_size))
model.add(Dropout(dropout_conv))

model.add(Flatten())
model.add(Dense(256, activation = "relu"))
model.add(Dropout(dropout_dense))
model.add(Dense(2, activation = "softmax"))

model.summary()

In [None]:
model.compile(Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
#get the labels associated with each index:
print(val_gen.class_indices)

In [None]:
#Generate Directory to save models

model_dir = '/content/CancerDetection/Model/'
os.mkdir(model_dir)

#train directory
#train_dir = os.path.join(base_dir, 'train_dir')
#os.mkdir(train_dir)

In [None]:
filepath = "/content/CancerDetection/Model/model.h5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, min_lr=0.00001)

callbacks_list = [checkpoint, reduce_lr]

history = model.fit(train_gen, steps_per_epoch=train_steps,
                              epochs=3,
                              validation_data=val_gen,
                              validation_steps=val_steps,
                              callbacks=callbacks_list)