<a href="https://colab.research.google.com/github/srilamaiti/ml_works/blob/main/w207_hector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 207 - Histopathological cancer detection

Hector Rincon

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import os
import random
import glob
import random
import gc
import skimage.io as skio
import subprocess
import shutil

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import matplotlib.patches as patches
from matplotlib.patches import Polygon
from mlxtend.plotting import plot_confusion_matrix

import tensorflow as tf

# Keras
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.image import array_to_img, img_to_array, load_img
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, CSVLogger

# SKLearn
from skimage.color import gray2rgb
from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score, roc_curve, auc, confusion_matrix

# Configuration
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()

In [None]:
%cd /datadrive/data/

/datadrive/data


In [None]:
image_dir = 'raw/train'
IMG_SIZE = 96
BATCH_SIZE=32
validation_dir = 'processed/validation'
train_dir = 'processed/train'
test_dir = 'processed/test'

## Create source of truth

In [None]:
# Read the original file
train_label = pd.read_csv('raw/train_labels.csv')

In [None]:
# Gets the count of the smaller class, and creates a new source of truth with balanced counts
# Effectively undersamples the more populous class
positive_examples = train_label[train_label.label == 1]
positive_example_count = len(positive_examples)
negative_examples = train_label[train_label.label == 0].sample(positive_example_count)

# New source of truth
df = pd.concat([positive_examples, negative_examples], ignore_index=True)

In [None]:
df.label.value_counts()

1    89117
0    89117
Name: label, dtype: int64

## Test/train split

In [None]:
# The source of truth is train_label
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
xtrain, xtest = list(sss.split(df.id, df.label))[0]

df_train = df.iloc[xtrain]
df_test = df.iloc[xtest]

sss2 = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=0) # 25% of the 80% we already got = 20% for validation
xtrain_train, xtrain_val = list(sss2.split(df_train.id, df_train.label))[0]

df_val = df_train.iloc[xtrain_val]
df_train = df_train.iloc[xtrain_train]

In [None]:
print(df_train.label.value_counts())
print(df_test.label.value_counts())
df_val.label.value_counts()

0    53470
1    53470
Name: label, dtype: int64
0    17824
1    17823
Name: label, dtype: int64


1    17824
0    17823
Name: label, dtype: int64

## Set up directory structure

This only needs to be run once. Make sure you have enough disk space.

In [None]:
def prep_directories(train_df, val_df, test_df):
    # Essentially make 3 directories: train, validation, test
    # each of which will have a '0' and '1' directory inside
    root_dir = 'processed'
    dirs = ['train', 'validation', 'test']

    # Create the dirs
    for d in dirs:
        for c in ['0', '1']:
            os.makedirs(f'{root_dir}/{d}/{c}', exist_ok=True)

    def constrain(x, partition):
        label = x['label']
        imgid = x['id']
        fullpath = f'{image_dir}/{imgid}.tif'
        newpath = f'{root_dir}/{partition}/{label}/{imgid}.tif'
        shutil.copyfile(fullpath, newpath)

    train_df.apply(lambda x: constrain(x, 'train'), axis=1)
    val_df.apply(lambda x: constrain(x, 'validation'), axis=1)
    test_df.apply(lambda x: constrain(x, 'test'), axis=1)

In [None]:
prep_directories(df_train, df_val, df_test)

In [None]:
tf.keras.backend.clear_session()
tf.config.run_functions_eagerly(True)

# Test image data generator

In [None]:
def rotate_img(image):
    return np.rot90(image, np.random.choice([-1, 0, 1, 2]))

In [None]:
# Generators
train_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1./255,
    zoom_range=0.1,
    horizontal_flip=True,
    vertical_flip=True,
    preprocessing_function=rotate_img,
    brightness_range=[0.4, 1.2]
)
val_datagen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255)

Found 20 images belonging to 2 classes.


# CNN model v2

## Build the model

In [None]:
input_shape = (IMG_SIZE, IMG_SIZE, 1)
model_cnn_2 = tf.keras.Sequential([
    tf.keras.layers.Conv2D(16, (3,3), padding='same', activation=tf.nn.relu, input_shape=input_shape),
    tf.keras.layers.MaxPooling2D((2, 2), strides=2),
    tf.keras.layers.Dropout(.1),

    tf.keras.layers.Conv2D(32, (3,3), padding='same', activation=tf.nn.relu),
    tf.keras.layers.MaxPooling2D((2, 2), strides=2),
    tf.keras.layers.Dropout(.1),

    tf.keras.layers.Conv2D(64, (3,3), padding='same', activation=tf.nn.relu),
    tf.keras.layers.MaxPooling2D((2, 2), strides=2),
    tf.keras.layers.Dropout(.1),

    tf.keras.layers.Flatten(),

    tf.keras.layers.Dense(512, activation=tf.nn.relu),
    tf.keras.layers.Dense(256, activation=tf.nn.relu),
    tf.keras.layers.Dense(128, activation=tf.nn.relu),
    
    tf.keras.layers.Dense(1, activation= None)
])

2022-11-13 22:20:40.873185: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-11-13 22:20:40.873220: W tensorflow/stream_executor/cuda/cuda_driver.cc:263] failed call to cuInit: UNKNOWN ERROR (303)
2022-11-13 22:20:40.873239: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (w207): /proc/driver/nvidia/version does not exist
2022-11-13 22:20:40.873468: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
model_cnn_2.compile(optimizer = 'adam', loss = tf.keras.losses.BinaryCrossentropy(from_logits=True),  metrics = ['accuracy']) 

In [None]:
train_datagen_flow = train_datagen.flow_from_directory(
    train_dir,
    target_size=(IMG_SIZE, IMG_SIZE),
    class_mode='binary',
    batch_size=BATCH_SIZE,
    color_mode='grayscale'
)
val_datagen_flow = val_datagen.flow_from_directory(
    validation_dir,
    target_size=(IMG_SIZE, IMG_SIZE),
    class_mode='binary',
    batch_size=BATCH_SIZE,
    color_mode='grayscale'
)

Found 106940 images belonging to 2 classes.
Found 35647 images belonging to 2 classes.


In [None]:
history_model_cnn_2 = model_cnn_2.fit(train_datagen_flow,
                                      epochs=10,
                                      steps_per_epoch=len(train_datagen_flow),
                                      validation_data=val_datagen_flow,
                                      validation_steps=len(val_datagen_flow),
                                      callbacks=[CSVLogger('training_logs.csv', append=False, separator=';')]
                                     )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Densenet201

In [None]:
train_datagen_flow_color = train_datagen.flow_from_directory(
    train_dir,
    target_size=(IMG_SIZE, IMG_SIZE),
    class_mode='binary',
    batch_size=BATCH_SIZE,
    color_mode='rgb'
)
val_datagen_flow_color = val_datagen.flow_from_directory(
    validation_dir,
    target_size=(IMG_SIZE, IMG_SIZE),
    class_mode='binary',
    batch_size=BATCH_SIZE,
    color_mode='rgb'
)

Found 106940 images belonging to 2 classes.
Found 35647 images belonging to 2 classes.


In [None]:
base_densenet_model =  Sequential([
    tf.keras.applications.densenet.DenseNet201(
        include_top=False,
        weights='imagenet',
        input_shape=(IMG_SIZE, IMG_SIZE, 3)
    )
])
base_densenet_model.trainable = False

densenet_model = Sequential([
    base_densenet_model,
    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(units=1)
])

densenet_model.summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/densenet/densenet201_weights_tf_dim_ordering_tf_kernels_notop.h5
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 sequential_1 (Sequential)   (None, 3, 3, 1920)        18321984  
                                                                 
 global_average_pooling2d (G  (None, 1920)             0         
 lobalAveragePooling2D)                                          
                                                                 
 dropout_3 (Dropout)         (None, 1920)              0         
                                                                 
 dense_4 (Dense)             (None, 1)                 1921      
                                                                 
Total params: 18,323,905
Trainable params: 1,921
Non-trainable params: 18,321,984
______________________

In [None]:
densenet_model.compile(optimizer = 'adam', loss = tf.keras.losses.BinaryCrossentropy(from_logits=True),  metrics = ['accuracy']) 

In [None]:
# https://stackoverflow.com/questions/54527760/using-tensorflow-how-do-i-find-the-time-taken-for-an-epoch-during-fitting
import time
class timecallback(tf.keras.callbacks.Callback):
    def __init__(self):
        self.times = []
        # use this value as reference to calculate cummulative time taken
        self.timetaken = time.process_time()
    def on_epoch_end(self,epoch,logs = {}):
        self.times.append((epoch,time.process_time() - self.timetaken))
    def on_train_end(self,logs = {}):
        plt.xlabel('Epoch')
        plt.ylabel('Total time taken until an epoch in seconds')
        plt.plot(*zip(*self.times))
        plt.show()

In [None]:
time_callback = timecallback()
history_densetmodel = densenet_model.fit(train_datagen_flow_color,
                                      epochs=10,
                                      steps_per_epoch=len(train_datagen_flow_color),
                                      validation_data=val_datagen_flow_color,
                                      validation_steps=len(val_datagen_flow_color),
                                      callbacks=[time_callback, CSVLogger('training_logs_densenet.csv', append=False, separator=',')]
                                     )

Epoch 1/10
Epoch 2/10
Epoch 3/10

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Epoch 5/10

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Epoch 9/10
 543/3342 [===>..........................] - ETA: 19:33 - loss: 0.4040 - accuracy: 0.8155

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



# RESNET50

## Define the model

In [None]:
tf.keras.backend.clear_session()
tf.random.set_seed(1234)
np.random.seed(1234)

rnet50 = tf.keras.applications.resnet50.ResNet50(
    include_top=False,
    weights='imagenet',
    input_tensor=None,
    input_shape=(IMG_SIZE, IMG_SIZE, 3)
)

inputs = tf.keras.Input(input_shape)
m2 = GlobalAveragePooling2D()(rnet50(inputs))
m2 = tf.keras.layers.Dropout(.1)(m2)
m2 = tf.keras.layers.Flatten()(m2)
m2 = tf.keras.layers.Dense(1, activation= None)(m2)

resnetmodel = tf.keras.Model(inputs=inputs, outputs=m2)

## Compilation

In [None]:
resnetmodel.compile(optimizer = 'adam',
              loss = tf.keras.losses.BinaryCrossentropy(from_logits=True),  
              metrics = ['accuracy']) 

## Fit the model

In [None]:
history_resnetmodel = resnetmodel.fit(np.repeat(X_train, 3, -1), y_train,
                    epochs = 10, 
                    validation_data = (np.repeat(X_val, 3, -1), y_val)
)

## Save model

In [None]:
resnetmodel.save('/home/hector/resnest_model.tf', overwrite=True, include_optimizer=True, save_format='tf')