<a href="https://colab.research.google.com/github/srilamaiti/SM_MIDS_W207_HW/blob/main/w207_cancer_detection_sm_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [87]:
import pandas as pd
import numpy as np
import os
import random
import joblib
import glob
import random
from imutils import rotate as rotate
from itertools import product
import gc
from sklearn.utils import shuffle
import cv2 as cv
import skimage.io as skio

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import matplotlib.patches as patches
from matplotlib.patches import Polygon

import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import RandomFlip
from tensorflow.keras.layers import RandomZoom
from tensorflow.keras.layers import RandomRotation
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import AveragePooling2D
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.optimizers import Adam
from keras.applications.vgg16 import VGG16
from tensorflow.keras.preprocessing.image import array_to_img
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.python.ops.numpy_ops import np_config
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from keras.callbacks import ReduceLROnPlateau
np_config.enable_numpy_behavior()
from fastai.vision import *
from fastai.metrics import error_rate, accuracy

# Required to read the data from Kaggle
from google.colab import drive
drive.mount('/content/gdrive')
os.environ['KAGGLE_CONFIG_DIR'] = "/content/gdrive/MyDrive/Kaggle"

import warnings
warnings.simplefilter("ignore", category=DeprecationWarning)

Mounted at /content/gdrive


In [142]:
!kaggle competitions download -c histopathologic-cancer-detection

Downloading histopathologic-cancer-detection.zip to /content
100% 6.30G/6.31G [00:37<00:00, 254MB/s]
100% 6.31G/6.31G [00:37<00:00, 179MB/s]


In [143]:
!unzip -o -qq \*.zip  && rm *.zip

In [90]:
def generate_fully_qualified_file_name_list(file_list):
    """
    This function generates a list of fully qualified file names.
    """
    qualified_file_name_list = [os.path.join(current_working_dir, train_path) + 
                                img + 
                                '.tif' 
                                for img in file_list
                               ]
    return qualified_file_name_list

In [91]:
def get_id_and_label_list(file_path, file_extension):
    """
    This function gets the imgae id and corresponding label.
    """
    file_list = []
    for file_name in glob.glob(file_path + '*' + file_extension):
        file_list.append(file_name)
    return file_list

In [92]:
def split_data(split_indices, df):
    """
    This function splits the input dataframe in train, validation and test set.
    """
    X_train = df[: split_indices[0]]
    X_val = df[split_indices[0] : split_indices[1]]
    X_test = df[split_indices[1]:]
    return X_train, X_val, X_test

In [93]:
CONTRAST_FACTOR = 3
DELTA = 0.3

train_path = 'train/'
test_path = 'test/'

current_working_dir = os.getcwd()

train_label_file = 'train_labels.csv'
test_label_file = 'sample_submission.csv'

image_file_extension = '.tif'

train_files_path = os.path.join(current_working_dir, train_path)
test_files_path = os.path.join(current_working_dir, test_path)

# declare constants for reproduciblity
RANDOM_STATE = 20220922

# Finding split positions
split = (0.30, 0.10, 0.10)

process_chunk_size = 200
zoom_range = (0.4, 0.5)

tf.random.set_seed(1234)
np.random.seed(1234)

In [94]:
#get file names for train and test
train_file_list = []
test_file_list = []

train_file_list = get_id_and_label_list(train_files_path, image_file_extension)
test_file_list = get_id_and_label_list(test_files_path, image_file_extension)

train_label = pd.read_csv(train_label_file)
test_label = pd.read_csv(test_label_file)

print(f"Number of train files : {len(train_file_list)}")
print(f"Number of test files : {len(test_file_list)}")

Number of train files : 220025
Number of test files : 57458


In [95]:
train_positive_image_id_list = list(train_label[train_label.label==1].id)
train_negative_image_id_list = list(train_label[train_label.label==0].id)

train_positive_images_list = generate_fully_qualified_file_name_list(train_positive_image_id_list)
train_negative_images_list = generate_fully_qualified_file_name_list(train_negative_image_id_list)

print(f"Number of positive images in train set: {len(train_positive_images_list)}")
print(f"Number of negative images in train set: {len(train_negative_images_list)}")

Number of positive images in train set: 89117
Number of negative images in train set: 130908


In [96]:
# Shuffling data
shuffled_train_label = shuffle(train_label)
len_shuffled_train_label = len(shuffled_train_label)

# Selecting positive and negative images from the shuffled list
shuffled_train_positive_label_df = shuffled_train_label[shuffled_train_label.label == 1]
shuffled_train_negative_label_df = shuffled_train_label[shuffled_train_label.label == 0]

print(f"Length of shuffled_train_positive_label_df : {len(shuffled_train_positive_label_df)}")
print(f"Length of shuffled_train_negative_label_df : {len(shuffled_train_negative_label_df)}")

Length of shuffled_train_positive_label_df : 89117
Length of shuffled_train_negative_label_df : 130908


In [98]:
split_indices = np.multiply(len(shuffled_train_positive_label_df), split).astype(int)
for idx, val in enumerate(split_indices):
    #print(idx, val)
    if idx > 0:
        split_indices[idx] = sum(split_indices[idx - 1 : idx]) + split_indices[idx]
print("Split indecs : ", split_indices)

Split indecs :  [26735 35646 44557]


In [99]:
# Splitting data at train, validation and test positions 
# for both positive and negative imgaes
# This process will select the list of ids
(X_train_id_positive, 
 X_val_id_positive, 
 X_test_id_positive) = split_data(split_indices = split_indices, 
                                  df = shuffled_train_positive_label_df[:split_indices[-1]]
                                 )
(X_train_id_negative, 
 X_val_id_negative, 
 X_test_id_negative) = split_data(split_indices = split_indices, 
                                  df = shuffled_train_negative_label_df[:split_indices[-1]]
                                 )
 
print(f"Length of X_train_id_positive : {len(X_train_id_positive)}")
print(f"Length of X_train_id_negative : {len(X_train_id_negative)}")

print(f"Length of X_val_id_positive : {len(X_val_id_positive)}")
print(f"Length of X_val_id_negative : {len(X_val_id_negative)}")

print(f"Length of X_test_id_positive : {len(X_test_id_positive)}")
print(f"Length of X_test_id_negative : {len(X_test_id_negative)}")

Length of X_train_id_positive : 26735
Length of X_train_id_negative : 26735
Length of X_val_id_positive : 8911
Length of X_val_id_negative : 8911
Length of X_test_id_positive : 8911
Length of X_test_id_negative : 8911


In [100]:
X_train_img_file_positive = generate_fully_qualified_file_name_list(list(np.concatenate(X_train_id_positive[['id']].values.tolist()).flat))
X_val_img_file_positive = generate_fully_qualified_file_name_list(list(np.concatenate(X_val_id_positive[['id']].values.tolist()).flat))
X_test_img_file_positive = generate_fully_qualified_file_name_list(list(np.concatenate(X_test_id_positive[['id']].values.tolist()).flat))

X_train_img_file_negative = generate_fully_qualified_file_name_list(list(np.concatenate(X_train_id_negative[['id']].values.tolist()).flat))
X_val_img_file_negative = generate_fully_qualified_file_name_list(list(np.concatenate(X_val_id_negative[['id']].values.tolist()).flat))
X_test_img_file_negative = generate_fully_qualified_file_name_list(list(np.concatenate(X_test_id_negative[['id']].values.tolist()).flat))

print(f"Length of X_train_img_file_positive : {len(X_train_img_file_positive)}")
print(f"Length of X_train_img_file_negative : {len(X_train_img_file_negative)}")

print(f"Length of X_val_img_file_positive : {len(X_val_img_file_positive)}")
print(f"Length of X_val_img_file_negative : {len(X_val_img_file_negative)}")

print(f"Length of X_test_img_file_positive : {len(X_test_img_file_positive)}")
print(f"Length of X_test_img_file_negative : {len(X_test_img_file_negative)}")

Length of X_train_img_file_positive : 26735
Length of X_train_img_file_negative : 26735
Length of X_val_img_file_positive : 8911
Length of X_val_img_file_negative : 8911
Length of X_test_img_file_positive : 8911
Length of X_test_img_file_negative : 8911


In [144]:
!rm -rf /content/image_processing 
!mkdir /content/image_processing
!mkdir /content/image_processing/train /content/image_processing/test /content/image_processing/validation
!mkdir /content/image_processing/train/positive /content/image_processing/train/negative
!mkdir /content/image_processing/test/positive /content/image_processing/test/negative
!mkdir /content/image_processing/validation/positive /content/image_processing/validation/negative

In [102]:
image_processing_train_positive_path = '/content/image_processing/train/positive'
image_processing_train_negative_path = '/content/image_processing/train/negative'

image_processing_validation_positive_path = '/content/image_processing/validation/positive'
image_processing_validation_negative_path = '/content/image_processing/validation/negative'

image_processing_test_positive_path = '/content/image_processing/test/positive'
image_processing_test_negative_path = '/content/image_processing/test/negative'

In [145]:
def move_file_from_one_to_other(file_names, dest_path):
    command = 'cp -r ' + file_names + ' ' + dest_path
    #print(command)
    os.system(command)

In [135]:
def process_move_files(file_name_list, dest_path):
    """"
    This function moves the files to a destination directory
    """
    process_chunk_size = 10
    for idx in range(0, len(file_name_list), process_chunk_size):
        #print("a")
        if idx % 1000 == 0:
            print("Processing index: ", idx)
        #print('b')
        file_names = ' '.join(file_name_list[idx : idx + process_chunk_size])
        #print('c')
        move_file_from_one_to_other(file_names, dest_path)

In [146]:
process_move_files(X_train_img_file_positive, image_processing_train_positive_path)
process_move_files(X_train_img_file_negative, image_processing_train_negative_path)

process_move_files(X_val_img_file_positive, image_processing_validation_positive_path)
process_move_files(X_val_img_file_negative, image_processing_validation_negative_path)

process_move_files(X_test_img_file_positive, image_processing_test_positive_path)
process_move_files(X_test_img_file_negative, image_processing_test_negative_path)


Processing index:  0
cp -r /content/train/ebb0606f1a1e0f1a07572b0a521d10a191fcc0ef.tif /content/train/b8a58bc2960ba0929ab001e0ba07d733497b5dfd.tif /content/train/b10599509743e1424dccbc41ab94a4fe504c0c3f.tif /content/train/5018b86a4ebe1d89f9f3fcb62fc8b29f3202e332.tif /content/train/13488702aa082e364fb262f02887748ab68d0663.tif /content/train/e6abdc19d44a62e0e38e52032cfe862c3ce12ee9.tif /content/train/42d7341a99cd7b9d9926c83f113c7cce47198de6.tif /content/train/88c39dcb9f846dbeca9e0f181e60e5b3022f9fef.tif /content/train/92c232250de0f7da088672d480099a6ae8767b6b.tif /content/train/3b67b39dd9010dd429119b8b9524d0cc1c3d076e.tif /content/image_processing/validation/positive
cp -r /content/train/5f6fe139f97cedae98c1058da4884ab81606c4b6.tif /content/train/c8d83008de6125073e122955a9370af711929d88.tif /content/train/b5cd3d69ba341a4da7e220313a0c9100f8ecbc8e.tif /content/train/33db59cb5812f62db0a9270aba77ec6405a4f45c.tif /content/train/cc2814abed3208a7f820303b8f572db967752a75.tif /content/train/2dadb9

In [137]:
!ls -ltr /content/image_processing/train/positive/


total 0


In [108]:
!ls -ltr /content/image_processing/train/negative

total 0


In [109]:
!ls -ltr /content/image_processing/test/positive

total 0


In [149]:
!ls -ltr /content/image_processing/validation/positive|wc -l

8912


In [111]:
!ls -ltr /content/image_processing/test/negative

total 0


In [112]:
!ls -ltr /content/image_processing/validation/negative

total 0
