In [None]:
!pip -q install tensorflow==2.3.0

In [None]:
# Basics / Data manipulation
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import zipfile
import os
import glob
import shutil

# Visualization
import matplotlib.pyplot as plt
from PIL import Image
import cv2
import skimage.io

# ML
from sklearn import model_selection
from sklearn.model_selection import train_test_split


import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras import models
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import NASNetMobile
from tensorflow.keras.applications.imagenet_utils import preprocess_input

#Use this to check if the GPU is configured correctly
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

%matplotlib inline

# Data
10k+ of .tiff images
*    **80%** for training 
*    **20%** for internal testing
            *  10% Validation
            *  10% Testing

# Checking if GPU is being used

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    print("Running on TPU ", tpu.cluster_spec().as_dict()["worker"])
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    print("Not connected to a TPU runtime. Using CPU/GPU strategy")
    strategy = tf.distribute.MirroredStrategy()

# Model 
The model will have the follow configuration:
______________
1st layer: NASNetMobile (224, 224, 3) input images
______________
2nd layer: GlobalMaxPooling2D
______________
3rd layer: Dropout with learning rate = 2e-5
______________
4th layer: Denser layer x 6 that will classify the image

In [None]:
base_model = NASNetMobile(weights="imagenet", include_top=False, input_shape=(224, 224, 3))
#print(base_model.summary())

In [None]:
model = models.Sequential()
model.add(base_model)
base_model.trainable = True
model.add(layers.GlobalMaxPooling2D(name="gap"))
# Avoid overfitting
model.add(layers.Dropout(rate=0.5))
model.add(layers.Dense(2, activation="softmax", name="fc_out"))

model.compile(
    loss="categorical_crossentropy",
    optimizer=optimizers.RMSprop(lr=2e-5),
    metrics=["acc"])

model.summary()

### Unzipping The Files
The original images have been transformed into tiled mosaics. Each image_id has 8 mosaic variations; the variations have been grouped into their seperate own zips
to work within a Kaggle restriction of  max 5 GB in ./kaggle/working & and max 20 GB in ./kaggle/tmp while training.csv + validation.csv + testing.csv are seperate & global. 

The dataset has been split 90% Training, 7.5% Validation, and 2.5% Internal Testing. If you want to use all 10% of the data for Validation, just merge the appropriate dataframes & zipfile contents. 

Only unzip a single variation at a time! The notebook will fail if you use up all the space in ./kaggle/working and your model will not be saved! If you need to move on to a different variation, delete the old files! You can unzip them again later, no problem.


In [None]:
NUMBER_OF_TRAINING_IMAGES = len(pd.read_csv('../input/8-fold-pc-dataset-gen-0-8/training.csv'))
NUMBER_OF_VALIDATION_IMAGES = len(pd.read_csv('../input/8-fold-pc-dataset-gen-0-8/validation.csv'))
NUMBER_OF_TESTING_IMAGES = len(pd.read_csv('../input/8-fold-pc-dataset-gen-0-8/testing.csv'))

In [None]:
#TVT = ["./train", "./validation", "./test"]
TVT = ["./train", "./validation"]
OUT = [0, 1]
Out = ["/Positive","/Negative"]
GLE = ["/GLEASON_SCORE_[!0]+[!0]", "/GLEASON_SCORE_0+0"]

def binarize():
    for grouping in TVT:
        for outcomes in OUT:
            if not os.path.exists(grouping + Out[outcomes]):
                os.makedirs(grouping + Out[outcomes])
            for file in glob.iglob(grouping + GLE[outcomes] + "/*"):
                os.replace(file, grouping + Out[outcomes] + "/" + file.split("/")[3])
            for folder in glob.iglob(grouping + GLE[outcomes]): 
                os.rmdir(folder)  


In [None]:
### CAUTION ###

#variations = ["A", "B", "C", "D", "E", "F", "G", "H"]
variations = ["A"]

def zippity(variant):
    print(f'Variation {variant}')
    # Train
    with zipfile.ZipFile(f'../input/8-fold-pc-dataset-gen-{variations.index(variant) + 1}-8-{variant.lower()}/train{variant}.zip','r') as z:
        z.extractall(".")
                    
    # Valid
    with zipfile.ZipFile(f'../input/8-fold-pc-dataset-gen-{variations.index(variant) + 1}-8-{variant.lower()}/validation{variant}.zip','r') as z:
        z.extractall(".")
                    
    # Test
#     with zipfile.ZipFile("../input/pc-data-dataset-gen/test.zip","r") as z:
#         z.extractall(".")
    binarize()
    
    for path in glob.glob("./*/GLEASON_SCORE_?+?/"):
        os.rmdir(path)

In [None]:
def zappity():
    # Deleting image folders to avoid over-saturate the output
    !rm -r train
    !rm -r validation
#     !rm -r test

### Additional Data Augmentation

In [None]:
image_gen = ImageDataGenerator(
    width_shift_range=0.1,
    height_shift_range=0.1,
    rescale=1/255,
    shear_range=0.2,
    zoom_range=0.2,
    fill_mode="nearest",
    preprocessing_function=tf.keras.applications.nasnet.preprocess_input)

In [None]:
#sample = plt.imread("../input/panda2/train_images/0005f7aaab2800f6170c399693a96917.png")

#plt.imshow(image_gen.random_transform(sample))

In [None]:
batch_size = 32

def which_image_gen(which):
    if(which == "train"):
        which_gen = image_gen.flow_from_directory("./train",
                                                  target_size=(224, 224),
                                                  batch_size=batch_size,
                                                  class_mode="categorical")
        
    
    elif(which == "valid"):
        which_gen = image_gen.flow_from_directory("./validation",
                                                  target_size=(224, 224),
                                                  batch_size=batch_size,
                                                  class_mode="categorical")
    
#     elif(which == "test"):
#         which_gen = image_gen.flow_from_directory("./test",
#                                                   target_size=(224, 224),
#                                                   batch_size=batch_size,
#                                                   class_mode="categorical")
    return which_gen


In [None]:
for variety in variations:
    zippity(variety)
    
    train_image_gen = which_image_gen("train")
    validation_image_gen = which_image_gen("valid")
#     test_image_gen = which_image_gen("test")

#     Flowing through directories to see the classes and the number of images
#     print(image_gen.flow_from_directory("./train"))
#     print(image_gen.flow_from_directory("./validation"))
#     print(image_gen.flow_from_directory("./test"))

#     train_image_gen.class_indices
#     validation_image_gen.class_indices
#     test_image_gen.class_indices

    results = model.fit(
        train_image_gen,
        steps_per_epoch=NUMBER_OF_TRAINING_IMAGES // batch_size,
        epochs=50,
        validation_data=validation_image_gen,
        validation_steps=NUMBER_OF_VALIDATION_IMAGES // batch_size,
        verbose=20,
        use_multiprocessing=True,
        workers=4)
    
    # Saving the synaptic weights of the model
    model.save("./NASNetMobile-model.h5")
    
    zappity()


In [None]:
def plot_hist_acc(hist):
    plt.plot(hist.history["acc"])
    plt.plot(hist.history["val_acc"])
    plt.title("Model Accuracy")
    plt.ylabel("Accuracy")
    plt.xlabel("Epoch")
    plt.legend(["Accuracy", "Validation Accuracy"], loc="upper left")
    plt.show()

In [None]:
def plot_hist_loss(hist):
    plt.plot(hist.history["loss"])
    plt.plot(hist.history["val_loss"])
    plt.title("Model Loss")
    plt.ylabel("Errors")
    plt.xlabel("Epoch")
    plt.legend(["Loss", "Validation Loss"], loc="upper left")
    plt.show()

In [None]:
# Saving the synaptic weights of the model
model.save("./NASNetMobile-model.h5")

In [None]:
results_df = pd.DataFrame({"epoch":[i + 1 for i in range(len(results.history["acc"]))], "acc":results.history["acc"], "val_acc":results.history["val_acc"], "loss":results.history["loss"], "val_loss":results.history["val_loss"]})
results_df

In [None]:
plot_hist_acc(results)

In [None]:
plot_hist_loss(results)