In [None]:
!pip -q install tensorflow==2.3.0

In [None]:
# Basics / Data manipulation
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import zipfile
import os

# Visualization
import matplotlib.pyplot as plt
from PIL import Image
import cv2
import skimage.io

# ML
from keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split

%matplotlib inline

# Data
10k+ of .tiff images
*    **80%** for training 
*    **20%** for internal testing
            *  10% Validation
            *  10% Testing

In [None]:
# Folder paths
TRAIN = '../input/prostate-cancer-grade-assessment/train_images'
MASKS = '../input/prostate-cancer-grade-assessment/train_label_masks'
OUT_TRAIN = './train.zip'
OUT_VALIDATION = './validation.zip'
OUT_TEST = './test.zip'
OUT_MASKS_TRAIN = './masks_train.zip'
OUT_MASKS_VALIDATION = './masks_validation.zip'
OUT_MASKS_TEST = './masks_test.zip'

BASE_FOLDER = "/kaggle/input/prostate-cancer-grade-assessment/"
!ls {BASE_FOLDER}
BASE_FOLDER2 ="/kaggle/input/panda-tiles/"
!ls {BASE_FOLDER2}

In [None]:
train = pd.read_csv(BASE_FOLDER+"train.csv")
test = pd.read_csv(BASE_FOLDER+"test.csv")
sub = pd.read_csv(BASE_FOLDER+"sample_submission.csv")
train.head()

In [None]:
# Checking for all the "negative" labels in the label of gleason_score
train[train['gleason_score'] == 'negative']

In [None]:
# Deleting from the dataset a mislabeled row and converting the "negative" labels to "0+0" in order to have an standard
train.drop([7273],inplace=True)
train['gleason_score'] = train['gleason_score'].apply(lambda x: "0+0" if x == "negative" else x)

In [None]:
data = np.array(train) # Converting the DataFrame to an array to take the column
labels = data[:, 3] # Labels of interest (GLEASON SCORE)
labels

In [None]:
features = data[:, :3] # Features of interest (ID, PROVIDER, ISUP GRADE)
features

In [None]:
X = features
y = labels

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_check, y_train, y_check = train_test_split(X, y, test_size=0.2, random_state = 42) 

In [None]:
X_validation, X_test, y_validation, y_test = train_test_split(X_check, y_check, test_size=0.5, random_state = 84)

## Train-Validation-Test

In [None]:
X_train = pd.DataFrame(X_train, columns=["image_id", "data_provider", "isup_grade"])
X_train["gleason_score"] = y_train

X_validation = pd.DataFrame(X_validation, columns=["image_id", "data_provider", "isup_grade"])
X_validation["gleason_score"] = y_validation

X_test = pd.DataFrame(X_test, columns=["image_id", "data_provider", "isup_grade"])
X_test["gleason_score"] = y_test

In [None]:
train_eda = X_train.groupby("gleason_score").count()["image_id"].reset_index().sort_values(by="image_id", ascending=False)
train_eda.style.background_gradient(cmap="Greens")

In [None]:
validation_eda = X_validation.groupby("gleason_score").count()["image_id"].reset_index().sort_values(by="image_id", ascending=False)
validation_eda.style.background_gradient(cmap="Reds")

In [None]:
test_eda = X_test.groupby("gleason_score").count()["image_id"].reset_index().sort_values(by="image_id", ascending=False)
test_eda.style.background_gradient(cmap="Blues")

In [None]:
import plotly.graph_objects as go

fig = go.Figure(data=[
    go.Bar(name="Test", x=train_eda["gleason_score"], y=train_eda["image_id"]),
    go.Bar(name="Validation", x=validation_eda["gleason_score"], y=validation_eda["image_id"]),
    go.Bar(name="Train", x=test_eda["gleason_score"], y=test_eda["image_id"]),
])

# Change the bar mode
fig.update_layout(barmode='group')
fig.show()

In [None]:
import plotly.express as px

In [None]:
df = train_eda
fig = px.pie(df, values='image_id', names='gleason_score', title = 'Training Images')

fig.show()

In [None]:
df = validation_eda
fig = px.pie(df, values='image_id', names='gleason_score', title = 'Validation Images')
fig.show()

In [None]:
df = test_eda
fig = px.pie(df, values='image_id', names='gleason_score', title = 'Testing Images')
fig.show()

In [None]:
labels = 'Training Images','Validation Images', 'Testing Images'
sizes_features = [len(X_train), len(X_validation), len(X_test)]
# sizes_labels = [len(y_train), len(y_validation)]

fig, ax = plt.subplots(figsize=(30,7))

ax.pie(sizes_features, labels=labels, autopct='%1.1f%%',
          shadow=True, startangle=60)
ax.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle
ax.set_title(f"Distribution of the dataset\n Total Images - {(len(X) / len(X) * 100)}%: {len(X)}\n Training images - {(len(X_train) / len(X) * 100)}%: {len(X_train)}\n Validation images - {(len(X_validation) / len(X) * 100)}%: {len(X_validation)}\n Testing images - {(len(X_test) / len(X) * 100)}%: {len(X_test)} \n "
                                                                          ,weight="bold")
plt.show()

In [None]:
# Saving the datasets
X_train = pd.DataFrame(X_train, columns=["image_id", "data_provider", "isup_grade", "gleason_score"])
X_validation = pd.DataFrame(X_validation, columns=["image_id", "data_provider", "isup_grade", "gleason_score"])
X_test = pd.DataFrame(X_test, columns=["image_id", "data_provider", "isup_grade", "gleason_score"])

X_train.to_csv("./training.csv")
X_validation.to_csv("./validation.csv")
X_test.to_csv("./testing.csv")

In [None]:
SIZE_IMG = 112
N = 16
def tile(img, mask):
    result = []
    shape = img.shape
    pad0,pad1 = (SIZE_IMG - shape[0]%SIZE_IMG)%SIZE_IMG, (SIZE_IMG - shape[1]%SIZE_IMG)%SIZE_IMG
    img = np.pad(img, [[pad0//2, pad0-pad0//2], [pad1//2, pad1 - pad1//2],[0,0]],
                constant_values=255)
    mask = np.pad(mask,[[pad0//2, pad0-pad0//2], [pad1//2,pad1-pad1//2], [0,0]],
                constant_values=0)
    img = img.reshape(img.shape[0]//SIZE_IMG, SIZE_IMG, img.shape[1]//SIZE_IMG,SIZE_IMG, 3)
    img = img.transpose(0,2,1,3,4).reshape(-1, SIZE_IMG,SIZE_IMG,3)
    mask = mask.reshape(mask.shape[0]//SIZE_IMG, SIZE_IMG,mask.shape[1]//SIZE_IMG, SIZE_IMG, 3)
    mask = mask.transpose(0, 2, 1, 3, 4).reshape(-1, SIZE_IMG,SIZE_IMG, 3)
    if len(img) < N:
        mask = np.pad(mask, [[0, N-len(img)], [0, 0], [0, 0],[0, 0]], constant_values=0)
        img = np.pad(img, [[0, N-len(img)],[0, 0],[0, 0], [0, 0]], constant_values=255)
    idxs = np.argsort(img.reshape(img.shape[0], -1).sum(-1))[: N]
    img = img[idxs]
    mask = mask[idxs]
    for i in range(len(img)):
        result.append({'img':img[i], 'mask':mask[i], 'idx':i})
    return result

In [None]:
import openslide
img=openslide.OpenSlide('/kaggle/input/prostate-cancer-grade-assessment/train_images/2fd1c7dc4a0f3a546a59717d8e9d28c3.tiff')
display(img.get_thumbnail(size=(512,512)))


## Image Preview


In [None]:
train_dataset = pd.read_csv("./training.csv", usecols=["image_id", "data_provider", "isup_grade", "gleason_score"])
# validation_dataset = pd.read_csv("./validation.csv", usecols=["image_id", "data_provider", "isup_grade", "gleason_score"])

f, ax = plt.subplots(4,4, figsize=(10, 10))

# Mapping to the original dataset
# ../input/prostate-cancer-grade-assessment/train_images/0005f7aaab2800f6170c399693a96917.tiff
# ../input/prostate-cancer-grade-assessment/train_label_masks/0005f7aaab2800f6170c399693a96917_mask.tiff
img = skimage.io.MultiImage(os.path.join(TRAIN,"0005f7aaab2800f6170c399693a96917"+'.tiff'))[1]
mask = skimage.io.MultiImage(os.path.join(MASKS,"0005f7aaab2800f6170c399693a96917"+'_mask.tiff'))[1]
tiles = tile(img, mask)
for t in range(len(tiles)):
    ax[t//4, t%4].imshow(tiles[t]["img"]) # Displaying Image    
    ax[t//4, t%4].axis('off')      

## Concatenate Images - 16:1
The function ```concat_tile()``` concatenates 16 tiles in one single image, which will be saved later on.

In [None]:
id_train = train_dataset["image_id"][163]
id_validation = validation_dataset["image_id"][163]
id_test = validation_dataset["image_id"][163]
# Testing the function
def concat_tile(im_list_2d):
    return cv2.vconcat([cv2.hconcat(im_list_h) for im_list_h in im_list_2d])

def mosaic(tiles):

    im1 = tiles[0]["img"]
    im2 = tiles[1]["img"]
    im3 = tiles[2]["img"]
    im4 = tiles[3]["img"]

    im5 = tiles[4]["img"]
    im6 = tiles[5]["img"]
    im7 = tiles[6]["img"]
    im8 = tiles[7]["img"]

    im9 = tiles[8]["img"]
    im10 = tiles[9]["img"]
    im11 = tiles[10]["img"]
    im12 = tiles[11]["img"]

    im13 = tiles[12]["img"]
    im14 = tiles[13]["img"]
    im15 = tiles[14]["img"]
    im16 = tiles[15]["img"]

    im_tile = concat_tile([[im1, im2, im3, im4],
                           [im5, im6, im7, im8],
                           [im9, im10, im11, im12],
                           [im13, im14, im15, im16]])
    return im_tile

img = skimage.io.MultiImage(os.path.join(TRAIN, f"{id_train}.tiff"))[1]
mask = skimage.io.MultiImage(os.path.join(MASKS, f"{id_train}_mask.tiff"))[1]
tiles = tile(img, mask)

mosaic_img = mosaic(tiles)
plt.title(f"ID: {id_train}")
plt.imshow(mosaic_img)

# Generating the Dataset

* Iterate through the train and test dataset
    * Map the for both the train and the test dataset to the base folder
    * Zip the 16 subimages
    * Save the 16 subimages in their correspondant GLEASON_SCORE folder

In [None]:
train_IDs = train_dataset["image_id"]
validation_IDs = validation_dataset["image_id"]
test_IDs = test_dataset["image_id"]

not_found_train = []
not_found_validation = []
not_found_test = []

def generate_dataset(ids, dataset_type):
    if dataset_type == "train":
        x_tot,x2_tot = [], []
        with zipfile.ZipFile(OUT_TRAIN, 'w') as img_out,\
         zipfile.ZipFile(OUT_MASKS_TRAIN, 'w') as mask_out:
            for gleason_score, id in enumerate(tqdm(ids)):
                try:
                    img = skimage.io.MultiImage(os.path.join(TRAIN,id+'.tiff'))[1]
                    mask = skimage.io.MultiImage(os.path.join(MASKS,id+'_mask.tiff'))[1]
                    tiles = tile(img,mask)                    
                    img = mosaic(tiles)
                    
                    x_tot.append((img/255.0).reshape(-1,3).mean(0))
                    x2_tot.append(((img/255.0)**2).reshape(-1,3).mean(0))
                    # If read with PIL RGB turns into BGR
                    img = cv2.imencode('.png',cv2.cvtColor(img, cv2.COLOR_RGB2BGR))[1]
                    # Uncomment to classify by ISUP GRADE 
                    # img_out.writestr(f'train/ISUP_GRADE_{train_dataset["isup_grade"][isup_grade]}/{id}_{idx}.png', img)
                    img_out.writestr(f'train/GLEASON_SCORE_{train_dataset["gleason_score"][gleason_score]}/{id}.png', img)
                except Exception as e:
                    not_found_train.append(id)
        print(f"INFO: Not images found in train: {len(not_found_train)}")
        
    elif dataset_type == "valid": 
        x_tot,x2_tot = [], []
        with zipfile.ZipFile(OUT_VALIDATION, 'w') as img_out,\
         zipfile.ZipFile(OUT_MASKS_VALIDATION, 'w') as mask_out:
            for gleason_score, id in enumerate(tqdm(ids)):
                try:
                    img = skimage.io.MultiImage(os.path.join(TRAIN,id+'.tiff'))[1]
                    mask = skimage.io.MultiImage(os.path.join(MASKS,id+'_mask.tiff'))[1]
                    tiles = tile(img,mask)
                    img = mosaic(tiles)
                    
                    x_tot.append((img/255.0).reshape(-1,3).mean(0))
                    x2_tot.append(((img/255.0)**2).reshape(-1,3).mean(0)) 
                    # If read with PIL RGB turns into BGR
                    img = cv2.imencode('.png',cv2.cvtColor(img, cv2.COLOR_RGB2BGR))[1]
                    # Uncomment to classify by ISUP GRADE 
                    # img_out.writestr(f'test/ISUP_GRADE_{train_dataset["isup_grade"][isup_grade]}/{id}_{idx}.png', img)
                    img_out.writestr(f'validation/GLEASON_SCORE_{validation_dataset["gleason_score"][gleason_score]}/{id}.png', img)
                except Exception as e:
                    not_found_validation.append(id)

        print(f"INFO: Not images found in validation: {len(not_found_validation)}")
        
    elif dataset_type == "test":  
        x_tot,x2_tot = [], []
        with zipfile.ZipFile(OUT_TEST, 'w') as img_out,\
         zipfile.ZipFile(OUT_MASKS_TEST, 'w') as mask_out:
            for gleason_score, id in enumerate(tqdm(ids)):
                try:
                    img = skimage.io.MultiImage(os.path.join(TRAIN,id+'.tiff'))[1]
                    mask = skimage.io.MultiImage(os.path.join(MASKS,id+'_mask.tiff'))[1]
                    tiles = tile(img,mask)
                    img = mosaic(tiles)
                    
                    x_tot.append((img/255.0).reshape(-1,3).mean(0))
                    x2_tot.append(((img/255.0)**2).reshape(-1,3).mean(0)) 
                    # If read with PIL RGB turns into BGR
                    img = cv2.imencode('.png',cv2.cvtColor(img, cv2.COLOR_RGB2BGR))[1]
                    # Uncomment to classify by ISUP GRADE 
                    # img_out.writestr(f'test/ISUP_GRADE_{train_dataset["isup_grade"][isup_grade]}/{id}_{idx}.png', img)
                    img_out.writestr(f'test/GLEASON_SCORE_{test_dataset["gleason_score"][gleason_score]}/{id}.png', img)
                except Exception as e:
                    not_found_test.append(id)

        print(f"INFO: Not images found in test: {len(not_found_test)}")

In [None]:
generate_dataset(train_IDs, datset_type='train')
generate_dataset(train_IDs, datset_type='valid')
generate_dataset(validation_IDs, datset_type='test')

## Lost/Corrupted Data
Some images were not successfully processed for some reason: 80 in the training set, 20 in the testing set.

In [None]:
labels = "Training Images", "Validation Images","Testing Images", "Loss"
sizes_features = [len(X_train), len(X_validation), len(X_test), len(not_found_train) + len(not_found_validation) +len(not_found_test)]
# sizes_labels = [len(X_train), len(X_validation), 20]

fig, ax = plt.subplots(figsize=(30,7))

ax.pie(sizes_features, labels=labels, autopct='%1.1f%%',
          shadow=True, startangle=60)
ax.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle
ax.set_title(f"Distribution of the dataset\n" /
             f"Total Images - {(len(X) / len(X) * 100)}%: {len(X)} \n" / 
             f"Training images - {(len(X_train) / len(X) * 100)}%: {len(X_train)} \n" /
             f"Validation images - {(len(X_validation) / len(X) * 100)}%: {len(X_validation)} \n" /
             f"Testing images - {(len(X_test) / len(X) * 100)}%: {len(X_test)} \n "
             f"Loss - : {len(not_found_train) + len(not_found_validation) +len(not_found_test)} / {len(X)} images", weight="bold")

plt.show()

## Removing Lost/Corrupted Data

Since there are 100 images not found (80 images for training and 20 for testing) we want to make sure this does not affect at the end. Given that there is a possibility that the 80 images not found may belong to the classes with the less images provided by the original dataset

In [None]:
not_found_train_eda = []
if not_found_train:
    for not_found in not_found_train:
        not_found_train_eda.append(train_dataset[train_dataset["image_id"] == not_found])
    not_found_train_eda = pd.concat(not_found_train_eda)

In [None]:
not_found_validation_eda = []
if not_found_validation:
    for not_found in not_found_validation:
        not_found_validation_eda.append(validation_dataset[validation_dataset["image_id"] == not_found])
    not_found_validation_eda = pd.concat(not_found_validation_eda)

In [None]:
not_found_test_eda = []
if not_found_test:
    for not_found in not_found_test:
        not_found_test_eda.append(test_dataset[test_dataset["image_id"] == not_found])
    not_found_test_eda = pd.concat(not_found_test_eda)

In [None]:
# if not_found_train_eda.empty:
not_found_train_eda = not_found_train_eda.groupby('gleason_score').count()['image_id'].reset_index().sort_values(by='image_id', ascending=False)
not_found_train_eda.style.background_gradient(cmap='Greens')

In [None]:
# if not_found_validation_eda.empty:
not_found_validation_eda = not_found_validation_eda.groupby('gleason_score').count()['image_id'].reset_index().sort_values(by='image_id', ascending=False)
not_found_validation_eda.style.background_gradient(cmap='Reds')

In [None]:
# if not not_found_test_eda.empty:
not_found_test_eda = not_found_test_eda.groupby('gleason_score').count()['image_id'].reset_index().sort_values(by='image_id', ascending=False)
not_found_test_eda.style.background_gradient(cmap='Blues')

In [None]:
# if not_found_train_eda or not_found_validation_eda or not_found_test_eda:
fig = go.Figure(data=[
    go.Bar(name="Not found test", x=not_found_train_eda["gleason_score"], y=not_found_train_eda["image_id"]),
    go.Bar(name="Not found validation", x=not_found_validation_eda["gleason_score"], y=not_found_validation_eda["image_id"]),
    go.Bar(name="Not found train", x=not_found_test_eda["gleason_score"], y=not_found_test_eda["image_id"])
])

# Change the bar mode
fig.update_layout(barmode='group')
fig.show()

In [None]:
# if not_found_train_eda or not_found_test_eda:
fig = go.Figure(data=[
    go.Bar(name="Not found train", x=not_found_train_eda["gleason_score"], y=not_found_train_eda["image_id"]),
    go.Bar(name="Not found validation", x=not_found_validation_eda["gleason_score"], y=not_found_validation_eda["image_id"]),
    go.Bar(name="Not found test", x=not_found_test_eda["gleason_score"], y=not_found_test_eda["image_id"]),
    go.Bar(name="Found test", x=train_eda["gleason_score"], y=train_eda["image_id"]),
    go.Bar(name="Found validation", x=validation_eda["gleason_score"], y=validation_eda["image_id"]),
    go.Bar(name="Found train", x=test_eda["gleason_score"], y=test_eda["image_id"])
])

# Change the bar mode
fig.update_layout(barmode='group')
fig.show()

## Distribution of the loss images
In the next pie charts, it is shown the loss in the images for both the testing and training datasets 

In [None]:
# if not_found_train_eda:
df = not_found_train_eda
fig = px.pie(df, values='image_id', names='gleason_score')
fig.show()

In [None]:
# if not_found_validation_eda:
df = not_found_validation_eda
fig = px.pie(df, values='image_id', names='gleason_score')
fig.show()

In [None]:
# if not_found_test_eda:
df = not_found_test_eda
fig = px.pie(df, values='image_id', names='gleason_score')
fig.show()