In [None]:
!pip -q install tensorflow==2.3.0

In [None]:
# Basics / Data manipulation
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import zipfile
import os

# Visualization
import matplotlib.pyplot as plt
from PIL import Image
import cv2
import skimage.io
from IPython.display import display, HTML

# ML
from keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split

%matplotlib inline

# Data
10k+ of .tiff images
*    **90%** for training 
*    **10%** for internal testing
            *  75% Validation
            *  25% Testing

In [None]:
# Folder paths
TRAIN = '../input/prostate-cancer-grade-assessment/train_images'
MASKS = '../input/prostate-cancer-grade-assessment/train_label_masks'

# OUT_TRAIN = './train.zip'
# OUT_VALIDATION = './validation.zip'
# OUT_TEST = './test.zip'
# OUT_MASKS_TRAIN = './masks_train.zip'
# OUT_MASKS_VALIDATION = './masks_validation.zip'
# OUT_MASKS_TEST = './masks_test.zip'

BASE_FOLDER = "/kaggle/input/prostate-cancer-grade-assessment/"
!ls {BASE_FOLDER}

In [None]:
train = pd.read_csv(BASE_FOLDER + "train.csv")
train.columns.name = "train.csv"
test = pd.read_csv(BASE_FOLDER + "test.csv")
test.columns.name = "test.csv"
sub = pd.read_csv(BASE_FOLDER + "sample_submission.csv")
sub.columns.name = "sample_submission.csv"

print(f'Number of images: {len(train)}')
display(train.head())

In [None]:
# Checking for all the "negative" labels in the label of gleason_score
display(train[train['gleason_score'] == 'negative'])

In [None]:
# Deleting from the dataset a mislabeled row and converting the "negative" labels to "0+0" in order to have an standard
train.drop([7273],inplace=True)
train['gleason_score'] = train['gleason_score'].apply(lambda x: "0+0" if x == "negative" else x)
print(f'Number of images: {len(train)}')

In [None]:
sus = pd.read_csv("../input/collection-of-600-suspicious-slides-data-loader/PANDA_Suspicious_Slides.csv")
susIDs = sus['image_id']
print(f'Number of suspicious images: {len(susIDs)}')

In [None]:
train.reset_index()
susIDs.reset_index()
train = train[~train.image_id.isin(susIDs)]
print(f'Number of reliable images: {len(train)}')

In [None]:
data = np.array(train) # Converting the DataFrame to an array to take the column
labels = data[:, 3] # Labels of interest (GLEASON SCORE)
#labels

In [None]:
features = data[:, :3] # Features of interest (ID, PROVIDER, ISUP GRADE)
#features

In [None]:
X = features
y = labels

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_check, y_train, y_check = train_test_split(X, y, test_size=0.1, random_state = 42) 

In [None]:
X_validation, X_test, y_validation, y_test = train_test_split(X_check, y_check, test_size=0.25, random_state = 84)

## Train-Validation-Test

In [None]:
X_train = pd.DataFrame(X_train, columns=["image_id", "data_provider", "isup_grade"])
X_train["gleason_score"] = y_train

X_validation = pd.DataFrame(X_validation, columns=["image_id", "data_provider", "isup_grade"])
X_validation["gleason_score"] = y_validation

X_test = pd.DataFrame(X_test, columns=["image_id", "data_provider", "isup_grade"])
X_test["gleason_score"] = y_test

In [None]:
train_eda = X_train.groupby("gleason_score").count()["image_id"].reset_index().sort_values(by="image_id", ascending=False)
train_eda.style.background_gradient(cmap="Greens")
train_eda.style.set_caption("Train")

In [None]:
validation_eda = X_validation.groupby("gleason_score").count()["image_id"].reset_index().sort_values(by="image_id", ascending=False)
validation_eda.style.background_gradient(cmap="Reds")
train_eda.style.set_caption("Validation")

In [None]:
test_eda = X_test.groupby("gleason_score").count()["image_id"].reset_index().sort_values(by="image_id", ascending=False)
test_eda.style.background_gradient(cmap="Blues")
train_eda.style.set_caption("Test")

In [None]:
import plotly.graph_objects as go

fig = go.Figure(data=[
    go.Bar(name="Test", x=train_eda["gleason_score"], y=train_eda["image_id"]),
    go.Bar(name="Validation", x=validation_eda["gleason_score"], y=validation_eda["image_id"]),
    go.Bar(name="Train", x=test_eda["gleason_score"], y=test_eda["image_id"]),
])

# Change the bar mode
fig.update_layout(barmode='group')
fig.show()

In [None]:
import plotly.express as px

In [None]:
df = train_eda
fig = px.pie(df, values='image_id', names='gleason_score', title = 'Training Images')

fig.show()

In [None]:
df = validation_eda
fig = px.pie(df, values='image_id', names='gleason_score', title = 'Validation Images')
fig.show()

In [None]:
df = test_eda
fig = px.pie(df, values='image_id', names='gleason_score', title = 'Testing Images')
fig.show()

In [None]:
labels = 'Training Images','Validation Images', 'Testing Images'
sizes_features = [len(X_train), len(X_validation), len(X_test)]
# sizes_labels = [len(y_train), len(y_validation)]

fig, ax = plt.subplots(figsize=(30,7))

ax.pie(sizes_features, labels=labels, autopct='%1.1f%%',
          shadow=True, startangle=60)
ax.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle
ax.set_title(f"Distribution of the dataset\n Total Images - {(len(X) / len(X) * 100)}%: {len(X)}\n Training images - {(len(X_train) / len(X) * 100)}%: {len(X_train)}\n Validation images - {(len(X_validation) / len(X) * 100)}%: {len(X_validation)}\n Testing images - {(len(X_test) / len(X) * 100)}%: {len(X_test)} \n "
                                                                          ,weight="bold")
plt.show()

In [None]:
display(X_train.head())

In [None]:
display(X_validation.head())

In [None]:
display(X_test.head())

In [None]:
# Saving the datasets
X_train = pd.DataFrame(X_train, columns=["image_id", "variant", "data_provider", "isup_grade", "gleason_score"])
X_validation = pd.DataFrame(X_validation, columns=["image_id", "variant", "data_provider", "isup_grade", "gleason_score"])
X_test = pd.DataFrame(X_test, columns=["image_id", "variant", "data_provider", "isup_grade", "gleason_score"])

X_train.to_csv("./training.csv")
X_validation.to_csv("./validation.csv")
X_test.to_csv("./testing.csv")

In [None]:
SIZE_IMG = 112
N = 16
def tile(img, mask):
    result = []
    shape = img.shape
    pad0,pad1 = (SIZE_IMG - shape[0]%SIZE_IMG)%SIZE_IMG, (SIZE_IMG - shape[1]%SIZE_IMG)%SIZE_IMG
    img = np.pad(img, [[pad0//2, pad0-pad0//2], [pad1//2, pad1 - pad1//2],[0,0]],
                constant_values=255)
    mask = np.pad(mask,[[pad0//2, pad0-pad0//2], [pad1//2,pad1-pad1//2], [0,0]],
                constant_values=0)
    img = img.reshape(img.shape[0]//SIZE_IMG, SIZE_IMG, img.shape[1]//SIZE_IMG,SIZE_IMG, 3)
    img = img.transpose(0, 2, 1, 3, 4).reshape(-1, SIZE_IMG,SIZE_IMG,3)
    mask = mask.reshape(mask.shape[0]//SIZE_IMG, SIZE_IMG,mask.shape[1]//SIZE_IMG, SIZE_IMG, 3)
    mask = mask.transpose(0, 2, 1, 3, 4).reshape(-1, SIZE_IMG,SIZE_IMG, 3)
    if len(img) < N:
        mask = np.pad(mask, [[0, N-len(img)], [0, 0], [0, 0],[0, 0]], constant_values=0)
        img = np.pad(img, [[0, N-len(img)],[0, 0],[0, 0], [0, 0]], constant_values=255)
    idxs = np.argsort(img.reshape(img.shape[0], -1).sum(-1))[: N]
    img = img[idxs]
    mask = mask[idxs]
    
    for i in range(len(img)):
        result.append({'img':img[i], 'mask':mask[i], 'idx':i})

    return result

In [None]:
def multiplyTiles(tiles):
    variationA = []
    variationB = []
    variationC = []
    variationD = []
    variationE = []
    variationF = []
    variationG = []
    variationH = []
    for t in range(len(tiles)):
        
        # Original Tile (A)
        tile_a_img = tiles[t]['img']
        tile_a_mask = tiles[t]['mask']
        tile_a_idx = tiles[t]['idx']
        tile_a = {"img": tile_a_img, "mask": tile_a_mask, "idx": tile_a_idx}
        
        # Rotated Tiles (B, C, D)
        tile_b_img = np.rot90(tile_a_img)
        tile_b_mask = np.rot90(tile_a_mask)
        tile_b_idx = tile_a_idx
        tile_b = {"img": tile_b_img, "mask": tile_b_mask, "idx": tile_b_idx}
        
        tile_c_img = np.rot90(tile_b_img)
        tile_c_mask = np.rot90(tile_b_mask)
        tile_c_idx = tile_b_idx
        tile_c = {"img": tile_c_img, "mask": tile_c_mask, "idx": tile_c_idx}
        
        tile_d_img = np.rot90(tile_c_img)
        tile_d_mask = np.rot90(tile_c_mask)
        tile_d_idx = tile_c_idx
        tile_d = {"img": tile_d_img, "mask": tile_d_mask, "idx": tile_d_idx}
        
        # Mirrored Original Tile (A:E)
        tile_e_img = np.fliplr(tile_a_img)
        tile_e_mask = np.fliplr(tile_a_mask)
        tile_e_idx = tile_a_idx
        tile_e = {"img": tile_e_img, "mask": tile_e_mask, "idx": tile_e_idx}        
        
        # Mirrored Rotated Tiles (B:F, C:G, D:H)
        tile_f_img = np.fliplr(tile_b_img)
        tile_f_mask = np.fliplr(tile_b_mask)
        tile_f_idx = tile_a_idx
        tile_f = {"img": tile_f_img, "mask": tile_f_mask, "idx": tile_f_idx}
        
        tile_g_img = np.fliplr(tile_c_img)
        tile_g_mask = np.fliplr(tile_c_mask)
        tile_g_idx = tile_c_idx
        tile_g = {"img": tile_g_img, "mask": tile_g_mask, "idx": tile_g_idx}
        
        tile_h_img = np.fliplr(tile_d_img)
        tile_h_mask = np.fliplr(tile_d_mask)
        tile_h_idx = tile_d_idx
        tile_h = {"img": tile_h_img, "mask": tile_h_mask, "idx": tile_h_idx}        
        
        
        variationA.append(tile_a)
        variationB.append(tile_b)
        variationC.append(tile_c)
        variationD.append(tile_d)
        variationE.append(tile_e)
        variationF.append(tile_f)
        variationG.append(tile_g)
        variationH.append(tile_h)
        
        tile_bulk = [variationA, variationB, variationC, variationD, variationE, variationF, variationG, variationH]

    return tile_bulk

In [None]:
import openslide
img=openslide.OpenSlide('/kaggle/input/prostate-cancer-grade-assessment/train_images/0005f7aaab2800f6170c399693a96917.tiff')
display(img.get_thumbnail(size=(512,512)))


## Image Preview


In [None]:
train_dataset = pd.read_csv("./training.csv", usecols=["image_id", "data_provider", "isup_grade", "gleason_score"])
validation_dataset = pd.read_csv("./validation.csv", usecols=["image_id", "data_provider", "isup_grade", "gleason_score"])
test_dataset = pd.read_csv("./testing.csv", usecols=["image_id", "data_provider", "isup_grade", "gleason_score"])

# Mapping to the original dataset
# ../input/prostate-cancer-grade-assessment/train_images/0005f7aaab2800f6170c399693a96917.tiff
# ../input/prostate-cancer-grade-assessment/train_label_masks/0005f7aaab2800f6170c399693a96917_mask.tiff
img = skimage.io.MultiImage(os.path.join(TRAIN,"0005f7aaab2800f6170c399693a96917"+'.tiff'))[1]
mask = skimage.io.MultiImage(os.path.join(MASKS,"0005f7aaab2800f6170c399693a96917"+'_mask.tiff'))[1]
tiles = tile(img, mask)
[tiles_A, tiles_B, tiles_C, tiles_D, tiles_E, tiles_F, tiles_G, tiles_H] = multiplyTiles(tiles)

In [None]:
#To Display The Variations
f_A, ax_A = plt.subplots(4,4, figsize=(10, 10))
f_B, ax_B = plt.subplots(4,4, figsize=(10, 10))
f_C, ax_C = plt.subplots(4,4, figsize=(10, 10))
f_D, ax_D = plt.subplots(4,4, figsize=(10, 10))
f_E, ax_E = plt.subplots(4,4, figsize=(10, 10))
f_F, ax_F = plt.subplots(4,4, figsize=(10, 10))
f_G, ax_G = plt.subplots(4,4, figsize=(10, 10))
f_H, ax_H = plt.subplots(4,4, figsize=(10, 10))

#Display Variations
for t in range(len(tiles_A)):
        ax_A[t//4, t%4].imshow(tiles_A[t]["img"]) # Displaying Image    
        ax_A[t//4, t%4].axis('off')
f_A.suptitle('Variation A')

for t in range(len(tiles_B)):
        ax_B[t//4, t%4].imshow(tiles_B[t]["img"]) # Displaying Image    
        ax_B[t//4, t%4].axis('off')  
f_B.suptitle('Variation B')

for t in range(len(tiles_C)):
        ax_C[t//4, t%4].imshow(tiles_C[t]["img"]) # Displaying Image    
        ax_C[t//4, t%4].axis('off')  
f_C.suptitle('Variation C') 

for t in range(len(tiles_D)):
        ax_D[t//4, t%4].imshow(tiles_D[t]["img"]) # Displaying Image    
        ax_D[t//4, t%4].axis('off') 
f_D.suptitle('Variation D')

for t in range(len(tiles_E)):
        ax_E[t//4, t%4].imshow(tiles_E[t]["img"]) # Displaying Image    
        ax_E[t//4, t%4].axis('off')
f_E.suptitle('Variation E')

for t in range(len(tiles_F)):
        ax_F[t//4, t%4].imshow(tiles_F[t]["img"]) # Displaying Image    
        ax_F[t//4, t%4].axis('off')  
f_F.suptitle('Variation F')

for t in range(len(tiles_G)):
        ax_G[t//4, t%4].imshow(tiles_G[t]["img"]) # Displaying Image    
        ax_G[t//4, t%4].axis('off')  
f_G.suptitle('Variation G')

for t in range(len(tiles_H)):
        ax_H[t//4, t%4].imshow(tiles_H[t]["img"]) # Displaying Image    
        ax_H[t//4, t%4].axis('off') 
f_H.suptitle('Variation H')

## Concatenate Images - 16:1
The function ```concat_tile()``` concatenates 16 tiles in one single image, which will be saved later on.

In [None]:
id_train = train_dataset["image_id"][163]
id_validation = validation_dataset["image_id"][163]
id_test = test_dataset["image_id"][163]

# Testing the function
def concat_tile(im_list_2d):
    return cv2.vconcat([cv2.hconcat(im_list_h) for im_list_h in im_list_2d])

def mosaic(tiles):

    im1 = tiles[0]["img"]
    im2 = tiles[1]["img"]
    im3 = tiles[2]["img"]
    im4 = tiles[3]["img"]

    im5 = tiles[4]["img"]
    im6 = tiles[5]["img"]
    im7 = tiles[6]["img"]
    im8 = tiles[7]["img"]

    im9 = tiles[8]["img"]
    im10 = tiles[9]["img"]
    im11 = tiles[10]["img"]
    im12 = tiles[11]["img"]

    im13 = tiles[12]["img"]
    im14 = tiles[13]["img"]
    im15 = tiles[14]["img"]
    im16 = tiles[15]["img"]

    im_tile = concat_tile([[im1, im2, im3, im4],
                           [im5, im6, im7, im8],
                           [im9, im10, im11, im12],
                           [im13, im14, im15, im16]])
    return im_tile

img = skimage.io.MultiImage(os.path.join(TRAIN, f"{id_train}.tiff"))[1]
mask = skimage.io.MultiImage(os.path.join(MASKS, f"{id_train}_mask.tiff"))[1]


mosaic_img = mosaic(tiles)

plt.title(f"ID: {id_train}")
plt.imshow(mosaic_img)