In [None]:
import os
from os import walk

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from PIL import Image
import cv2
from tqdm.notebook import tqdm
import skimage.io
from skimage.transform import resize, rescale
import time
import seaborn as sns

%matplotlib inline
import mpld3
mpld3.enable_notebook()

In [None]:
radboud = 'radboud'
karolinska = 'karolinska'

In [None]:
train_labels = pd.read_csv('../input/panda-eda-and-filtering-data/proper_test_cases.csv')
train_labels_red = train_labels[train_labels['gleason_score'].isin(['0+0','3+3', '4+4', '5+5'])]
# train_labels = pd.read_csv('/kaggle/input/prostate-cancer-grade-assessment/train.csv')
# train_label_red['score']
train_labels_red.head()

In [None]:
# print((train_labels_red['image_id']=='000920ad0b612851f8e01bcc880d9b3d').sum())

In [None]:
gleason_score_mapper = {'0+0': 0, # Healthy cells and tissues
                        '3+3': 1, # Gleason grade 3
                        '4+4': 2, # Gleason grade 4
                        '5+5': 3, # Gleason grade 4
                       }
train_labels_red['score'] = train_labels_red.gleason_score.apply(lambda x: gleason_score_mapper[x])
train_labels_red.head()

In [None]:
print(f"Karolinska: {len(train_labels_red[train_labels_red['data_provider']==karolinska])}")
print(f"\nRadboud: {len(train_labels_red[train_labels_red['data_provider']==radboud])}")

First trying for radboud because of smaller number of samples

In [None]:
data_provider = radboud
train_labels_to_crop = train_labels_red[train_labels_red['data_provider']==data_provider]
train_labels_to_crop.head()

In [None]:
img_dir = '/kaggle/input/prostate-cancer-grade-assessment/train_images/'
print(f"Number of images in directory: {len(os.listdir(img_dir))}")

In [None]:
mask_dir = '/kaggle/input/prostate-cancer-grade-assessment/train_label_masks/'
print(f"Number of images in directory: {len(os.listdir(mask_dir))}")

# working with one image

In [None]:
# img_id = train_labels.image_id[2]
# mask_path = mask_dir + img_id + '_mask.tiff'
# img_path = img_dir + img_id + '.tiff'
# print(f"img_id: {img_id}, path: {img_path}")
# print(f"mask_path: {mask_path}")

In [None]:
# image = skimage.io.MultiImage(img_path)
# mask = skimage.io.MultiImage(mask_path)

In [None]:
# print("Image sizes: ")
# for frame in image:
#     print(f"frame.shape: {frame.shape}")
    
# print("Mask sizes: ")
# for frame in mask:
#     print(f"mask.shape: {frame.shape}")

In [None]:
# Consider the middle image, which is a medium size one
# image = image[1]
# mask = mask[1]

In [None]:
# # Display image
# plt.imshow(image[1])

# # Display mask
# plt.figure()
# plt.imshow(mask[1])

In [None]:
# path
for i in gleason_score_mapper.values(): 
    path = f"{data_provider}_scores_{i}"
    print(path)
    try:
        os.mkdir(path)  
    except OSError as error:  
        print(error) 

In [None]:
def get_slice(image, mask, x_index, y_index, target_size, display = 0):
#   Generate indices
    x_begin = x_index*target_size
    x_end = (x_index+1)*target_size
    y_begin = y_index*target_size
    y_end = (y_index+1)*target_size
#   Create crops
    mask_slice = mask[slice(x_begin,x_end), slice(y_begin,y_end)]
    image_slice = image[slice(x_begin,x_end), slice(y_begin,y_end), :]
#   Display images
    if display:
#         print(f"xindex: {x_begin}-{x_end}, yindex: {y_begin}-{y_end}")
        mask_slice_temp = mask_slice*30
        plt.figure()
        plt.subplot(1,2,1)
        plt.imshow(mask_slice_temp, cmap = 'gray') # Mask has single channel, 2D
        plt.title("Mask")
        plt.subplot(1,2,2)
        plt.imshow(image_slice)
        plt.title("Biopsy")
    return image_slice, mask_slice

In [None]:
# image_class:
# 0: background
# 1: healthy cells
# 3: gleason_grade 3
# 4: gleason_grade 4
# 5: gleason_grade 5


In [None]:
# print(image.mean())

In [None]:
# Create a DataFrame object for storage
df = pd.DataFrame()
df.i = 0
df.j = 0
## This grading is valid only for radboud
df[0] = 0 # White background
df[1] = 0 # Connecting tissue
df[2] = 0 # Healthy tissue
df[3] = 0 # Gleason grade 3
df[4] = 0 # Gleason grade 4
df[5] = 0 # Gleason grade 5
df['data_provider'] = ""

# Storing mean values
modified_train_info = pd.DataFrame(columns = ['data_provider', 'mean', 'image_class'])
modified_train_info.head()

# This wont save any mask, just analysing threshold based on mean

In [None]:
def analyse_image_mask_radboud(image_slice, mask_slice, label, i, j, df, modified_train_info, reference):
#           Analyse image mask
    counts = pd.Series(mask_slice.reshape(-1)).value_counts()
    counts_np = np.zeros(6).astype('uint64')
    for i in counts.keys():
        counts_np[i] = counts[i]
    counts['saved'] = 0
#           out of 50176 pixels, selecting patches which has fewer than 20000 background pixels
    tissue_cells = sum(counts_np[1:])
    if tissue_cells > 20000: # To exclude white background
        healthy_cells = sum(counts_np[1:3]) # Healthy + connecting tissue (1+2)
#           Cancerous cells is excluding background and healthy tissue
        cancerous_cells = sum(counts_np[3:]) # 3,4,5: cancerous tissue
        if healthy_cells > 2*cancerous_cells:
            label = 0 # If healthy cells is twice frequent than cencerous cells, consider as healthy slice
#                 path = f"{data_provider}_gleason_scores_{i}"
#         time.sleep(10 / 1000) # Sleep for 10ms to store results better
#         cv2.imwrite(f"{data_provider}_scores_{label}/{row['image_id']}_{i}_{j}.png",image_slice)
        counts['saved'] = 1
#           Updating info in df, to design a classifier for distinguishing healthy and cencerous cells
        modified_train_info = modified_train_info.append({'data_provider' : data_provider,
                                    'mean': image_slice.mean(),
                                    'image_class': label} , ignore_index=True)
        counts['data_provider'] = data_provider
        reference[label] += 1
        df = df.append(counts, ignore_index = True)
    else:
        modified_train_info = modified_train_info.append({'data_provider' : data_provider,
                                    'mean': image_slice.mean(),
                                    'image_class': -1} , ignore_index=True)
        
    return df, modified_train_info
            

In [None]:
target_size = 224
img_size = 224*224
def gen_crops(row, df, modified_train_info, reference):
#   image and mask path
    mask_path = mask_dir + row['image_id'] + '_mask.tiff'
    img_path = img_dir + row['image_id'] + '.tiff'
    label = row['score']
#   Load image and mask
    image = skimage.io.MultiImage(img_path)[1] # Take the second image in tiff file which has medium size
    mask = skimage.io.MultiImage(mask_path)[1] # Sly as image
#     print(f"img.shape: {image.shape}, mask.shape: {mask.shape}")
#   Check image and mask shapes
    assert(image.shape == mask.shape)
#   Create crops
    mask = mask[:,:,0] # class information is present in first channel of mask
#     print(f"mask.shape: {mask.shape}")
#   Integer division of crops
    n_x = mask.shape[0]//target_size
    n_y = mask.shape[1]//target_size
#     print(f"n_x: {n_x}, n_y: {n_y}")
    for i in range(n_x):
        for j in range(n_y):
#             print(f"i: {i}, j: {j}")
#           get slices for image and mask
            image_slice, mask_slice = get_slice(image,mask,i,j,target_size)
#           skip iteration when size is 0
            if image_slice.size == 0 or mask_slice.size == 0:
                continue
            if data_provider == radboud:
                df, modified_train_info = analyse_image_mask_radboud(image_slice, mask_slice, label, i,j, df, modified_train_info, reference)
            else:
                print("NOT YET IMPLEMENTED FOR KAROLINSKA")
                # To do for karolinska
#             print(counts)
    return df, modified_train_info
#             mask_crop = mask
#     counts = pd.Series(temp.reshape(-1)).value_counts()
#     row.height = temp.shape[0]
#     row.width = temp.shape[1]
#     row.update(counts)

In [None]:
# %time df, modified_train_info = gen_crops(train_labels.loc[2], df, modified_train_info)

In [None]:
fig = plt.figure(figsize=(6,4))
ax = sns.countplot(x="score", hue="data_provider", data=train_labels_to_crop)
plt.title("Score by Data Provider", fontsize=14)
plt.xlabel("Score", fontsize=14)
plt.ylabel("Count", fontsize=14)
plt.show()

In [None]:

ax = sns.countplot(x="score", hue="data_provider", data=train_labels_to_crop.iloc[0:100])

# Test: 0-100

In [None]:
reference = np.zeros(len(gleason_score_mapper.values()))
temp_df = train_labels_to_crop.iloc[0:100]
for _, row in tqdm(temp_df.iterrows(),total=temp_df.shape[0]):
    df, modified_train_info = gen_crops(row, df, modified_train_info, reference)

# PDF of healthy vs cancerous

In [None]:
cells = modified_train_info[modified_train_info['image_class'].isin([0,1,2,3])]
background = modified_train_info[modified_train_info['image_class'].isin([-1])]

In [None]:
sns.distplot( cells["mean"] , color="skyblue", label="Cells")
sns.distplot( background["mean"] , color="red", label="Background")
plt.legend()

In [None]:
kwargs = {'cumulative': True}
sns.distplot(cells['mean'], hist_kws=kwargs, kde_kws=kwargs)
sns.distplot(background['mean'], hist_kws=kwargs, kde_kws=kwargs)

# Comparing distributions of each cell category

In [None]:
class0 = modified_train_info[modified_train_info['image_class'].isin([0])]
class1 = modified_train_info[modified_train_info['image_class'].isin([1])]
class2 = modified_train_info[modified_train_info['image_class'].isin([2])]
class3 = modified_train_info[modified_train_info['image_class'].isin([3])]

In [None]:
sns.distplot( class0["mean"] , label="Class-0")
sns.distplot( class1["mean"] , label="Class-1")
sns.distplot( class2["mean"] , label="Class-2")
sns.distplot( class3["mean"] , label="Class-3")
sns.distplot(background['mean'], label='Background')
plt.legend()

# Observation:
Max of cells(approx):
* Class 0: 250
* Class 1,2,3: 230

> So, we can choose mean *[140, 240]*******, to be our region of intrest. Considering mean beyond this as background
In this higher classes such as 1,2,3 are somewhat far from 240. Thus we are not losing information about most important classes.

In [None]:
di = {'Class 0': 39343 ,
'Class 1': 3116,
'Class 2': 3712,
'Class 3': 585}

total = sum(di.values())
for key in di:
    di[key] = di[key]/total
print(di)