## **Removing white space from images**

### **Import Usefull Libraries.**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import openslide
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import cv2
import PIL
import random
import openslide
import skimage.io
import matplotlib
from IPython.display import Image, display

### **Base Folder Path of Dataset.**

In [None]:
BASE_FOLDER = "/kaggle/input/prostate-cancer-grade-assessment/"
!ls {BASE_FOLDER}

### **Read all CSV file & train Images of Dataset.**

In [None]:
IMG_FOLDER = BASE_FOLDER + 'train_images/'
MASK_FOLDER = BASE_FOLDER + 'train_label_masks/'
train = pd.read_csv(BASE_FOLDER+"train.csv")
test = pd.read_csv(BASE_FOLDER+"test.csv")
sub = pd.read_csv(BASE_FOLDER+"sample_submission.csv")

### **Drop the Mislabelled row from training dataset.**

In [None]:
train.drop([7273],inplace=True)

### **Compute Statistics Function**

This function help us to find the ratio of white pixels, green concentration & red concentration in the image.

In [None]:
def compute_statistics(image):
    """
    Args:
        image                  numpy.array   multi-dimensional array of the form WxHxC
    
    Returns:
        ratio_white_pixels     float         ratio of white pixels over total pixels in the image 
    """
    width, height = image.shape[0], image.shape[1]
    num_pixels = width * height
    
    num_white_pixels = 0
    
    summed_matrix = np.sum(image, axis=-1)
    # Note: A 3-channel white pixel has RGB (255, 255, 255)
    num_white_pixels = np.count_nonzero(summed_matrix > 620)
    ratio_white_pixels = num_white_pixels / num_pixels
    
    green_concentration = np.mean(image[1])
    blue_concentration = np.mean(image[2])
    
    return ratio_white_pixels, green_concentration, blue_concentration

### **Select K best regions Function**

In [None]:
def select_k_best_regions(regions, k=20):
    """
    Args:
        regions = list of 2-component tuples first component the region, 
                  second component the ratio of white pixels                          
        k = number of regions to select
    """
    
    # x[3]=green_concentration, x[4]=blue_concentration
    regions = [x for x in regions if x[3] > 180 and x[4] > 180]
    
    # sorted the regions according to white pixel ratio & select the k best regions from that
    k_best_regions = sorted(regions, key=lambda tup: tup[2])[:k]
    return k_best_regions

### **Function to get K best regions.**

In [None]:
def get_k_best_regions(coordinates, image, window_size=512):
    regions = {}
    for i, tup in enumerate(coordinates):
        x, y = tup[0], tup[1]
        regions[i] = image[x : x+window_size, y : y+window_size, :]
    
    return regions

### **Generate patches from Images**

The main function: the two while loops slide over the image (the first one from top to bottom, the second from left to right). The order does not matter actually. Then we select the region, compute the statistics of that region, sort the array and select the k-best regions.

In [None]:
def generate_patches(slide_path, window_size=200, stride=128, k=20):
    
    image = skimage.io.MultiImage(slide_path)[0]
    image = np.array(image)
    
    max_width, max_height = image.shape[0], image.shape[1]
    regions_container = []
    i = 0
    
    while window_size + stride*i <= max_height:
        j = 0
        
        while window_size + stride*j <= max_width:            
            x_top_left_pixel = j * stride
            y_top_left_pixel = i * stride
            
            patch = image[
                x_top_left_pixel : x_top_left_pixel + window_size,
                y_top_left_pixel : y_top_left_pixel + window_size,
                :
            ]
            
            ratio_white_pixels, green_concentration, blue_concentration = compute_statistics(patch)
            
            region_tuple = (x_top_left_pixel, y_top_left_pixel, ratio_white_pixels, green_concentration, blue_concentration)
            regions_container.append(region_tuple)
            
            j += 1
        
        i += 1
    
    k_best_region_coordinates = select_k_best_regions(regions_container, k=k)
    k_best_regions = get_k_best_regions(k_best_region_coordinates, image, window_size)
    
    return image, k_best_region_coordinates, k_best_regions

### **Function to display regions of Images**

In [None]:
def display_images(regions, title):
    fig, ax = plt.subplots(5, 4, figsize=(15, 15))
    
    for i, region in regions.items():
        ax[i//4, i%4].imshow(region)
    
    fig.suptitle(title)

**List of all Image Id's & Labels.**

In [None]:
images = list(train['image_id'])
labels = list(train['isup_grade'])

In [None]:
%%time

ex_url = IMG_FOLDER + images[0] + '.tiff'
_, best_coordinates, best_regions = generate_patches(ex_url)

In [None]:
display_images(best_regions, 'Window size: 200, stride: 128')

In [None]:
%%time

ex_url = IMG_FOLDER + images[0] + '.tiff'
_, best_coordinates, best_regions = generate_patches(ex_url, window_size=128, stride=64)
display_images(best_regions, 'Window size: 128, stride: 64')

### **Function to Glue all patches into one Image.**

In [None]:
def glue_to_one_picture(image_patches, window_size=200, k=16):
    side = int(np.sqrt(k))
    image = np.zeros((side*window_size, side*window_size, 3), dtype=np.int16)
        
    for i, patch in image_patches.items():
        x = i // side
        y = i % side
        image[
            x * window_size : (x+1) * window_size,
            y * window_size : (y+1) * window_size,
            :
        ] = patch
    
    return image

### **Define Parameters**

In [None]:
WINDOW_SIZE = 128
STRIDE = 64
K = 16               # no of best regions to select

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(20, 25))

for i, img in enumerate(images[:2]):
    url = IMG_FOLDER + img + '.tiff'
    image, best_coordinates, best_regions = generate_patches(url, window_size=WINDOW_SIZE, stride=STRIDE, k=K)
    glued_image = glue_to_one_picture(best_regions, window_size=WINDOW_SIZE, k=K)
    
    ax[i][0].imshow(image)
    ax[i][0].set_title(f'{img} - Original - Label: {labels[i]}')
    
    ax[i][1].imshow(glued_image)
    ax[i][1].set_title(f'{img} - Glued - Label: {labels[i]}')

fig.suptitle('From biopsy to glued patches')

Conclusion for Image Tiles:-

To Small window size can create loss of information which will affect the model performance, so the window size around 200 is a good choice having enough biopsy detail & structured capture.

Although, we can tune more this in modelling part after seeing the performance of the model.
