# HuBMAP - Exploratory Data Analysis (EDA)
This notebook provides brief exploratory data analysis for the new HuBMAP data set. The full kidney images in the training dataset are visualized with the glomeruli FTUs highlighted. A brief analysis of the shape of the glomerulis follows.

# References
The following references were used in this notebook.
- Reading images: https://www.kaggle.com/ihelon/hubmap-exploratory-data-analysis
- RLE encoding: https://www.kaggle.com/paulorzp/rle-functions-run-lenght-encode-decode

# Imports

In [None]:
import cv2
import datetime
import gc
import glob
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import skimage.morphology
import sys
import tensorflow as tf
import tifffile

# Parameters

In [None]:
base_path = '../input/hubmap-kidney-segmentation'

plot_full_image = True

# Number of glomeruli to display for each image
num_glom_display = 5

# Number of glomberuli to save as tiff files.
num_glom_save = 5

glob_scale = 0.25

# Utility Functions

In [None]:
def rle_to_image(rle_mask, image_shape):
    """
    Converts an rle string to an image represented as a numpy array.
    Reference: https://www.kaggle.com/paulorzp/rle-functions-run-lenght-encode-decode

    :param rle_mask: string with rle mask.
    :param image_shape: (width, height) of array to return
    :return: Image as a numpy array. 1 = mask, 0 = background.
    """

    # Processing
    s = rle_mask.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    image = np.zeros(image_shape[0] * image_shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        image[lo:hi] = 1

    return image.reshape(image_shape).T


# File Structure
The files in the root of the dataset are shown below. The dataset consists of 2 directories that contain training and test images and 3 csv-files with additional information about the images.
## Directory Contents

In [None]:
print('\n'.join(os.listdir(base_path)))

## Training Images
The train directory contains 15 images for training.

In [None]:
train_files = sorted(glob.glob(os.path.join(base_path, 'train/*.tiff')))
print(f'Number of training images: {len(train_files)}')
print('\n'.join(train_files))

## Test Images
The test directory contains 5 images for testing.

In [None]:
test_files = sorted(glob.glob(os.path.join(base_path, 'test/*.tiff')))
print(f'Number of test images: {len(test_files)}')
print('\n'.join(test_files))

## Train.csv
The masks indicating a glomeruli FTUs are stored in rle format in the train.csv for each training image id.

In [None]:
df_train = pd.read_csv(os.path.join(base_path, 'train.csv'))
display(df_train)

## Sample_Sumbission.csv
The sample_submission.csv files shows the format of the submissions files consisting of the test image id and an rle encoded masks.

In [None]:
df_submission = pd.read_csv(os.path.join(base_path,'sample_submission.csv'))
display(df_submission)

## Patient Data
HuBMAP-20-dataset_information.csv contains additional information about each image such as image size and anonymized patient data.

In [None]:
df_info = pd.read_csv(os.path.join(base_path,'HuBMAP-20-dataset_information.csv'))
display(df_info)

In [None]:
pd.options.display.float_format = '{:,.1f}'.format
df_info.describe()

# Training Image Analysis

## Width and Height Distribution
The training images do not have consistent dimensions. This has to be corrected when loading the images. They have on of the following shapes:
- [height, width, channel]
- [channel, height, width]
- [1, 1, channel, height, width]

In [None]:
for f in train_files + test_files:
    image = tifffile.imread(f)
    print(f'Image {f} shape: {image.shape}', flush=True)
    del image
    gc.collect()

The size of the images varies greatly as well. 

In [None]:
plt.scatter(df_info['width_pixels'], df_info['height_pixels'])
plt.title('Image Height and Width')
plt.xlabel('Width')
plt.ylabel('Height')
plt.xlim(0, df_info['width_pixels'].max() * 1.1)
plt.ylim(0, df_info['height_pixels'].max() * 1.1)
plt.grid()

## Image Utilitity Functions

In [None]:
def overlay_image_mask(image, mask, mask_color=(0,255,0), alpha=1.0):
    im_f= image.astype(np.float32)
#     if mask.ndim == 2:
#         mask = np.expand_dims(mask,-1)        
    mask_col = np.expand_dims(np.array(mask_color)/255.0, axis=(0,1))
    return (im_f + alpha * mask * (np.mean(0.8 * im_f + 0.2 * 255, axis=2, keepdims=True) * mask_col - im_f)).astype(np.uint8)


def overlay_image_mask_original(image, mask, mask_color=(0,255,0), alpha=1.0):
    return  np.concatenate((image, overlay_image_mask(image, mask)), axis=1)

def get_image_id(image_file):
    return os.path.splitext(os.path.split(image_file)[1])[0]


def read_image(image_file, scale=1.0):
    image = tifffile.imread(image_file).squeeze()
    if image.shape[0] == 3:
        image = np.transpose(image, (1,2,0))
    
    orig_shape = image.shape
    if scale != 1.0:
        image = cv2.resize(image, (0,0), fx=scale, fy=scale)
    return image, orig_shape


def read_mask(image_file, image_shape, scale=1.0):
    image_id = get_image_id(image_file)
    train_info = df_train.loc[df_train['id'] == image_id]
    rle = train_info['encoding'].values[0] if len(train_info) > 0 else None
    if rle is not None:
        mask = rle_to_image(rle, (image_shape[1], image_shape[0]))
        if scale != 1.0:
            mask = cv2.resize(mask, (0,0), fx=scale, fy=scale)
        return np.expand_dims(mask,-1)
    else:
        return None        

    
def read_image_mask(image_file, scale=1.0):
    image, image_shape = read_image(image_file, scale)
    mask = read_mask(image_file, image_shape, scale)
    return image, mask


def get_tile(image, mask, x, y, tile_size, scale=1.0):
    x = round(x * scale)
    y = round(y * scale)
    size = int(round(tile_size / 2 * scale))
    image_s = image[y-size:y+size, x-size:x+size, :] 
    mask_s = mask[y-size:y+size, x-size:x+size, :]
    return image_s, mask_s


def get_particles(mask, scale=1.0):
    num, labels, stats, centroids = cv2.connectedComponentsWithStats(mask)
    df_particles = pd.DataFrame(dict(zip(['x','y','left','top','width','height','area'],
                               [(centroids[1:,0]) / scale,
                                (centroids[1:,1]) / scale,
                                (stats[1:,cv2.CC_STAT_LEFT]) / scale,
                                (stats[1:,cv2.CC_STAT_TOP]) / scale,
                                (stats[1:,cv2.CC_STAT_WIDTH]) / scale,
                                (stats[1:,cv2.CC_STAT_HEIGHT]) / scale,
                                (stats[1:,cv2.CC_STAT_AREA]) / (scale * scale)])))
    df_particles.sort_values(['x','y'], inplace=True, ignore_index=True)
    df_particles['no'] = range(len(df_particles))
    return df_particles


def analyze_image(image_file):
    image_id = get_image_id(image_file)
    image, image_shape = read_image(image_file, glob_scale)
    mask = read_mask(image_file, image_shape, glob_scale)

    mask_full = read_mask(image_file, image_shape, scale=1.0)
    df_glom = get_particles(mask_full, scale=1.0)
    df_glom['id'] = image_id
    del mask_full
    gc.collect()
    
    info = df_info[df_info['image_file'] == f'{image_id}.tiff']
    print(f'Image ID:        {image_id:}')
    print(f'Image Size:      {info["width_pixels"].values[0]} x {info["height_pixels"].values[0]}')
    print(f'Patient No:      {info["patient_number"].values[0]}')
    print(f'Sex:             {info["sex"].values[0]}')
    print(f'Age:             {info["age"].values[0]}')
    print(f'Race:            {info["race"].values[0]}')
    print(f'Height:          {info["height_centimeters"].values[0]} cm')
    print(f'Weight:          {info["weight_kilograms"].values[0]} kg')
    print(f'BMI:             {info["bmi_kg/m^2"].values[0]} kg/m^2')
    print(f'Laterality:      {info["laterality"].values[0]}')
    print(f'Percent Cortex:  {info["percent_cortex"].values[0]} %')
    print(f'Percent Medulla: {info["percent_medulla"].values[0]} %')
    
    # Plot full image
    if plot_full_image:
        scale = 0.1
        image_small = cv2.resize(image, (0,0), fx=scale, fy=scale)
        mask_small = cv2.resize(mask, (0,0), fx=scale, fy=scale)
        mask_small = np.expand_dims(mask_small,-1) 
    
        plt.figure(figsize=(16, 16))
        plt.imshow(overlay_image_mask(image_small, mask_small))
        plt.axis('off')

    # Plot glomeruli images
    fig_cols = 5
    fig_rows = int(math.ceil(num_glom_display/fig_cols))
    plt.figure(figsize=(4 * fig_cols, 4 * fig_rows))
    if num_glom_save > 0 and not os.path.exists(image_id):
        os.mkdir(image_id)
    for i in range(min(max(num_glom_display, num_glom_save), len(df_glom))):
        image_s, mask_s = get_tile(image,mask, df_glom['x'][i], df_glom['y'][i], 1000, scale=glob_scale)
        ovl = overlay_image_mask(image_s, mask_s)
        if i < num_glom_display:
            plt.subplot(fig_rows, fig_cols, i+1)
            plt.imshow(ovl)
            plt.axis('off')
        if i < num_glom_save:
            cv2.imwrite(f'{image_id}_{i:03}.png', cv2.cvtColor(ovl, cv2.COLOR_RGB2BGR))    
    
    del image, mask
    gc.collect()
    return df_glom


def plot_glom(df, image_id, glom_no):
    image, mask = read_image_mask(os.path.join(base_path, f'train/{image_id}.tiff'), scale=glob_scale)
    glom = df.loc[(df['id'] == image_id) & (df['no'] == glom_no)]
    im, ma = get_tile(image, mask, glom['x'].iloc[0], glom['y'].iloc[0], 1000, scale=glob_scale)
    del image, mask
    gc.collect()
    plt.figure(figsize=(16,8))
    plt.imshow(overlay_image_mask_original(im, ma))
    plt.title(f'Image: {image_id}, Glomeruli No: {glom_no}, Area: {glom["area"].iloc[0]}')

## Training Images With Glomerulis

In [None]:
df_glom = pd.DataFrame()
df_glom = df_glom.append(analyze_image(train_files[0]), ignore_index=True)

In [None]:
df_glom = df_glom.append(analyze_image(train_files[1]), ignore_index=True)

In [None]:
df_glom = df_glom.append(analyze_image(train_files[2]), ignore_index=True)

In [None]:
df_glom = df_glom.append(analyze_image(train_files[3]), ignore_index=True)

In [None]:
df_glom = df_glom.append(analyze_image(train_files[4]), ignore_index=True)

In [None]:
df_glom = df_glom.append(analyze_image(train_files[5]), ignore_index=True)

In [None]:
df_glom = df_glom.append(analyze_image(train_files[6]), ignore_index=True)

In [None]:
df_glom = df_glom.append(analyze_image(train_files[7]), ignore_index=True)

In [None]:
df_glom = df_glom.append(analyze_image(train_files[8]), ignore_index=True)

In [None]:
df_glom = df_glom.append(analyze_image(train_files[9]), ignore_index=True)

In [None]:
df_glom = df_glom.append(analyze_image(train_files[10]), ignore_index=True)

In [None]:
df_glom = df_glom.append(analyze_image(train_files[11]), ignore_index=True)

In [None]:
df_glom = df_glom.append(analyze_image(train_files[12]), ignore_index=True)

In [None]:
df_glom = df_glom.append(analyze_image(train_files[13]), ignore_index=True)

In [None]:
df_glom = df_glom.append(analyze_image(train_files[14]), ignore_index=True)

# Glomerulis
## Basic Statistics

In [None]:
df_glom.to_csv('glomeruli.csv')
display(df_glom)

In [None]:
df_glom.describe()

## Glomerulis Per Image

In [None]:
g = df_glom.groupby('id')
plt.bar(g.size().index, g.size().values)
plt.title('Number of Glomerulis in Image')
plt.xticks(rotation=90)
plt.grid()

## Glomeruli Width, Height and Area Distribution

In [None]:
plt.figure(figsize=(20,5))
plt.subplot(1,3,1)
plt.hist(df_glom['width'], bins=40, density=True)
plt.title('Width Distribution')
plt.grid()
plt.subplot(1,3,2)
plt.hist(df_glom['height'], bins=40, density=True)
plt.title('Height Distribution')
plt.grid()
plt.subplot(1,3,3)
plt.hist(df_glom['area'], bins=40, density=True)
plt.title('Area Distribution')
plt.grid()

## Glomerulis by Size

In [None]:
df_glom.sort_values('area', inplace=True)
df_glom

## 5 Smallest Glomerulis

In [None]:
for i in range(5):
    plot_glom(df_glom, df_glom['id'].iloc[i], df_glom['no'].iloc[i])

## 5 Largest Glomerulis

In [None]:
for i in range(len(df_glom)-5, len(df_glom)):
    plot_glom(df_glom, df_glom['id'].iloc[i], df_glom['no'].iloc[i])