In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import gc
import cv2
import pdb
import glob
import pytz
import warnings
import pickle
import random
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from torch.optim.lr_scheduler import ReduceLROnPlateau, ExponentialLR
from sklearn.model_selection import KFold
import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.backends.cudnn as cudnn
from torch.utils.data import DataLoader, Dataset, sampler
from albumentations.pytorch import ToTensorV2
import segmentation_models_pytorch as smp

import tifffile as tiff
import rasterio
from rasterio.windows import Window

**EDA(Exploratory Data Analysis):**

In [None]:
dset_info = pd.read_csv('/kaggle/input/hubmap-kidney-segmentation/HuBMAP-20-dataset_information.csv')
dset_info.head()

In [None]:
dset_train=pd.read_csv('../input/hubmap-kidney-segmentation/train.csv')
dset_train.head(3)

In [None]:
dset_submission=pd.read_csv('../input/hubmap-kidney-segmentation/sample_submission.csv')
dset_submission.head(3)

In [None]:
# Training Images
train_files = sorted(glob.glob(os.path.join('../input/hubmap-kidney-segmentation', 'train/*.tiff')))
print(f'Number of training images: {len(train_files)}')
print('\n'.join(train_files))


In [None]:
#Test Images
test_files = sorted(glob.glob(os.path.join('../input/hubmap-kidney-segmentation', 'test/*.tiff')))
print(f'Number of test images: {len(test_files)}')
print('\n'.join(test_files))

In [None]:
import tifffile
import gc
#train and test image analysis of height and weight distribution,where the formats vary(H,W,C) or(C,H,W) or(ndim,C,H,W)

for f in train_files[:2]:
    image = tifffile.imread(f)
    print(f'Image {f} shape: {image.shape}', flush=True)
    del image
    gc.collect()

In [None]:

for f in test_files[:3]:
    image = tifffile.imread(f)
    print(f'Image {f} shape: {image.shape}', flush=True)
    del image
    gc.collect()

In [None]:
#Both train and test images vary in sizes
df_info = pd.read_csv(os.path.join('../input/hubmap-kidney-segmentation','HuBMAP-20-dataset_information.csv'))
df_info.head(4)

In [None]:
#The size of the images varies greatly as well.
import matplotlib.pyplot as plt
plt.scatter(df_info['width_pixels'], df_info['height_pixels'])
plt.title('Image Height and Width')
plt.xlabel('Width')
plt.ylabel('Height')
plt.xlim(0, df_info['width_pixels'].max() * 1.1)
plt.ylim(0, df_info['height_pixels'].max() * 1.1)
plt.grid()

In [None]:
#EDA on Glomeruli in test images

df_submit=pd.read_csv('../input/hubmap-submission-file/sample_submission.csv')

**Image Utility Functions**

In [None]:
plot_full_image = True

# Number of glomeruli to display for each image
num_glom_display = 5

# Number of glomberuli to save as tiff files.
num_glom_save = 5

glob_scale = 0.25

In [None]:
def rle_to_image(rle_mask, image_shape):
    """
    Converts an rle string to an image represented as a numpy array.
    Reference: https://www.kaggle.com/paulorzp/rle-functions-run-lenght-encode-decode

    :param rle_mask: string with rle mask.
    :param image_shape: (width, height) of array to return
    :return: Image as a numpy array. 1 = mask, 0 = background.
    """

    # Processing
    s = rle_mask.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    image = np.zeros(image_shape[0] * image_shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        image[lo:hi] = 1

    return image.reshape(image_shape).T

def overlay_image_mask(image, mask, mask_color=(0,255,0), alpha=1.0):
    im_f= image.astype(np.float32)
#     if mask.ndim == 2:
#         mask = np.expand_dims(mask,-1)        
    mask_col = np.expand_dims(np.array(mask_color)/255.0, axis=(0,1))
    return (im_f + alpha * mask * (np.mean(0.8 * im_f + 0.2 * 255, axis=2, keepdims=True) * mask_col - im_f)).astype(np.uint8)

def overlay_image_mask_original(image, mask, mask_color=(0,255,0), alpha=1.0):
    return  np.concatenate((image, overlay_image_mask(image, mask)), axis=1)

def get_image_id(image_file):
    return os.path.splitext(os.path.split(image_file)[1])[0]


def read_image(image_file, scale=1.0):
    image = tifffile.imread(image_file).squeeze()
    if image.shape[0] == 3:
        image = np.transpose(image, (1,2,0))
    
    orig_shape = image.shape
    if scale != 1.0:
        image = cv2.resize(image, (0,0), fx=scale, fy=scale)
    return image, orig_shape

def read_mask(image_file, image_shape, scale=1.0):
    image_id = get_image_id(image_file)
    train_info = dset_train.loc[dset_train['id'] == image_id]
    submit_info = df_submit.loc[df_submit['id'] == image_id]
    rle = train_info['encoding'].values[0] if len(train_info) > 0 else None
    rle_test = submit_info['predicted'].values[0] if len(submit_info) > 0 else None
    if rle is not None:
        mask = rle_to_image(rle, (image_shape[1], image_shape[0]))
        if scale != 1.0:
            mask = cv2.resize(mask, (0,0), fx=scale, fy=scale)
        return np.expand_dims(mask,-1)
    elif rle_test is not None:
        mask = rle_to_image(rle_test, (image_shape[1], image_shape[0]))
        if scale != 1.0:
            mask = cv2.resize(mask, (0,0), fx=scale, fy=scale)
        return np.expand_dims(mask,-1)
        
    else:
        return None       
    
def read_image_mask(image_file, scale=1.0):
    image, image_shape = read_image(image_file, scale)
    mask = read_mask(image_file, image_shape, scale)
    return image, mask


def get_tile(image, mask, x, y, tile_size, scale=1.0):
    x = round(x * scale)
    y = round(y * scale)
    size = int(round(tile_size / 2 * scale))
    image_s = image[y-size:y+size, x-size:x+size, :] 
    mask_s = mask[y-size:y+size, x-size:x+size, :]
    return image_s, mask_s  
def get_particles(mask, scale=1.0):
    num, labels, stats, centroids = cv2.connectedComponentsWithStats(mask)
    df_particles = pd.DataFrame(dict(zip(['x','y','left','top','width','height','area'],
                               [(centroids[1:,0]) / scale,
                                (centroids[1:,1]) / scale,
                                (stats[1:,cv2.CC_STAT_LEFT]) / scale,
                                (stats[1:,cv2.CC_STAT_TOP]) / scale,
                                (stats[1:,cv2.CC_STAT_WIDTH]) / scale,
                                (stats[1:,cv2.CC_STAT_HEIGHT]) / scale,
                                (stats[1:,cv2.CC_STAT_AREA]) / (scale * scale)])))
    df_particles.sort_values(['x','y'], inplace=True, ignore_index=True)
    df_particles['no'] = range(len(df_particles))
    return df_particles

def analyze_image(image_file):
    image_id = get_image_id(image_file)
    image, image_shape = read_image(image_file, glob_scale)
    mask = read_mask(image_file, image_shape, glob_scale)
    mask_full = read_mask(image_file, image_shape, scale=1.0)
    df_glom = get_particles(mask_full, scale=1.0)
    df_glom['id'] = image_id
    del mask_full
    gc.collect()
    
    info = df_info[df_info['image_file'] == f'{image_id}.tiff']
    print(f'Image ID:        {image_id:}')
    print(f'Image Size:      {info["width_pixels"].values[0]} x {info["height_pixels"].values[0]}')
    print(f'Patient No:      {info["patient_number"].values[0]}')
    print(f'Sex:             {info["sex"].values[0]}')
    print(f'Age:             {info["age"].values[0]}')
    print(f'Race:            {info["race"].values[0]}')
    print(f'Height:          {info["height_centimeters"].values[0]} cm')
    print(f'Weight:          {info["weight_kilograms"].values[0]} kg')
    print(f'BMI:             {info["bmi_kg/m^2"].values[0]} kg/m^2')
    print(f'Laterality:      {info["laterality"].values[0]}')
    print(f'Percent Cortex:  {info["percent_cortex"].values[0]} %')
    print(f'Percent Medulla: {info["percent_medulla"].values[0]} %')
    
    # Plot full image
    if plot_full_image:
        scale = 0.1
        image_small = cv2.resize(image, (0,0), fx=scale, fy=scale)
        mask_small = cv2.resize(mask, (0,0), fx=scale, fy=scale)
        mask_small = np.expand_dims(mask_small,-1) 
    
        plt.figure(figsize=(16, 16))
        plt.imshow(overlay_image_mask(image_small, mask_small))
        plt.axis('off')

    # Plot glomeruli images
    fig_cols = 5
    fig_rows = int(math.ceil(num_glom_display/fig_cols))
    plt.figure(figsize=(4 * fig_cols, 4 * fig_rows))
    if num_glom_save > 0 and not os.path.exists(image_id):
        os.mkdir(image_id)
    for i in range(min(max(num_glom_display, num_glom_save), len(df_glom))):
        image_s, mask_s = get_tile(image,mask, df_glom['x'][i], df_glom['y'][i], 1000, scale=glob_scale)
        
        ovl = overlay_image_mask(image_s, mask_s)
        if i < num_glom_display:
            plt.subplot(fig_rows, fig_cols, i+1)
            plt.imshow(ovl)
            plt.axis('off')
        if i < num_glom_save:
            cv2.imwrite(f'{image_id}_{i:03}.png', cv2.cvtColor(ovl, cv2.COLOR_RGB2BGR))    
    
    del image, mask
    gc.collect()
    return df_glom

In [None]:
import math
#training images with glomerulis
df_glom = pd.DataFrame()
df_glom = df_glom.append(analyze_image(train_files[0]), ignore_index=True)

In [None]:
df_glom = df_glom.append(analyze_image(train_files[1]), ignore_index=True)


In [None]:
df_glom = df_glom.append(analyze_image(train_files[3]), ignore_index=True)

**Basic Statistics of glomerulis in train images**

In [None]:
df_glom.to_csv('./glomeruli_train.csv')


In [None]:
df_glom=pd.read_csv('./glomeruli_train.csv')
df_glom.describe()

In [None]:
#plotting  for 3 images number of glomeruli
g = df_glom.groupby('id')
plt.bar(g.size().index, g.size().values)
plt.title('Number of Glomerulis in Image')
plt.xticks(rotation=90)
plt.grid()

EDA On Test Images

In [None]:
df_glom_test = pd.DataFrame()
df_glom_test = df_glom_test.append(analyze_image(test_files[0]), ignore_index=True)

In [None]:
df_glom_test = df_glom_test.append(analyze_image(test_files[1]), ignore_index=True)

In [None]:
df_glom_test = df_glom_test.append(analyze_image(test_files[2]), ignore_index=True)

In [None]:
df_glom_test = df_glom_test.append(analyze_image(test_files[3]), ignore_index=True)

In [None]:
df_glom_test = df_glom_test.append(analyze_image(test_files[4]), ignore_index=True)

In [None]:
df_glom_test.to_csv('./glomeruli_test.csv')

In [None]:
# To plot number of glomeruli per test images
g = df_glom_test.groupby('id')
plt.bar(g.size().index, g.size().values)
plt.title('Number of Glomerulis in Image')
plt.xticks(rotation=90)
plt.grid()

In [None]:
# To display glomerulis by size
df_glom_test.sort_values('area', inplace=True)
df_glom_test

In [None]:

def plot_glom(df, image_id, glom_no):
    #image, mask = read_image_mask(os.path.join(base_path, f'train/{image_id}.tiff'), scale=glob_scale)
    image, mask = read_image_mask(os.path.join('../input/hubmap-kidney-segmentation', f'test/{image_id}.tiff'), scale=glob_scale)
    glom = df.loc[(df['id'] == image_id) & (df['no'] == glom_no)]
    im, ma = get_tile(image, mask, glom['x'].iloc[0], glom['y'].iloc[0], 1000, scale=glob_scale)
    del image, mask
    gc.collect()
    plt.figure(figsize=(16,8))
    plt.imshow(overlay_image_mask_original(im, ma))
    plt.title(f'Image: {image_id}, Glomeruli No: {glom_no}, Area: {glom["area"].iloc[0]}')


In [None]:
#To display 10 small glomerulis in test images
for i in range(10):
    plot_glom(df_glom_test, df_glom_test['id'].iloc[i], df_glom_test['no'].iloc[i])

In [None]:
#To display largest glomerulis in test images
for i in range(len(df_glom_test)-5, len(df_glom_test)):
    plot_glom(df_glom_test, df_glom_test['id'].iloc[i], df_glom_test['no'].iloc[i])