In [None]:
import os
import gc
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import cv2 as cv
import tifffile as tiff
from tqdm.notebook import tqdm
from PIL import Image

# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))
        
%matplotlib inline

In [None]:
DATA_DIR = '/kaggle/input/hubmap-kidney-segmentation/'
TRAIN_DATA_DIR = '/kaggle/input/hubmap-kidney-segmentation/train/'
TEST_DATA_DIR = '/kaggle/input/hubmap-kidney-segmentation/test/'
TRAIN_SAVE_DIR = "/kaggle/working/train_tiles/"
TEST_SAVE_DIR = "/kaggle/working/test_tiles/"
MODEL_SAVE_DIR = "/kaggle/working/"
TILE_SIZE = 256
REDUCE_RATE = 4

if not os.path.exists(TRAIN_SAVE_DIR):
    os.mkdir(TRAIN_SAVE_DIR)

if not os.path.exists(TEST_SAVE_DIR):
    os.mkdir(TEST_SAVE_DIR)

# Helpers

In [None]:
def display_pil_images(
    images, 
    masks=None,
    labels=None,
    columns=5, width=20, height=8, max_images=15, 
    label_wrap_length=50, label_font_size=9):

    if len(images) > max_images:
        print(f"Showing {max_images} images of {len(images)}:")
        images=images[0:max_images]
        if masks is not None:
            masks= masks[0:max_images]

    height = max(height, int(len(images)/columns) * height)
    plt.figure(figsize=(width, height))
    
    if masks is not None:
        for i, (image, mask) in enumerate(zip(images,masks)):
            plt.subplot(len(images) / columns + 1, columns, i + 1)
            plt.imshow(image)
            plt.imshow(mask, cmap='coolwarm', alpha=0.5)
            
            if labels is not None:
                plt.title(labels[i], fontsize=label_font_size); 
            
    else:
        for i, image in enumerate(images):
            plt.subplot(len(images) / columns + 1, columns, i + 1)
            plt.imshow(image)
        
            if labels is not None:
                plt.title(labels[i], fontsize=label_font_size); 

        
def mask2rle(img):
    '''
    img: numpy array, 1 - mask, 0 - background
    Returns run length as string formated
    '''
    pixels= img.T.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)
 
def rle2mask(mask_rle, shape=(1600,256)):
    '''
    mask_rle: run-length as string formated (start length)
    shape: (width,height) of array to return 
    Returns numpy array, 1 - mask, 0 - background

    '''
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape).T


def is_tile_contains_info(img, pixel_limits, content_threshold, expected_shape):
    """
    img: np.array
    pixel_limits: tuple
    content_threshold: float percents
    expected_shape: tuple
    """
    
    left_limit = np.prod(img > pixel_limits[0], axis=-1)
    right_limit =  np.prod(img < pixel_limits[1], axis=-1)

    if img.shape != expected_shape:
        return False, 0.

    percent_of_pixels = np.sum(left_limit*right_limit) / (img.shape[0] * img.shape[1])
    return  percent_of_pixels > content_threshold, percent_of_pixels

def extract_train_tiles(sample_img_path, rle_mask_sample, idx):
    print(idx)
    sample_image = tiff.imread(sample_img_path)
    
    if idx in ['e79de561c', '095bf7a1f', '54f2eec69', '1e2425f28']:
        sample_image = np.transpose(sample_image.squeeze(), (1,2,0))

        
    sample_mask = rle2mask(rle_mask_sample, (sample_image.shape[1], sample_image.shape[0]))
    print(f"Original Tiff image shape: {sample_image.shape}")
    
    pad0 = (REDUCE_RATE*TILE_SIZE - sample_image.shape[0]%(REDUCE_RATE*TILE_SIZE))%(REDUCE_RATE*TILE_SIZE)
    pad1 = (REDUCE_RATE*TILE_SIZE - sample_image.shape[1]%(REDUCE_RATE*TILE_SIZE))%(REDUCE_RATE*TILE_SIZE)
    
    sample_image = np.pad(sample_image,[[pad0//2,pad0-pad0//2],[pad1//2,pad1-pad1//2],[0,0]],
                   constant_values=0)
    sample_mask = np.pad(sample_mask,[[pad0//2,pad0-pad0//2],[pad1//2,pad1-pad1//2]],
                  constant_values=0)
        
    sample_image = cv.resize(sample_image,(sample_image.shape[1]//REDUCE_RATE,sample_image.shape[0]//REDUCE_RATE),
                             interpolation = cv.INTER_AREA)
    
    sample_mask = cv.resize(sample_mask,(sample_mask.shape[1]//REDUCE_RATE,sample_mask.shape[0]//REDUCE_RATE),
                             interpolation = cv.INTER_AREA)
    
    print(f"Reduced Tiff image shape: {sample_image.shape}")
    
    tiles, masks, paths = [], [], []
    for x in range(0,sample_image.shape[0],TILE_SIZE):
        for y in range(0,sample_image.shape[1],TILE_SIZE):
            sub_image = np.float32(sample_image[x:x+TILE_SIZE,y:y+TILE_SIZE])
            sub_mask = sample_mask[x:x+TILE_SIZE,y:y+TILE_SIZE]
            if is_tile_contains_info(sub_image, (50, 220), 0.7, (TILE_SIZE,TILE_SIZE, 3))[0]:
                tiles.append(sub_image)
                masks.append(sub_mask)
            else:
                continue
    
    if not os.path.exists(os.path.join(TRAIN_SAVE_DIR, idx)):
        os.mkdir(os.path.join(TRAIN_SAVE_DIR, idx))

    count = 0
    for tile,mask in zip(tiles,masks):
        cv.imwrite(os.path.join(TRAIN_SAVE_DIR, idx, f"img_{count}.png"), tile)
        cv.imwrite(os.path.join(TRAIN_SAVE_DIR, idx, f"mask_{count}.png"), mask)
        paths.append((os.path.join(TRAIN_SAVE_DIR, idx, f"img_{count}.png"), 
                      os.path.join(TRAIN_SAVE_DIR, idx, f"mask_{count}.png")))

        count += 1
            
    print(f"Length tiles", len(tiles))
    gc.collect()
    
    return paths

def extract_test_tiles(sample_img_path, idx):
    print(idx)
    sample_image = tiff.imread(sample_img_path)
    
    if idx in ['26dc41664', 'c68fe75ea']:
        sample_image = np.transpose(sample_image.squeeze(), (1,2,0))

    print(f"Original Tiff image shape: {sample_image.shape}")
    
    pad0 = (REDUCE_RATE*TILE_SIZE - sample_image.shape[0]%(REDUCE_RATE*TILE_SIZE))%(REDUCE_RATE*TILE_SIZE)
    pad1 = (REDUCE_RATE*TILE_SIZE - sample_image.shape[1]%(REDUCE_RATE*TILE_SIZE))%(REDUCE_RATE*TILE_SIZE)
    
    sample_image = np.pad(sample_image,[[pad0//2,pad0-pad0//2],[pad1//2,pad1-pad1//2],[0,0]],
                   constant_values=0)
    
    sample_image = cv.resize(sample_image,(sample_image.shape[1]//REDUCE_RATE,sample_image.shape[0]//REDUCE_RATE),
                             interpolation = cv.INTER_AREA)
    
    print(f"Reduced Tiff image shape: {sample_image.shape}")
    
    tiles, paths = [], []
    for x in range(0,sample_image.shape[0],TILE_SIZE):
        for y in range(0,sample_image.shape[1],TILE_SIZE):
            sub_image = np.float32(sample_image[x:x+TILE_SIZE,y:y+TILE_SIZE])
            tiles.append(sub_image)
    
    if not os.path.exists(os.path.join(TEST_SAVE_DIR, idx)):
        os.mkdir(os.path.join(TEST_SAVE_DIR, idx))

    count = 0
    for tile in tiles:
        cv.imwrite(os.path.join(TEST_SAVE_DIR, idx, f"img_{count}.png"), tile)
        paths.append(os.path.join(TEST_SAVE_DIR, idx, f"img_{count}.png"))
        count += 1
            
    print(f"Length tiles", len(tiles))
    gc.collect()
    
    return paths
    

# Data exploration

In [None]:
train_df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'), encoding='utf-8')
train_df

In [None]:
sample_df = pd.read_csv('../input/hubmap-kidney-segmentation/sample_submission.csv')
sample_df

In [None]:
with open(os.path.join(DATA_DIR, 'train/2f6ecfcdf-anatomical-structure.json'), mode='r', encoding='utf-8') as f:
    sample_anatomy = json.load(f)

sample_anatomy

In [None]:
train_img_paths = [os.path.join(TRAIN_DATA_DIR, item + '.tiff') for item in train_df['id']]
test_img_paths = [os.path.join(TEST_DATA_DIR, item + '.tiff') for item in sample_df['id']]

print(train_img_paths)
print(test_img_paths)

In [None]:
%%time

IMAGE_IDX = 4

sample_image = tiff.imread(train_img_paths[IMAGE_IDX])

sample_image = np.transpose(sample_image.squeeze(), (1,2,0))
img_id = train_df['id'].values[0]
print("This image's id:", img_id)
print(f"Sample image shape: {sample_image.shape}")
      
sample_mask = rle2mask(train_df['encoding'].values[IMAGE_IDX], (sample_image.shape[1], sample_image.shape[0]))
print(f"Sample image shape: {sample_mask.shape}")

In [None]:
plt.figure(figsize=(10,10))
plt.imshow(sample_image)
plt.imshow(sample_mask, cmap='coolwarm', alpha=0.5);

# Make patches

In [None]:
%%time

all_train_paths = []
for idx, train_img_path in enumerate(train_img_paths):
    paths = extract_train_tiles(train_img_path, 
                                train_df['encoding'].values[idx], 
                                train_df['id'].values[idx])
    all_train_paths.extend(paths)
    
print("Length of all samples:", len(all_train_paths))

In [None]:
%%time

all_test_paths = []
for idx, test_img_path in enumerate(test_img_paths):
    paths = extract_test_tiles(test_img_path, 
                               sample_df['id'].values[idx])
    all_test_paths.extend(paths)
    
print("Length of all samples:", len(all_test_paths))

# Result

In [None]:
imgs = [Image.open(img_path) for img_path, _ in all_train_paths]
masks = [Image.open(mask_path) for _, mask_path in all_train_paths]
display_pil_images(imgs[:15], masks[:15])

In [None]:
imgs = [Image.open(img_path) for img_path in all_test_paths]
display_pil_images(imgs[450:500], labels=all_test_paths[450:500])

In [None]:
!zip -r train_tiles_256.zip train_tiles
!zip -r test_tiles_256.zip test_tiles
!rm -r train_tiles
!rm -r test_tiles