## References

* This [notebook](https://www.kaggle.com/pestipeti/decoding-rle-masks) by Peter shows how to load the images using `skimage.io`

In [None]:
import os

import cv2
import numpy as np
import pandas as pd
import skimage.io
from tqdm.notebook import tqdm

## Variables

In [None]:
data_dir = '/kaggle/input/hubmap-kidney-segmentation'
split = 'train' # Change this to use test
tile_size = 512
ext = 'png' # Change to jpg for smaller files

## Helper function

In [None]:
# https://www.kaggle.com/paulorzp/rle-functions-run-lenght-encode-decode
def mask2rle(img):
    '''
    img: numpy array, 1 - mask, 0 - background
    Returns run length as string formated
    '''
    pixels= img.T.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)
 
def rle2mask(mask_rle, shape=(1600,256)):
    '''
    mask_rle: run-length as string formated (start length)
    shape: (width,height) of array to return 
    Returns numpy array, 1 - mask, 0 - background

    '''
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape).T

## Load the CSVs

In [None]:
train_df = pd.read_csv(f'{data_dir}/train.csv')
sub_df = pd.read_csv(f'{data_dir}/sample_submission.csv')

## Break down all images

In [None]:
# Those folders will store our images
os.makedirs(f'{split}_tiles/images', exist_ok=True)
os.makedirs(f'{split}_tiles/masks', exist_ok=True)

# This list will contain information about all our images
meta_ls = []

# Choose a dataframe based on the split
if split == 'train':
    df = train_df
else:
    df = sub_df

# The break down starts here
for ix in range(df.shape[0]):
    img_id = df.id[ix]
    path = f"{data_dir}/{split}/{img_id}.tiff"
    img = skimage.io.imread(path).squeeze()
    mask = rle2mask(df.encoding[ix], shape=img.shape[1::-1])

    x_max, y_max = img.shape[:2]

    for x0 in tqdm(range(0, x_max, tile_size)):
        x1 = min(x_max, x0 + tile_size)
        for y0 in range(0, y_max, tile_size):
            y1 = min(y_max, y0 + tile_size)

            img_tile = img[x0:x1, y0:y1]
            mask_tile = mask[x0:x1, y0:y1]

            img_tile_path = f"{split}_tiles/images/{img_id}_{x0}-{x1}x_{y0}-{y1}y.{ext}"
            mask_tile_path = f"{split}_tiles/masks/{img_id}_{x0}-{x1}x_{y0}-{y1}y.png"

            cv2.imwrite(img_tile_path, cv2.cvtColor(img_tile, cv2.COLOR_RGB2BGR))
            cv2.imwrite(mask_tile_path, mask_tile)

            meta_ls.append([
                img_id, x0, x1, y0, y1, img_tile.min(), img_tile.max(), 
                mask_tile.max(), img_tile_path, mask_tile_path
            ])

In [None]:
meta_df = pd.DataFrame(meta_ls, columns=['image_id', 'x0', 'x1', 'y0', 'y1', 'min_pixel_value', 'max_pixel_value', 'max_mask_value', 'image_tile_path', 'mask_tile_path'])
meta_df.to_csv(f'{split}_metadata.csv', index=False)
meta_df.head()

## Convert to tar

In [None]:
%%time
# c: create, q: quiet, f: file
!tar -cf train_tiles.tar train_tiles --remove-files