In [None]:
import os
from tqdm.notebook import tqdm

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import cv2
import skimage.io

### Variables

In [None]:
data_dir = '/kaggle/input/hubmap-kidney-segmentation'
split = 'train' # Change this to use test
tile_size = 256
ext = 'png' # Change to jpg for smaller files

### Helper function

In [None]:
# https://www.kaggle.com/paulorzp/rle-functions-run-lenght-encode-decode
def mask2rle(img):
    '''
    img: numpy array, 1 - mask, 0 - background
    Returns run length as string formated
    '''
    pixels= img.T.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)
 
def rle2mask(mask_rle, shape=(1600,256)):
    '''
    mask_rle: run-length as string formated (start length)
    shape: (width,height) of array to return 
    Returns numpy array, 1 - mask, 0 - background

    '''
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape).T

### Load the CSVs

In [None]:
train_df = pd.read_csv(f'{data_dir}/train.csv')
sub_df = pd.read_csv(f'{data_dir}/sample_submission.csv')

train_df['enc_dec_enc'] = ''

### Creating a dataset - https://www.kaggle.com/xhlulu/hubmap-break-down-images-into-512x512-tiles

In [None]:
# Those folders will store our images
os.makedirs(f'{split}_tiles/images', exist_ok=True)
os.makedirs(f'{split}_tiles/masks', exist_ok=True)

# This list will contain information about all our images
meta_ls = []

# Choose a dataframe based on the split
if split == 'train':
    df = train_df
else:
    df = sub_df

# The break down starts here
for ix in range(df.shape[0]):
    img_id = df.id[ix]
    path = f"{data_dir}/{split}/{img_id}.tiff"
    img = skimage.io.imread(path).squeeze()
    mask = rle2mask(df.encoding[ix], shape=img.shape[1::-1])

    x_max, y_max = img.shape[:2]

    for x0 in tqdm(range(0, x_max, tile_size)):
        x1 = min(x_max, x0 + tile_size)
        for y0 in range(0, y_max, tile_size):
            y1 = min(y_max, y0 + tile_size)

            img_tile = img[x0:x1, y0:y1]
            mask_tile = mask[x0:x1, y0:y1]

            img_tile_path = f"{split}_tiles/images/{img_id}_{x0}-{x1}x_{y0}-{y1}y.{ext}"
            mask_tile_path = f"{split}_tiles/masks/{img_id}_{x0}-{x1}x_{y0}-{y1}y.png"

            cv2.imwrite(img_tile_path, cv2.cvtColor(img_tile, cv2.COLOR_RGB2BGR))
            cv2.imwrite(mask_tile_path, mask_tile)

            meta_ls.append([
                img_id, x0, x1, y0, y1, img_tile.min(), img_tile.max(), 
                mask_tile.max(), img_tile_path, mask_tile_path, x_max, y_max
            ])

In [None]:
meta_df = pd.DataFrame(meta_ls, columns=['image_id', 'x0', 'x1', 'y0', 'y1', 'min_pixel_value', 'max_pixel_value', 'max_mask_value', 'image_tile_path', 'mask_tile_path', 'w', 'h'])
meta_df.to_csv(f'{split}_metadata.csv', index=False)
meta_df.head()

### reconstruction of the full image and coordinate-wise RLE encoding

In [None]:
# adapted from https://stackoverflow.com/questions/53327999/creating-a-run-length-for-a-number-matrix-in-python
def run_length_encoding(matrix):
    # List for storing run length encoding
    encoding = []

    # Counts the number of occurrences
    count = 0
    true_count = 0
    false_counts = 1
    
    # Initialize previous element to first element in matrix
    previous_element = matrix[0][0]

    for row in matrix:
        for current_element in row:
            if current_element == previous_element:
                count += 1
            else:
                if previous_element == 1:
                    encoding.append(count)
                    true_count += count
                else:   
                    false_counts += count
                    encoding.append(false_counts+true_count)
        
                # Reset counter and update previous element
                count = 1
                previous_element = current_element

    return ' '.join(str(x) for x in encoding)

In [None]:
%%time
for id_ in meta_df['image_id'].value_counts().index.tolist():
    w = meta_df.loc[meta_df['image_id'] ==  id_, 'w'].values[0]
    h = meta_df.loc[meta_df['image_id'] == id_, 'h'].values[0]
    full_mask = np.zeros((w, h), dtype=np.int8)
    
    # full image reconstruction
    for _, row in meta_df.loc[meta_df['image_id'] == id_].iterrows():
        tile_mask = cv2.imread(row['mask_tile_path'],0)
        cmin, cmax, rmin, rmax = row['x0'], row['x1'], row['y0'], row['y1']
        full_mask[cmin:cmax, rmin:rmax] = tile_mask[:cmax, :rmax].astype(np.int8)
        
    # run-length encoding
    rle = run_length_encoding(full_mask.T)
    df.loc[df['id'] == id_, 'enc_dec_enc'] = rle

In [None]:
df

In [None]:
fig, ax = plt.subplots(1,2, figsize=(30, 15))

mask = rle2mask(df.loc[df['id'] == id_, 'enc_dec_enc'].values[0], full_mask.shape[::-1])
ax[0].imshow(full_mask)
ax[0].set_title("Enocoded-Decoded-Encoded", fontsize=30);

mask = rle2mask(df.loc[df['id'] == id_, 'encoding'].values[0], full_mask.shape[::-1])
ax[1].imshow(mask)
ax[1].set_title("Enocoded", fontsize=30);

plt.tight_layout()
plt.show()

### reconstruction of the full image directly into a flat array (not implemented)

```
%%time

for id_ in meta_df['image_id'].value_counts().index.tolist():
    w = meta_df.loc[meta_df['image_id'] == id_, 'w'].values[0]
    h = meta_df.loc[meta_df['image_id'] == id_, 'h'].values[0]
    
    full_mask = np.zeros((w, h), dtype=np.int8).T.flatten()

    start = 0

    # full image reconstruction
    for _, row in meta_df.loc[meta_df['image_id'] == id_].iterrows():

        tile_mask = cv2.imread(row['mask_tile_path'], 0).astype("int8")
        cmin, cmax, rmin, rmax = row['x0'], row['x1'], row['y0'], row['y1']
        
        #y * width + x
        tile_mask = tile_mask[:cmax-cmin, :rmax-rmin].T.flatten()
        
        # Convert 2d index to 1d 
        index = cmin * w + rmin 

        full_mask[index:index+len(tile_mask)] = tile_mask
    
    # rle from here - https://www.kaggle.com/bguberfain/memory-aware-rle-encoding
    full_mask[0] = 0
    full_mask[-1] = 0
    full_mask = np.where(full_mask[1:] != full_mask[:-1])[0] + 2

    full_mask[1::2] -= full_mask[::2]

    df.loc[df['id'] == id_, 'enc_dec_enc'] = ' '.join(str(x) for x in full_mask)
```