In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import os
from tifffile import imread
import numpy as np
import cv2
import gc
import random
import matplotlib.pyplot as plt

# Preprocess Data

## Create training set

Create training set by reading, padding, downsizing & tiling images and according masks. Non-informative tiles are dismissed.

In [None]:
# thanks to: https://www.kaggle.com/iafoss/256x256-images

# specify parameters
TILE_SIZE = 512
DOWNSIZE = 4
X_TRAIN = []
Y_TRAIN = []

print('Starting to create training set ...')
print(f'Tile size is: {TILE_SIZE} px')
print(f'Downsizing images and masks by a factor of {DOWNSIZE}.')

# read mask encodings
MASKS = pd.read_csv('/kaggle/input/hubmap-kidney-segmentation/train.csv').set_index('id')

step = 1
for file in os.listdir('/kaggle/input/hubmap-kidney-segmentation/train/'):
    if os.path.splitext(file)[1] == '.tiff':
        # load image
        print(f'--- STEP {step}/8 ---')
        print(f'Reading image {file} ...')
        img = imread('/kaggle/input/hubmap-kidney-segmentation/train/' + file)

        # check if image is stored 3- or 5-dimensional and transform if necessary
        if len(img.shape) == 5:
            img = np.squeeze(img)
            img = np.transpose(img, (1,2,0))

        # define tiling parameters
        HEIGHT_REMAIN = TILE_SIZE*DOWNSIZE - (img.shape[1] % (TILE_SIZE*DOWNSIZE))
        WIDTH_REMAIN = TILE_SIZE*DOWNSIZE - (img.shape[0] % (TILE_SIZE*DOWNSIZE))
        img_shape = img.shape

        # pad image with zeros, such that image is divisible by tile size
        print('Padding ...')
        img_pad = np.pad(img, ((WIDTH_REMAIN//2, WIDTH_REMAIN-WIDTH_REMAIN//2),
                               (HEIGHT_REMAIN//2, HEIGHT_REMAIN-HEIGHT_REMAIN//2),
                               (0,0)), 'constant', constant_values=0)

        # downsizing to fit into RAM
        img_pad = cv2.resize(img_pad, (img_pad.shape[1]//DOWNSIZE, img_pad.shape[0]//DOWNSIZE),
                             interpolation = cv2.INTER_AREA)

        # devide image into tiles with 'reshape-transform-trick'
        print('Tiling ...')
        img_split = img_pad.reshape(img_pad.shape[0]//TILE_SIZE,
                                    TILE_SIZE,
                                    img_pad.shape[1]//TILE_SIZE,
                                    TILE_SIZE,
                                    3)
        img_split = img_split.transpose(0, 2, 1, 3, 4).reshape(-1, TILE_SIZE, TILE_SIZE, 3)

        # free memory
        del img, img_pad
        gc.collect()

        # create empty array for mask with length 'total number of pixels'
        print('Reading according mask ...')
        mask = np.zeros(img_shape[0] * img_shape[1], dtype=np.uint8)

        img_mask = MASKS.loc[os.path.splitext(file)[0]].encoding.split()

        # fill empty array with mask encodings
        for i in range(0, len(img_mask), 2):
            start = int(img_mask[i])
            stop = int(img_mask[i]) + int(img_mask[i+1])
            mask[start:stop] = 1

        # reshape into image form
        mask = mask.reshape(img_shape[1], img_shape[0]).T

        # pad mask with zeros, such that it is divisible by tile size
        mask_pad = np.pad(mask, ((WIDTH_REMAIN//2, WIDTH_REMAIN-WIDTH_REMAIN//2),
                                 (HEIGHT_REMAIN//2, HEIGHT_REMAIN-HEIGHT_REMAIN//2)),
                          'constant', constant_values=0)

        # downsizing to fit into RAM
        mask_pad = cv2.resize(mask_pad, (mask_pad.shape[1]//DOWNSIZE, mask_pad.shape[0]//DOWNSIZE),
                              interpolation = cv2.INTER_NEAREST)

        # devide mask into tiles with 'reshape-transform-trick'
        mask_split = mask_pad.reshape(mask_pad.shape[0]//TILE_SIZE,
                                      TILE_SIZE,
                                      mask_pad.shape[1]//TILE_SIZE,
                                      TILE_SIZE)
        mask_split = mask_split.transpose(0, 2, 1, 3).reshape(-1, TILE_SIZE, TILE_SIZE)

        del  mask, img_mask, mask_pad
        gc.collect()

        # randomly drop images/masks to reduce training dataset size (30%)
        rand = random.sample(range(0, len(img_split)), int(len(img_split)*0.3))
        img_tiles = np.delete(img_split, rand, axis=0)
        mask_tiles = np.delete(mask_split, rand, axis=0)
        assert len(img_tiles) == len(mask_tiles), 'Number of image and mask tiles does not match!'

        # append tiles and masks to final training set
        print('No. of created images/masks:', len(img_tiles))
        X_TRAIN = X_TRAIN + list(img_tiles)
        Y_TRAIN = Y_TRAIN + list(mask_tiles)

        del img_split, img_tiles, mask_split, mask_tiles
        gc.collect()

        step += 1

print(f'Training set created. Total number of sampels is {len(X_TRAIN)}.')

In [None]:
# plot tile and mask (with glomerulus) for sanity check
for i, j in enumerate(X_TRAIN):
    if 1 in j:
        fig = plt.figure()
        plt.imshow(X_TRAIN[i])
        plt.imshow(Y_TRAIN[i], alpha=0.3)
        plt.show()
        break

## Write to txt

Flatten arrays, so that each picture corresponds to one row. Reshape to (512, 512, 3) after import.

In [None]:
img_512 = [arr.flatten() for arr in X_TRAIN]
mask_512 = [arr.flatten() for arr in Y_TRAIN]

np.savetxt('img_512.txt', np.asarray(img_512), fmt='%1.0f', delimiter=',')
np.savetxt('mask_512.txt', np.asarray(mask_512), fmt='%1.0f', delimiter=',')