## References

[HuBMAP: Break down images into 512x512 tiles](https://www.kaggle.com/xhlulu/hubmap-break-down-images-into-512x512-tiles)
[How To Create TFRecords](https://www.kaggle.com/cdeotte/how-to-create-tfrecords)

### other notebooks

Make Tfrecords of 512x512 or other tiles (This notebook)

[HUBMAP TPU Train Phase](https://www.kaggle.com/itsuki9180/hubmap-tpu-train-phase)

[HUBMAP GPU Inference Phase](https://www.kaggle.com/itsuki9180/hubmap-gpu-inference-phase)

In [None]:
import os
import glob
import cv2
import numpy as np
import pandas as pd
import skimage.io
from tqdm.notebook import tqdm
import tensorflow as tf
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt

## Variables

In [None]:
data_dir = '/kaggle/input/hubmap-kidney-segmentation'
split = 'train' # Change this to use test
tile_size = 512
ext = 'jpg' # Change to jpg for smaller files
FOLDS = 8
SEED = 32
np.random.seed(seed=32)

## Helper function

In [None]:
# https://www.kaggle.com/paulorzp/rle-functions-run-lenght-encode-decode
def mask2rle(img):
    '''
    img: numpy array, 1 - mask, 0 - background
    Returns run length as string formated
    '''
    pixels= img.T.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)
 
def rle2mask(mask_rle, shape=(1600,256)):
    '''
    mask_rle: run-length as string formated (start length)
    shape: (width,height) of array to return 
    Returns numpy array, 1 - mask, 0 - background

    '''
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape).T

## Load the CSVs

In [None]:
train_df = pd.read_csv(f'{data_dir}/train.csv')
sub_df = pd.read_csv(f'{data_dir}/sample_submission.csv')

## Break down all images

In [None]:
# Those folders will store our images
os.makedirs(f'{split}_tiles/images', exist_ok=True)
os.makedirs(f'{split}_tiles/masks', exist_ok=True)

# This list will contain information about all our images
meta_ls = []

# Choose a dataframe based on the split
if split == 'train':
    df = train_df
else:
    df = sub_df

# The break down starts here
for ix in range(df.shape[0]):
    img_id = df.id[ix]
    path = f"{data_dir}/{split}/{img_id}.tiff"
    print(path)
    img = skimage.io.imread(path).squeeze()
    mask = rle2mask(df.encoding[ix], shape=img.shape[1::-1])

    x_max, y_max = img.shape[:2]

    for x0 in tqdm(range(0, x_max, tile_size)):
        x1 = min(x_max, x0 + tile_size)
        for y0 in range(0, y_max, tile_size):
            y1 = min(y_max, y0 + tile_size)
            
            if x1-x0!=tile_size or y1-y0!=tile_size:
                continue

            img_tile = img[x0:x1, y0:y1]
            mask_tile = mask[x0:x1, y0:y1]
            
            # undersampling
            if np.count_nonzero(mask_tile>=1)==0:
                if np.random.rand()>0.5:
                    continue

            img_tile_path = f"{split}_tiles/images/{img_id}_{x0}-{x1}x_{y0}-{y1}y.{ext}"
            mask_tile_path = f"{split}_tiles/masks/{img_id}_{x0}-{x1}x_{y0}-{y1}y.png"

            cv2.imwrite(img_tile_path, cv2.cvtColor(img_tile, cv2.COLOR_RGB2BGR))
            cv2.imwrite(mask_tile_path, mask_tile*255)
            
            # remove concatains too much black or white area 
            if os.path.getsize(img_tile_path)<50000:
                if np.random.rand()<0.99:
                    os.remove(img_tile_path)
                    os.remove(mask_tile_path)
                    continue

            meta_ls.append([
                img_id, x0, x1, y0, y1, img_tile.min(), img_tile.max(), 
                mask_tile.max(), img_tile_path, mask_tile_path
            ])

In [None]:
meta_df = pd.DataFrame(meta_ls, columns=['image_id', 'x0', 'x1', 'y0', 'y1', 'min_pixel_value', 'max_pixel_value', 'max_mask_value', 'image_tile_path', 'mask_tile_path'])
meta_df.to_csv(f'{split}_metadata.csv', index=False)
meta_df.head()

In [None]:
# CreateTfrecord

In [None]:
folder_img = "./train_tiles/images"
folder_mask = "./train_tiles/masks"

In [None]:
img_set = []
mask_set = []
ids = []
folds_id = []
x = np.arange(8)
y = np.arange(8)
np.random.shuffle(y)
print(x,y)
for i, j, ix in zip(x, y, df["id"]):
    img_id = ix
    imgs = glob.glob(folder_img + f"/{img_id}*")
    imgs = sorted(imgs)
    masks = glob.glob(folder_mask + f"/{img_id}*")
    masks = sorted(masks)
    
    img_set.append(imgs)
    mask_set.append(masks)
    ids.append([i] * len(imgs))
    folds_id.append([j] * len(imgs))
    
img_set = np.concatenate(img_set)
mask_set = np.concatenate(mask_set)
ids = np.concatenate(ids)
folds_id = np.concatenate(folds_id)

if len(img_set)!=len(mask_set) or len(img_set)!=len(ids):
    assert "invalid"

In [None]:
df=pd.DataFrame({'image_name': img_set,
                    'mask_name': mask_set,
                    "patient_id": ids,
                    "fold": folds_id})

In [None]:
df.head()

In [None]:
folds = df.copy()
#Fold = KFold(n_splits=8, shuffle=True, random_state=42)
#for n, (train_index, val_index) in enumerate(Fold.split(folds, folds['mask_name'])):
#    folds.loc[val_index, 'fold'] = int(n)
#folds['fold'] = folds['fold'].astype(int)
#print(folds.groupby(['fold', 'mask_name']).size())

In [None]:
def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [None]:
def serialize_example(feature0, feature1):
  feature = {
      'image': _bytes_feature(feature0),
      'mask':  _bytes_feature(feature1),
  }
  example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
  return example_proto.SerializeToString()

In [None]:
for f in range(FOLDS):
    ct = (folds['fold'] == f).sum()
    idx = folds[folds['fold'] == f].index
    print(idx)
    print(ct)
    print('Writing TFRecord %i of %i...'%(f,ct))
    with tf.io.TFRecordWriter('train%.2i-%i.tfrec'%(f,ct)) as writer:
        for k in range(ct):
            path = folds['image_name'][idx[k]]    
            img = cv2.imread(path)
            img = cv2.resize(img, (512,512))
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) # Fix incorrect colors
            img = cv2.imencode('.jpg', img, (cv2.IMWRITE_JPEG_QUALITY, 98))[1].tostring()
            
            path2 = folds['mask_name'][idx[k]]    
            #print(path,path2)
            
            mask = cv2.imread(path2, cv2.IMREAD_GRAYSCALE)
            mask = cv2.resize(mask, (512,512))
            mask = cv2.imencode('.png', mask)[1].tostring()
            #name = folds['image_id'][idx[k]].split('.')[0]
            
            
            #row = folds.loc[folds.image_id==name]
            example = serialize_example(
                img, 
                mask,
                )
            writer.write(example)
            if k%100==0: print(k,', ',end='')
            if k<5:
                img = cv2.imread(path)
                img = cv2.resize(img, (512,512))
                plt.imshow(img)
                plt.show()
                mask = cv2.imread(path2, cv2.IMREAD_GRAYSCALE)
                mask = cv2.resize(mask, (512,512))
                plt.imshow(mask)
                plt.show()

## Convert to tar

In [None]:
#%%time
# c: create, q: quiet, f: file
#!tar -cf train_tiles.tar train_tiles --remove-files
#!zip -r train_tiles.zip train_tiles 

In [None]:
import shutil
shutil.rmtree(folder_img)
shutil.rmtree(folder_mask)