Hello fellow Kagglers,

This notebook demonstrates the preprocessing process for generating the training and validation data used from training in [this](https://www.kaggle.com/markwijkhuizen/sartorius-training-upsampling-tf-public) notebook.

The training data is split in 4 folds and saved as compressed numpy files.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from PIL import Image, ImageEnhance
from tqdm.notebook import tqdm
from sklearn.model_selection import KFold

import glob
import sys
import cv2
import imageio
import joblib
import math
import warnings
import os

tqdm.pandas()

In [None]:
# All training data have the same dimensions
HEIGHT = 520
WIDTH = 704

train = pd.read_csv('/kaggle/input/sartorius-cell-instance-segmentation/train.csv')

In [None]:
# Add image file path
def get_file_path(image_id):
    return f'/kaggle/input/sartorius-cell-instance-segmentation/train/{image_id}.png'

train['file_path'] = train['id'].apply(get_file_path)

In [None]:
# Add image shape
train['shape'] = train[['height', 'width']].apply(tuple, axis=1)

In [None]:
display(train.head())

In [None]:
display(train.info())

# Analysis

In [None]:
plt.figure(figsize=(8, 8))
train['cell_type'].value_counts().plot(kind='pie', autopct='%1.1f%%', title='Cell Type Distribution')
plt.show()

# RLE Decode

In [None]:
# source: https://www.kaggle.com/stainsby/fast-tested-rle
def rle_encode(img):
    '''
    img: numpy array, 1 - mask, 0 - background
    Returns run length as string formated
    '''
    pixels = img.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)

In [None]:
# inspiration: https://www.kaggle.com/paulorzp/run-length-encode-and-decode
# Modified to retrieve the full mask for a given image id
def rle_decode(image_id):
    rows = train.loc[train['id'] == image_id]
    # Shape
    shape = train.loc[0, 'shape']
    # Image Shape flattenned
    mask = np.zeros(shape=shape[0]*shape[1], dtype=np.uint8)
    
    # Add all image masks
    for idx, row in rows.iterrows():
        s = row['annotation'].split()
        starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
        starts -= 1
        ends = starts + lengths
        for start, end in zip(starts, ends):
            mask[start:end] = 1
    
    return mask.reshape(shape)

# Image Examples

In [None]:
# Shows a batch of images
def show_image_and_masks(rows=4, cols=4):
    # Figure
    fig, axes = plt.subplots(nrows=rows, ncols=cols, figsize=(cols*8, rows*6))
    # Unique Image Ids
    image_ids = train['id'].unique()
    
    for r in range(rows):
        image_id = image_ids[r]
        df_row = train.loc[train['id'] == image_id].head(1).squeeze()
        # Rad Image
        image = imageio.imread(df_row['file_path'])
        # Enhance Contrast
        # inspiration from: https://www.kaggle.com/dschettler8845/sartorius-segmentation-eda-efficientdet-tf
        image = np.array(ImageEnhance.Contrast(Image.fromarray(image)).enhance(16))
        # Plot Image
        axes[r, 0].imshow(image)
        axes[r, 0].set_title(f'Image {image_id}', size=18)
        axes[r, 0].axis(False)
        
        # Image
        img_norm = imageio.imread(df_row['file_path'])
        img_norm = abs(img_norm.astype(np.float32) - 127) * 2
        img_norm[img_norm > 255] = 255
        axes[r, 1].imshow(img_norm)
        axes[r, 1].set_title(f'Image {image_id} 0-255', size=18)
        axes[r, 1].axis(False)
        
        # Mask
        mask = rle_decode(image_ids[r])
        axes[r, 2].imshow(mask)
        axes[r, 2].set_title('Mask', size=18)
        axes[r, 2].axis(False)
        
        # Image with Mask
        axes[r, 3].imshow(cv2.cvtColor(image, cv2.COLOR_GRAY2RGB))
        axes[r, 3].imshow((np.expand_dims(mask, axis=2) * np.array([255, 0, 0])), alpha=0.50)
        axes[r, 3].set_title('Image and Mask', size=18)
        axes[r, 3].axis(False)
            
            
    # Adjust Vertical Space Between Subplots
    fig.subplots_adjust(wspace=0.10)

In [None]:
show_image_and_masks(rows=8)

# Train Test KFolds

In [None]:
def create_fold(idxs, fold, subset):
    # Images
    X = np.empty(shape=[len(idxs), HEIGHT, WIDTH], dtype=np.uint8)
    for idx, image_idx_idx in enumerate(tqdm(idxs)):
        image_id = id_unique[image_idx_idx]
        image = imageio.imread(image_id2file_path[image_id])
        image = np.array(ImageEnhance.Contrast(Image.fromarray(image)).enhance(16))

        X[idx] = image
    # Save X as compressed Numpy Array
    np.savez_compressed(f'X_fold_{fold}_{subset}.npz', v=X)
    
    # Labels
    y = np.empty(shape=[len(idxs), HEIGHT, WIDTH], dtype=np.uint8)
    for idx, image_idx_idx in enumerate(tqdm(idxs)):
        image_id = id_unique[image_idx_idx]
        y[idx] = rle_decode(image_id)
    # Save y as compressed Numpy Array
    np.savez_compressed(f'y_fold_{fold}_{subset}.npz', v=y)

In [None]:
# Unique Image Id's
id_unique = train['id'].unique()
print(f'There are {len(id_unique)} unique image ids')

In [None]:
# Maps a given image id to the corresponding image file path
image_id2file_path = train.groupby('id')[['id', 'file_path']].head(1).set_index('id').squeeze().to_dict()

In [None]:
# Make 4 folds
N_FOLDS = 4
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
# Make NPY Files
for fold, (train_idxs, val_idxs) in enumerate(kf.split(id_unique)):
    train_len = len(train_idxs)
    val_len = len(val_idxs)
    print(f'FOLD {fold} | train samples: {train_len}, val sampless: {val_len}')
    
    create_fold(train_idxs, fold, 'train')
    create_fold(val_idxs, fold, 'val')

# Check Numpy Files

In [None]:
# Shows a batch of images
def show_batch(images, rows=4, cols=2):
    fig, axes = plt.subplots(nrows=rows, ncols=cols, figsize=(cols*8, rows*6))
    for r in range(rows):
        for c in range(cols):
            axes[r, c].imshow(images[r*rows+c])
            axes[r, c].axis(False)

In [None]:
X_fold_0 = np.load('X_fold_0_train.npz')['v']
show_batch(X_fold_0)

In [None]:
y_fold_0 = np.load('y_fold_0_train.npz')['v']
show_batch(y_fold_0)