# Acknowledgment

Copied from https://www.kaggle.com/wrrosa/hubmap-tf-with-tpu-efficientunet-512x512-subm by Wojtek Rosa.

The main differences from Wojtek's work:
1. Omit case where pre-tiled Test dataset is used; instead, read data directly from .tiff images
2. Instead of constructing entire mask image before run length encoding (RLE), do prediction and encoding one "tile column" at a time
3. Add reflected border to image before prediction, trim border off results before encoding

In [None]:
LAVIN = True

# Imports

In [None]:
# ! pip install ../input/kerasapplications/keras-team-keras-applications-3b180cb -f ./ --no-index -q
# ! pip install ../input/efficientnet/efficientnet-1.1.0/ -f ./ --no-index -q
import numpy as np
import pandas as pd
import os
import glob
import gc
import sys
import re

import matplotlib.pyplot as plt
%matplotlib inline

# import tifffile
import rasterio
from rasterio.windows import Window

import pathlib
# from tqdm.notebook import tqdm
import cv2

# import tensorflow as tf
# import efficientnet as efn
# import efficientnet.tfkeras

import yaml
import pprint
import json

# Run parameters

In [None]:
trained_model_dirname = "/kaggle/input/hubmap-models/"
tiff_image_dirname = "/kaggle/input/hubmap-kidney-segmentation/test/"
CSV_filename = "/kaggle/working/submission.csv"

# Read and Print Parameters
Read parameters and matrics set in Training notebook:

In [None]:
with open(trained_model_dirname+'params.yaml') as file:
    P = yaml.load(file, Loader=yaml.FullLoader)
    pprint.pprint(P)
    
P[ 'THRESHOLD' ] = 0.4 # preds > THRESHOLD

with open(trained_model_dirname + 'metrics.json') as json_file:
    M = json.load(json_file)
print('Model run datetime: '+M['datetime'])
print('OOF val_dice_coe: ' + str(M['oof_dice_coe']))


# Run length encoding (RLE) Functions
Based on https://www.kaggle.com/friedchips/fully-correct-hubmap-rle-encoding-and-decoding:


In [None]:
def encode_RLE( mask, column_pixel_offset = 0 ):
    '''
    Given a predicted binary image tile column "mask" and a starting offset in
    column-major ordered pixels, calculate and return the string RLE, which will be
    concatenated with RLEs from other columns, to construct RLE for the entire image:
    '''
    mask = mask.T.reshape(-1) # make 1D, column-first
    mask = np.pad(mask, 1) # make sure that the 1d mask starts and ends with a 0
    starts = np.nonzero((~mask[:-1]) & mask[1:])[0] # start points
    ends = np.nonzero(mask[:-1] & (~mask[1:]))[0] # end points
    rle = np.empty(2 * starts.size, dtype=int) # interlacing...
    rle[0::2] = starts  + column_pixel_offset # ...starts...
    rle[1::2] = ends - starts # ...and lengths
    rle = ' '.join([ str(elem) for elem in rle ]) # turn into space-separated string
    return rle

def rle2mask(rle, mask_shape):
    ''' takes a space-delimited RLE string in column-first order
    and turns it into a 2d boolean numpy array of shape mask_shape '''
    
    mask = np.zeros(np.prod(mask_shape), dtype=bool) # 1d mask array
    rle = np.array(rle.split()).astype(int) # rle values to ints
    starts = rle[::2]
    lengths = rle[1::2]
    for s, l in zip(starts, lengths):
        mask[s:s+l] = True
    return mask.reshape(np.flip(mask_shape)).T # flip because of column-first order


# Visualizing results

In [None]:
# From https://www.kaggle.com/friedchips/fully-correct-hubmap-rle-encoding-and-decoding

def visualize_mask_and_image( tiff_image_dirname, CSV_filename ):

    image_RLEs = pd.read_csv( CSV_filename )

    for tiff_image_filename in glob.glob( tiff_image_dirname + "*.tiff" ):
        
        if "26dc" not in tiff_image_filename:  # ### REMOVE THIS!!! ###
            continue
        
        image_id = pathlib.Path( tiff_image_filename ).stem

        RLE = image_RLEs.predicted[ image_RLEs.id == image_id ]

        if ( len( RLE ) == 0 ):
            print( "For image_id", image_id, "no prediction", file = sys.stderr )
        else:
            RLE = RLE.values[ 0 ]   # Extract from 1-long Pandas Sequence
            print( "image_id", image_id, "len(RLE)", len( RLE ), file = sys.stderr )
            with rasterio.open( tiff_image_filename, transform = None) as dataset:
                image_pixel_rows, image_pixel_cols = dataset.shape
                print( "tiff image dataset.shape", dataset.shape, file = sys.stderr )
                
                mask = rle2mask( RLE, ( image_pixel_rows, image_pixel_cols ) )
                plt.title( image_id )
                plt.imshow( mask )
                del mask
                del RLE
                plt.show()
                gc.collect()
                '''  # ### COMMENTED OUT, => Out Of Memory (OOM) ###
                plt.title( image_id )
                image = np.squeeze( dataset.read( [1, 2, 3 ] ) )  # Channels
                print( "image.shape", image.shape, file = sys.stderr )
                image = np.moveaxis( image, 0, -1 )
                plt.imshow( image )
                # del image
                plt.show()
                '''

tiff_image_dirname = "/kaggle/input/hubmap-kidney-segmentation/test/"
CSV_filename = "/kaggle/input/version-17-columnwise-submission/submission.csv"
# visualize_mask_and_image( tiff_image_dirname, CSV_filename )



# analyze_RLE
Checks for inconsistencies in the run-length encoding (RLE) of glom masks from "test" set.  Looks for "start" 
positions out of order, or two run-length segments overlapping or abutting.

Also does morphological operations:  openings (which tend to remove small isolated 1's) of various sizes and 
closings (which tend to fill in small gaps or concavities) and measures how many 1-pixels are removed or added.
The numbers listed (e.g., "3-opening") refer to the "radius" of a structuring element; thus, a 3-opening uses
a 7 x 7 array of 1's for its structuring element.

In [None]:
def analyze_RLE( tiff_image_dirname, CSV_filename ):

    image_RLEs = pd.read_csv( CSV_filename )

    for tiff_image_filename in glob.glob( tiff_image_dirname + "*.tiff" ):
        
        image_id = pathlib.Path( tiff_image_filename ).stem

        RLE = image_RLEs.predicted[ image_RLEs.id == image_id ]
        
        if len( RLE ) == 0:
            print( "for image_id", image_id, "RLE is empty" )
        
        else:
            RLE = RLE.values[ 0 ]   # Extract from 1-long Pandas Sequence
            print( "\nimage_id", image_id, "len(RLE)", len( RLE ) )
            tokens = RLE.split()
            ntokens = len( tokens )
            npairs = ntokens // 2
            if ntokens % 2 != 0:
                print( f"  ERROR:  Odd number of tokens {ntokens} in RLE string" )
            else:
                print( f"  Number of RLE pairs is {npairs}" )
                
            with rasterio.open( tiff_image_filename, transform = None) as dataset:
                image_pixel_rows, image_pixel_cols = dataset.shape
                print( "  tiff image dataset.shape", dataset.shape)
                last_start, last_length = -1, 0
                for start_str, length_str in zip( tokens[ 0 : : 2 ], tokens[ 1 : : 2 ] ):
                    start = int( start_str )
                    length = int( length_str )
                    if last_start >= start:
                        print( f"  ERROR:  start {start} <= last_start {last_start }" )
                    elif last_start + last_length > start:
                        print( f"  ERROR:  last run {(last_start,last_length)} overlaps current run {(start,length)}" )
                    elif last_start + last_length == start:
                        print( f"  WARNING:  last run {(last_start,last_length)} abuts current run {(start,length)}" )
                    last_start, last_length = start, length
                if last_start + length > image_pixel_rows * image_pixel_cols:
                    print( f"ERROR:  Overall last run goes past image size" )
                mask = rle2mask( RLE, ( image_pixel_rows, image_pixel_cols ) )
                original_count = np.sum(mask)
                print( f"  mask 1's count {original_count}" )
                for i in range( 1, 11, 2 ):
                    opened_mask = cv2.morphologyEx( mask.astype( np.uint8 ), cv2.MORPH_OPEN, np.ones( ( 2 * i + 1, 2 * i + 1 ) ) )
                    count = np.sum( opened_mask )
                    print( f"  mask 1's with {i}-opening is {count} ({int(original_count-count)} 1's removed)" )
                for i in range( 1, 11, 2 ):
                    opened_mask = cv2.morphologyEx( mask.astype( np.uint8 ), cv2.MORPH_CLOSE, np.ones( ( 2 * i + 1, 2 * i + 1 ) ) )
                    count = np.sum( opened_mask )
                    print( f"  mask 1's with {i}-closing is {count} ({int(count-original_count)} 0's removed)" )
                
analyze_RLE( tiff_image_dirname, CSV_filename )
   
