# Introduction

Notebook to run Prediction, Encoding and Saving of Training Set TIFF images and DICE comparison with
ground truth masks

# Imports

In [None]:
! pip install ../input/kerasapplications/keras-team-keras-applications-3b180cb -f ./ --no-index -q
! pip install ../input/efficientnet/efficientnet-1.1.0/ -f ./ --no-index -q
import numpy as np
import pandas as pd
import os
import glob
import gc
import sys
import re

import matplotlib.pyplot as plt
%matplotlib inline

import tifffile
import rasterio
from rasterio.windows import Window

import pathlib
from tqdm.notebook import tqdm
import cv2

import tensorflow as tf
import efficientnet as efn
import efficientnet.tfkeras

import yaml
import pprint
import json

# Run parameters

In [None]:
trained_model_dirname = "/kaggle/input/hubmap-models/"
tiff_image_dirname = "/kaggle/input/hubmap-kidney-segmentation/train/"
CSV_filename = "/kaggle/working/submission.csv"
predicted_csv_filename = "/kaggle/input/predict-encode-and-save-for-train-images/submission.csv"


In [None]:
! ls -al /kaggle/input

# Read and Print Parameters
Read parameters and matrics set in Training notebook:

In [None]:
with open(trained_model_dirname+'params.yaml') as file:
    P = yaml.load(file, Loader=yaml.FullLoader)
P[ 'THRESHOLD' ] = 0.3 # preds > THRESHOLD
P[ 'DIM' ] = 1024  # ### REMOVE ###
pprint.pprint(P)
    
with open(trained_model_dirname + 'metrics.json') as json_file:
    M = json.load(json_file)
print('Model run datetime: '+M['datetime'])
print('OOF val_dice_coe: ' + str(M['oof_dice_coe']))


# Miscellaneous Functions

In [None]:
def running_on_TPU():
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver() # TPU detection
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
        return True
    except:
        return False
    
print( "running_on_TPU", running_on_TPU(), file = sys.stderr )

def load_models( mod_path ):
    '''
    Return trained models, one per training "fold"
    '''
    fold_models = []
    for fold_model_path in glob.glob(mod_path+'*.h5'):
        fold_models.append(tf.keras.models.load_model(fold_model_path,compile = False))

    return fold_models

def tiff_image_shape( image_pathname ):
    with rasterio.open( image_pathname ) as image:
        return image.shape

# Run length encoding (RLE) Functions
Based on https://www.kaggle.com/friedchips/fully-correct-hubmap-rle-encoding-and-decoding:


In [None]:
def encode_RLE( mask, column_pixel_offset = 0 ):
    '''
    Given a predicted binary image tile column "mask" and a starting offset in
    column-major ordered pixels, calculate and return the string RLE, which will be
    concatenated with RLEs from other columns, to construct RLE for the entire image:
    '''
    mask = mask.T.reshape(-1) # make 1D, column-first
    mask = np.pad(mask, 1) # make sure that the 1d mask starts and ends with a 0
    starts = np.nonzero((~mask[:-1]) & mask[1:])[0] # start points
    ends = np.nonzero(mask[:-1] & (~mask[1:]))[0] # end points
    rle = np.empty(2 * starts.size, dtype=int) # interlacing...
    rle[0::2] = starts  + column_pixel_offset # ...starts...
    rle[1::2] = ends - starts # ...and lengths
    rle = ' '.join([ str(elem) for elem in rle ]) # turn into space-separated string
    return rle

def rle2mask(rle, mask_shape):
    ''' takes a space-delimited RLE string in column-first order
    and turns it into a 2d boolean numpy array of shape mask_shape '''
    
    mask = np.zeros(np.prod(mask_shape), dtype=bool) # 1d mask array
    rle = np.array(rle.split()).astype(int) # rle values to ints
    starts = rle[::2]
    lengths = rle[1::2]
    for s, l in zip(starts, lengths):
        mask[s:s+l] = True
    return mask.reshape(np.flip(mask_shape)).T # flip because of column-first order


# Low-memory glom Prediction and Encoding
This version of the code above reduces the memory by doing model prediction and Run Length Encoding (RLE)
column by column rather than for the entire image.

In [None]:
def predict_encode_save_testcases( image_dirname, model_dirname, CSV_filename, P, noheader = False ):
    models = load_models( model_dirname )
    results = {}
    for image_index, image_pathname in enumerate( glob.glob( image_dirname + "*.tiff" ) ):
        print( "predict_encode_save_testcases for image", image_pathname )
        image_id = pathlib.Path( image_pathname ).stem
        if image_index in range( 100 ):   # All TIFF images
            try:
                RLE = predict_encode_image( image_pathname, models, P )
            except:
                image_rows, image_cols = tiff_image_shape( image_pathname )
                RLE = "0 " + str( image_rows * image_cols - 1 )   # all 1's image
        else:
            RLE = "0 1"  # single 1 image
        results[ image_index ] = { "id": image_id, "predicted" : RLE }
        print( "image_id", image_id, "len(RLE)", len( RLE ) )
        del RLE
    
    # Write results to CSV_filename
    results_df = pd.DataFrame.from_dict( results, orient='index' )
    results_df.to_csv( CSV_filename, index=False )
    del results

    
        
def predict_encode_image( image_pathname, models, P ):
    '''
    Predict the mask for image in "image_pathname" and return its Run Length Encoding (RLE)
    '''
    with rasterio.open( image_pathname ) as image:
        image_pixel_rows, image_pixel_cols = image.shape
        # print( "image", image_pathname, "has shape", image.shape )
        DIM = P['DIM']
        RLE = ''
        for image_pixel_col in range( 0, image_pixel_cols // DIM * DIM - DIM + 1, DIM ):
            RLE += predict_encode_column( image, models, image_pixel_col, P ) + " "

        return RLE.strip()   # Remove leading, trailing whitespace

def predict_encode_column( image, models, start_image_col, P ):
    '''
    Predict the mask column corresponding to column in "image" starting at "start_image_col"
    and return its run-length encoding
    '''
    image_pixel_rows, image_pixel_cols = image.shape
    DIM = P['DIM']
    OVL = P['PIXEL_OVERLAP']
    # print( "predict_encode_column starting at", start_image_col, "/", image_pixel_cols )
    column_mask = np.zeros( ( image_pixel_rows, DIM ), dtype = np.uint8 )
    for image_pixel_row in range( 0, image_pixel_rows // DIM * DIM - DIM + 1, DIM ):
        image_tile = get_bordered_tile( image, image_pixel_row, start_image_col, DIM, OVL )
        assert image_tile.shape == ( DIM + 2 * OVL, DIM + 2 * OVL, 3 )
        mask_tile = predict_tile( image_tile, models, P )
        assert mask_tile.shape == ( DIM, DIM )
        column_mask[ image_pixel_row : image_pixel_row + DIM, 0 : DIM ] = mask_tile
        del mask_tile

    column_offset = start_image_col * image_pixel_rows
    RLE = encode_RLE( column_mask, column_offset )
    del column_mask
    # print( "sum(column_mask)", np.sum( column_mask ), "len(RLE) for start_image_col", start_image_col, "is", len( RLE ) )
    return RLE
    
    
def predict_tile( image_tile, models, P ):
    '''
    Perform prediction for "image_tile", averaging over "models", and return thresholded version
    NOTE "image_tile" is DIM x DIM PLUS PIXEL_OVERLAP border on all four sides, predicted_mask_tile
    is trimmed to DIM x DIM
    '''
    THRESHOLD = P['THRESHOLD']
    DIM = P['DIM']
    OVL = P['PIXEL_OVERLAP']
    assert image_tile.shape == ( DIM + 2 * OVL, DIM + 2 * OVL, 3 )
    image_tile_batch = np.expand_dims( image_tile, axis = 0 )
    predicted_mask_tile = None
    for model in models:
        model_prediction = model.predict( image_tile_batch )
        predicted_mask_tile = model_prediction if predicted_mask_tile is None else predicted_mask_tile + model_prediction
    if len( models ) > 1:
        predicted_mask_tile /= len( models )
    predicted_mask_tile = ( np.squeeze( predicted_mask_tile ) > P['THRESHOLD'] ).astype( np.uint8 )
    # print( "thresholded predicted_mask_tile.sum()", predicted_mask_tile.sum(), file = sys.stderr )
    predicted_mask_tile = predicted_mask_tile[ OVL : -OVL, OVL : -OVL ]
    assert predicted_mask_tile.shape == ( DIM, DIM )

    return predicted_mask_tile

def get_bordered_tile( image, pixel_row, pixel_col, DIM, OVL ):
    '''
    Extract a DIM x DIM pixel tile from "image" with upper left corner at (pixel_row, pixel_col )
    and add an OVL-pixel border around it by extending the sides or, on tiles at the image
    boundary, by reflection about the boundary.  Return DIM + 2*OVL x DIM + 2*OVL tile
    '''
    image_pixel_rows, image_pixel_cols = image.shape
    # Calculate top, bottom, left and right extensions and reflections
    top_shift = OVL if pixel_row > OVL else -OVL
    bot_shift = OVL if pixel_row < ( image_pixel_rows // DIM * DIM - OVL - DIM + 1 ) else -OVL
    lft_shift = OVL if pixel_col > OVL else -OVL
    rgt_shift = OVL if pixel_col < ( image_pixel_cols // DIM * DIM - OVL - DIM + 1 ) else -OVL
    shifts = [ top_shift, bot_shift, lft_shift, rgt_shift ]
    extents = list( map( lambda x : max( 0, x ), shifts ) )
    reflects = list( map( lambda x : max( 0, -x ), shifts ) )
    tile_window = Window.from_slices( ( pixel_row - extents[ 0 ], pixel_row + DIM + extents[ 1 ] ),
                                      ( pixel_col - extents[ 2 ], pixel_col + DIM + extents[ 3 ] ) )
    tile = image.read( [1, 2, 3 ], window = tile_window )
    tile = np.moveaxis( tile, 0, -1 )
    ts = tile.shape
    if np.any( reflects ):
        tile = cv2.copyMakeBorder( tile, reflects[ 0 ], reflects[ 1 ], reflects[ 2 ], reflects[ 3 ], cv2.BORDER_REFLECT )
    return tile

# Compare predicted "train" glom mask against ground truth

In [None]:
def read_mask_image_from_json( json_filename, mask_shape ):
    '''
    Reads polygonal representation of mask image from "json_filename" and converts to
    binary image, which is returned.
    '''
    with open( json_filename, "r") as read_file:
        mask_data = json.load(read_file)
        
    polys = []
    for index in range(mask_data.__len__()):
        geom = np.array(mask_data[index]['geometry']['coordinates'])
        polys.append(geom)

    mask = np.zeros(mask_shape, dtype = np.uint8 )
    cv2.fillPoly(mask, polys, 1)
    mask = mask.astype(bool)
    return mask

def read_mask_from_csv( csv_filename, image_id, mask_shape ):
    '''
    Reads Run Length Encoded (RLE) for "image_id" and converts to binary image, which
    is returned.
    '''
    CSV = pd.read_csv( csv_filename )
    RLE = CSV.predicted[ CSV[ "id" ] == image_id ].values[ 0 ]
    return rle2mask( RLE, mask_shape )
    
def calculate_DICE_for_two_masks( predicted_mask, ground_truth_mask ):
    '''
    Calculates numerator and denominator of DICE coefficient for the binary glom mark images
    in two arguments and returns them
    '''
    npredicted = np.sum( predicted_mask )
    ngroundtruth = np.sum( ground_truth_mask  )
    nint = np.sum( predicted_mask & ground_truth_mask )
    return 2 * nint, npredicted + ngroundtruth

def DICE_compare_two_masks( image_id, tiff_image_dirname, predicted_csv_filename ):
    image_shape = tiff_image_shape( tiff_image_dirname + image_id + ".tiff" )
    ground_truth_mask = read_mask_image_from_json( tiff_image_dirname + image_id + ".json", image_shape )
    predicted_mask = read_mask_from_csv( predicted_csv_filename, image_id, image_shape )
    # predicted_mask = clean_binary_image( predicted_mask, 0 )  # ### DON'T BOTHER, CLEANING DOESN'T HELP
    # predicted_mask = expand( predicted_mask, -1 )   # ### ONLY IMPROVES SCORE FROM 0.94938 to .94985
    DICE = calculate_DICE_for_two_masks( predicted_mask, ground_truth_mask )
    return DICE

def DICE_compare_predicted_ground_truth( tiff_image_dirname, predicted_csv_filename ):
    tiff_image_filenames = glob.glob( tiff_image_dirname + "*.tiff" )
    DICE_scores = {}
    sum_num = 0.0
    sum_den = 0.0
    for tiff_image_filename in tiff_image_filenames:
        image_id = pathlib.Path( tiff_image_filename ).stem
        print( "predicting mask for image", image_id )
        num, den = DICE_compare_two_masks( image_id, tiff_image_dirname, predicted_csv_filename )
        DICE_score = num / den
        DICE_scores[ image_id ] = DICE_score
        sum_num += num
        sum_den += den
        # print( "after", image_id, "DICE_scores are", DICE_scores )
    print( "weighted average DICE score", sum_num / sum_den )
    return DICE_scores

def clean_binary_image( image, radius ):
    if radius > 0 :
        kernel = np.ones( ( 2 * radius + 1, 2 * radius + 1 ) )
        image = cv2.morphologyEx( image.astype( np.uint8 ), cv2.MORPH_OPEN, kernel )
        image = cv2.morphologyEx( image, cv2.MORPH_CLOSE, kernel ).astype( np.bool )
    return image

def expand( image, radius ):
    if radius > 0:
        kernel = np.ones( ( 2 * radius + 1, 2 * radius + 1 ) )
        image = cv2.dilate( image.astype( np.uint8 ), kernel )
    elif radius < 0:
        kernel = np.ones( ( -2 * radius + 1, -2 * radius + 1 ) )
        image = cv2.erode( image.astype( np.uint8 ), kernel )
    elif radius == 0:
        pass
    return image

# Do predictions for all "train" images; 

This step generates glom mask predictions for all eight "train" images.
This is, of course, a cheat because we're using the models that were trained on "trained".  This will take several hours.  Put resulting RLE's in "predict-encode-and-save-for-train-images/train_submission.csv"

This step was executed once, and the results were saved in the dataset https://www.kaggle.com/markalavin/predict-encode-and-save-for-train-images.  After that,
this step was commented out, and comparisons (DICE, XOR) were made between predicted::ground truth pairs of glom masks.

In [None]:
'''
predict_encode_save_testcases( tiff_image_dirname, trained_model_dirname, CSV_filename, P )
'''

# Do the DICE comparison of predicted and ground truth
This step is intended to calculate the DICE coefficient (https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient), which measures the similarity of two binary images and ranges from 0 (all pixels differ between predicted and ground truth images) to 1 (predicted and ground truth images are identical).

The overall DICE coefficient is calculated between the composite of all eight train image predictions and ground truth images, that is, it's the average the DICE scores for each train image, weighted by the number of 1's in the trained + predicted image.

In [None]:
DICE_scores = DICE_compare_predicted_ground_truth( tiff_image_dirname, predicted_csv_filename )
print( "DICE_scores", DICE_scores )

# Windowed DICE Comparison
Break predicted and ground truth mask images into DIM x DIM windows, calculate a DICE score for each window, show histogram of results.  NOTE that all-0's windows are ignored, rather than treating as if they had DICE scores of 1.0.

In [None]:
def DICE_window_compare_two_masks( image_id, tiff_image_dirname, predicted_csv_filename,
                                 threshold = None ):
    '''
    Returns { ( start_row, start_col ) : DICE_score, ... }, but omits cases
    where both predicted and ground-truth masks are all 0's:
    '''
    DIM = P[ 'DIM' ]
    result = {}
    image_shape = tiff_image_shape( tiff_image_dirname + image_id + ".tiff" )
    ground_truth_mask = read_mask_image_from_json( tiff_image_dirname + image_id + ".json", image_shape )
    predicted_mask = read_mask_from_csv( predicted_csv_filename, image_id, image_shape )
    for start_row in range( 0, image_shape[ 0 ] // DIM * DIM - DIM + 1, DIM ):
        for start_col in range( 0, image_shape[ 1 ] // DIM * DIM - DIM + 1, DIM ):
            predicted_mask_window = predicted_mask[ start_row : start_row + DIM, start_col : start_col + DIM ]
            ground_truth_mask_window = ground_truth_mask[ start_row : start_row + DIM, start_col : start_col + DIM ]
            DICE_num, DICE_denom = calculate_DICE_for_two_masks( predicted_mask_window, ground_truth_mask_window )
            if ( DICE_num > 0 ) or ( DICE_denom > 0 ):
                result[ ( start_row, start_col ) ] = DICE_num / DICE_denom
    # print( "result", result )
    return result

def DICE_window_compare_predicted_ground_truth( tiff_image_dirname, predicted_csv_filename ):
    tiff_image_filenames = glob.glob( tiff_image_dirname + "*.tiff" )
    for tiff_image_filename in tiff_image_filenames:
        image_id = pathlib.Path( tiff_image_filename ).stem
        print( "for image", image_id )
        DICE_window_scores = DICE_window_compare_two_masks( image_id, tiff_image_dirname, predicted_csv_filename )
        plt.hist( DICE_window_scores.values(), bins = 32, range = ( 0, 1.0 ) )
        plt.show()
    return


In [None]:
DICE_window_compare_predicted_ground_truth( tiff_image_dirname, predicted_csv_filename )

# Windowed XOR comparison
Similar to Windowed DICE comparison, but just counts the number of discrepant pixels between each of the predicted/ground truth image pairs.  Thus, while higher DICE score
(max. 1.0) is desirable, we would like the windowed XOR score to be minimal, and 0.0
means the predicted and ground truth windowed images are identical.

For purposes of comparison, we define a threshold number of discrepant pixels, that is, we only consider windows where the predicted and ground truth images have at least the threshold number of discrepancies.

Result is reported in two ways:   First, we plot a histogram of XOR scores, which ranges from the threshold to the window size (DIM x DIM).  Second, for each window with at least the threshold number of discrepancies, we plot four binary images:

   predicted_image,  ground_truth_image, pNOTg, and gNOTp
   
where the latter two show the two 1-sided differences, e.g., all the pixels that are 1 in the predicted image and 0 in the ground-truth image.

In [None]:
def calculate_XOR_for_two_masks( predicted_mask, ground_truth_mask ):
    '''
    Calculates ( pNOTg, gNOTp ) the counts of discrepancies for the binary glom
    mark images in two arguments and returns them.
    '''
    pNOTg = np.sum( np.logical_and( predicted_mask, np.logical_not( ground_truth_mask ) ) )
    gNOTp = np.sum( np.logical_and( ground_truth_mask, np.logical_not( predicted_mask ) ) )
    return pNOTg, gNOTp

def XOR_window_compare_two_masks( image_id, tiff_image_dirname, predicted_csv_filename,
                                 threshold = None ):
    '''
    Returns { ( start_row, start_col ) : XOR_score, ... }, but omits cases
    where both predicted and ground-truth masks are all 0's:
    '''
    DIM = P[ 'DIM' ]
    result = {}
    image_shape = tiff_image_shape( tiff_image_dirname + image_id + ".tiff" )
    ground_truth_mask = read_mask_image_from_json( tiff_image_dirname + image_id + ".json", image_shape )
    predicted_mask = read_mask_from_csv( predicted_csv_filename, image_id, image_shape )
    for start_row in range( 0, image_shape[ 0 ] // DIM * DIM - DIM + 1, DIM ):
        for start_col in range( 0, image_shape[ 1 ] // DIM * DIM - DIM + 1, DIM ):
            predicted_mask_window = predicted_mask[ start_row : start_row + DIM, start_col : start_col + DIM ]
            ground_truth_mask_window = ground_truth_mask[ start_row : start_row + DIM, start_col : start_col + DIM ]
            DICE_num, DICE_denom = calculate_DICE_for_two_masks( predicted_mask_window, ground_truth_mask_window )
            if ( DICE_num > 0 ) or ( DICE_denom > 0 ):
                pNOTg, gNOTp = calculate_XOR_for_two_masks( predicted_mask_window, ground_truth_mask_window )
                result[ ( start_row, start_col ) ] = pNOTg, gNOTp
                if (threshold is not None ) & ( ( pNOTg + gNOTp ) >= threshold ):
                    fig, axes = plt.subplots( 1, 4, figsize = ( 12.0, 6.0 ) )
                    fig.suptitle( "Image " + image_id + " at " + str( start_row ) + ", " + str( start_col ) )
                    axes[ 0 ].imshow( predicted_mask_window )
                    axes[ 1 ].imshow( ground_truth_mask_window )
                    axes[ 2 ].imshow( np.logical_and( predicted_mask_window, np.logical_not( ground_truth_mask_window ) ) )
                    axes[ 3 ].imshow( np.logical_and( ground_truth_mask_window, np.logical_not( predicted_mask_window ) ) )
                    plt.show()
    return result

def XOR_window_compare_predicted_ground_truth( tiff_image_dirname, predicted_csv_filename ):
    tiff_image_filenames = glob.glob( tiff_image_dirname + "*.tiff" )
    for tiff_image_filename in tiff_image_filenames:
        image_id = pathlib.Path( tiff_image_filename ).stem
        print( "for image", image_id )
        XOR_window_scores = XOR_window_compare_two_masks( image_id, tiff_image_dirname, predicted_csv_filename, threshold = 5000 )
        plt.hist( XOR_window_scores.values(), bins = 32 )
        plt.show()
    return


In [None]:
XOR_window_compare_predicted_ground_truth( tiff_image_dirname, predicted_csv_filename )