# Contents
[1. Info](#1.-Info)
 - 1-1. Pre-processed data
 - 1-2. Trained Models
 - 1-3. Datasets
    
    
[2. Used Packages](#2.-Used-Packages) 
 - 2-1. Install & Import Packages
 - 2-2. Define Functions
    
    
[3. Models](#3.-Models)
 - Set 2 kind of Fold Model Groups
 - 3-1. fold_models_1 : 'hubmap-tf-with-tpu-efficientunet-512x512-train'
 - 3-2. fold_models_2 : 'hubmap-models-cv-08848-pl-0847' 


4. Inference (fold model predict)


- Save Submission File

-----
# 1. Info
* 1-1. Pre-processed data : This Notebook is summarized with other trials
* 1-2. Image Data Pre-Processing & Basic Model, based on [Wojtek's work](https://www.kaggle.com/wrrosa/hubmap-tf-with-tpu-efficientunet-512x512-train) 
    * Model Prediction, based on [Ashish Gupta's work](https://www.kaggle.com/roydatascience/hubmap-sub-effunet5-tpu-efficientunet-512x512)
* 1-3. Before you start, make sure Datasets are added on your Notebook
    * for package installation : 'kerasapplications' , 'efficientnet'
    * for trained models       : 'hubmap-models-cv-0.8848-pl=0.847' , 'hubmap-tf-with-tpu-efficientunet-512x512' 
* (Optional) if you want to know how to control tiff large images, check this link [Nihad TP's work](https://www.kaggle.com/nihadtp/kidney-hacking-exploration-stage)

-----
# 2. Used Packages

* 2-1. Install & Import Packages
* 2-2. Define Functions

> ---
> ### 2-1. Install & Import Packages

In [None]:
# Install 'Keras Applicatoins' & 'Efficientnet' Packages
! pip install ../input/kerasapplications/keras-team-keras-applications-3b180cb -f ./ --no-index -q
! pip install ../input/efficientnet/efficientnet-1.1.0/ -f ./ --no-index -q

# Import Installed Packages

import numpy as np
import pandas as pd
import os
import glob
import gc

import rasterio
from rasterio.windows import Window

import pathlib
from tqdm.notebook import tqdm
import cv2

import tensorflow as tf
import efficientnet as efn
import efficientnet.tfkeras

import os, glob, gc
import json

osj = os.path.join # function to merge dir names

> ---
> ### 2-2. Define Functions

In [None]:
# Functions from Wojteck Kernel

def rle_encode_less_memory(img):
    pixels = img.T.flatten()
    pixels[0] = 0
    pixels[-1] = 0
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 2
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)

def make_grid(shape, window=256, min_overlap=32):
    """
        Return Array of size (N,4), where N - number of tiles,
        2nd axis represente slices: x1,x2,y1,y2 
    """
    x, y = shape
    nx = x // (window - min_overlap) + 1
    x1 = np.linspace(0, x, num=nx, endpoint=False, dtype=np.int64)
    x1[-1] = x - window
    x2 = (x1 + window).clip(0, x)
    ny = y // (window - min_overlap) + 1
    y1 = np.linspace(0, y, num=ny, endpoint=False, dtype=np.int64)
    y1[-1] = y - window
    y2 = (y1 + window).clip(0, y)
    slices = np.zeros((nx,ny, 4), dtype=np.int64)
    
    for i in range(nx):
        for j in range(ny):
            slices[i,j] = x1[i], x2[i], y1[j], y2[j]    
    return slices.reshape(nx*ny,4)

-----

# 3. Models

* 3-1. fold_models_1 : 'hubmap-tf-with-tpu-efficientunet-512x512-train'
* 3-2. fold_models_2 : 'hubmap-models-cv-08848-pl-0847'


> ---
> ### 3-1. fold_models_1 : 'hubmap-tf-with-tpu-efficientunet-512x512-train'

In [None]:
import yaml    # package for read yaml file
import pprint  # package for simplified prompt output print

# Dir including 'Model' , 'metrics' , 'params'
mod_path = '/kaggle/input/hubmap-tf-with-tpu-efficientunet-512x512-train/'

# Load parameters
with open(mod_path+'params.yaml') as file:
    P = yaml.load(file, Loader=yaml.FullLoader)
    pprint.pprint(P)

# Define additional parameters    
THRESHOLD = 0.4
WINDOW = 1024
MIN_OVERLAP = 300
NEW_SIZE = P['DIM']

In [None]:
import pandas as pd
import json

# Load METRICS, information of Models 
with open(mod_path + 'metrics.json') as json_file:
    M = json.load(json_file)
    json_file.close()
    
M_info = pd.DataFrame(M)
M_info = M_info[['val_loss','val_dice_coe','val_accuracy']]

print('[ Info of Models ]')
print('%-18s : %s' % ('Model Updated Date',str(M['datetime'])))
print('%-18s : %s' % ('Average accuracy',str(M['oof_dice_coe'])))
M_info.head()
# print('%20s : ')

# print('Model run datetime: '+M['datetime'])
# print('OOF val_dice_coe: ' + str(M['oof_dice_coe']))

In [None]:
# define coordinates
# | x' |   | a b c | | x |
# | y' | = | d e f | | y |
# | 1  |   | 0 0 1 | | 1 |
identity = rasterio.Affine(1, 0, 0, 0, 1, 0)

# make model list for cross validate the models
fold_models_1 = []
# Dir including 'Model' , 'metrics' , 'params'
mod_path = '/kaggle/input/hubmap-tf-with-tpu-efficientunet-512x512-train/'

for fold_model_path in glob.glob(mod_path+'*.h5'):
    fold_models_1.append(tf.keras.models.load_model(fold_model_path,compile = False))
print('Target Fold Models Group 1 : %d EA' % len(fold_models_1))

> ---
> ### 3-2. fold_models_2 : 'hubmap-models-cv-08848-pl-0847'

In [None]:
# Parameters from ISA's Kernel

debug = True # True False
n_debug_images = 1 if debug else 1000000000
n_debug_slices = 20 if debug else 1000000000

# whether to run prediction when committing. WILL RUN predictions during submission in any case
do_predict = False if not debug else True

models_dir = '../input/hubmap-models-cv-08848-pl-0847'
model_filepaths = [ os.path.join(models_dir, f"model-fold-{i}.h5") for i in range(4)]

assert len(model_filepaths)==len(np.unique(model_filepaths))
#folds_to_predict = [i for (i, fn) in enumerate(model_filepaths) if os.path.isfile(fn)]
model_dirnames = [os.path.dirname(filepath) for filepath in model_filepaths]

#check_order = [fn.split('.')[-2].split('-')[-1] == i for (i,fn) in enumerate(model_filepaths) if fn.strip()!='']
#assert np.sum(check_order)==0, 'models should be in folds order or empty string'

import yaml
import pprint
with open(osj(model_dirnames[0],'params.yaml')) as file:
    P = yaml.load(file, Loader=yaml.FullLoader)
    pprint.pprint(P)

THRESHOLD = 0.30
WINDOW = 1024
MIN_OVERLAP = 32 
NEW_SIZE = P['DIM']

assert sum([not os.path.isfile(path_) for path_ in model_filepaths]) == 0
print("\n Number of models : {}".format(len(model_filepaths)))

In [None]:
# About the Models

ave_score = 0
for i, m_path in enumerate(model_filepaths):
    fold_ = int(m_path.split('.')[-2].split('-')[-1])
    with open(osj(model_dirnames[i],'metrics.json')) as json_file:
        M = json.load(json_file)
    print(f"\n ----------- \nModel {model_dirnames[i].split('/')[-1]}" +
          '\nval_dice_coe: '+ str(round(M['val_dice_coe'][fold_], 5)) +
          '\tval_loss: ' + str(round(M['val_loss'][fold_], 5)) +
          '\tval_accuracy: '+ str(round(M['val_accuracy'][fold_], 5))
          )


# See the Model Informations
for model_group in np.unique(model_dirnames):
    with open(osj(model_group,'metrics.json')) as json_file:
        M = json.load(json_file)
        ave_dice = np.mean(M['val_dice_coe']) 
    ave_loss = np.mean(M['val_loss'])  # /len(folds_to_predict)
    ave_accuracy = np.mean(M['val_accuracy'])
    print(f"\n ============ MODEL GROUP {model_group} ==============")
    print(" ------------ \nAVERAGE DICE SCORE = {}".format(round(ave_dice, 5)))
    print(" ------------ \nAVERAGE VALIDATION LOSS = {}".format(round(ave_loss, 5)))
    print(" ------------ \nAVERAGE VALIDATION ACCURACY = {}".format(round(ave_accuracy, 5)))

In [None]:
%%time
# Make fold model list 

if do_predict:
    identity = rasterio.Affine(1, 0, 0, 0, 1, 0)
    fold_models_2 = []
    
    for fold_model_path in model_filepaths:
        fold_models_2.append(tf.keras.models.load_model(fold_model_path,compile = False))
#     print(len(fold_models_2))
#     print('Target Fold Models Group 2 : %d EA' % len(model_filepaths))

-----
# 4. Inference (fold model predict)

In [None]:
print('Target Fold Model Group 1 : %d EA' % len(fold_models_1))
print('Target Fold Model Group 2 : %d EA' % len(fold_models_2))

In [None]:
# testset path
p = pathlib.Path('../input/hubmap-kidney-segmentation')
subm = {}


###########################################
###### If you intend to prediction ########
skip_pred = False ## change it to "False"##
###########################################
if skip_pred!=True:
    for i, filename in tqdm(enumerate(p.glob('test/*.tiff')), 
                            total = len(list(p.glob('test/*.tiff')))):

        print(f'{i+1} Predicting {filename.stem}')

        dataset = rasterio.open(filename.as_posix(), transform = identity)
#         print(dataset.shape)
        
        slices = make_grid(dataset.shape, WINDOW, min_overlap=MIN_OVERLAP)
#         print(slices.shape)
        preds = np.zeros(dataset.shape, dtype=np.uint8)

        for (x1,x2,y1,y2) in slices:
            image = dataset.read(1, # [1,2,3]
                                 window=Window.from_slices((x1,x2),(y1,y2),boundless=True)) 
            image = np.moveaxis(image, 0, -1)
            image = cv2.resize(image, (NEW_SIZE, NEW_SIZE),interpolation = cv2.INTER_AREA)
            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
            image = np.expand_dims(image, 0)

            pred_1 = None
            pred_2 = None

            # Predict with fold_models_1 models

            for fold_model in fold_models_1:
                if pred_1 is None:
                    pred_1 = np.squeeze(fold_model.predict(image))
                else:
                    pred_1 += np.squeeze(fold_model.predict(image))

            # Predict with fold_models_2 models

            for fold_model in fold_models_2:
                if pred_2 is None:
                    pred_2 = np.squeeze(fold_model.predict(image))
                else:
                    pred_2 += np.squeeze(fold_model.predict(image))

            pred_1 = pred_1/len(fold_models_1)
            pred_2 = pred_2/len(fold_models_2)

            # calculate average of pred results
            pred = 0.5 * pred_1 + 0.5 * pred_2

            pred = cv2.resize(pred, (WINDOW, WINDOW))
            preds[x1:x2,y1:y2] += (pred > THRESHOLD).astype(np.uint8)

        preds = (preds > 0.5).astype(np.uint8)

        subm[i] = {'id':filename.stem, 'predicted': rle_encode_less_memory(preds)}
        print(np.sum(preds))
        del preds
        gc.collect();

In [None]:
print(slices.shape)
print(dataset.shape)
tmp_1 = dataset.read(1,window=Window.from_slices((x1,x2),(y1,y2)))
tmp_2 = dataset.read(2,window=Window.from_slices((x1,x2),(y1,y2)))
tmp_3 = dataset.read(3,window=Window.from_slices((x1,x2),(y1,y2)))
fig, ax = plt.subplots(1,3,figsize=(10,5))
ax[0].imshow(tmp_1)
ax[1].imshow(tmp_2)
ax[2].imshow(tmp_3)

-----
# Save Submission File

In [None]:
# see submission samples

import pandas as pd
sample_sub = pd.read_csv('../input/hubmap-kidney-segmentation/sample_submission.csv')
print(sample_sub.shape)
sample_sub

In [None]:
data_info = pd.read_csv('../input/hubmap-kidney-segmentation/HuBMAP-20-dataset_information.csv')
data_info

In [None]:
# run after prediction

submission = pd.DataFrame.from_dict(subm, orient='index')
submission.to_csv('submission.csv', index=False)
submission