# ***Disclaimer:*** 
Hello Kagglers! I am a Solution Architect with the Google Cloud Platform. I am a coach for this competition, the focus of my contributions is on helping users to leverage GCP components (GCS, TPUs, BigQueryetc..) in order to solve large problems. My ideas and contributions represent my own opinion, and are not representative of an official recommendation by Google. Also, I try to develop notebooks quickly in order to help users early in competitions. There may be better ways to solving particular problems, I welcome comments and suggestions. Use my contributions at your own risk, I don't garantee that they will help on winning any competition, but I am hoping to learn by collaborating with everyone.

# Objective:
The objective of this notebook is to demonstrate how to build a TFRecord Dataset designed to be used with a TPU Accelerator. In previous notebooks (see list below) I have build a TFRecord dataset using 1 512x512 tile per file. This resulted in thousands of files which severely hurt the TPU performance. It turns out that the recommended TFRecord file size for feeding TPUs is about 100M. So, I have re-packated the previously built TFRecord Dataset packing 256 512x512 tiles per record, which amounts to about 170M per TFRecord file. In this notebook I group tiles by type in 3 datasets:
1) Tiles with Gloms
2) Tiles with NoGloms
3) Tiles with both Gloms and NoGloms

The datasets are stored in the following public dataset (hubmap-large-records):
[https://www.kaggle.com/marcosnovaes/hubmap-large-records](https://www.kaggle.com/marcosnovaes/hubmap-large-records)

The 3 datasets consist of several file parts. I illustrate how to use that dataset with a TPU enabled Keras model in this Notebook:
[https://www.kaggle.com/marcosnovaes/hubmap-unet-keras-model-fit-with-tpu/](https://www.kaggle.com/marcosnovaes/hubmap-unet-keras-model-fit-with-tpu/)

Previous Notebooks in this competition:

https://www.kaggle.com/marcosnovaes/hubmap-3-unet-models-with-keras-cpu-gpu/: Investigates three implementations of the Unet model

https://www.kaggle.com/marcosnovaes/hubmap-read-data-and-build-tfrecords/: Demonstrates how the TFRecord Dataset was built

https://www.kaggle.com/marcosnovaes/hubmap-looking-at-tfrecords/: Explains how to read the data using the TFRecord Dataset


In [None]:
import os
import sys
import random
import warnings

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from tqdm import tqdm

import tensorflow as tf

In [None]:
# Utilities serialize data into a TFRecord
def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [None]:
def image_example(image, mask):
    image_shape = image.shape
    
    img_bytes = image.tostring()

    mask_bytes = mask.tostring()
    
    feature = {
        'height': _int64_feature(image_shape[0]),
        'width': _int64_feature(image_shape[1]),
        'num_channels': _int64_feature(image_shape[2]),
        'img_bytes': _bytes_feature(img_bytes),
        'mask' : _bytes_feature(mask_bytes),
    }
    return tf.train.Example(features=tf.train.Features(feature=feature))

In [None]:
def create_tfrecord( image, mask,output_path):
    opts = tf.io.TFRecordOptions(compression_type="GZIP")
    with tf.io.TFRecordWriter(output_path, opts) as writer:
        tf_example = image_example(image, mask)
        writer.write(tf_example.SerializeToString())
    writer.close()

In [None]:
# Create a dictionary for reading. 
image_feature_description = {
    'img_index': tf.io.FixedLenFeature([], tf.int64),
    'height': tf.io.FixedLenFeature([], tf.int64),
    'width': tf.io.FixedLenFeature([], tf.int64),
    'num_channels': tf.io.FixedLenFeature([], tf.int64),
    'img_bytes': tf.io.FixedLenFeature([], tf.string),
    'mask': tf.io.FixedLenFeature([], tf.string),
    'tile_id': tf.io.FixedLenFeature([], tf.int64),
    'tile_col_pos': tf.io.FixedLenFeature([], tf.int64),
    'tile_row_pos': tf.io.FixedLenFeature([], tf.int64),
}

def _parse_image_and_masks_function(example_proto):
    single_example = tf.io.parse_single_example(example_proto, image_feature_description)
    img_height = single_example['height']
    img_width = single_example['width']
    num_channels = single_example['num_channels']
    
    img_bytes =  tf.io.decode_raw(single_example['img_bytes'],out_type='uint8')
    
    #dynamic shape
    #img_array = tf.reshape( img_bytes, (img_height, img_width, num_channels))
    
    #fixed shape
    img_array = tf.reshape( img_bytes, (512, 512, 3))
    
    mask_bytes =  tf.io.decode_raw(single_example['mask'],out_type='bool')

    mask = tf.reshape(mask_bytes, (512,512))
    
    #normalize images array and cast image and mask to float32
    #img_array = tf.cast(img_array, tf.float32) / 255.0
    #mask = tf.cast(mask, tf.float32)
    return img_array, mask

def read_dataset(storage_file_path):
    encoded_image_dataset = tf.data.TFRecordDataset(storage_file_path, compression_type="GZIP")
    parsed_image_dataset = encoded_image_dataset.map(_parse_image_and_masks_function)
    return parsed_image_dataset

In [None]:
!ls /kaggle/input

In [None]:
!ls /kaggle/input/hubmap-tfrecord-512/

In [None]:
train_tiles_csv = '/kaggle/input/hubmap-tfrecord-512/train_all_tiles.csv'
test_tiles_csv = '/kaggle/input/hubmap-tfrecord-512/test_all_tiles.csv'

In [None]:
# build a dataset of all images tiles from the train set that have gloms in them
#for csv_file in file_list:
train_tiles_df = pd.read_csv(train_tiles_csv)
train_gloms_df = train_tiles_df.loc[train_tiles_df["mask_density"]  > 0]
train_gloms_df.head()

In [None]:
train_gloms_df.__len__()

In [None]:
train_cropped_df = train_tiles_df.loc[train_tiles_df["lowband_density"]  > 1000]
train_cropped_df.head()

In [None]:
train_cropped_df.__len__()

In [None]:
no_gloms_df = train_cropped_df.loc[train_cropped_df["mask_density"]  == 0]
no_gloms_df.__len__()

Shuffle all the dataframes. This helps for training later.

In [None]:
#shuffle the data frames
#train_gloms_df = train_gloms_df.sample(frac=1)
#train_cropped_df = train_cropped_df.sample(frac=1)
#no_gloms_df = no_gloms_df.sample(frac=1)
train_gloms_df.head()

In [None]:
# read selected ranges of the datasets into train and validation datasets. Use about 10% for validation

train_glom_files = train_gloms_df[0:3000]['local_path']
validation_glom_files = train_gloms_df[3001:]['local_path']

train_cropped_files = train_cropped_df[0:13500]['local_path']
validation_cropped_files = train_cropped_df[13500:]['local_path']

train_no_glom_files = no_gloms_df[0:11000]['local_path']
validation_no_glom_files = no_gloms_df[11000:]['local_path']

train_glom_dataset = read_dataset(train_glom_files)
validation_glom_dataset = read_dataset(validation_glom_files)

train_cropped_dataset = read_dataset(train_cropped_files)
validation_cropped_dataset = read_dataset(validation_cropped_files)

train_no_glom_dataset = read_dataset(train_no_glom_files)
validation_no_glom_dataset = read_dataset(validation_no_glom_files)


In [None]:
def write_dataset( dataset, records_per_part, prefix):
    opts = tf.io.TFRecordOptions(compression_type="GZIP")
    part_num = 0
    num_records = 0
    output_path = prefix+'_part{}.tfrecords'.format(part_num)
    writer = tf.io.TFRecordWriter(output_path, opts)
    
    for image, mask in dataset.as_numpy_iterator(): 
            tf_example = image_example(image, mask)
            writer.write(tf_example.SerializeToString())
            num_records += 1   
            if(num_records == records_per_part - 1):
                # close current file and open new one
                print("wrote part #{}".format(part_num))
                writer.close()
                part_num += 1
                output_path = prefix+'_part{}.tfrecords'.format(part_num)
                writer = tf.io.TFRecordWriter(output_path, opts)
                num_records = 0
    writer.close()

In [None]:
print("writing train_glom_dataset")
write_dataset( train_glom_dataset, 256, '/kaggle/working/train_gloms')
print("writing validation_glom_dataset")
write_dataset( validation_glom_dataset, 256, '/kaggle/working/validation_gloms')
print("writing train_cropped_dataset")
write_dataset( train_cropped_dataset, 256, '/kaggle/working/train_cropped')
print("writing validation_cropped_dataset")
write_dataset( validation_cropped_dataset, 256, '/kaggle/working/validation_cropped')
print("writing train_no_glom_dataset")
write_dataset( train_no_glom_dataset, 256, '/kaggle/working/train_no_gloms')
print("writing validation_no_glom_dataset")
write_dataset( validation_no_glom_dataset, 256, '/kaggle/working/validation_no_gloms')

In [None]:
!ls -l /kaggle/working