# This kernel is a training baseline using the dataset provided by @ayuraj. 
# I have made the dataset into TFRecords which I shall be using for this kernel.

## Dependencies

In [1]:
import os

import numpy as np
import pandas as pd
from glob import glob
import tensorflow as tf
from PIL import ImageFont
from typing import List, Tuple
from collections import Counter
import plotly.graph_objects as go
from matplotlib import pyplot as plt
from plotly.subplots import make_subplots
from kaggle_datasets import KaggleDatasets
import seaborn as sns
import plotly.express as px
import tensorflow_addons as tfa
from glob import glob

In [1]:
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import train_test_split
from skmultilearn.model_selection import iterative_train_test_split

We won't be needing the train_df here , as the labels we need are present in the TFRecords itself. 

In [1]:
train_df = pd.read_csv("../input/hpasinglelabelcellcsv/singlelabelcellonly.csv")
train_df.head()

In [1]:
train_df

In [1]:
BASE_DIR = KaggleDatasets().get_gcs_path('hpa-single-label-cell-level-tfrecords')

In [1]:
IMG_DIR = os.path.join(BASE_DIR , 'tfrecords/')



In [1]:
IMG_DIR

In [1]:
TRAIN_TFRECORDS = tf.io.gfile.glob(os.path.join(IMG_DIR, '*.tfrec'))

In [1]:
TRAIN_TFRECORDS

I have taken the following functions and some functions from this really neat [kernel](https://www.kaggle.com/soumikrakshit/hpa-baseline-on-tpu). Thanks @soumikrakshit.

In [1]:
class TFRecordLoader:

    def __init__(self, image_size: List[int], n_classes: int):
        self.image_size = image_size
        self.n_classes = n_classes
        

    def _parse_image(self, image):
        image = tf.image.decode_png(image, channels=3)
        image = tf.cast(image, dtype=tf.float32) / 255.0
        image = tf.image.resize(image, self.image_size)
        
        return image

    def _parse_label(self, label):
        indices = tf.strings.to_number(
            label       
        )
        indices =tf.cast(indices ,dtype = tf.uint8)
        return tf.one_hot(indices, depth=self.n_classes)
        

    def _make_example(self, example):
        feature_format = {
            'image': tf.io.FixedLenFeature([], dtype=tf.string),
            'image_name': tf.io.FixedLenFeature([], dtype=tf.string),
            'target': tf.io.FixedLenFeature([], dtype=tf.string)
        }
        features = tf.io.parse_single_example(example, features=feature_format)
        image = self._parse_image(features['image'])
        image_name = features['image_name']
        label = self._parse_label(features['target'])
        return image,  label

   
    

    def get_dataset(self, train_tfrecord_files: List[str], ignore_order: bool = False):
        options = tf.data.Options()
        options.experimental_deterministic = False
        dataset = tf.data.TFRecordDataset(
            train_tfrecord_files, num_parallel_reads=tf.data.AUTOTUNE)
        dataset = dataset.with_options(options) if ignore_order else dataset
        dataset = dataset.map(
            map_func=self._make_example, num_parallel_calls=tf.data.AUTOTUNE)
        #dataset = self._preprocess(dataset)
        return dataset

In [1]:
class AugmentationFactory:

    def __init__(self, include_flips: bool, include_rotation: bool, include_jitter: bool):
        self.include_flips = include_flips
        self.include_rotation = include_rotation
        self.include_jitter = include_jitter

    @staticmethod
    def _flip_horizontal(image, seed):
        image = tf.image.stateless_random_flip_left_right(image, seed)
        return image

    @staticmethod
    def _flip_vertical(image, seed):
        image = tf.image.stateless_random_flip_up_down(image, seed)
        return image

    @staticmethod
    def _rotate(image):
        rotation_k = tf.random.uniform((1,), minval=0, maxval=4, dtype=tf.int32)[0]
        image = tf.image.rot90(image, k=rotation_k)
        return image

    @staticmethod
    def _random_jitter(image, seed):
        image = tf.image.stateless_random_saturation(image, 0.9, 1.1, seed)
        image = tf.image.stateless_random_brightness(image, 0.075, seed)
        image = tf.image.stateless_random_contrast(image, 0.9, 1.1, seed)
        return image

    def _map_augmentations(self, image, label):
        seed = tf.random.uniform((2,), minval=0, maxval=100, dtype=tf.int32)
        if self.include_flips:
            image = self._flip_horizontal(image=image, seed=seed)
            image = self._flip_vertical(image=image, seed=seed)
        image = self._rotate(image=image) if self.include_rotation else image
        image = self._random_jitter(image=image, seed=seed) if self.include_jitter else image
        return image, label

    def augment_dataset(self, dataset):
        return dataset.map(
            map_func=self._map_augmentations,
            num_parallel_calls=tf.data.AUTOTUNE
        )

In [1]:
loader = TFRecordLoader(
    image_size=[224, 224], n_classes=19, 
)
dataset = loader.get_dataset(TRAIN_TFRECORDS)

augmentation_factory = AugmentationFactory(
    include_flips=True, include_rotation=True, include_jitter=True
)
dataset = augmentation_factory.augment_dataset(dataset)

In [1]:
for x in dataset.take(1):
    plt.imshow(x[0])

In [1]:
def get_strategy():
    try:  # detect TPUs
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
    except ValueError:  # detect GPUs
        strategy = tf.distribute.MirroredStrategy()  # for GPU or multi-GPU machines
    print("Number of accelerators: ", strategy.num_replicas_in_sync)
    return strategy

In [1]:
# train_labels= tf.dtypes.cast(train_labels ,  dtype = tf.float32)
# valid_labels = tf.dtypes.cast(valid_labels ,  dtype = tf.float32)

In [1]:
#IMSIZE = (224, 240, 260, 300, 380, 456, 528, 600)
IMSIZE = 224


In [1]:
def configure_train_dataset(augmented_dataset, shuffle_buffer: int = 128, batch_size: int = 16):
    dataset = augmented_dataset.repeat()
    dataset = augmented_dataset.shuffle(shuffle_buffer)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.AUTOTUNE)
    return dataset

In [1]:
def configure_val_dataset(augmented_dataset, shuffle_buffer: int = 128, batch_size: int = 16):
    dataset = augmented_dataset.repeat()
    dataset = augmented_dataset.shuffle(shuffle_buffer)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.AUTOTUNE)
    return dataset

Code for the model was taken from [this notebook](https://www.kaggle.com/dschettler8845/hpa-cellwise-classification-training/data).
Thanks @dschettler8845 for all the amazing work in this competition! :)

In [1]:
def get_backbone(efficientnet_name="efficientnet_b0", input_shape=(224,224,3), include_top=False, weights="imagenet", pooling="avg"):
    if "b0" in efficientnet_name:
        eb = tf.keras.applications.EfficientNetB0(
            include_top=include_top, weights=weights, pooling=pooling, input_shape=input_shape
            )
    elif "b1" in efficientnet_name:
        eb = tf.keras.applications.EfficientNetB1(
            include_top=include_top, weights=weights, pooling=pooling, input_shape=input_shape
            )
    elif "b2" in efficientnet_name:
        eb = tf.keras.applications.EfficientNetB2(
            include_top=include_top, weights=weights, pooling=pooling, input_shape=input_shape
            )
    elif "b3" in efficientnet_name:
        eb = tf.keras.applications.EfficientNetB3(
            include_top=include_top, weights=weights, pooling=pooling, input_shape=input_shape
            )
    elif "b4" in efficientnet_name:
        eb = tf.keras.applications.EfficientNetB4(
            include_top=include_top, weights=weights, pooling=pooling, input_shape=input_shape
            )
    elif "b5" in efficientnet_name:
        eb = tf.keras.applications.EfficientNetB5(
            include_top=include_top, weights=weights, pooling=pooling, input_shape=input_shape
            )
    elif "b6" in efficientnet_name:
        eb = tf.keras.applications.EfficientNetB6(
            include_top=include_top, weights=weights, pooling=pooling, input_shape=input_shape
            )
    elif "b7" in efficientnet_name:
        eb = tf.keras.applications.EfficientNetB7(
            include_top=include_top, weights=weights, pooling=pooling, input_shape=input_shape
            )
    else:
        raise ValueError("Invalid EfficientNet Name!!!")
    return eb


def add_head_to_bb(bb, n_classes=19, dropout=0.05, head_layer_nodes=(512,)):
    x = tf.keras.layers.BatchNormalization()(bb.output)
    x = tf.keras.layers.Dropout(dropout)(x)
    
    for n_nodes in head_layer_nodes:
        x = tf.keras.layers.Dense(n_nodes, activation="relu")(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Dropout(dropout/2)(x)
    
    output = tf.keras.layers.Dense(n_classes, activation="sigmoid")(x)
    return tf.keras.Model(inputs=bb.inputs, outputs=output)


#eb.compile(optimizer=OPTIMIZER, loss=LOSS_FN, metrics=["acc", tf.keras.metrics.AUC(name="auc", multi_label=True)])

Since we do not have the length of our TFRecords dataset (it is a prefetch dataset) , I use the following functions to make a train/test split in our dataset. is_test returns 1 out of every 5 examples , is_train returns the remaining 4 out of 5. This results in a 80-20 train-test split of our dataset.

In [1]:

def is_test(x, y):
    return x % 5 == 0

def is_train(x, y):
    return not is_test(x, y)

recover = lambda x,y: y



In [1]:
strategy = get_strategy()

In [1]:
loader = TFRecordLoader(
    image_size=[IMSIZE, IMSIZE], n_classes=19
    
)
dataset = loader.get_dataset(
    TRAIN_TFRECORDS, ignore_order=True
)

val_dataset = dataset.enumerate() \
                    .filter(is_test) \
                    .map(recover)

train_dataset = dataset.enumerate() \
                    .filter(is_train) \
                    .map(recover)
        

augmentation_factory = AugmentationFactory(
    include_flips=True, include_rotation=False, include_jitter=True)

train_dataset = augmentation_factory.augment_dataset(train_dataset)
BATCH_SIZE = 64 * strategy.num_replicas_in_sync
train_dataset = configure_train_dataset(
    train_dataset, batch_size=BATCH_SIZE
)

val_dataset = configure_val_dataset(val_dataset, batch_size = BATCH_SIZE )


In [1]:
    
with strategy.scope():
    eff = get_backbone("b0")
    model = add_head_to_bb(eff, n_classes=19, dropout=0.5) 
    model.compile(
        optimizer=tf.keras.optimizers.Adam(),
        loss=tf.keras.losses.BinaryCrossentropy(),
        metrics=[tf.keras.metrics.AUC(multi_label=True)])
        
    model.summary()

In [1]:
#steps_per_epoch = train_paths.shape[0] // BATCH_SIZE
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    'effb7model.h5', save_best_only=True, monitor='val_loss', mode='min')
lr_reducer = tf.keras.callbacks.ReduceLROnPlateau(
    monitor="val_loss", patience=3, min_lr=1e-6, mode='min')


### Training the Model

Steps per epoch is unknown to us as the length of the dataset is unknown , however after the first epoch it is calculated automatically by TF , so the first epoch will show x/unknown for the number of steps during the run.

In [1]:
history = model.fit(
    train_dataset, 
    epochs=10,
    verbose=1,
    callbacks=[checkpoint, lr_reducer],
    
    validation_data=val_dataset)

In [1]:
hist_df = pd.DataFrame(history.history)
hist_df.to_csv('history.csv')

In [1]:

save_locally = tf.saved_model.SaveOptions(experimental_io_device='/job:localhost')
model.save('./model', options=save_locally) # saving in Tensorflow's "SavedModel" format