[Modelling Notebook](https://www.kaggle.com/venkat555/ranzcr-clip-tpu-densenet-with-kfold/)

**Credits** 
* Flowers TPU Notebook 
* Fellow Kagglers - All the amazing posts and kernels to learn from

## Dependencies

In [None]:
import math, os, re, warnings, random, glob
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras.layers as L
import tensorflow.keras.backend as K
from tensorflow.keras import Sequential
from kaggle_datasets import KaggleDatasets

### Hardware configuration

In [None]:
# TPU or GPU detection
# Detect hardware, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print(f'Running on TPU {tpu.master()}')
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

AUTO = tf.data.experimental.AUTOTUNE
REPLICAS = strategy.num_replicas_in_sync
print(f'REPLICAS: {REPLICAS}')

# Model parameters

In [None]:
BATCH_SIZE = 16 * REPLICAS
HEIGHT = 512
WIDTH = 512 
CHANNELS = 3
N_CLASSES = 5
TTA_STEPS = 3 # Do TTA if > 0 
IMAGE_SIZE = [512, 512] # At this size, a GPU will run out of memory. Use the TPU.
                        # For GPU training, please select 224 x 224 px image size.
SEED =555    
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
AUG_BATCH = BATCH_SIZE

## Augmentation

In [None]:
def data_augment(image, label):
    # data augmentation. Thanks to the dataset.prefetch(AUTO) statement in the next function (below),
    # this happens essentially for free on TPU. Data pipeline code is executed on the "CPU" part
    # of the TPU while the TPU itself is computing gradients.
    # RandomCrop, VFlip, HFilp, RandomRotate
    image = tf.image.rot90(image,k=np.random.randint(4))
    image = tf.image.random_flip_left_right(image , seed=SEED)
    image=  image = tf.image.random_flip_up_down(image, seed=SEED)
    IMG_SIZE=IMAGE_SIZE[0]
    # Add 6 pixels of padding
    image = tf.image.resize_with_crop_or_pad(image, IMG_SIZE + 6, IMG_SIZE + 6) 
    # Random crop back to the original size
    image = tf.image.random_crop(image, size=[IMG_SIZE, IMG_SIZE, 3])
    image = tf.image.random_brightness(image, max_delta=0.5) # Random brightness
    image = tf.image.random_saturation(image, 0, 2, seed=SEED)
    image = tf.image.adjust_saturation(image, 3)
    
    #image = tf.image.central_crop(image, central_fraction=0.5)
    return image, label 

## Auxilary Functions

In [None]:
def to_float32_2(image, label):
    max_val = tf.reduce_max(label, axis=-1,keepdims=True)
    cond = tf.equal(label, max_val)
    label = tf.where(cond, tf.ones_like(label), tf.zeros_like(label))
    return tf.cast(image, tf.float32), tf.cast(label, tf.int32)

def to_float32(image, label):
    return tf.cast(image, tf.float32), label

def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels=3)
    image = tf.cast(image, tf.float32) / 255.0  # convert image to floats in [0, 1] range
    image = tf.reshape(image, [1024,1024, 3]) # explicit size needed for TPU
    return image



# Create a dictionary describing the features.


def read_labeled_tfrecord(example):
    # Create a dictionary describing the features.
    train_feature_description = {
        "CVC - Abnormal": tf.io.FixedLenFeature([], tf.int64),
        "CVC - Borderline": tf.io.FixedLenFeature([], tf.int64),
        "CVC - Normal": tf.io.FixedLenFeature([], tf.int64),
        "ETT - Abnormal": tf.io.FixedLenFeature([], tf.int64),
        "ETT - Borderline": tf.io.FixedLenFeature([], tf.int64),
        "ETT - Normal": tf.io.FixedLenFeature([], tf.int64),
        "NGT - Abnormal": tf.io.FixedLenFeature([], tf.int64),
        "NGT - Borderline": tf.io.FixedLenFeature([], tf.int64),
        "NGT - Incompletely Imaged": tf.io.FixedLenFeature([], tf.int64),
        "NGT - Normal": tf.io.FixedLenFeature([], tf.int64),
        "StudyInstanceUID" : tf.io.FixedLenFeature([], tf.string),
        "Swan Ganz Catheter Present" : tf.io.FixedLenFeature([], tf.int64),
        "image" : tf.io.FixedLenFeature([], tf.string),
    }
    example = tf.io.parse_single_example(example, train_feature_description)
    image = decode_image(example['image'])    
    uid= example["StudyInstanceUID"]
    cvca = example["CVC - Abnormal"]
    cvcb = example["CVC - Borderline"]
    cvcn = example["CVC - Normal"]
    etta = example["ETT - Abnormal"]
    ettb = example["ETT - Borderline"]
    ettn = example["ETT - Normal"]
    ngta = example["NGT - Abnormal"]
    ngtb = example["NGT - Borderline"]
    ngti = example["NGT - Incompletely Imaged"]
    ngtn = example["NGT - Normal"]
    sgcp = example["Swan Ganz Catheter Present"]

    values  = [  etta, ettb, ettn, ngta, ngtb, ngti, ngtn,cvca, cvcb, cvcn , sgcp]
    label = tf.cast(0, tf.int32)
    for i in range(len(values)):
        if ( values[i]==1):
            label = tf.cast(i, tf.int32)
    return image,label # returns a dataset of (image, label) pairs

def read_unlabeled_tfrecord(example):
    UNLABELED_TFREC_FORMAT  = {
    "StudyInstanceUID" : tf.io.FixedLenFeature([], tf.string),
    "image" : tf.io.FixedLenFeature([], tf.string)
    }
    example = tf.io.parse_single_example(example, UNLABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    image= tf.image.resize(image, [IMAGE_SIZE[0],IMAGE_SIZE[0]])
    image_name = example['StudyInstanceUID']
    return image, image_name # returns a dataset of image(s)

def load_dataset(filenames, labeled=True, ordered=False):
    # Read from TFRecords. For optimal performance, reading from multiple files at once and
    # disregarding data order. Order does not matter since we will be shuffling the data anyway.

    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False # disable order, increase speed

    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTO) # automatically interleaves reads from multiple files
    dataset = dataset.with_options(ignore_order) # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.map(read_labeled_tfrecord if labeled else read_unlabeled_tfrecord, num_parallel_calls=AUTO)
    # returns a dataset of (image, label) pairs if labeled=True or (image, id) pairs if labeled=False
    return dataset

def data_augment(image, label):
    # data augmentation. Thanks to the dataset.prefetch(AUTO) statement in the next function (below),
    # this happens essentially for free on TPU. Data pipeline code is executed on the "CPU" part
    # of the TPU while the TPU itself is computing gradients.
    # RandomCrop, VFlip, HFilp, RandomRotate
    image = tf.image.rot90(image,k=np.random.randint(4))
    image = tf.image.random_flip_left_right(image , seed=SEED)
    image= tf.image.random_flip_up_down(image, seed=SEED)
    IMG_SIZE=IMAGE_SIZE[0]
    # Add 6 pixels of padding
    #image = tf.image.resize_with_crop_or_pad(image, IMG_SIZE + 6, IMG_SIZE + 6) 
    # Random crop back to the original size
    #image = tf.image.random_crop(image, size=[IMG_SIZE, IMG_SIZE, 3])
    image = tf.image.random_brightness(image, max_delta=0.5) # Random brightness
    image = tf.image.random_saturation(image, 0, 2, seed=SEED)
    image = tf.image.adjust_saturation(image, 3)
    
    #image = tf.image.central_crop(image, central_fraction=0.5)
    return image, label   

def get_training_dataset():
    dataset = load_dataset(TRAINING_FILENAMES, labeled=True)
    dataset = dataset.map(data_augment, num_parallel_calls=AUTO)
    dataset = dataset.repeat() # the training dataset must repeat for several epochs
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def get_training_dataset(dataset, do_aug=True , do_onehot=False):
    dataset = dataset.map(data_augment, num_parallel_calls=AUTO)
    dataset = dataset.repeat() # the training dataset must repeat for several epochs
    dataset = dataset.batch(AUG_BATCH)
    #if do_aug: dataset = dataset.map(transform, num_parallel_calls=AUTO) # note we put AFTER batching
    if do_onehot: dataset = dataset.map(onehot, num_parallel_calls=AUTO) 
    dataset = dataset.unbatch()
    
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def get_validation_dataset(ordered=False):
    dataset = load_dataset(VALIDATION_FILENAMES, labeled=True, ordered=ordered)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.cache()
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def get_test_dataset(ordered=False , tta= False):
    dataset = load_dataset(TEST_FILENAMES, labeled=False, ordered=ordered)
    dataset = dataset.batch(BATCH_SIZE)
    if tta:
        dataset = dataset.map(data_augment, num_parallel_calls=AUTO)
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def count_data_items(filenames):
    #the number of data items is written in the name of the .tfrec files, i.e. flowers00-230.tfrec = 230 data items
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(n)
    #c = 0
    #for filename in filenames:
    #    c += sum(1 for _ in tf.data.TFRecordDataset(filename))
    #return c


## Load Test Data

In [None]:
database_base_path = '/kaggle/input/ranzcr-clip-catheter-line-classification/'
submission = pd.read_csv(f'{database_base_path}sample_submission.csv')
display(submission.head())
TEST_FILENAMES = tf.io.gfile.glob(f'{database_base_path}test_tfrecords/*.tfrec') # predic
NUM_TEST_IMAGES = count_data_items(TEST_FILENAMES)
print(f'GCS: test: {NUM_TEST_IMAGES}')

## List Models loaded 

In [None]:
model_path_list = glob.glob('/kaggle/input/ranzcr-clip/model*.h5')
model_path_list.sort()

print('Models to predict:')
print(*model_path_list, sep='\n')

## Test set predictions

In [None]:
from tensorflow import keras
    
models = []    
i = 0
for model_path in model_path_list:
    print(model_path)
    K.clear_session()
    models.append(keras.models.load_model(model_path))

## Generate Predictions

In [None]:
print(" TTA_STEPS = {} ".format(TTA_STEPS))
if TTA_STEPS > 0:
    for step in range(TTA_STEPS):
        test_ds = get_test_dataset(ordered=True, tta=True)
        print(f'TTA step {step+1}/{TTA_STEPS}')
        test_images_ds = test_ds.map(lambda image, image_name: image)
        probabilities = np.average([models[i].predict(test_images_ds) for i in range(len(models))], axis = 0)
else:
    test_ds = get_test_dataset(ordered=True, tta=True)
    test_images_ds = test_ds.map(lambda image, image_name: image)
    probabilities = np.average([models[i].predict(test_images_ds) for i in range(1)], axis = 0)



## Generate submission file

In [None]:
print('Generating submission.csv file...')
test_ids_ds = test_ds.map(lambda image, idnum: idnum).unbatch()
test_ids = next(iter(test_ids_ds.batch(NUM_TEST_IMAGES))).numpy().astype('U') # all in one batch
np.savetxt('submission.csv', np.rec.fromarrays([test_ids] +  [probabilities[:,i] for i in range(probabilities.shape[1])]), fmt=['%s', '%f','%f' , '%f', '%f','%f' , '%f', '%f','%f' , '%f', '%f','%f'  ], delimiter=',', header='StudyInstanceUID,ETT - Abnormal,ETT - Borderline,ETT - Normal,NGT - Abnormal,NGT - Borderline,NGT - Incompletely Imaged,NGT - Normal,CVC - Abnormal,CVC - Borderline,CVC - Normal,Swan Ganz Catheter Present', comments='')


In [None]:
!head submission.csv