In [None]:
!pip install -q efficientnet
!pip install tensorflow_addons
import re
import os
import numpy as np
import pandas as pd
import random
import math
import tensorflow as tf
import efficientnet.tfkeras as efn
from sklearn import metrics
from sklearn.model_selection import KFold, StratifiedKFold
from tensorflow.keras import backend as K
import tensorflow_addons as tfa
from kaggle_datasets import KaggleDatasets

In [None]:
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
# For tf.dataset
AUTO = tf.data.experimental.AUTOTUNE

# Data access
GCS_PATH = KaggleDatasets().get_gcs_path('ranzcr-tf-records-896-stratified')

# Configuration
EPOCHS = 14
BATCH_SIZE = 10 * strategy.num_replicas_in_sync
IMAGE_SIZE = [896, 896]
# Seed
SEED = 123
# Learning rate
LR = 0.001
# Test time augmentation rounds
TTA = 5
# Verbosity
VERBOSE = 2
# Number of classes
N_CLASSES = 11

# Training filenames directory
TRAINING_FILENAMES = tf.io.gfile.glob(GCS_PATH + '/*.tfrec')

In [None]:
# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)

# Data augmentation function
def data_augment(image, StudyInstanceUID, targets):
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_flip_up_down(image)
    image = tf.image.random_hue(image, 0.01)
    image = tf.image.random_saturation(image, 0.70, 1.30)
    image = tf.image.random_contrast(image, 0.80, 1.20)
    image = tf.image.random_brightness(image, 0.10)
    return image, StudyInstanceUID, targets

# Function to decode our images
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels = 3)
    image = tf.image.resize(image, IMAGE_SIZE)
    image = tf.cast(image, tf.float32) / 255.0
    return image

# This function parse our images and also get the target variable
def read_labeled_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string),
        "StudyInstanceUID": tf.io.FixedLenFeature([], tf.string),
        "ETT - Abnormal": tf.io.FixedLenFeature([], tf.int64),
        "ETT - Borderline": tf.io.FixedLenFeature([], tf.int64),
        "ETT - Normal": tf.io.FixedLenFeature([], tf.int64),
        "NGT - Abnormal": tf.io.FixedLenFeature([], tf.int64),
        "NGT - Borderline": tf.io.FixedLenFeature([], tf.int64),
        "NGT - Incompletely Imaged": tf.io.FixedLenFeature([], tf.int64),
        "NGT - Normal": tf.io.FixedLenFeature([], tf.int64),
        "CVC - Abnormal": tf.io.FixedLenFeature([], tf.int64),
        "CVC - Borderline": tf.io.FixedLenFeature([], tf.int64),
        "CVC - Normal": tf.io.FixedLenFeature([], tf.int64),
        "Swan Ganz Catheter Present": tf.io.FixedLenFeature([], tf.int64)
    }

    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    image = decode_image(example['image'])
    StudyInstanceUID = example['StudyInstanceUID']
    ETT_Abnormal = tf.cast(example['ETT - Abnormal'], tf.float32)
    ETT_Borderline = tf.cast(example['ETT - Borderline'], tf.float32)
    ETT_Normal = tf.cast(example['ETT - Normal'], tf.float32)
    NGT_Abnormal = tf.cast(example['NGT - Abnormal'], tf.float32)
    NGT_Borderline = tf.cast(example['NGT - Borderline'], tf.float32)
    NGT_Incompletely_Imaged = tf.cast(example['NGT - Incompletely Imaged'], tf.float32)
    NGT_Normal = tf.cast(example['NGT - Normal'], tf.float32)
    CVC_Abnormal = tf.cast(example['CVC - Abnormal'], tf.float32)
    CVC_Borderline = tf.cast(example['CVC - Borderline'], tf.float32)
    CVC_Normal = tf.cast(example['CVC - Normal'], tf.float32)
    Swan_Ganz_Catheter_Present = tf.cast(example['Swan Ganz Catheter Present'], tf.float32)
    targets = tf.stack([ETT_Abnormal] + [ETT_Borderline] + [ETT_Normal] + [NGT_Abnormal] + [NGT_Borderline] + [NGT_Incompletely_Imaged] + \
                       [NGT_Normal] + [CVC_Abnormal] + [CVC_Borderline] + [CVC_Normal] + [Swan_Ganz_Catheter_Present])
    return image, StudyInstanceUID, targets

# This function loads TF Records and parse them into tensors
def load_dataset(filenames, ordered = False):
    
    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False 
        
    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads = AUTO)
    dataset = dataset.with_options(ignore_order)
    dataset = dataset.map(read_labeled_tfrecord, num_parallel_calls = AUTO) 
    return dataset

# This function is to get our training tensors
def get_training_dataset(filenames, ordered = False):
    dataset = load_dataset(filenames, ordered = ordered)
    dataset = dataset.map(data_augment, num_parallel_calls = AUTO)
    dataset = dataset.repeat()
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

# This function is to get our validation tensors
def get_validation_dataset(filenames, ordered = True):
    dataset = load_dataset(filenames, ordered = ordered)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO) 
    return dataset

# This function is to get our validation time augmentation tensors
def get_vta(filenames, ordered = True):
    dataset = load_dataset(filenames, ordered = ordered)
    dataset = dataset.map(data_augment, num_parallel_calls = AUTO)
    dataset = dataset.repeat()
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

# Function to count how many photos we have in
def count_data_items(filenames):
    # The number of data items is written in the name of the .tfrec files, i.e. flowers00-230.tfrec = 230 data items
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(n)


NUM_TRAINING_IMAGES = count_data_items(TRAINING_FILENAMES)
print(f'Dataset: {NUM_TRAINING_IMAGES} training images')

In [None]:
# Function to get the mean roc auc from a multi label matrix
def mean_roc_auc(targets, probabilities):
    roc_auc = []
    for k in range(N_CLASSES):
        roc_auc.append(metrics.roc_auc_score(targets[:, k], probabilities[:, k]))
    return np.average(roc_auc)
        
# Function to create our EfficientNetB6 model
def get_model():
    
    with strategy.scope():
        
        inp = tf.keras.layers.Input(shape = (*IMAGE_SIZE, 3))
        x = efn.EfficientNetB6(include_top = False, weights = 'imagenet')(inp)
        x = tf.keras.layers.GlobalAveragePooling2D()(x)
        output = tf.keras.layers.Dense(N_CLASSES, activation = 'sigmoid')(x)
        
        model = tf.keras.models.Model(inputs = [inp], outputs = [output])

        opt = tf.keras.optimizers.Adam(learning_rate = LR)

        model.compile(
            optimizer = opt,
            loss = [tfa.losses.SigmoidFocalCrossEntropy(alpha = 0.50, gamma = 2.0)],
            metrics = [tf.keras.metrics.AUC(multi_label = True)]
        )

        return model

# Function to train and evaluate our model
def train_and_evaluate(folds = 5):
    oof_StudyInstanceUID = []
    oof_targets = np.zeros((NUM_TRAINING_IMAGES, N_CLASSES))
    oof_predictions = np.zeros((NUM_TRAINING_IMAGES, N_CLASSES))
    previous_number_of_files = 0
    total_number_of_files = 0
    
    # Seed everything
    seed_everything(SEED)
    kfold = KFold(folds, shuffle = True, random_state = SEED)
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(TRAINING_FILENAMES)):
        if tpu:
            tf.tpu.experimental.initialize_tpu_system(tpu)
        print('\n')
        print('-'*50)
        print(f'Training fold {fold + 1}')
        train_dataset = get_training_dataset([TRAINING_FILENAMES[x] for x in trn_ind], ordered = False)
        train_dataset = train_dataset.map(lambda image, StudyInstanceUID, targets: (image, targets))
        val_dataset = get_validation_dataset([TRAINING_FILENAMES[x] for x in val_ind], ordered = True)
        val_dataset_ = val_dataset.map(lambda image, StudyInstanceUID, targets: (image, targets))
        STEPS_PER_EPOCH = count_data_items([TRAINING_FILENAMES[x] for x in trn_ind]) // BATCH_SIZE
        K.clear_session()
        model = get_model()
        # Model checkpoint
        checkpoint = tf.keras.callbacks.ModelCheckpoint(f'EfficientNetB6_{fold}_{IMAGE_SIZE[0]}_{SEED}.h5', 
                                                        monitor = 'val_auc', 
                                                        verbose = VERBOSE, 
                                                        save_best_only = True,
                                                        save_weights_only = True, 
                                                        mode = 'max')
        lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(monitor = 'val_auc', 
                                                            factor = 0.1, 
                                                            patience = 2, 
                                                            verbose = VERBOSE,
                                                            mode = 'max')
        history = model.fit(train_dataset,
                            steps_per_epoch = STEPS_PER_EPOCH,
                            epochs = EPOCHS,
                            callbacks = [checkpoint, lr_scheduler], 
                            validation_data = val_dataset_,
                            verbose = VERBOSE)
        
        # Load weights from the best epoch
        model.load_weights(f'EfficientNetB6_{fold}_{IMAGE_SIZE[0]}_{SEED}.h5')
        
        # Get ids and targets
        number_of_files = count_data_items([TRAINING_FILENAMES[x] for x in val_ind])
        StudyInstanceUID = val_dataset.map(lambda image, StudyInstanceUID, targets: StudyInstanceUID).unbatch()
        targets = val_dataset.map(lambda image, StudyInstanceUID, targets: targets).unbatch()
        StudyInstanceUID = next(iter(StudyInstanceUID.batch(number_of_files))).numpy().astype('U')
        targets = next(iter(targets.batch(number_of_files))).numpy()
        oof_StudyInstanceUID.extend(list(StudyInstanceUID))
        total_number_of_files += number_of_files
        oof_targets[previous_number_of_files:total_number_of_files] = targets
        
        # Use validation time augmentation for predictions
        steps = TTA * number_of_files / BATCH_SIZE
        dataset = get_vta([TRAINING_FILENAMES[x] for x in val_ind], ordered = True)
        image = dataset.map(lambda image, StudyInstanceUID, targets: image)
        probabilities = model.predict(image, steps = steps)[: TTA * number_of_files]
        probabilities = np.mean(probabilities.reshape((number_of_files, TTA, N_CLASSES), order = 'F'), axis = 1)
        oof_predictions[previous_number_of_files:total_number_of_files] = probabilities
        previous_number_of_files += number_of_files
        
        print('\n')
        print('-'*50)
        fold_roc_auc_score = mean_roc_auc(targets, probabilities)
        print(f'Our fold {fold + 1} roc auc score validation with {TTA} TTA is {fold_roc_auc_score}')
        
    print('\n')
    print('-'*50)
    oof_roc_auc_score = mean_roc_auc(oof_targets, oof_predictions)
    print(f'Our out of folds roc auc score is {oof_roc_auc_score}')
    
    # Save the out of folds predictions
    print('Saving out of folds to disk...')
    target_columns = ["ETT - Abnormal", "ETT - Borderline", "ETT - Normal", "NGT - Abnormal", "NGT - Borderline", "NGT - Incompletely Imaged", "NGT - Normal", "CVC - Abnormal", "CVC - Borderline", 
                      "CVC - Normal", "Swan Ganz Catheter Present"]
    prediction_columns = [col + ' Prob' for col in target_columns]
    oof_targets_df = pd.DataFrame(oof_targets, columns = target_columns)
    oof_predictions_df = pd.DataFrame(oof_predictions, columns = prediction_columns)
    oof_dataset = pd.DataFrame({'oof_StudyInstanceUID': oof_StudyInstanceUID})
    oof_dataset = pd.concat([oof_dataset, oof_targets_df, oof_predictions_df], axis = 1)
    oof_dataset.to_csv(f'EfficientNetB6_{IMAGE_SIZE[0]}_{SEED}.csv', index = False)
    
train_and_evaluate(folds = 5)