In [None]:
import os
import re
import math
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
from matplotlib import pyplot as plt

import tensorflow as tf
import tensorflow.keras.layers as L
from sklearn.model_selection import train_test_split

import efficientnet.tfkeras as efn
from kaggle_datasets import KaggleDatasets

In [None]:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()

# tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

In [None]:
# For tf.dataset
AUTO = tf.data.experimental.AUTOTUNE

# Data access
GCS_PATH = KaggleDatasets().get_gcs_path('melanoma-512x512')

# Configuration
EPOCHS = 13
BATCH_SIZE = 8 * strategy.num_replicas_in_sync
IMAGE_SIZE = [512, 512]        #[299, 299]


FILENAMES =  tf.io.gfile.glob(GCS_PATH + '/train*.tfrec')
TEST_FILENAMES = tf.io.gfile.glob(KaggleDatasets().get_gcs_path('melanoma-512x512') + '/test*.tfrec')
FILENAMES_df = pd.DataFrame({"FILENAMES":FILENAMES})

In [None]:
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels=3)
    # convert image to floats in [0, 1] range
    image = tf.cast(image, tf.float32) / 255.0 
    # explicit size needed for TPU
    image = tf.reshape(image, [*IMAGE_SIZE, 3])
    
    return image


def read_labeled_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        "image_name": tf.io.FixedLenFeature([], tf.string),
        "patient_id": tf.io.FixedLenFeature([], tf.int64),
        "age_approx": tf.io.FixedLenFeature([], tf.int64),  
        "sex": tf.io.FixedLenFeature([], tf.int64),
        "anatom_site_general_challenge": tf.io.FixedLenFeature([],tf.int64),
        "target": tf.io.FixedLenFeature([], tf.int64),  # shape [] means single element
    }
    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    
    #image data
    image = decode_image(example['image'])
    
    # tabular data
    data = {}
    data['patient_id'] = tf.cast(example['patient_id'], tf.float32) 
    data['age'] = tf.cast(example['age_approx'], tf.float32)/30.
    data['sex'] = tf.cast(example['sex'], tf.float32)
    data['anatom_site'] = tf.cast(example['anatom_site_general_challenge'], tf.float32)
    # label
    label = tf.cast(example['target'], tf.int32)
    return image, label, data 

def read_unlabeled_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        "image_name": tf.io.FixedLenFeature([], tf.string),
        "patient_id": tf.io.FixedLenFeature([], tf.int64),
        "age_approx": tf.io.FixedLenFeature([], tf.int64),  
        "sex": tf.io.FixedLenFeature([], tf.int64),
        "anatom_site_general_challenge": tf.io.FixedLenFeature([],tf.int64)
    }
    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)
    
    #image data
    image = decode_image(example['image'])
    
    # tabular data
    data = {}
    data['patient_id'] = tf.cast(example['patient_id'], tf.float32)
    data['image_name'] = tf.cast(example['image_name'], tf.string)
    data['age'] = tf.cast(example['age_approx'], tf.float32)/30.
    data['sex'] = tf.cast(example['sex'], tf.float32)
    data['anatom_site'] = tf.cast(example['anatom_site_general_challenge'], tf.float32)
    # label
    
    return image, data # returns a dataset of (image, label) pairs


def load_dataset(filenames, labeled=True, ordered=False):
    # Read from TFRecords. 
    
    ignore_order = tf.data.Options()
    if not ordered:
        # disable order, increase speed
        ignore_order.experimental_deterministic = False

    # automatically interleaves reads from multiple files
    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTO)
    # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.with_options(ignore_order)
    # returns a dataset of (image, label) pairs if labeled=True 
    # or (image, id) pairs if labeled=False
    dataset = dataset.map(read_labeled_tfrecord if labeled 
                          else read_unlabeled_tfrecord, num_parallel_calls=AUTO)
    
    return dataset

In [None]:
tab_features = ["age", "sex", "anatom_site"]
Num_features = len(tab_features)

def setup_input(image, label, data):
    
    tab_data=[tf.cast(data[tfeat], dtype=tf.float32) for tfeat in tab_features]
    
    tabular=tf.stack(tab_data)
    
    return {'inp1': image, 'inp2':  tabular}, label

In [None]:
def data_augment(data, label):
    # data augmentation. Thanks to the dataset.prefetch(AUTO) statement in the next function (below),
    # this happens essentially for free on TPU. Data pipeline code is executed on the "CPU" part
    # of the TPU while the TPU itself is computing gradients.
    image = tf.image.random_flip_left_right(data['inp1'])
#     image = tf.image.random_brightness(image, 0.1)
#     image = tf.image.adjust_contrast(image, 2)
#     image = tf.image.random_hue(image, 0.2)
#     image = tf.image.random_saturation(image, lower, upper, seed=None)
#     image = tf.image.central_crop(image, central_fraction=0.5)
    image = tf.image.random_flip_up_down(image)
    
    return {'inp1': image, 'inp2': data['inp2']} ,label 

In [None]:
def get_training_dataset(TRAIN_FILENAMES):
    dataset = load_dataset(TRAIN_FILENAMES, labeled=True)
    dataset = dataset.map(setup_input, num_parallel_calls=AUTO)
    dataset = dataset.map(data_augment, num_parallel_calls=AUTO)
    dataset = dataset.repeat() # the training dataset must repeat for several epochs
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def get_validation_dataset(VALIDATION_FILENAMES, ordered=False):
    dataset = load_dataset(VALIDATION_FILENAMES, labeled=True, ordered=ordered)
    dataset = dataset.map(setup_input, num_parallel_calls=AUTO)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.cache()
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def get_test_dataset(dataset, ordered=False):
#     dataset = load_dataset(TEST_FILENAMES, labeled=False, ordered=ordered)
#     dataset = dataset.map(setup_input, num_parallel_calls=AUTO)
#     dataset = dataset.map(data_augment, num_parallel_calls=AUTO)   #use when train is augmented
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def count_data_items(filenames):
    # the number of data items is written in the name of the .tfrec files, i.e. flowers00-230.tfrec = 230 data items
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(n)

In [None]:
NUM_TRAINING_IMAGES = count_data_items(FILENAMES)#*(folds-1)/folds
# NUM_VALIDATION_IMAGES = count_data_items(FILENAMES)/folds

NUM_TEST_IMAGES = count_data_items(TEST_FILENAMES)
STEPS_PER_EPOCH = NUM_TRAINING_IMAGES // BATCH_SIZE

# model architecture

In [None]:
######## model with metadata and image  #############

def get_model(model, IMAGE_SIZE, Num_features):
    
    # chose a model=[EfficientNetB5, InceptionV3, ResNet50, ResNet50V2, NASNetLarge]
    with strategy.scope():
        if model == "EfficientNetB5":
            pretrained_model = efn.EfficientNetB5(
                    input_shape = (*IMAGE_SIZE, 3),
                    weights = 'imagenet',
                    include_top = False)

        elif model == "InceptionV3":
            pretrained_model = tf.keras.applications.InceptionV3(
                    input_shape = (*IMAGE_SIZE, 3),
                    weights = 'imagenet',
                    include_top = False)
            
        elif model == "ResNet50":
            pretrained_model = tf.keras.applications.ResNet50(
                    input_shape=(*IMAGE_SIZE,3),
                    weights='imagenet',
                    include_top=False)
        elif model == "ResNet50V2":
            pretrained_model = tf.keras.applications.ResNet50V2(
                    input_shape=(*IMAGE_SIZE,3),
                    weights='imagenet',
                    include_top=False)
        elif model == "NASNetLarge" :
            pretrained_model = tf.keras.applications.NASNetLarge(
                    include_top=False,
                    input_shape=(*IMAGE_SIZE,3),
                    weights='imagenet' )
        elif model == "MobileNetV2":
            pretrained_model = tf.keras.applications.MobileNetV2(
                    input_shape=(*IMAGE_SIZE,3), weights='imagenet',include_top=False,alpha=1.0)
        elif model == "DenseNet201":
            pretrained_model = tf.keras.applications.DenseNet201(
                    input_shape=(*IMAGE_SIZE,3), include_top=False, weights='imagenet' )
            
            
            
        pretrained_model.trainable = True

        inp1 = tf.keras.layers.Input(shape=(*IMAGE_SIZE, 3), name='inp1')
        inp2 = tf.keras.layers.Input(shape=(Num_features), name='inp2')

        # might consider kernel regualizer in the dense leyers
        x = pretrained_model(inp1)
        x = L.GlobalAveragePooling2D()(x)
        x = L.Dense(1024, activation = 'relu')(x) 
        x = L.Dropout(0.3)(x)
        x = L.Dense(512, activation= 'relu')(x) 
        x = L.Dropout(0.2)(x)
        x = L.Dense(256, activation='relu')(x)
        x = L.Dropout(0.2)(x)
        x = L.Dense(128, activation='relu')(x)
        x = L.Dropout(0.1)(x)
        # add a layer for the tabular features
        y = L.Dense(64, activation='relu')(inp2)

        # concatinate the two nets
        concat = L.concatenate([x,y])

        output = L.Dense(1, activation='sigmoid', name='output')(concat)

        model = tf.keras.models.Model(inputs =[inp1,inp2], outputs=[output])
        
    model.compile( optimizer='adam',
    #loss = 'binary_crossentropy',
    loss = tf.keras.losses.BinaryCrossentropy(label_smoothing = 0.2),
    metrics= ['AUC']       #['binary_crossentropy']
              )
        
    return model

# KFold Training

In [None]:
def get_test_dataset(dataset, ordered=False):
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def setup_test_name(image, data):
    return data['image_name']

def setup_test_image(image, data):    
    
    tab_data=[tf.cast(data[tfeat], dtype=tf.float32) for tfeat in ["age", "sex", "anatom_site"]]
    tabular=tf.stack(tab_data)
    return {'inp1': image, 'inp2': tabular}

In [None]:
from sklearn.model_selection import KFold  

early_stopping = EarlyStopping( monitor='val_auc', verbose=1, patience=4,
                                mode='max', restore_best_weights=True)



def trained_models(folds = 6, EPOCHS = 8):
    
    kfold = KFold(folds, shuffle = True, random_state = 1002)

#     history=[]
    models=[]
    histories=[]

    for fold, (trn_index, val_index) in enumerate(kfold.split(FILENAMES)):
            print(f"Fold {fold+1}")
            
            TRAIN_FILENAMES = list(FILENAMES_df.loc[trn_index]['FILENAMES'])
            NUM_TRAINING_IMAGES = count_data_items(TRAIN_FILENAMES)
            VALIDATION_FILENAMES = list(FILENAMES_df.loc[val_index]['FILENAMES'])
        
            checkpoint_name = f'model_fold_{fold+1}' + '.h5'
        
            model_checkpoint = ModelCheckpoint(checkpoint_name 
                           ,monitor='val_auc', mode='max', verbose=1, save_best_only=True, 
                            save_weights_only=True, save_freq='epoch')
            
            model = get_model("EfficientNetB5",IMAGE_SIZE, Num_features)  
            
            history = model.fit_generator(
                   get_training_dataset(TRAIN_FILENAMES),
                   steps_per_epoch = NUM_TRAINING_IMAGES // BATCH_SIZE,
                   epochs = EPOCHS,
#                   verbose = 1,
                   validation_data = get_validation_dataset(VALIDATION_FILENAMES),
                   callbacks = [early_stopping, lr_ramp_up, model_checkpoint]
#                   validation_steps = int(950 // batch_size)
                   ,class_weight={0: 1, 1: 20}
                    )
        
#             print('Load best weights for model prediction')
#             model.load_weights(checkpoint_name)
            models.append(model)
            histories.append(history)
            
    return models, histories

In [None]:
models, histories = trained_models()

In [None]:
def make_preds(models):

    preds = pd.DataFrame({'image_name': np.zeros(NUM_TEST_IMAGES)})


    test_ds = load_dataset(TEST_FILENAMES, labeled=False, ordered=True)
    test_images_ds = test_ds.map(setup_test_image, num_parallel_calls=AUTO)
    test_images_ds = get_test_dataset(test_images_ds)


    test_ds = get_test_dataset(test_ds)
    test_ids_ds = test_ds.map(setup_test_name, num_parallel_calls=AUTO).unbatch()


    preds['image_name'] = next(iter(test_ids_ds.batch(NUM_TEST_IMAGES))).numpy().astype('U')
    
    
    
    
    for i,model in enumerate(models):
        
        preds['target'+str(i+1)] = model.predict(test_images_ds)
        print("*"*50,'\n', str(i+1)+'th prediction done!','\n')
    
    preds['target'] = preds.mean(axis=1)
    
    return preds
    

In [None]:
pred = make_preds(models)