In [None]:
!pip install efficientnet

In [None]:
import tensorflow as tf
import random
import numpy as np
import pandas as pd
from scipy.stats import rankdata
import efficientnet.tfkeras as efn
from matplotlib import pyplot as plt
from sklearn.utils import class_weight
from sklearn.metrics import roc_auc_score
from kaggle_datasets import KaggleDatasets
import efficientnet.tfkeras as effn
from tqdm import tqdm
import PIL
import re

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print("running on tpu",tpu.master())
except ValueError:
    tpu = None
if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()
print("Replicas",strategy.num_replicas_in_sync)

                                         

In [None]:
gcs_paths = [x.split('/')[2:][0] for x in tf.io.gfile.glob('../input/*')]
gcs_paths.sort()
gcs_paths

In [None]:
path_dict = {}

for path in gcs_paths:
    path_dict[path] = KaggleDatasets().get_gcs_path(path)
    print(f'{path}\t| {KaggleDatasets().get_gcs_path(path)}')

In [None]:
batch_size = 8*strategy.num_replicas_in_sync
image_size=[384,384]
dim = image_size[0]
autotune = tf.data.experimental.AUTOTUNE
k = image_size[0]
#gcs_path = KaggleDatasets().get_gcs_path("siim-isic-melanoma-classification")
train_file_path_2019 = tf.io.gfile.glob(path_dict["isic2019-1024x1024"]+"/train*.tfrec")
train_file_path_2020= tf.io.gfile.glob(path_dict["melanoma-1024x1024"]+"/train*.tfrec")
#sorted file list
test_file_path = tf.io.gfile.glob(path_dict["siim-isic-melanoma-classification"]+"/tfrecords/test*.tfrec")



In [None]:
print("number of train files in 2020 data :",len(train_file_path_2020))
print("number of train files in 2019 data :",len(train_file_path_2019))

In [None]:
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)
autotune = tf.data.experimental.AUTOTUNE

In [None]:

image_size =[1024,1024]
batch_size = 8 * strategy.num_replicas_in_sync
dim = image_size[0]

val_skip = len(train_file_path_2019) // 5 #number of folds
val_fname ={}

index = 0
for n in range(1,6):
    val_fname[f"fold_{n}"] = train_file_path_2019[index:index+val_skip]
    index+=val_skip
total_data = train_file_path_2020 + train_file_path_2019
train_fname = [list(set(total_data) - set(val_fname[f'fold_{n}'])) for n in range(1,6)]
train_data_fname = [i for j in train_fname for i in j]

In [None]:
train_data_fname[:10]

In [None]:
def read_labeled_tfrecord(example):
    tfrec_format ={
      "image" : tf.io.FixedLenFeature([],tf.string),
      "image_name" : tf.io.FixedLenFeature([],tf.string),
      "patient_id" : tf.io.FixedLenFeature([],tf.int64),
      "sex"        : tf.io.FixedLenFeature([],tf.int64),
      "age_approx" : tf.io.FixedLenFeature([],tf.int64),
      "anatom_site_general_challenge" : tf.io.FixedLenFeature([],tf.int64),
      "diagnosis"  : tf.io.FixedLenFeature([],tf.int64),
      "target"     : tf.io.FixedLenFeature([],tf.int64),
      #"tfrecord"   : tf.io.FixedLenFeature([],tf.int64,default_value=None),
      #"width"      : tf.io.FixedLenFeature([],tf.int64,default_value=None)
      
  }    
    example = tf.io.parse_single_example(example,tfrec_format)
    tabular_data = [example["sex"],example["age_approx"],example["anatom_site_general_challenge"]]

    return example["image"],example["target"],tabular_data
def read_unlabeled_tfrecord(example):
    tfrec_format ={
      "image" : tf.io.FixedLenFeature([],tf.string),
      "image_name" : tf.io.FixedLenFeature([],tf.string),
      "patient_id" : tf.io.FixedLenFeature([],tf.int64),
      "sex"        : tf.io.FixedLenFeature([],tf.string),
      "age_approx" : tf.io.FixedLenFeature([],tf.int64),
      "anatom_site_general_challenge" : tf.io.FixedLenFeature([],tf.int64),
  }
    example = tf.io.parse_single_example(example,tfrec_format)
    tabular_data = [example["sex"],example["age_approx"],example["anatom_site_general_challenge"]]
    tabular_data = [tf.cast(tabular_data[feat],dtype=tf.float32) for feat in tabular_data]
    tabular_data = tf.stack(tabular_data)
    return example["image"],example["image_name"],tabular_data

def data_augment(img,label,tabular,train=True):
    img = tf.image.random_flip_left_right(img)
    img = tf.image.random_saturation(img,0.7,1.3)
    img = tf.image.random_contrast(img,0.8,1.2)
    img = tf.image.random_brightness(img,0.1)
    return img,label,tabular

def prepare_image(img,label,tabular):
    img = tf.image.decode_jpeg(img,channels=3)
    img = tf.cast(img,tf.float32)/255.0
    return img,label,tabular    

def get_dataset(files,labeled=True):
    ds = tf.data.TFRecordDataset(files,num_parallel_reads=autotune)
    ds =ds.cache()
    if labeled:
        ds = ds.map(read_labeled_tfrecord,num_parallel_calls = autotune)
        ds = ds.shuffle(1024*8)
        opt = tf.data.Options()
        opt.experimental_deterministic = False
        ds = ds.with_options(opt)
        ds = ds.map(lambda img,label,tabular : prepare_image(img,label,tabular),num_parallel_calls=autotune)
        ds = ds.map(lambda img,label,tabular : data_augment(img,label,tabular,train=True),num_parallel_calls=autotune)
    else:
        ds = ds.map(read_unlabeled_tfrecord,num_parallel_calls = autotune)
        ds = ds.map(lambda img,img_name,tabular:prepare_image(image,label,tabular),num_parallel_calls=autotune)

    ds = ds.batch(64)
    ds = ds.prefetch(autotune)
    return ds




In [None]:
train_df=pd.read_csv("../input/isic2019-1024x1024/train.csv")

In [None]:
def show_dataset(thumb_size, cols, rows, ds):
    mosaic = PIL.Image.new(mode='RGB', size=(thumb_size*cols + (cols-1), 
                                             thumb_size*rows + (rows-1)))
   
    for idx, data in enumerate(iter(ds)):
        img, target,tabular = data
        ix  = idx % cols
        iy  = idx // cols
        img = np.clip(img.numpy() * 255, 0, 255).astype(np.uint8)
        img = PIL.Image.fromarray(img)
        img = img.resize((thumb_size, thumb_size), resample=PIL.Image.BILINEAR)
        mosaic.paste(img, (ix*thumb_size + ix, 
                           iy*thumb_size + iy))

    display(mosaic)
    
ds = get_dataset(train_data_fname).unbatch().take(12*5)   
show_dataset(64, 12, 5, ds)



## Callbacks


In [None]:
def get_lr_callback():
    lr_start   = 0.000005
    lr_max     = 0.000020 * 8
    lr_min     = 0.000001
    lr_ramp_ep = 5
    lr_sus_ep  = 0
    lr_decay   = 0.8
   
    def lrfn(epoch):
        if epoch < lr_ramp_ep:
            lr = (lr_max - lr_start) / lr_ramp_ep * epoch + lr_start
            
        elif epoch < lr_ramp_ep + lr_sus_ep:
            lr = lr_max
            
        else:
            lr = (lr_max - lr_min) * lr_decay**(epoch - lr_ramp_ep - lr_sus_ep) + lr_min
            
        return lr

    lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose=False)
    return lr_callback
class save_best_n(tf.keras.callbacks.Callback):
    def __init__(self, fn, model):
        self.fn = fn
        self.model = model

    def on_epoch_end(self, epoch, logs=None):
        
        if (epoch>0):
            score=logs.get("val_auc")
        else:
            score=-1
      
        if (score > best_score[fold_num].min()):
          
            idx_min=np.argmin(best_score[fold_num])

            best_score[fold_num][idx_min]=score
            best_epoch[fold_num][idx_min]=epoch+1

            path_best_model=f'best_model_fold_{self.fn}_{idx_min}.hdf5'
            self.model.save("/kaggle/working"/path_best_model)


In [None]:
def get_cnn():
    with strategy.scope():
        model= effn.EfficientNetB7(input_shape=(1024,1024,3),
                                  weights="imagenet",
                                  include_top=False)
        model.trainable= True
        x=tf.keras.layers.GlobalAveragePooling2D()(model.output)
        x = tf.keras.layers.Flatten()(x)
        x = tf.keras.layers.Dense(512,kernel_regularizer = tf.keras.regularizers.l2(l=0.01),activation="relu")(x)
        x = tf.keras.layers.Dropout(0.2)(x)
        x = tf.keras.layers.Dense(128,kernel_regularizer = tf.keras.regularizers.l2(l=0.01),activation = "relu")(x)
        x = tf.keras.layers.Dropout(0.2)(x)
        x = tf.keras.layers.Dense(64,kernel_regularizer=tf.keras.regularizers.l2(l=0.01),activation="relu")(x)
        x = tf.keras.layers.Dense(32,kernel_regularizer= tf.keras.regularizers.l2(l=0.01),activation="relu")(x)
        model = tf.keras.Model(model.input,x)
        return model
def get_mlp():
    with strategy.scope():
        inp = tf.keras.layers.Input(shape=(3),name='inp2')
        model = tf.keras.layers.Dense(16,activation="relu",kernel_regularizer= tf.keras.regularizers.l2(l=0.01))(inp)
        model = tf.keras.layers.Dense(32,activation="relu",kernel_regularizer= tf.keras.regularizers.l2(l=0.01))(model)
        moedl = tf.keras.layers.Dense(64,activation="relu",kernel_regularizer= tf.keras.regularizers.l2(l=0.01))(model)
        model = tf.keras.Model(inp,model)
        return model
def get_model():
    cnn = get_cnn()
    mlp = get_mlp()
    concat = tf.keras.layers.concatenate([cnn.output,mlp.output])
    output = tf.keras.layers.Dense(1,activation="sigmoid")(concat)
    model = tf.keras.Model([cnn.input,mlp.input],output)
    model.compile(optimizer="adam",loss="binary_crossentropy",metrics=["AUC","accuracy"])
    return model
    
    

In [None]:
model = get_model()
model.summary()

In [None]:
tf.keras.utils.plot_model(
    model, to_file='model.png', show_shapes=False, show_layer_names=True,
    rankdir='TB', expand_nested=False, dpi=96
)

In [None]:
def count_data_items(filenames):
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1))for filename in filenames]
    return np.sum(n)
    

In [None]:
step_size_train = count_data_items(train_data_fname)
step_size_validation= count_data_items(val_fname["fold_1"])
print(step_size_train,step_size_validation)

In [None]:
def setup_input(image,label,tabular_data):
    
    return {"inp1":image,"inp2":tabular_data},label

In [None]:
histories = []
best_epoch = {f:np.zeros(1) for f in range(1,6)}
best_score = {f:np.zeros(1) for f in range(1,6)}
for fold_num in range(1,6):
    tf.keras.backend.clear_session()
    tf.tpu.experimental.initialize_tpu_system(tpu)
    print("-"*50)
    print(f"strating fold {fold_num} out of 6")
    files_train = train_data_fname
    files_val = val_fname[f"fold_{fold_num}"]
    train_dataset = get_dataset(files_val)
    train_dataset = train_dataset.map(setup_input,num_parallel_calls=tf.data.experimental.AUTOTUNE)
    
    val_dataset = get_dataset(files_val)
    val_dataset = val_dataset.map(setup_input,num_parallel_calls=tf.data.experimental.AUTOTUNE)
    
    model = get_model()
    history = model.fit(files_train,
                       steps_per_epoch=step_size_train,
                       epochs=15,
                       validation_data=val_dataset,
                       verbose=2,
                       callbacks=[get_lr_callback(),save_best_n(fold_num,model)])
    idx_sorted=np.argsort(best_score[fold_num])
    best_score[fold_num]=np.array(best_score[fold_num])[idx_sorted]
    best_epoch[fold_num]=np.array(best_epoch[fold_num])[idx_sorted]

    print(f"\nFold {fold_num} is finished. The best epochs: {[int(best_epoch[fold_num][i]) for i in range(len(best_epoch[fold_num]))]}")
    print(f"The corresponding scores: {[round(best_score[fold_num][i], 5) for i in range(len(best_epoch[fold_num]))]}")

    histories.append(history)         
                          
