### References
* Ensembling 
[MinMax highest public LB=.9619](https://www.kaggle.com/paklau9/minmax-highest-public-lb-9619/data)
([Pak Lau](https://www.kaggle.com/paklau9))
* High Performin base line model 
[Triple Stratified KFold with TFRecords](https://www.kaggle.com/cdeotte/triple-stratified-kfold-with-tfrecords)
([Chris Deotte](https://www.kaggle.com/cdeotte))

### Import libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import re, math
import datetime

from IPython.core.display import display, HTML
import matplotlib.pyplot as plt

!pip install -q efficientnet
import efficientnet.tfkeras as efn

import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow.keras.backend as K
import tensorflow.keras.layers as L
from tensorflow.keras import Sequential, Model
from tensorflow.keras.layers import Conv2D, Dense, Input, Flatten, AveragePooling2D, GlobalAveragePooling2D, BatchNormalization, Dropout, Activation

from kaggle_datasets import KaggleDatasets

from sklearn.model_selection import train_test_split
import pickle


### Detect hardware, return appropriate distribution strategy

In [None]:
def get_strategy():
    gpu = ""
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        print('Running on TPU ', tpu.master())
    except ValueError:
        tpu = None
        os.environ["CUDA_VISIBLE_DEVICES"] = "0"
        gpu = tf.config.list_physical_devices("GPU")
        if len(gpu) == 1:
            print('Running on GPU ', gpu)
    if tpu:
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        tf.config.experimental_connect_to_cluster(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
        GCS_PATH = KaggleDatasets().get_gcs_path('siim-isic-melanoma-classification')
    elif len(gpu) == 1:
        strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
        tf.config.optimizer.set_experimental_options({"auto_mixed_precision":True})
        GCS_PATH = "/kaggle/input/siim-isic-melanoma-classification/"
    else:
        strategy = tf.distribute.get_strategy()
        GCS_PATH = "/kaggle/input/siim-isic-melanoma-classification/"

    print("REPLICAS: ", strategy.num_replicas_in_sync)
    base_dir = "/kaggle/input/siim-isic-melanoma-classification/"
    return strategy, GCS_PATH, base_dir

strategy,GCS_PATH, base_dir = get_strategy()

### Settings

In [None]:
VERSION = 24
KEY_CHANGES = "Code cleanup, "

AUTO = tf.data.experimental.AUTOTUNE

ROT_ = 45.0
SHR_ = 2.0
HZOOM_ = 10.0
WZOOM_ = 10.0
HSHIFT_ = 10.0
WSHIFT_ = 10.0

DIM=384
TFREC_DIM=384
SEED = 42

TFREC_GCS_DIR = KaggleDatasets().get_gcs_path('melanoma-%ix%i'%(TFREC_DIM,TFREC_DIM))
PRV_TFREC_GCS_DIR = KaggleDatasets().get_gcs_path('isic2019-%ix%i'%(TFREC_DIM,TFREC_DIM))


BLN_TRAIN_MODEL=False

REPLICAS = strategy.num_replicas_in_sync
BATCH_SIZE = 16 * REPLICAS
EPOCHS=12
KFOLD_SPLITS=5

TTA=25

### Write log
Important to document results of various experiments as we go along

In [None]:
log_sno = 1
log_start = datetime.datetime.now()

    
def write_log(data_type, data):
    global log_sno
    filename = "log_melanoma_classification.txt"
    prv_filename = "../input/amazingly-fast-kernel/" + filename
    if (not os.path.exists(filename)) and os.path.exists(prv_filename):
        os.popen('cp ' + prv_filename + ' ' + filename)
        
    f = open(filename,"a")
   
    log_cur = datetime.datetime.now()
    delta = log_cur - log_start
    f.write("{},{},{},{},{},{}".format(VERSION, log_sno, log_start, data_type, data, delta.total_seconds()))
    log_sno = log_sno + 1
    
    f.close()
write_log("KEY_CHANGES", KEY_CHANGES)

### Load and perform basic EDA on training data

In [None]:
train_data = pd.read_csv(base_dir + "train.csv")
print(display(HTML(train_data.head(1).to_html())))
print(train_data["target"].value_counts())

In [None]:
train_data["anatom_site_general_challenge"].fillna("Unknown", inplace=True)
group_data = train_data.groupby(["anatom_site_general_challenge"])["benign_malignant"].value_counts().unstack(-1)
group_data["perc_malignant"] = np.round((group_data["malignant"] * 100) /(group_data["benign"] + group_data["malignant"]),2)
group_data.sort_values("perc_malignant", inplace=True)

In [None]:
import seaborn as sns
fix, ax = plt.subplots(1,4, figsize=(13,3))
ax[0].hist("age_approx", data=train_data[train_data["target"]==0], bins=100)

ax[0].hist("age_approx", data=train_data[train_data["target"]==0], bins=100)
ax[0].set_title("Age histogram-Benign");
ax[1].hist("age_approx", data=train_data[train_data["target"]==1], bins=100);
ax[1].set_title("Age histogram-Malignant");

sex_data = train_data[["sex","target","image_name"]].groupby(["sex","target"]).count().reset_index()
sns.barplot("sex","image_name", data=sex_data, hue="target" , ax=ax[2]);
ax[2].set_title("Sex bar graph");

group_data = train_data[["sex","target","image_name"]].groupby(["sex","target"]).count().unstack(-1)
sum_data = group_data.sum(axis=1).values
group_data["total"] = list(sum_data)
group_data.iloc[:,1] = list(np.round(group_data.iloc[:,1].values * 100 / sum_data,2))
group_data.columns = ["benign_count","malignant_perc","total_count"]
group_data.reset_index(inplace=True)
sns.barplot("sex","malignant_perc", data=group_data,  ax=ax[3]);

plt.tight_layout();

In [None]:
fix, ax = plt.subplots(1,2, figsize=(13,4))
sex_data = train_data[["anatom_site_general_challenge","target","image_name"]].groupby(["anatom_site_general_challenge","target"]).count().reset_index()
sns.barplot("anatom_site_general_challenge","image_name", data=sex_data, hue="target" , ax=ax[0]);
ax[0].set_title("Anatomy bar graph");

group_data = train_data[["anatom_site_general_challenge","target","image_name"]].groupby(["anatom_site_general_challenge","target"]).count().unstack(-1)
sum_data = group_data.sum(axis=1).values
group_data["total"] = list(sum_data)
group_data.iloc[:,1] = list(np.round(group_data.iloc[:,1].values * 100 / sum_data,2))
group_data.columns = ["benign_count","malignant_perc","total_count"]
group_data.reset_index(inplace=True)
sns.barplot("anatom_site_general_challenge","malignant_perc", data=group_data,  ax=ax[1]);

plt.setp(ax[0].xaxis.get_majorticklabels(), rotation=45)
plt.setp(ax[1].xaxis.get_majorticklabels(), rotation=45)
plt.tight_layout();

### Data Augmentation
This plays a major role in Image classification. I think this is how we can make our algo think better than what humans do.
Lifting straight away from [Chris Deotte Notebook](https://www.kaggle.com/cdeotte/triple-stratified-kfold-with-tfrecords)


In [None]:

def get_mat(rotation, shear, height_zoom, width_zoom, height_shift, width_shift):
    # returns 3x3 transformmatrix which transforms indicies
        
    # CONVERT DEGREES TO RADIANS
    rotation = math.pi * rotation / 180.
    shear    = math.pi * shear    / 180.

    def get_3x3_mat(lst):
        return tf.reshape(tf.concat([lst],axis=0), [3,3])
    
    # ROTATION MATRIX
    c1   = tf.math.cos(rotation)
    s1   = tf.math.sin(rotation)
    one  = tf.constant([1],dtype='float32')
    zero = tf.constant([0],dtype='float32')
    
    rotation_matrix = get_3x3_mat([c1,   s1,   zero, 
                                   -s1,  c1,   zero, 
                                   zero, zero, one])    
    # SHEAR MATRIX
    c2 = tf.math.cos(shear)
    s2 = tf.math.sin(shear)    
    
    shear_matrix = get_3x3_mat([one,  s2,   zero, 
                                zero, c2,   zero, 
                                zero, zero, one])        
    # ZOOM MATRIX
    zoom_matrix = get_3x3_mat([one/height_zoom, zero,           zero, 
                               zero,            one/width_zoom, zero, 
                               zero,            zero,           one])    
    # SHIFT MATRIX
    shift_matrix = get_3x3_mat([one,  zero, height_shift, 
                                zero, one,  width_shift, 
                                zero, zero, one])
    
    return K.dot(K.dot(rotation_matrix, shear_matrix), 
                 K.dot(zoom_matrix,     shift_matrix))


def aug(image, DIM=DIM):    
    # input image - is one image of size [dim,dim,3] not a batch of [b,dim,dim,3]
    # output - image randomly rotated, sheared, zoomed, and shifted
    XDIM = DIM%2 #fix for size 331
    
    rot = ROT_ * tf.random.normal([1], dtype='float32')
    shr = SHR_ * tf.random.normal([1], dtype='float32') 
    h_zoom = 1.0 + tf.random.normal([1], dtype='float32') / HZOOM_
    w_zoom = 1.0 + tf.random.normal([1], dtype='float32') / WZOOM_
    h_shift = HSHIFT_ * tf.random.normal([1], dtype='float32') 
    w_shift = WSHIFT_ * tf.random.normal([1], dtype='float32') 

    # GET TRANSFORMATION MATRIX
    m = get_mat(rot,shr,h_zoom,w_zoom,h_shift,w_shift) 

    # LIST DESTINATION PIXEL INDICES
    x   = tf.repeat(tf.range(DIM//2, -DIM//2,-1), DIM)
    y   = tf.tile(tf.range(-DIM//2, DIM//2), [DIM])
    z   = tf.ones([DIM*DIM], dtype='int32')
    idx = tf.stack( [x,y,z] )
    
    # ROTATE DESTINATION PIXELS ONTO ORIGIN PIXELS
    idx2 = K.dot(m, tf.cast(idx, dtype='float32'))
    idx2 = K.cast(idx2, dtype='int32')
    idx2 = K.clip(idx2, -DIM//2+XDIM+1, DIM//2)
    
    # FIND ORIGIN PIXEL VALUES           
    idx3 = tf.stack([DIM//2-idx2[0,], DIM//2-1+idx2[1,]])
    d    = tf.gather_nd(image, tf.transpose(idx3))
        
    img = tf.reshape(d,[DIM, DIM,3])
    img = tf.image.random_flip_left_right(img)
    #img = tf.image.random_hue(img, 0.01)
    img = tf.image.random_saturation(img, 0.7, 1.3)
    img = tf.image.random_contrast(img, 0.8, 1.2)
    img = tf.image.random_brightness(img, 0.1)
    return img

def aug_img_label(img, label):
    img = aug(img)
    return img, label

def aug_img(img):
    img = aug(img)
    return img

### Methods to parse tfrecords

In [None]:

def parse_rec_train(data):           
    feature_set = {
        'image': tf.io.FixedLenFeature([], tf.string),
        'target': tf.io.FixedLenFeature([], tf.int64)
    }
    features = tf.io.parse_single_example(data, features= feature_set )
    return features

def parse_rec_validate(data):           
    feature_set = {
        'image': tf.io.FixedLenFeature([], tf.string),
        'target': tf.io.FixedLenFeature([], tf.int64),
        'image_name': tf.io.FixedLenFeature([], tf.string)
    }
    features = tf.io.parse_single_example(data, features= feature_set )
    return features

def parse_rec_test(data):           
    feature_set = {
        'image': tf.io.FixedLenFeature([], tf.string),
        'image_name': tf.io.FixedLenFeature([], tf.string)
    }
    features = tf.io.parse_single_example(data, features= feature_set )
    return features


def process_img(img):
    img = tf.image.decode_image(img)
    img = tf.ensure_shape(img, (TFREC_DIM,TFREC_DIM,3))
    img = tf.image.resize(img, [DIM,DIM])
    img = float(img)/255.00
    return tf.cast(img, tf.float32)

def get_img_label(features):
    target = features["target"]
    features.pop("target")
    img = process_img(features["image"])
    return img, target


def get_img(features, label=None):
    img = process_img(features["image"])
    return img

def get_img_and_name(features, label=None):
    img = process_img(features["image"])
    image_name = features["image_name"]
    return img, image_name

def get_img_name(features):
    image_name = features["image_name"]
    return image_name

### Familiarise with TFRec

In [None]:
tfrec_files_train_all = np.sort(np.array(tf.io.gfile.glob(TFREC_GCS_DIR + '/train*.tfrec')))
tfrec_files_train_2019 = np.sort(np.array(tf.io.gfile.glob(PRV_TFREC_GCS_DIR + '/train%.2i*.tfrec'%x for x in range(1, 31, 2))))
tfrec_files_train_2018 = np.sort(np.array(tf.io.gfile.glob(PRV_TFREC_GCS_DIR + '/train%.2i*.tfrec'%x for x in range(0, 30, 2))))
tfrec_files_test  = np.sort(np.array(tf.io.gfile.glob(TFREC_GCS_DIR + '/test*.tfrec')))
dataset_peek = tf.data.TFRecordDataset(tfrec_files_train_all)
for data in dataset_peek.take(1):
    example = tf.train.Example()
    example.ParseFromString(data.numpy())
    print(str(example)[-600:])

### Peek a few images in the TFRec dataset

In [None]:
def get_img_list(dataset):
    arr_img = []
    for img, label in dataset.take(8):
        arr_img.append(img)
    return arr_img
    
def show_img(img_list):
    row=3; col=8;
    plt.figure(figsize=(20,row*12/col))
    x = 1
    for k in range(3):
        if k == 0:
            for img in img_list:
                plt.subplot(row,col,x)
                plt.imshow(img)
                x = x + 1
        else:
            for img in img_list:
                img = aug(img)
                plt.subplot(row,col,x)
                plt.imshow(img)
                x = x + 1

dataset_train = dataset_peek.map(parse_rec_train).map(get_img_label)
show_img(get_img_list(dataset_train)) 

### Define Model

In [None]:
EFNS = [efn.EfficientNetB0, efn.EfficientNetB4, efn.EfficientNetB6, efn.EfficientNetB6, efn.EfficientNetB6]
def build_model(fold):
    if BLN_TRAIN_MODEL:
        if 1==2:
            base_model = efn.EfficientNetB0(weights='imagenet', include_top=False, input_shape=(DIM,DIM,3))
            x = base_model.output
            x = GlobalAveragePooling2D()(x)
            x = Dense(1024)(x)
            x = tf.keras.layers.BatchNormalization()(x)
            x = tf.keras.layers.Activation('relu')(x)
            x = tf.keras.layers.Dropout(0.02)(x)
            predictions = Dense(1, activation='sigmoid')(x)
            model = Model(inputs=base_model.input, outputs=predictions)
        
        inp = tf.keras.layers.Input(shape=(DIM,DIM,3))
        base = EFNS[fold](input_shape=(DIM,DIM,3),weights='imagenet',include_top=False)
        x = base(inp)
        x = tf.keras.layers.GlobalAveragePooling2D()(x)
        x = tf.keras.layers.Dense(1,activation='sigmoid')(x)
        model = tf.keras.Model(inputs=inp,outputs=x)
        opt = tf.keras.optimizers.Adam(learning_rate=0.001)
        loss = tf.keras.losses.BinaryCrossentropy(label_smoothing=0.05) 
        model.compile(optimizer=opt,loss=loss,metrics=['AUC'])
        return model
    else:
        model_path = "/kaggle/input/model-data-pipeline-v18/model.h5"
        model = tf.keras.models.load_model(model_path)
      
        opt = tf.keras.optimizers.Adam(lr=0.001)
        loss = tf.keras.losses.BinaryCrossentropy(label_smoothing=0.05, name='binary_crossentropy')
        model.compile(optimizer=opt, loss=loss, metrics=['accuracy',tf.keras.metrics.AUC()])
    return model


### Callbacks: Learning Rate scheduler, Early Stopping Callback and Best Model Save

In [None]:
def get_lr_callback(batch_size):
    lr_start   = 0.000005
    lr_max     = 0.000020 * strategy.num_replicas_in_sync
    lr_min     = 0.000001
    lr_ramp_ep = 5
    lr_sus_ep  = 0
    lr_decay   = 0.8
   
    def lrfn(epoch):
        if epoch < lr_ramp_ep:
            lr = (lr_max - lr_start) / lr_ramp_ep * epoch + lr_start
            
        elif epoch < lr_ramp_ep + lr_sus_ep:
            lr = lr_max
            
        else:
            lr = (lr_max - lr_min) * lr_decay**(epoch - lr_ramp_ep - lr_sus_ep) + lr_min
            
        return lr

    lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose=False)
    return lr_callback


lr = get_lr_callback(BATCH_SIZE)

es = tf.keras.callbacks.EarlyStopping(monitor='val_auc', mode='max', verbose=1, patience=3)


In [None]:
dataset_test = tf.data.TFRecordDataset(tfrec_files_test).map(parse_rec_test, num_parallel_calls=AUTO)
img_name_list_test = list(dataset_test.map(get_img_name, num_parallel_calls=AUTO).as_numpy_iterator())
dataset_test_raw = dataset_test.map(get_img, num_parallel_calls=AUTO)

In [None]:
def count_data_items(filenames):
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) 
         for filename in filenames]
    return np.sum(n)

In [None]:
arr_test_pred = []
arr_test_pred_min = []
arr_test_pred_max = []

if BLN_TRAIN_MODEL:
    from sklearn.model_selection import KFold
    kf = KFold(n_splits=KFOLD_SPLITS, shuffle=True,random_state=SEED)
    for fold , (train_idx, val_idx) in enumerate(kf.split(range(len(tfrec_files_train_all)))):
        print(train_idx, val_idx)
        tfrec_files_train = np.array(tfrec_files_train_all)[list(train_idx)]
        if fold >= 3:
            additional_files = np.array(tfrec_files_train_2019)[list(train_idx)]
        else:
            additional_files = np.array(tfrec_files_train_2018)[list(train_idx)]
        tfrec_files_train = np.concatenate([tfrec_files_train, additional_files])
        np.random.shuffle(tfrec_files_train)
        tfrec_files_valid = np.array(tfrec_files_train_all)[list(val_idx)]
        print("num_steps:", count_data_items(tfrec_files_train)/BATCH_SIZE)
        temp_dataset = tf.data.TFRecordDataset(tfrec_files_train).shuffle(1024*8).map(parse_rec_train, num_parallel_calls=AUTO).map(get_img_label, num_parallel_calls=AUTO)
        temp_dataset = temp_dataset.map(aug_img_label, num_parallel_calls=AUTO)
        dataset_train = temp_dataset.batch(BATCH_SIZE, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)
        dataset_valid = tf.data.TFRecordDataset(tfrec_files_valid).map(parse_rec_train, num_parallel_calls=AUTO).map(get_img_label, num_parallel_calls=AUTO).batch(BATCH_SIZE, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)
        K.clear_session()
        with strategy.scope():
            model = build_model(fold)
            
        sv = tf.keras.callbacks.ModelCheckpoint('fold-%i.h5'%fold, monitor='val_auc', verbose=0, save_best_only=True,
            save_weights_only=True, mode='max', save_freq='epoch')
        model.fit(dataset_train, epochs=EPOCHS, verbose=1, callbacks=[sv, lr, es], validation_data = dataset_valid)    #steps_per_epoch=steps_per_epoch, 
        model.load_weights('fold-%i.h5'%fold)
        arr_pred = []
        for i in range(TTA):
            dataset_test = dataset_test_raw.map(aug_img, num_parallel_calls=AUTO).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
            arr_pred.append( model.predict(dataset_test, verbose=1) )
        arr_test_pred.append(np.stack(arr_pred, axis=1).mean(axis=1))
        filename = 'test_pred-%i.pkl'%fold
        outfile = open(filename,'wb')
        pickle.dump(arr_pred,outfile)
        outfile.close()
else:
    for fold in range(KFOLD_SPLITS):
        filename = '../input/dsamazinglyfastkernel09402/test_pred-%i.pkl'%fold
        outfile = open(filename,'rb')
        arr_pred = pickle.load(outfile)
        arr_test_pred.append(np.stack(arr_pred, axis=1).mean(axis=1))
        arr_test_pred_min.append(np.stack(arr_pred, axis=1).min(axis=1))
        arr_test_pred_max.append(np.stack(arr_pred, axis=1).max(axis=1))
        outfile.close()
            

### Make predictions for test data and submit

In [None]:


file_1 = pd.read_csv("../input/melanoma-submission-9402/submission.csv") # this is the model with eb6 and data from 2019 in kfold on 384 size
file_2 = pd.read_csv("../input/amazingly-fast-kernel/submission.csv") # this is combined model on 384 b0,b4, b6, data just current year
file_3 = pd.read_csv("../input/amazingly-fast-kernel-256/submission.csv")
file_4 = pd.read_csv("../input/amazingly-fast-kernel-512/submission.csv")
file_5 = pd.read_csv("../input/amazingly-fast-kernel-192/submission.csv")
file_6 = pd.read_csv("../input/amazingly-fast-kernel-128/submission.csv")
df = pd.DataFrame({"image_name":img_name_list_test})
df["image_name"] = df["image_name"].map(lambda x: x.decode("utf-8"))

file_1 = pd.merge(df, file_1, on="image_name")
file_2 = pd.merge(df, file_2, on="image_name")
file_3 = pd.merge(df, file_3, on="image_name")
file_4 = pd.merge(df, file_4, on="image_name")
file_5 = pd.merge(df, file_5, on="image_name")
file_6 = pd.merge(df, file_6, on="image_name")
df["target_1"] = list(file_1["target"].values)
df["target_2"] = list(file_2["target"].values)
df["target_3"] = list(file_3["target"].values)
df["target_4"] = list(file_4["target"].values)
df["target_5"] = list(arr_test_pred[0][:,0])
df["target_6"] = list(arr_test_pred[1][:,0])
df["target_7"] = list(arr_test_pred[2][:,0])
df["target_8"] = list(arr_test_pred[3][:,0])
df["target_9"] = list(arr_test_pred[4][:,0])
df["target_10"] = list(file_5["target"].values)
df["target_11"] = list(file_6["target"].values)



In [None]:
meta = pd.read_csv("../input/meta-data-model/submission.csv")

In [None]:
df["target"] = df.iloc[:,1:12].mean(axis=1)
df["target_min"] = df.iloc[:,1:12].min(axis=1)
df["target_max"] = df.iloc[:,1:12].max(axis=1)

In [None]:
df.head(1)

In [None]:
if 1==2:
    cutoff_lo = 0.88
    cutoff_hi = 0.11
    df['target'] = np.where(np.all(df.iloc[:, 1:10] > cutoff_lo, axis=1),
                                            df['target_max'],
                                            np.where(np.all(df.iloc[:, 1:10] < cutoff_hi, axis=1),
                                                     df['target_min'],
                                                     df['target']))

In [None]:
if 1==2:
    meta = meta[["image_name","target"]]
    meta.columns = ["image_name","target_meta"]
    df = df[["image_name","target"]]
    df.columns = ["image_name","target_model"]
    df = pd.merge(df[["image_name","target_model"]], meta[["image_name","target_meta"]], on="image_name")
    df["target"] = df.apply(lambda row: 0.9*row["target_model"] + 0.1*row["target_meta"], axis=1)

In [None]:
#df["target"] = df.apply(lambda row: row["target_model"] if (row["target"]<0.11 and row["target_model"] < row["target"]) else row["target"], axis=1)
#df["target"] = df.apply(lambda row: row["target_model"] if (row["target"]>0.66 and row["target_model"] > row["target"]) else row["target"], axis=1)

In [None]:

df[["image_name","target"]].to_csv("submission.csv", index=False)

In [None]:
print(df.sort_values("target").tail(5))
sns.distplot(df["target"])