TODOS:
* Kfold
* Different loss functions
* Different encoding for metadata


Based on: https://www.kaggle.com/agentauers/incredible-tpus-finetune-effnetb0-b6-at-once

In [None]:
DEVICE = "TPU"

CFG = dict(
    net_count         =   7,
    batch_size        =  16,
    
    read_size         = 224, 
    crop_size         = 224, 
    net_size          = [224, 240, 260, 300, 380, 456, 528, 600],
    
    LR_START          =   0.000005,
    LR_MAX            =   0.000020,
    LR_MIN            =   0.000001,
    LR_RAMPUP_EPOCHS  =   5,
    LR_SUSTAIN_EPOCHS =   0,
    LR_EXP_DECAY      =   0.8,
    epochs            =  20,
    
    rot               = 180.0,
    shr               =   2.0,
    hzoom             =   8.0,
    wzoom             =   8.0,
    hshift            =   8.0,
    wshift            =   8.0,

    optimizer         = 'adam',
    label_smooth_fac  =   0.05,
    
    tta_steps         =  25    
)

In [None]:
!pip install -q efficientnet

In [None]:
!pip install missingpy

In [None]:
import os, random, re, math, time
random.seed(a=42)

import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow.keras.backend as K
import efficientnet.tfkeras as efn

import PIL

from kaggle_datasets import KaggleDatasets

from tqdm import tqdm
from tensorflow.keras.callbacks import ModelCheckpoint

import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.utils import class_weight

from missingpy import MissForest
import tensorflow_addons as tfa

In [None]:
BASEPATH = "../input/siim-isic-melanoma-classification"
df_train = pd.read_csv(os.path.join(BASEPATH, 'train.csv'))
df_test  = pd.read_csv(os.path.join(BASEPATH, 'test.csv'))
df_sub   = pd.read_csv(os.path.join(BASEPATH, 'sample_submission.csv'))

GCS_PATH    = KaggleDatasets().get_gcs_path('melanoma-256x256')
files_train = np.sort(np.array(tf.io.gfile.glob(GCS_PATH + '/train*.tfrec')))
files_test  = np.sort(np.array(tf.io.gfile.glob(GCS_PATH + '/test*.tfrec')))

# train on images

In [None]:
if DEVICE == "TPU":
    print("connecting to TPU...")
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        print('Running on TPU ', tpu.master())
    except ValueError:
        print("Could not connect to TPU")
        tpu = None

    if tpu:
        try:
            print("initializing  TPU ...")
            tf.config.experimental_connect_to_cluster(tpu)
            tf.tpu.experimental.initialize_tpu_system(tpu)
            strategy = tf.distribute.experimental.TPUStrategy(tpu)
            print("TPU initialized")
        except _:
            print("failed to initialize TPU")
    else:
        DEVICE = "GPU"

if DEVICE != "TPU":
    print("Using default strategy for CPU and single GPU")
    strategy = tf.distribute.get_strategy()

if DEVICE == "GPU":
    print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
    

AUTO     = tf.data.experimental.AUTOTUNE
REPLICAS = strategy.num_replicas_in_sync
print(f'REPLICAS: {REPLICAS}')

In [None]:
def get_mat(rotation, shear, height_zoom, width_zoom, height_shift, width_shift):
    # returns 3x3 transformmatrix which transforms indicies
        
    # CONVERT DEGREES TO RADIANS
    rotation = math.pi * rotation / 180.
    shear    = math.pi * shear    / 180.

    def get_3x3_mat(lst):
        return tf.reshape(tf.concat([lst],axis=0), [3,3])
    
    # ROTATION MATRIX
    c1   = tf.math.cos(rotation)
    s1   = tf.math.sin(rotation)
    one  = tf.constant([1],dtype='float32')
    zero = tf.constant([0],dtype='float32')
    
    rotation_matrix = get_3x3_mat([c1,   s1,   zero, 
                                   -s1,  c1,   zero, 
                                   zero, zero, one])    
    # SHEAR MATRIX
    c2 = tf.math.cos(shear)
    s2 = tf.math.sin(shear)    
    
    shear_matrix = get_3x3_mat([one,  s2,   zero, 
                                zero, c2,   zero, 
                                zero, zero, one])        
    # ZOOM MATRIX
    zoom_matrix = get_3x3_mat([one/height_zoom, zero,           zero, 
                               zero,            one/width_zoom, zero, 
                               zero,            zero,           one])    
    # SHIFT MATRIX
    shift_matrix = get_3x3_mat([one,  zero, height_shift, 
                                zero, one,  width_shift, 
                                zero, zero, one])
    
    return K.dot(K.dot(rotation_matrix, shear_matrix), 
                 K.dot(zoom_matrix,     shift_matrix))


def transform(image, cfg):    
    # input image - is one image of size [dim,dim,3] not a batch of [b,dim,dim,3]
    # output - image randomly rotated, sheared, zoomed, and shifted
    DIM = cfg["read_size"]
    XDIM = DIM%2 #fix for size 331
    
    rot = cfg['rot'] * tf.random.normal([1], dtype='float32')
    shr = cfg['shr'] * tf.random.normal([1], dtype='float32') 
    h_zoom = 1.0 + tf.random.normal([1], dtype='float32') / cfg['hzoom']
    w_zoom = 1.0 + tf.random.normal([1], dtype='float32') / cfg['wzoom']
    h_shift = cfg['hshift'] * tf.random.normal([1], dtype='float32') 
    w_shift = cfg['wshift'] * tf.random.normal([1], dtype='float32') 

    # GET TRANSFORMATION MATRIX
    m = get_mat(rot,shr,h_zoom,w_zoom,h_shift,w_shift) 

    # LIST DESTINATION PIXEL INDICES
    x   = tf.repeat(tf.range(DIM//2, -DIM//2,-1), DIM)
    y   = tf.tile(tf.range(-DIM//2, DIM//2), [DIM])
    z   = tf.ones([DIM*DIM], dtype='int32')
    idx = tf.stack( [x,y,z] )
    
    # ROTATE DESTINATION PIXELS ONTO ORIGIN PIXELS
    idx2 = K.dot(m, tf.cast(idx, dtype='float32'))
    idx2 = K.cast(idx2, dtype='int32')
    idx2 = K.clip(idx2, -DIM//2+XDIM+1, DIM//2)
    
    # FIND ORIGIN PIXEL VALUES           
    idx3 = tf.stack([DIM//2-idx2[0,], DIM//2-1+idx2[1,]])
    d    = tf.gather_nd(image, tf.transpose(idx3))
        
    return tf.reshape(d,[DIM, DIM,3])

In [None]:
def read_labeled_tfrecord(example):
    tfrec_format = {
        'image'                        : tf.io.FixedLenFeature([], tf.string),
        'image_name'                   : tf.io.FixedLenFeature([], tf.string),
        'patient_id'                   : tf.io.FixedLenFeature([], tf.int64),
        'sex'                          : tf.io.FixedLenFeature([], tf.int64),
        'age_approx'                   : tf.io.FixedLenFeature([], tf.int64),
        'anatom_site_general_challenge': tf.io.FixedLenFeature([], tf.int64),
        'diagnosis'                    : tf.io.FixedLenFeature([], tf.int64),
        'target'                       : tf.io.FixedLenFeature([], tf.int64)
    }           
    example = tf.io.parse_single_example(example, tfrec_format)
    return (example['image'], (example['sex'], example['age_approx'], example['anatom_site_general_challenge']), example['target'])

def read_unlabeled_tfrecord(example, return_image_name):
    tfrec_format = {
        'image'                        : tf.io.FixedLenFeature([], tf.string),
        'image_name'                   : tf.io.FixedLenFeature([], tf.string),
        'patient_id'                   : tf.io.FixedLenFeature([], tf.int64),
        'sex'                          : tf.io.FixedLenFeature([], tf.int64),
        'age_approx'                   : tf.io.FixedLenFeature([], tf.int64),
        'anatom_site_general_challenge': tf.io.FixedLenFeature([], tf.int64)
    }
    example = tf.io.parse_single_example(example, tfrec_format)
    return (example['image'], (example['sex'], example['age_approx'], example['anatom_site_general_challenge']), (example['image_name'] if return_image_name else 0))

 
def prepare_image(img, cfg=None, augment=True):    
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, [cfg['read_size'], cfg['read_size']])
    img = tf.cast(img, tf.float32) / 255.0
    
    if augment:
        img = transform(img, cfg)
        img = tf.image.random_crop(img, [cfg['crop_size'], cfg['crop_size'], 3])
        img = tf.image.random_flip_left_right(img)
        img = tf.image.random_hue(img, 0.01)
        img = tf.image.random_saturation(img, 0.7, 1.3)
        img = tf.image.random_contrast(img, 0.8, 1.2)
        img = tf.image.random_brightness(img, 0.1)

    else:
        img = tf.image.central_crop(img, cfg['crop_size'] / cfg['read_size'])
                                   
#     img = tf.image.resize(img, [cfg['net_size'], cfg['net_size']])
#     img = tf.reshape(img, [cfg['net_size'], cfg['net_size'], 3])
    return img

def prepare_data(data, cfg=None, augment=True):
    img = data[0]
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, [cfg['read_size'], cfg['read_size']])
    img = tf.cast(img, tf.float32) / 255.0
    
    if augment:
        img = transform(img, cfg)
        img = tf.image.random_crop(img, [cfg['crop_size'], cfg['crop_size'], 3])
        img = tf.image.random_flip_left_right(img)
        img = tf.image.random_hue(img, 0.01)
        img = tf.image.random_saturation(img, 0.7, 1.3)
        img = tf.image.random_contrast(img, 0.8, 1.2)
        img = tf.image.random_brightness(img, 0.1)

    else:
        img = tf.image.central_crop(img, cfg['crop_size'] / cfg['read_size'])
                                   
    img = tf.image.resize(img, [cfg['net_size'][0], cfg['net_size'][0]])
    img = tf.reshape(img, [cfg['net_size'][0], cfg['net_size'][0], 3])
    
    sex_oh = tf.one_hot(data[1][0], 2)
    age_aprox = tf.dtypes.cast(tf.reshape(data[1][1], [1]), tf.float32)
    #age_aprox = age_aprox / tf.norm(age_aprox)
    anatom_site_general_challenge = tf.one_hot(data[1][2], 7)
    dense = tf.concat([sex_oh, age_aprox, anatom_site_general_challenge], axis=0)
    return (img, dense)

def count_data_items(filenames):
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) 
         for filename in filenames]
    return np.sum(n)

In [None]:
def get_dataset(files, cfg, augment = False, shuffle = False, repeat = False, 
                labeled=True, return_image_names=True):
    
    ds = tf.data.TFRecordDataset(files, num_parallel_reads=AUTO)
    ds = ds.cache()
    
    if repeat:
        ds = ds.repeat()
    
    if shuffle: 
        ds = ds.shuffle(1024*8)
        opt = tf.data.Options()
        opt.experimental_deterministic = False
        ds = ds.with_options(opt)
        
    if labeled: 
        ds = ds.map(read_labeled_tfrecord, num_parallel_calls=AUTO)
    else:
        ds = ds.map(lambda example: read_unlabeled_tfrecord(example, return_image_names), 
                    num_parallel_calls=AUTO)      
    
#     ds = ds.map(lambda img, imgname_or_label: (prepare_image(img, augment=augment, cfg=cfg), 
#                                                imgname_or_label), 
#                 num_parallel_calls=AUTO)
    ds = ds.map(lambda img, dense, imgname_or_label: (prepare_data((img, dense), augment=augment, cfg=cfg), 
                                               imgname_or_label), 
                num_parallel_calls=AUTO)
    
    ds = ds.batch(cfg['batch_size'] * REPLICAS)
    ds = ds.prefetch(AUTO)
    return ds

## Test the input pipeline
Before calling any neural net I always test the input pipeline.
Here are images from the train data.

In [None]:
def show_dataset(thumb_size, cols, rows, ds):
    mosaic = PIL.Image.new(mode='RGB', size=(thumb_size*cols + (cols-1), 
                                             thumb_size*rows + (rows-1)))
   
    for idx, data in enumerate(iter(ds)):
        img, target_or_imgid = data[0][0], data[1]
        #img, target_or_imgid = data
        ix  = idx % cols
        iy  = idx // cols
        img = np.clip(img.numpy() * 255, 0, 255).astype(np.uint8)
        img = PIL.Image.fromarray(img)
        img = img.resize((thumb_size, thumb_size), resample=PIL.Image.BILINEAR)
        mosaic.paste(img, (ix*thumb_size + ix, 
                           iy*thumb_size + iy))

    display(mosaic)
    
ds = get_dataset(files_train, CFG).unbatch().take(12*5)   
show_dataset(64, 12, 5, ds)

## Test of image augmentation

In [None]:
ds = tf.data.TFRecordDataset(files_train, num_parallel_reads=AUTO)
ds = ds.take(1).cache().repeat()
ds = ds.map(read_labeled_tfrecord, num_parallel_calls=AUTO)
# ds = ds.map(lambda img, target: (prepare_image(img, cfg=CFG, augment=True), target), 
#             num_parallel_calls=AUTO)
ds = ds.map(lambda img, dense, target: (prepare_data((img, dense), cfg=CFG, augment=True), target), 
            num_parallel_calls=AUTO)
ds = ds.take(12*5)
ds = ds.prefetch(AUTO)

show_dataset(64, 12, 5, ds)

## Images from the test data

In [None]:
ds = get_dataset(files_test, CFG, augment=True, repeat=True, 
                         labeled=False, return_image_names=False).unbatch().take(12*5)   
show_dataset(64, 12, 5, ds)

In [None]:
def get_lr_callback(cfg):
    lr_start   = cfg['LR_START']
    lr_max     = cfg['LR_MAX'] * strategy.num_replicas_in_sync
    lr_min     = cfg['LR_MIN']
    lr_ramp_ep = cfg['LR_RAMPUP_EPOCHS']
    lr_sus_ep  = cfg['LR_SUSTAIN_EPOCHS']
    lr_decay   = cfg['LR_EXP_DECAY']
   
    def lrfn(epoch):
        if epoch < lr_ramp_ep:
            lr = (lr_max - lr_start) / lr_ramp_ep * epoch + lr_start
            
        elif epoch < lr_ramp_ep + lr_sus_ep:
            lr = lr_max
            
        else:
            lr = (lr_max - lr_min) * lr_decay**(epoch - lr_ramp_ep - lr_sus_ep) + lr_min
            
        return lr

    lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose=False)
    return lr_callback

In [None]:
def get_model(cfg):
    model_input = tf.keras.Input(shape=(cfg['net_size'][0], cfg['net_size'][0], 3), name='imgIn')
    model_input_data = tf.keras.Input(shape=(10, ), name='dataIn')

    dummy = tf.keras.layers.Lambda(lambda x:x)(model_input)
    
    dense = tf.keras.layers.Dense(1024, activation='relu')(model_input_data)
    dense = tf.keras.layers.BatchNormalization()(dense)
    dense = tf.keras.layers.Dropout(0.5)(dense)

    dense = tf.keras.layers.Dense(256, activation='relu')(dense)
    dense = tf.keras.layers.BatchNormalization()(dense)
    dense = tf.keras.layers.Dropout(0.5)(dense)
    
    outputs = []    
    for i in range(cfg['net_count']):
        constructor = getattr(efn, f'EfficientNetB{i}')
        
        x = constructor(include_top=False, weights='imagenet', 
                        input_shape=(cfg['net_size'][i], cfg['net_size'][i], 3), 
                        pooling='avg')(dummy)
        
        x = tf.keras.layers.Concatenate()([x, dense])
        
        x = tf.keras.layers.Dense(256, activation='relu')(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Dropout(0.5)(x)
        
        x = tf.keras.layers.Dense(1, activation='sigmoid')(x)
        outputs.append(x)
        
    #x = tf.keras.layers.Concatenate()(outputs)
    #x = tf.keras.layers.Dropout(0.5)(x)
    #x = tf.keras.layers.Dense(1, activation='sigmoid')(x)    
    
    model = tf.keras.Model([model_input, model_input_data], outputs, name='aNetwork')
    model.summary()
    return model

In [None]:
def binary_focal_loss(gamma=2., alpha=.25):
    """
    Binary form of focal loss.
      FL(p_t) = -alpha * (1 - p_t)**gamma * log(p_t)
      where p = sigmoid(x), p_t = p or 1 - p depending on if the label is 1 or 0, respectively.
    References:
        https://arxiv.org/pdf/1708.02002.pdf
    Usage:
     model.compile(loss=[binary_focal_loss(alpha=.25, gamma=2)], metrics=["accuracy"], optimizer=adam)
    """
    def binary_focal_loss_fixed(y_true, y_pred):
        """
        :param y_true: A tensor of the same shape as `y_pred`
        :param y_pred:  A tensor resulting from a sigmoid
        :return: Output tensor.
        """
        pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
        pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))

        epsilon = K.epsilon()
        # clip to prevent NaN's and Inf's
        pt_1 = K.clip(pt_1, epsilon, 1. - epsilon)
        pt_0 = K.clip(pt_0, epsilon, 1. - epsilon)

        return -K.sum(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1)) \
               -K.sum((1 - alpha) * K.pow(pt_0, gamma) * K.log(1. - pt_0))

    return binary_focal_loss_fixed

In [None]:
def compile_new_model(cfg, old_model=None):    
    with strategy.scope():
        if(old_model is None):
            model = get_model(cfg)
        else:
            model = old_model
     
        #losses = [tf.keras.losses.BinaryCrossentropy(label_smoothing = cfg['label_smooth_fac'])
        #          for i in range(cfg['net_count'])]
        
        losses = [tfa.losses.SigmoidFocalCrossEntropy(gamma = 2.0, alpha = 0.80) for i in range(cfg['net_count'])]
        
        lr = tf.keras.optimizers.schedules.ExponentialDecay(
            initial_learning_rate=0.1,
            decay_steps=10000,
            decay_rate=0.9)
        
        model.compile(
            optimizer = tf.keras.optimizers.Adam(learning_rate=0.1),
            loss      = losses,
            metrics   = [tf.keras.metrics.AUC(name='auc')])
        
    return model

In [None]:
from sklearn.utils import class_weight
train = pd.read_csv('../input/siim-isic-melanoma-classification/train.csv')
class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(train.target),
                                                 train.target)
class_weights = dict(enumerate(class_weights))

In [None]:
class_weights

In [None]:
file_path="../input/output/effnet_comb_weights.best.hdf5"

#checkpoint = ModelCheckpoint(file_path, monitor='auc', verbose=1, save_best_only=True, mode='max')

In [None]:
ds_train     = get_dataset(files_train, CFG, augment=True, shuffle=True, repeat=True)
#ds_train     = ds_train.map(lambda img, dense, label: (img, dense, tuple([label] * CFG['net_count'])))

steps_train  = count_data_items(files_train) / (CFG['batch_size'] * REPLICAS)

## train dense first

In [None]:
print("Buidling model...")
model = compile_new_model(CFG)

In [None]:
print("Disabling effnets for training...")
for i in range(CFG['net_count']):
    for layer in model.layers:
        if(layer.name==f'efficientnet-b{i}'):
            layer.trainable = False
            break
print("Rebuilding model")
model = compile_new_model(CFG, model)
    
history = model.fit(ds_train, 
                         verbose          = 1,
                         steps_per_epoch  = steps_train, 
                         epochs           = 5,
                        class_weight=class_weights)

In [None]:
print("Disabling effnets for training...")
for i in range(CFG['net_count']):
    for layer in model.layers:
        if(layer.name==f'efficientnet-b{i}'):
            layer.trainable = True
            break
    
model = compile_new_model(CFG, model)

history_complete      = model.fit(ds_train, 
                         verbose          = 1,
                         steps_per_epoch  = steps_train, 
                         epochs           = CFG['epochs'],
                         callbacks        = [get_lr_callback(CFG)],
                        class_weight=class_weights)

In [None]:
# model        = compile_new_model(CFG, recompile=False, old_model=model)
# history      = model.fit(ds_train, 
#                          verbose          = 1,
#                          steps_per_epoch  = steps_train, 
#                          epochs           = CFG['epochs'],
#                          callbacks        = [get_lr_callback(CFG), checkpoint],
#                         class_weight=class_weights)

# Dense data

In [None]:
df_train.head()

In [None]:
df_train.sex.isna().sum()

In [None]:
df_train.anatom_site_general_challenge.isna().sum()

In [None]:
df_train.age_approx.isna().sum()

In [None]:
df_train['sex'] = pd.Categorical(df_train.sex)
df_train['sex_cat'] = df_train.sex.cat.codes

df_train['anatom_site_general_challenge'] = pd.Categorical(df_train.anatom_site_general_challenge)
df_train['anatom_site_general_challenge_cat'] = df_train.anatom_site_general_challenge.cat.codes

In [None]:
imputer = MissForest()
df_train[['sex_cat', 'age_approx', 'anatom_site_general_challenge_cat']] = imputer.fit_transform(df_train[['sex_cat', 'age_approx', 'anatom_site_general_challenge_cat']])

In [None]:
print("Sex nan:", df_train.sex_cat.isna().sum())
print("anatom_site_general_challenge nan:", df_train.anatom_site_general_challenge_cat.isna().sum())
print("age_approx nan:", df_train.age_approx.isna().sum())

In [None]:
train, test = train_test_split(df_train, test_size=0.2)

print(len(train))
print(len(test))

X_train, y_train = train[['sex_cat', 'age_approx', 'anatom_site_general_challenge_cat']], train.target
X_test, y_test = test[['sex_cat', 'age_approx', 'anatom_site_general_challenge_cat']], test.target

In [None]:
class_weights = list(class_weight.compute_class_weight('balanced',
                                             np.unique(y_train),
                                             y_train))

weights_train = np.ones(y_train.shape[0], dtype = 'float')
for i, val in enumerate(y_train):
    weights_train[i] = class_weights[val-1]


weights_test = np.ones(y_test.shape[0], dtype = 'float')
for i, val in enumerate(y_test):
    weights_test[i] = class_weights[val-1]

In [None]:
train_mat = xgb.DMatrix(X_train, y_train, weight=weights_train)
test_mat = xgb.DMatrix(X_test, y_test, weight=weights_test)

In [None]:
class XGBoostClassifier(xgb.XGBClassifier):
    def __init__(self, num_boost_round=10, **params):
        self.clf = None
        self.num_boost_round = num_boost_round
        if 'num_boost_round' in params:
            self.num_boost_round = params.pop('n_estimators')
        self.params = params
        self.params.update({'objective': 'binary:logistic'})
 
    def fit(self, X, y, num_boost_round=None, weights=None):
        num_boost_round = num_boost_round or self.num_boost_round
        dtrain = xgb.DMatrix(X, y, weights)
        self.clf = xgb.train(params=self.params, dtrain=dtrain, num_boost_round=num_boost_round)
 
    def predict(self, X):
        return self.clf.predict(xgb.DMatrix(X))
 
    def predict_proba(self, X):
        dtest = xgb.DMatrix(X)
        classone_probs = self.clf.predict(dtest)
        classzero_probs = 1.0 - classone_probs
        return np.vstack((classzero_probs, classone_probs)).transpose()

    def get_params(self, deep=True):
        return self.params
 
    def set_params(self, **params):
        if 'n_estimators' in params:
            self.num_boost_round = params.pop('n_estimators')
        self.params.update(params)
        return self

In [None]:
params = {
    'max_depth':[3, 5, 10], 
    'learning_rate': [0.5, 0.1, 0.001],
    'n_estimators': [300, 500, 1500],
    'subsample': [0.5, 0.7, 1.0],
    'colsample_bytree': [0.01, 0.75, 1]
}
xgbc = XGBoostClassifier(objective='binary:logistic')
clf = GridSearchCV(xgbc, params, scoring='roc_auc', cv=3)

clf.fit(X_train, y_train, weights=weights_train)

In [None]:
# result of grid search
params = {
    'objective':'binary:logistic',
    'colsample_bytree': 0.75,
 'learning_rate': 0.001,
 'max_depth': 13,
 'n_estimators': 1500,
 'subsample': 0.3}
bst = xgb.train(clf.best_params_, train_mat, clf.best_params_['n_estimators'])
roc_auc_score(y_train, bst.predict(train_mat))

In [None]:
roc_auc_score(y_test, bst.predict(test_mat))

### predict the test set using augmented images

In [None]:
# CFG['batch_size'] = 256

# cnt_test   = count_data_items(files_test)
# steps      = cnt_test / (CFG['batch_size'] * REPLICAS) * CFG['tta_steps']
# ds_testAug = get_dataset(files_test, CFG, augment=True, repeat=True, 
#                          labeled=False, return_image_names=False)

# probs = model.predict(ds_testAug, verbose=1, steps=steps)

# probs = np.stack(probs)
# probs = probs[:cnt_test * CFG['tta_steps']]
# probs = np.stack(np.split(probs, CFG['tta_steps'], axis=0), axis=1)
# probs = np.mean(probs, axis=1)

In [None]:
CFG['batch_size'] = 256

cnt_test   = count_data_items(files_test)
steps      = cnt_test / (CFG['batch_size'] * REPLICAS) * CFG['tta_steps']
ds_testAug = get_dataset(files_test, CFG, augment=True, repeat=True, 
                         labeled=False, return_image_names=False)

probs = model.predict(ds_testAug, verbose=1, steps=steps)

probs = np.stack(probs)
probs = probs[:,:cnt_test * CFG['tta_steps']]
probs = np.stack(np.split(probs, CFG['tta_steps'], axis=1), axis=1)
probs = np.mean(probs, axis=1)

### sort predictions to have the same order as the submission
The submission ist sorted by image_name, but the dataset yielded a different order.
Traverse the test dataset once again and capture the image_names. Then join this list of image_names with the predictions and sort by image_name.

In [None]:
len(probs)

In [None]:
ds = get_dataset(files_test, CFG, augment=False, repeat=False, 
                 labeled=False, return_image_names=True)

image_names = np.array([img_name.numpy().decode("utf-8") 
                        for img, img_name in iter(ds.unbatch())])

### write a submission file for each submodel

In [None]:
for i in range(1):
    submission = pd.DataFrame(dict(
        image_name = image_names,
        target     = probs[:,0]))

    submission = submission.sort_values('image_name') 
    submission.to_csv(f'submission_model_{i}.csv', index=False)

### write a submission file using the mean of all submodels

In [None]:
submission = pd.DataFrame(dict(
    image_name = image_names,
    target     = np.mean(probs[:,:,0], axis=0)))

submission = submission.sort_values('image_name') 
submission.to_csv('submission_models_blended.csv', index=False)

In [None]:
df_test['sex'] = pd.Categorical(df_test.sex)
df_test['sex_cat'] = df_test.sex.cat.codes

df_test['anatom_site_general_challenge'] = pd.Categorical(df_test.anatom_site_general_challenge)
df_test['anatom_site_general_challenge_cat'] = df_test.anatom_site_general_challenge.cat.codes

In [None]:
imputer = MissForest()
df_test[['sex_cat', 'age_approx', 'anatom_site_general_challenge_cat']] = imputer.fit_transform(df_test[['sex_cat', 'age_approx', 'anatom_site_general_challenge_cat']])

In [None]:
X_test = df_test[['sex_cat', 'age_approx', 'anatom_site_general_challenge_cat']]

In [None]:
df_test['xgb_pred'] = bst.predict(xgb.DMatrix(X_test))
df_test = df_test.set_index('image_name')

In [None]:
probs.shape

In [None]:
pred_df = pd.DataFrame(probs[:,:,0]).transpose()
pred_df['image_name'] = image_names
pred_df = pred_df.set_index('image_name')
pred_df = pd.concat([pred_df, df_test['xgb_pred']], axis=1)

In [None]:
df_test[df_test.index=='ISIC_1581247']

In [None]:
pred_df

In [None]:
pred_df['final_prediction'] = np.average(pred_df[list(range(7))].values, axis=1)
pred_df['final_prediction'] = pred_df['final_prediction']*0.95+pred_df['xgb_pred']*0.05

In [None]:
pred_df

In [None]:
list(range(6))

In [None]:
pred_df = pred_df.sort_index()
pred_df['target'] = pred_df['final_prediction']
pred_df['image_name'] = pred_df.index
pred_df[['image_name', 'target']].to_csv('submission_models_blended_with_dense.csv', index=False)