#  Combining embeddings

In this kernel I train a model that takes as input the [embeddings](https://www.kaggle.com/cdeotte/rapids-cuml-knn-find-duplicates) that result from encoding images of the same size with different efficient nets and also meta data. This leads to a model that has access to the different features that different pretrained networks identify. This approach could be extended to different image sizes encoded with the same efficient net (and all combinations...).

In principle the predictions of models trained in this way should not (relatively speaking) be correlated with models trained using [this](https://www.kaggle.com/cdeotte/triple-stratified-kfold-with-tfrecords) notebook, which I assume a lot of people used.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

In [None]:
from sklearn import metrics
from sklearn.model_selection import train_test_split

import tensorflow as tf
import tensorflow.keras.layers as L

from kaggle_datasets import KaggleDatasets

from xgboost import XGBClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold

import random
from functools import reduce

import scipy


# Load the data

In [None]:
# Load the embedded data
DIM = 384; EFFN_LIST = [3,4,5]; BATCH_SIZE = 128
PATH_TO_EMBEDDINGS = '../input/embeddings-extractor/'
embed=[]; embed_ext=[]; embed_test=[];
for EFFN in EFFN_LIST:
    embed += [np.load(PATH_TO_EMBEDDINGS+'embed_train_%i_%i.npy'%(DIM,EFFN))]
    embed_ext += [np.load(PATH_TO_EMBEDDINGS+'embed_ext_%i_%i.npy'%(DIM,EFFN))]
    embed_test += [np.load(PATH_TO_EMBEDDINGS+'embed_test_%i_%i.npy'%(DIM,EFFN))]
names = np.load(PATH_TO_EMBEDDINGS+'names_train.npy')
names_ext = np.load(PATH_TO_EMBEDDINGS+'names_ext.npy')
names_test = np.load(PATH_TO_EMBEDDINGS+'names_test.npy')

In [None]:
# np.concatenate((oof_df['pred'].loc[names],embed),axis=1)

# LOAD TRAIN AND TEST CSV
test = pd.read_csv( '../input/siim-isic-melanoma-classification/test.csv' ).set_index('image_name',drop=True)
test = test.loc[names_test].reset_index()
print('Test csv shape',test.shape)

train = pd.read_csv( '../input/melanoma-%ix%i/train.csv'%(DIM,DIM) ).set_index('image_name',drop=True)
train = train.loc[names].reset_index()
train.target = train.target.astype('float32')
print('Train csv shape',train.shape)

train_ext = pd.read_csv( '../input/isic2019-384x384/train.csv' ).set_index('image_name',drop=True)
train_ext = train_ext.loc[names_ext].reset_index()
train_ext.target = train_ext.target.astype('float32')
print('Train 2019 csv shape',train_ext.shape)

print('Displaying train.csv below...')
train.head()

# Concatenate and preprocess the data

In [None]:
# Everything has been ordered in the same way, so now all that needs doing is concatenation
cat_enc = OneHotEncoder(drop='first')
# num_enc = MinMaxScaler()
num_enc = StandardScaler()
embed_enc = [MinMaxScaler()]*len(EFFN_LIST)
numeric_features = ['age_approx']
cat_features = ['sex','anatom_site_general_challenge']

all_train = pd.concat((train,train_ext),axis=0)
cats = cat_enc.fit_transform(all_train[cat_features].fillna('0')).toarray()
nums = num_enc.fit_transform(all_train[numeric_features].fillna(0))
labels = pd.concat((train.target.astype('int32'),train_ext.target.astype('int32')),axis=0).to_numpy()
embed_ENC=[];embed_ext_ENC=[]; embed_encT=[]
for i in range(len(EFFN_LIST)):
    embed_enc[i].fit(np.concatenate((embed[i],embed_ext[i]),axis=0))
    embed_ENC += [embed_enc[i].transform(embed[i])]
    embed_ext_ENC += [embed_enc[i].transform(embed_ext[i])]
    embed_encT += [embed_enc[i].transform(embed_test[i])]
Xtrain = np.concatenate((cats,nums),axis=1)

cats = cat_enc.transform(test[cat_features].fillna('0')).toarray()
nums = num_enc.transform(test[numeric_features].fillna(0))
Xtest = np.concatenate((cats,nums),axis=1)

In [None]:
[np.shape(emb) for emb in embed_ENC]

In [None]:
for emb in embed_ENC:
    std_all = np.std(emb,axis=1)
    embed_std = np.mean(std_all)
    print('Mean standard deviation is {}'.format(embed_std))

# Set hyperparameters

In [None]:
tf.random.set_seed(5);

weights = {0:1, 1:50}

FOLDS=5
SEED=42
DISPLAY_PLOT = 1
REPLICAS=1
DISPLAY_PLOT = 1
EPOCHS=100
TTA=11
batch_size = 128
VERBOSE=0

# Build the model

In [None]:
dim1 = embed_ENC[0].shape[1]
dim2 = embed_ENC[1].shape[1]
dim3 = embed_ENC[2].shape[1]
meta_dim = Xtrain.shape[1]
def build_model(ls=0.05):
    inp1 = tf.keras.layers.Input(shape=(dim1,))
    inp2 = tf.keras.layers.Input(shape=(dim2,))
    inp3 = tf.keras.layers.Input(shape=(dim3,))
    meta_inp = tf.keras.layers.Input(shape=(meta_dim,))
    
    x1 = L.Dropout(0.2)(inp1)
    x1 = L.Dense(int(dim1/3), activation='relu')(x1)
    x2 = L.Dropout(0.2)(inp2)
    x2 = L.Dense(int(dim2/3), activation='relu')(x2)
    x3 = L.Dropout(0.2)(inp3)
    x3 = L.Dense(int(dim3/3), activation='relu')(x3)
    
    x = L.concatenate((x1,x2,x3))
    x = L.Dropout(0.3)(x)
    x = L.Dense(int(1024), activation='relu')(x)
    x = L.Dropout(0.3)(x)
    x = L.Dense(int(512), activation='relu')(x)
    x = L.Dropout(0.3)(x)
    x = L.Dense(int(256), activation='relu')(x)
    x = L.Dropout(0.3)(x)
    x = L.Dense(int(128), activation='relu')(x)

    xm = L.concatenate((x,meta_inp))
    xm = L.Dropout(0.3)(xm)
    xm = L.Dense(128, activation='relu')(xm)
    xm = L.Dropout(0.3)(xm)
    xm = L.Dense(64, activation='relu')(xm)
    xm = L.Dropout(0.3)(xm)
    xm = L.Dense(32, activation='relu')(xm)
    xm = L.Dropout(0.3)(xm)
    xm = L.Dense(16, activation='relu')(xm)
    xm = L.Dropout(0.2)(xm)
    xm = L.Dense(1, activation='sigmoid')(xm)
    model = tf.keras.Model(inputs=(meta_inp,inp1,inp2,inp3), outputs=xm)
    opt = tf.keras.optimizers.Adam(learning_rate=0.000000125* REPLICAS * batch_size)
    loss = tf.keras.losses.BinaryCrossentropy(label_smoothing=ls) 
    model.compile(optimizer=opt,loss=loss,metrics=['AUC'])
    return model

# Train the model

This has been repurposed from [this](https://www.kaggle.com/cdeotte/triple-stratified-kfold-with-tfrecords) notebook.

In [None]:
def get_lr_callback(batch_size=8):
    lr_start   = 0.000005
    lr_max     = 0.00000125 * REPLICAS * batch_size
#     lr_max     = 0.0000125 * REPLICAS * batch_size
    lr_min     = 0.000001
    lr_ramp_ep = 50
    lr_sus_ep  = 0
    lr_decay   = 0.99
   
    def lrfn(epoch):
        if epoch < lr_ramp_ep:
            lr = (lr_max - lr_start) / lr_ramp_ep * epoch + lr_start
            
        elif epoch < lr_ramp_ep + lr_sus_ep:
            lr = lr_max
            
        else:
            lr = (lr_max - lr_min) * lr_decay**(epoch - lr_ramp_ep - lr_sus_ep) + lr_min
            
        return lr

    lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose=False)
    return lr_callback

In [None]:
# helper function for loading data

def data_augment(data,mean=0.0,std1=0.02):
    gauss=tf.random.normal(tf.shape(data), mean=mean, stddev=std1,dtype=data.dtype)
    new_data = data + gauss
    return new_data

# How to augment meta data and image embeddings?

def aug(X):
    return tuple([data_augment(e) for e in X])

def get_dataset(X,embed,y,augment=True,repeat=True,batch=batch_size):
    ds = tf.data.Dataset.from_tensor_slices((  tuple([X]+embed)  , y))
    if repeat:
        ds = ds.repeat()
    if augment:
        ds = ds.map(lambda elem,label: (aug(elem),label))
    ds = ds.batch(batch)
    return ds

In [None]:
ids = train['image_name']
idsO = test['image_name']
data_sub = Xtest
# Default strategy for single GPU
strategy = tf.distribute.get_strategy()

# skf = StratifiedKFold(n_splits=FOLDS,shuffle=True,random_state=SEED)
skf = KFold(n_splits=FOLDS,shuffle=True,random_state=SEED)

oof_pred = []; oof_tar = []; oof_val = []; oof_names = []; oof_folds = [] 
preds = np.zeros((test.shape[0],1))
preds_all = np.zeros((test.shape[0],FOLDS))

XX_data=[]
for i in range(len(EFFN_LIST)):
    XX_data += [np.concatenate([embed_ENC[i],embed_ext_ENC[i]],axis=0)]

for fold,(idxT2,idxV2) in enumerate(skf.split(np.arange(15))):

    idxT = train.loc[train.tfrecord.isin(idxT2)].index.values #2020 train
    idxV = train.loc[train.tfrecord.isin(idxV2)].index.values #2020 valid
    
#     X = data_augment(data[idxT]); y = train.target[idxT]
    ext_start = len(train)
    ext_num = len(train_ext)
    ind = np.concatenate((idxT,list(range(ext_start,ext_num+ext_start-1))))
    X = Xtrain[ind]; y = labels[ind];
#     Xemb = [emb[idxT] for emb in embed_ENC];
    XX=[elem[ind] for elem in XX_data]
    
    X_val = Xtrain[idxV]; y_val = labels[idxV]; XX_val=[emb[idxV] for emb in embed_ENC];
    
    # BUILD MODEL
    tf.keras.backend.clear_session()
    with strategy.scope():
        model=build_model()

    # SAVE BEST MODEL EACH FOLD
    sv = tf.keras.callbacks.ModelCheckpoint(
        'fold-%i.h5'%fold, monitor='val_loss', verbose=0, save_best_only=True,
        save_weights_only=True, mode='min', save_freq='epoch')

    # Train the model
    history = model.fit(get_dataset(X,XX,y),epochs=EPOCHS,steps_per_epoch=X.shape[0]/batch_size//REPLICAS,
                        verbose=VERBOSE,class_weight=weights, 
#                         callbacks=[sv,get_lr_callback(batch_size=batch_size)],
                        callbacks=[sv],
                       validation_data=get_dataset(X_val,XX_val, y_val,augment=False,repeat=False))

    model.load_weights('fold-%i.h5'%fold)

    # PREDICT OOF USING TTA
    STEPS = TTA*X_val.shape[0]/(batch_size-10)/REPLICAS
    pred = model.predict( get_dataset(X_val,XX_val,y_val), steps=STEPS )[:TTA*X_val.shape[0]]
    oof_pred.append( np.mean(pred.reshape((X_val.shape[0],TTA),order='F'),axis=1) )                

    # GET OOF TARGETS AND NAMES
    oof_tar.append( y_val )
    oof_names.append( ids[idxV] )
    oof_folds.append( np.ones_like(oof_tar[-1],dtype='int8')*fold )

    STEPS = TTA*Xtest.shape[0]/(batch_size-10)/REPLICAS
    psub = model.predict( get_dataset(Xtest,embed_encT,np.zeros(len(Xtest))), steps=STEPS  )[:TTA*Xtest.shape[0]]
    pstore = np.mean(psub.reshape((len(Xtest),TTA),order='F'),axis=1)
    preds[:,0] += pstore*1/FOLDS

    # REPORT RESULTS
    auc = roc_auc_score(oof_tar[-1],oof_pred[-1])
    oof_val.append( np.max( history.history['val_auc'] ) )
    print('#### FOLD %i OOF AUC without TTA = %.3f, with TTA = %.3f'%(fold+1,oof_val[-1],auc))

    # PLOT TRAINING
    if DISPLAY_PLOT:
        plt.figure(figsize=(15,5))
        plt.plot(np.arange(EPOCHS),history.history['auc'],'-o',label='Train AUC',color='#ff7f0e')
        plt.plot(np.arange(EPOCHS),history.history['val_auc'],'-o',label='Val AUC',color='#1f77b4')
        x = np.argmax( history.history['val_auc'] ); y = np.max( history.history['val_auc'] )
        xdist = plt.xlim()[1] - plt.xlim()[0]; ydist = plt.ylim()[1] - plt.ylim()[0]
        plt.scatter(x,y,s=200,color='#1f77b4'); plt.text(x-0.03*xdist,y-0.13*ydist,'max auc\n%.2f'%y,size=14)
        plt.ylabel('AUC',size=14); plt.xlabel('Epoch',size=14)
        plt.legend(loc=2)
        plt2 = plt.gca().twinx()
        plt2.plot(np.arange(EPOCHS),history.history['loss'],'-o',label='Train Loss',color='#2ca02c')
        plt2.plot(np.arange(EPOCHS),history.history['val_loss'],'-o',label='Val Loss',color='#d62728')
        x = np.argmin( history.history['val_loss'] ); y = np.min( history.history['val_loss'] )
        ydist = plt.ylim()[1] - plt.ylim()[0]
        plt.scatter(x,y,s=200,color='#d62728'); plt.text(x-0.03*xdist,y+0.05*ydist,'min loss',size=14)
        plt.ylabel('Loss',size=14)
        plt.legend(loc=3)
        plt.show()  

# COMPUTE OVERALL OOF AUC
oof_c = np.concatenate(oof_pred); true = np.concatenate(oof_tar);
names = np.concatenate(oof_names); folds = np.concatenate(oof_folds)
auc = roc_auc_score(true,oof_c)
print('Overall OOF AUC with TTA = %.5f'%auc)

dd_oof = pd.DataFrame(dict(image_name = names, target=true, pred = oof_c, fold=folds))

submission = pd.DataFrame(dict(image_name=idsO, target=preds[:,0]))
submission = submission.sort_values('image_name')
submission.to_csv('submission.csv', index=False)
submission.head()