# Update:
    
* Add another efficientnet model predictions to the stacking.

In [None]:
import numpy as np 
import pandas as pd
from tqdm import tqdm
from PIL import Image
import random
import os
import warnings
warnings.filterwarnings('ignore')
import lightgbm as lgb
from sklearn.model_selection import GroupKFold, StratifiedKFold, KFold
from sklearn import metrics
from sklearn import preprocessing
from bayes_opt import BayesianOptimization
from catboost import CatBoostClassifier
import gc

In [None]:
## function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
# letÂ´s start seeding everything
seed_everything(42)

# function to read data and image data models predictions
def read_data():
    train = pd.read_csv('/kaggle/input/siim-isic-melanoma-classification/train.csv')
    test = pd.read_csv('/kaggle/input/siim-isic-melanoma-classification/test.csv')
    sub = pd.read_csv('/kaggle/input/siim-isic-melanoma-classification/sample_submission.csv')
    tr_ef1 = pd.read_csv('/kaggle/input/eff-b3/EfficientNetB3_256.csv')
    te_ef1 = pd.read_csv('/kaggle/input/eff-b3/sub_EfficientNetB3_256.csv')
    tr_ef2 = pd.read_csv('/kaggle/input/effb3-512/EfficientNetB3_512.csv')
    te_ef2 = pd.read_csv('/kaggle/input/effb3-512/sub_EfficientNetB3_512.csv')
    tr_ef3 = pd.read_csv('/kaggle/input/effb3-384/EfficientNetB3_384.csv')
    te_ef3 = pd.read_csv('/kaggle/input/effb3-384/sub_EfficientNetB3_384.csv')
    tr_ef4 = pd.read_csv('/kaggle/input/eff-256v2/EfficientNetB0_256.csv')
    te_ef4 = pd.read_csv('/kaggle/input/eff-256v2/sub_EfficientNetB0_256.csv')
    roc_auc_256 = metrics.roc_auc_score(tr_ef1['target'], tr_ef1['predictions'])
    roc_auc_512 = metrics.roc_auc_score(tr_ef2['target'], tr_ef2['predictions'])
    roc_auc_384 = metrics.roc_auc_score(tr_ef3['target'], tr_ef3['predictions'])
    roc_auc_256_B0 = metrics.roc_auc_score(tr_ef4['target'], tr_ef4['predictions'])
    print(f'EfficientnetB3 model for 256 x 256 images out of folds roc auc score is {roc_auc_256}')
    print(f'EfficientnetB3 model for 512 x 512 images out of folds roc auc score is {roc_auc_512}')
    print(f'EfficientnetB3 model for 384 x 384 images out of folds roc auc score is {roc_auc_384}, this model includes meta data')
    print(f'EfficientnetB0 model for 256 x 256 images out of folds roc auc score is {roc_auc_256_B0}, this model includes meta data')
    te_ef1.columns = ['image_name', 'predictions']
    tr_ef1 = tr_ef1[['image_name', 'predictions']]
    te_ef2.columns = ['image_name', 'predictions_1']
    tr_ef2 = tr_ef2[['image_name', 'predictions']]
    tr_ef2.columns = ['image_name', 'predictions_1']
    te_ef3.columns = ['image_name', 'predictions_2']
    tr_ef3 = tr_ef3[['image_name', 'predictions']]
    tr_ef3.columns = ['image_name', 'predictions_2']
    te_ef4.columns = ['image_name', 'predictions_3']
    tr_ef4 = tr_ef4[['image_name', 'predictions']]
    tr_ef4.columns = ['image_name', 'predictions_3']
    train = train.merge(tr_ef1, on = 'image_name').merge(tr_ef2, on = 'image_name').merge(tr_ef3, on = 'image_name').merge(tr_ef4, on = 'image_name')
    test = test.merge(te_ef1, on = 'image_name').merge(te_ef2, on = 'image_name').merge(te_ef3, on = 'image_name').merge(te_ef4, on = 'image_name')
    return train, test, sub

def feature_engineering(train, test):
    # size of images
    trn_images = train['image_name'].values
    trn_sizes = np.zeros((trn_images.shape[0], 2))
    for i, img_path in enumerate(tqdm(trn_images)):
        img = Image.open(os.path.join('/kaggle/input/siim-isic-melanoma-classification/jpeg/train/', f'{img_path}.jpg'))
        trn_sizes[i] = np.array([img.size[0], img.size[1]])
    test_images = test['image_name'].values
    test_sizes = np.zeros((test_images.shape[0],2))
    for i, img_path in enumerate(tqdm(test_images)):
        img = Image.open(os.path.join('/kaggle/input/siim-isic-melanoma-classification/jpeg/test/', f'{img_path}.jpg'))
        test_sizes[i] = np.array([img.size[0],img.size[1]])
    train['w'] = trn_sizes[:,0]
    train['h'] = trn_sizes[:,1]
    test['w'] = test_sizes[:,0]
    test['h'] = test_sizes[:,1]
    
    return train, test

def encode_categorical(train, test):
    for col in ['sex', 'anatom_site_general_challenge']:
        encoder = preprocessing.LabelEncoder()
        train[col].fillna('unknown', inplace = True)
        test[col].fillna('unknown', inplace = True)
        train[col] = encoder.fit_transform(train[col])
        test[col] = encoder.transform(test[col])
    age_approx = np.nanmean(np.concatenate([np.array(train['age_approx']), np.array(test['age_approx'])]))
    train['age_approx'].fillna(age_approx, inplace = True)
    test['age_approx'].fillna(age_approx, inplace = True)
    train['patient_id'].fillna('unknown', inplace = True)
    return train, test

def train_and_evaluate_cat(train, test, cat_params, verbose_eval, folds = 5):
    
    # define usefull features
    features = [col for col in train.columns if col not in ['image_name', 'patient_id', 'diagnosis', 'benign_malignant', 'target', 'source']]
    if verbose_eval != False:
        print('Training with features: ', features)
    
    
    # groupkfolds to predict evaluate unknown clients (just like the test set)
    kf = GroupKFold(n_splits = folds)
    target = 'target'
    
    oof_pred = np.zeros(len(train))
    y_pred = np.zeros(len(test))
     
    for fold, (tr_ind, val_ind) in enumerate(kf.split(train, groups = train['patient_id'])):
        if verbose_eval != False:
            print('\n')
            print('-'*50)
            print(f'Training fold {fold + 1}"')
        x_train, x_val = train[features].iloc[tr_ind], train[features].iloc[val_ind]
        y_train, y_val = train[target][tr_ind], train[target][val_ind]
        
        model = CatBoostClassifier(**cat_params)
        model.fit(x_train, y_train, eval_set = (x_val, y_val), cat_features = ['sex', 'anatom_site_general_challenge'], 
                      use_best_model = True, early_stopping_rounds = 50, verbose_eval = verbose_eval)
        
        oof_pred[val_ind] = model.predict_proba(x_val)[:, 1]
#         oof_pred[val_ind] = (cat_pred - cat_pred.min())/(cat_pred.max() - cat_pred.min())
        
        y_pred += model.predict_proba(test[features])[:, 1] / kf.n_splits
        
    rauc = metrics.roc_auc_score(train['target'], oof_pred)
    if verbose_eval != False:
        print(f'Our oof roc auc score for our cat boost model is {rauc}')
        
    gc.collect()
    
    return rauc, y_pred

train, test, sub = read_data()
train, test = feature_engineering(train, test)
train, test = encode_categorical(train, test)

In [None]:
# function to perform bayesian optimization search
def run_cat_bayesian(learning_rate, depth, bagging_temperature, colsample_bylevel):
    
    params = {
        'learning_rate': learning_rate,
        'eval_metric': 'AUC',
        'loss_function': 'Logloss',
        'random_seed': 42,
        'task_type': 'CPU',
        'depth': int(depth),
        'bagging_temperature': bagging_temperature,
        'colsample_bylevel': colsample_bylevel,
        
    } 
    
    roc_auc, y_pred = train_and_evaluate_cat(train, test, params, False)
    return roc_auc

# run bayesian optimization with optimal features
bounds_cat = {
    'learning_rate': (0.1, 0.5),
    'depth': (2, 12),
    'bagging_temperature': (0.0, 2.0),
    'colsample_bylevel': (0.5, 1.0)
}

cat_bo = BayesianOptimization(run_cat_bayesian, bounds_cat, random_state = 42)
cat_bo.maximize(init_points = 100, n_iter = 100, acq = 'ucb', xi = 0.0, alpha = 1e-6)

# get new hyperparameters
params = {
    'learning_rate': cat_bo.max['params']['learning_rate'],
    'eval_metric': 'AUC',
    'loss_function': 'Logloss',
    'random_seed': 42,
    'task_type': 'CPU',
    'depth': int(cat_bo.max['params']['depth']),
    'bagging_temperature': cat_bo.max['params']['bagging_temperature'],
    'colsample_bylevel': cat_bo.max['params']['colsample_bylevel']
}

# train with new hyperparameters
roc_auc, y_pred = train_and_evaluate_cat(train, test, params, 50)

# predict
test['target'] = y_pred
sub = test[['image_name', 'target']]
sub.to_csv('cat_baseline_sub.csv', index = False)

We can make a lot of image models and use their predictions as features for out stacked model. This is just a tutorial, it is not intended to get a high score.

Another thing to mention is that there maybee more meta data features!.