In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

from sklearn import metrics

from scipy.stats import rankdata
from bayes_opt import BayesianOptimization



train = pd.read_csv('/kaggle/input/siim-isic-melanoma-classification/train.csv')
test = pd.read_csv('/kaggle/input/siim-isic-melanoma-classification/test.csv')
sub = pd.read_csv('/kaggle/input/siim-isic-melanoma-classification/sample_submission.csv')


## Load OOF and Test predictions

Our single best models are:
* https://www.kaggle.com/cdeotte/triple-stratified-kfold-with-tfrecords
* https://www.kaggle.com/hiramcho/melanoma-efficientnetb6-with-attention-mechanism
* https://www.kaggle.com/digvijayyadav/getting-started-with-tfrecords

it is important that models use the same fold cv strategy to not overfit the ensemble model

In [None]:
models = [ "getting-started-with-tfrecords", "melanoma-efficientnetb6-with-attention-mechanism", "triple-stratified-kfold-with-tfrecords"]

for model in models:
    dirname = "/kaggle/input/" + model
    _oof = pd.read_csv(os.path.join(dirname, "oof.csv"))
    score = metrics.roc_auc_score(_oof['target'], _oof['pred'])
    print(f"{model}: OOF auc:{score:.4}")

    _oof = _oof.rename(columns={"pred":model}).drop(["target"],axis=1)
    if "fold" in _oof.columns:
        _oof = _oof.drop(["fold"],axis=1)

    train = train.merge(_oof, on="image_name")   


    _sub = pd.read_csv(os.path.join(dirname, "submission.csv"))
    _sub.columns = ["image_name",model]    
    test = test.merge(_sub, on="image_name")   


In [None]:
train.head()

# OOF Ensembling

## Avg, Rank, Pow Avg

In [None]:
train["pred_rank"] = 0
train["pred_power"] = 0
train["pred_avg"] = 0

for c in models:
    train["pred_rank"] += train[c].rank() / train[c].rank().max()
    train["pred_power"] += np.power(train[c],2)/np.power(train[c],2).max()
    train["pred_avg"] += train [c]/train [c].max()
    
train["pred_rank"] = train["pred_rank"]/len(models)
train["pred_power"] = train["pred_power"]/len(models)
train["pred_avg"] = train["pred_avg"]/len(models)


score = metrics.roc_auc_score(train['target'], train["pred_avg"])
print(f'OOF avg_auc:{score}')
   
    
score = metrics.roc_auc_score(train['target'], train["pred_rank"])
print(f'OOF rank_auc:{score}')

score = metrics.roc_auc_score(train['target'], train["pred_power"])
print(f'OOF pow_auc:{score}')

### Submissions

In [None]:
test["target"] = 0.0
for c in models:
    test["target"] += test[c].rank() / test[c].rank().max()
test["target"] = test["target"]/len(models) 
    
sub = test[["image_name","target"]]
sub.to_csv("submission_rank.csv",index=False)
sub.head()

In [None]:
test["target"] = 0.0
for c in models:
    test["target"] += np.power(test[c],2)/np.power(test[c],2).max()
test["target"] = test["target"]/len(models) 
    
sub = test[["image_name","target"]]
sub.to_csv("submission_pow.csv",index=False)
sub.head()

In [None]:
test["target"] = 0.0
for c in models:
    test["target"] += test[c]/test[c].max()
test["target"] = test["target"]/len(models) 
    
sub = test[["image_name","target"]]
sub.to_csv("submission_avg.csv",index=False)
sub.head()

## Weighted Avg (Bayesian Optimization)


In [None]:
def dim_optimizer (df_oof, features, init_points = 20, n_iter = 30  ):
    pbounds = {'c0': (0.0, 1.0), 'c1': (0.0, 1.0), 'c2': (0.0, 1.0)}
    
    features = features

    def dim_opt (df_oof, c0,c1,c2):

        x = c0*df_oof[  features[0] ] + c1*df_oof[ features[1]] + c2*df_oof[ features[2]]
        return metrics.roc_auc_score(df_oof['target'], x)



    def q (c0, c1,c2):
        return dim_opt  ( df_oof,  c0, c1,c2 )

    optimizer = BayesianOptimization(
        f=q,
        pbounds=pbounds,
        random_state=42,
    )


    optimizer.maximize(
        init_points=init_points,
        n_iter=n_iter,
    )

    c0 = optimizer.max["params"]["c0"]
    c1 = optimizer.max["params"]["c1"]
    c2= optimizer.max["params"]["c2"]
    
    t = optimizer.max["target"]
    print ( f'bo auc:{t}, c0:{c0}, c1:{c1}, c2:{c2}' )
    
    return c0, c1, c2


c0, c1, c2 = dim_optimizer (train, models, init_points = 40, n_iter = 40  )
print (models[0],c0)
print (models[1],c1)
print (models[2],c2)

In [None]:
def bo_pred (df):
    x = c0*df[  models[0] ] + c1*df[ models[1]] + c2*df[ models[2]]
    return x

train["pred"] = bo_pred (train)
score = metrics.roc_auc_score(train['target'], train['pred'])
print(f"auc bo:{score}")



### Submission

In [None]:
test["target"] = bo_pred (test)
    
sub = test[["image_name","target"]]
sub.to_csv("submission_bo.csv",index=False)
sub.head()