## Catboost Stacking Solution

During this competition I tried out stacking based on Catboost, Lightgbm and Xgboost. Catboost gave the best result of 0.9406 with few input predictions, but I did not get far with creating good input predictions and performing optimizations. In this notebook I want do demonstrate the stacking process with Catboost and show that this can lead to good results.

In notebook https://www.kaggle.com/cdeotte/forward-selection-oof-ensemble-0-942-private Chris Deotte gives a sample ensemble based on forward selection achieving 0.9420 on private LB. Here I use the same input OOF and submission files, but use Catboost Classifier for stacking the predictions. In addition to the input predictions I add a single feature give by Giba in https://www.kaggle.com/titericz/simple-baseline. 

The predictions on the different folds sometimes do to compare well for Catboost with auc dropping for the combined OOF predictions. Therefore I use the mean auc of the folds for hyperparameter selection. The hyperparameters are taken directly from above mentioned prediction I used for the competition.

The resulting predictions achieve 0.9432 on private LB.

In [None]:
import pandas as pd, numpy as np, os
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from catboost import CatBoostClassifier

In [None]:
PATH = '../input/melanoma-oof-and-sub/'

columns_map = {
    'age_approx': 'age',
    'anatom_site_general_challenge': 'location',
}

def get_train():    
    df = pd.read_csv('../input/siim-isic-melanoma-classification/train.csv')
    df['sex'].fillna('unknown', inplace=True)
    df = df.rename(columns=columns_map)
    df['location'].fillna('unknown', inplace=True)
    
    filenames = np.sort([f for f in os.listdir(PATH) if 'oof' in f])

    for k, filename in enumerate(filenames):
        df_preds = pd.read_csv(f'{PATH}{filename}')        
        df = df.merge(df_preds[['image_name', 'pred']].rename(columns={'pred': f'p_{k}'}), how='inner', on="image_name")
        if k == 0:
            df['fold'] = df_preds['fold']    
    
    return df

def get_test():
    df = pd.read_csv('../input/siim-isic-melanoma-classification/test.csv')
    df['sex'].fillna('unknown', inplace=True)
    df = df.rename(columns=columns_map)
    df['location'].fillna('unknown', inplace=True)
    filenames = np.sort([f for f in os.listdir(PATH) if 'sub' in f])

    for k, filename in enumerate(filenames):
        df_preds = pd.read_csv(f'{PATH}{filename}')
        df = df.merge(df_preds[['image_name', 'target']].rename(columns={'target': f'p_{k}'}), how='inner', on="image_name")
    
    return df

def add_features(df_train, df_test):
    df_train = pd.concat([df_train, pd.get_dummies(df_train['sex'], prefix='sex')], axis=1)
    df_test = pd.concat([df_test, pd.get_dummies(df_test['sex'], prefix='sex')], axis=1)
    
    df_train = pd.concat([df_train, pd.get_dummies(df_train['location'], prefix='anatom')], axis=1)
    df_test = pd.concat([df_test, pd.get_dummies(df_test['location'], prefix='anatom')], axis=1)
    
    L = 15
    features = ['sex','age', 'location']

    M = df_train.target.mean()
    te = df_train.groupby(features)['target'].agg(['mean','count']).reset_index()
    te['ll'] = ((te['mean']*te['count'])+(M*L))/(te['count']+L)
    del te['mean'], te['count']

    df_train = df_train.merge(te, on=features, how='left')
    df_train['ll'] = df_train['ll'].fillna(M)

    df_test = df_test.merge(te, on=features, how='left')
    df_test['ll'] = df_test['ll'].fillna(M)  
    
    return df_train, df_test

In [None]:
def train(df, params, features, nfolds=5, verbose_eval=10):

    folds = range(nfolds)
    target = 'target'
    
    df_oof = df[['image_name', 'target', 'fold']].copy()
    y = df[target].values
    oof_preds = np.zeros((df_oof.shape[0], 1))

    scores = []

    for fold in folds:
        print(f'==========> Training fold {fold} <==========')
        df_train = df[df.fold != fold]
        df_valid = df[df.fold == fold]
        valid_idx = df_valid.index

        X_train, y_train = df_train[features], df_train[target]
        X_valid, y_valid = df_valid[features], df_valid[target]

        run_params = params.copy()
        model = CatBoostClassifier(**run_params)
        model.fit(X_train, y_train,
                  eval_set=(X_valid, y_valid),
                  use_best_model=True,                  
                  early_stopping_rounds=50,
                  verbose_eval=10)

        y_preds = model.predict_proba(X_valid)[:,1]
#         y_preds = (y_preds - y_preds.min())/(y_preds.max() - y_preds.min())
        score_fold = roc_auc_score(y_valid, y_preds)
        print(f'auc roc score fold {fold} auc: {score_fold}') 
        scores.append(score_fold)
        oof_preds[valid_idx] = y_preds.reshape(-1,1)
        df_oof.loc[df_oof.fold == fold, "pred"] = y_preds.reshape(-1,1)

        model.save_model(f'model_{fold}')

    score = roc_auc_score(y, df_oof['pred'])
    print(f'auc roc score: {score}')
    print(f'Mean auc roc score per fold: {sum(scores)/len(scores):0.5f}')

    return df_oof

In [None]:
def predict(df, modelpaths, features):
    print(f'======================> Start Predictions <======================')
    df_preds = df[['image_name']].copy()
    df_preds['target'] = 0.0

    y_preds = np.zeros((df.shape[0], 1))
    X = df[features].values

    n = len(modelpaths)

    for k, modelpath in enumerate(modelpaths):
        print(f'Predicting with model {k}')
        model = CatBoostClassifier() 
        model.load_model(modelpath)
        y_preds += model.predict_proba(X)[:,1].reshape(-1,1)/n
        
    df_preds['target'] = y_preds

    return df_preds

In [None]:
def create_submission(df_preds):
    submission = pd.read_csv("../input/siim-isic-melanoma-classification/sample_submission.csv")
    submission.target = df_preds.target.values.reshape(-1)
    submission.to_csv(f'submissions.csv', index=False)

In [None]:
df_train = get_train()
df_test = get_test()
df_train, df_test = add_features(df_train, df_test)

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
features = [
    'll',
    'p_1',
    'p_3',
    'p_8',
    'p_10',
    'p_12',
    'p_21',
    'p_26',
    'p_37',
]

params = {
    'loss_function': 'CrossEntropy',
    'eval_metric': 'AUC',
    'bootstrap_type': 'Bayesian',
    'bagging_temperature': 0.5,
    'grow_policy': 'SymmetricTree',
    'depth': 3,
    'learning_rate': 0.03,
    'iterations': 2000,
}

df_oof = train(df_train, params, features, nfolds=5)

In [None]:
modelpaths = [f'model_{k}' for k in range(5)]
df_preds = predict(df_test, modelpaths, features)

In [None]:
create_submission(df_preds)