In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import itertools
import os
from glob import glob

from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import roc_auc_score

import seaborn as sns
sns.set(font_scale=1.4)

import matplotlib.pyplot as plt

In [None]:
PATH = '/kaggle/input/tabular-playground-series-mar-2021/'
train = pd.read_csv(PATH+'train.csv')
test = pd.read_csv(PATH+'test.csv')
sample_submission = pd.read_csv(PATH+'sample_submission.csv')

# Load Submission + Out Of Fold Predictions from each model

In [None]:
xgb_path  = '/kaggle/input/tabmar21-xgb-for-stacking/'
lgbm_path  = '/kaggle/input/tabmar21-lgbm-encoded-sub-300321/'
dae1_path = '/kaggle/input/tabmar21-daesub1/'
dae2_path = '/kaggle/input/tabmar21-daesub2/'
dae3_path = '/kaggle/input/tabmar21-daesub3/'
dae4_path = '/kaggle/input/tabmar21-daesub4/'
dae5_path = '/kaggle/input/tabmar21-dae-mlp-finalexpts1/'
dae6_path = '/kaggle/input/tabmar21-dae-narrow-290321/'
dae7_path = '/kaggle/input/tabmar21-dae-run005/'
dae8_path = '/kaggle/input/tabmar21-dae-binned1/'
dae9_path = '/kaggle/input/tabmar21-dae-run3st4/'

descrs = ['xgb', 'lgbm', 'dae1', 'dae2', 'dae3', 'dae4', 'dae5','dae6', 'dae7', 'dae8',  'dae9',]

oof_counts = [len(glob(xgb_path+'*oof*')), len(glob(lgbm_path+'*oof*')), len(glob(dae1_path+ '*oof*')) , len(glob(dae2_path+ '*oof*')),
len(glob(dae3_path+ '*oof*')),  len(glob(dae4_path+ '*oof*')), len(glob(dae5_path+ '*oof*')), len(glob(dae6_path+ '*oof*')), len(glob(dae7_path+ '*oof*')), len(glob(dae8_path+ '*oof*')), len(glob(dae9_path+ '*oof*'))] 

oofs = glob(xgb_path+'*oof*')+ glob(lgbm_path+'*oof*') + glob(dae1_path+ '*oof*') + glob(dae2_path+ '*oof*')+ glob(dae3_path+ '*oof*')+ glob(dae4_path+ '*oof*')+ glob(dae5_path+ '*oof*')+ glob(dae6_path+ '*oof*')+ glob(dae7_path+ '*oof*')+ glob(dae8_path+ '*oof*')+ glob(dae9_path+ '*oof*')
subs = glob(xgb_path+'*test*')+glob(lgbm_path+'*test*') + glob(dae1_path+ '*submission*')+ glob(dae2_path+ '*submission*')+ glob(dae3_path+ '*submission*')+ glob(dae4_path+ '*submission*')+ glob(dae5_path+ '*submission*')+ glob(dae6_path+ '*submission*')+ glob(dae7_path+ '*submission*')+ glob(dae8_path+ '*submission*')+ glob(dae9_path+ '*submission*')

oofs_dfs = [pd.read_csv(x) for x in oofs]
subs_dfs = [pd.read_csv(x) for x in subs]

print('Out of Fold links', oofs)

print( '  -   ')

print('Submission links', subs)

In [None]:
assert len(oofs) == len(subs)

In [None]:
print('Counts of Inputs')
print(len(oofs), len(subs))

In [None]:
output_tags = [[z+'_'+str(y) for y in range(x)] for x, z in zip(oof_counts, descrs)]
output_tags = [item for sublist in output_tags for item in sublist]
print('Model Tags')
output_tags

# Individual Model Scores

In [None]:
for count, oo in enumerate(oofs_dfs):
    print(output_tags[count],roc_auc_score(oo['target'], oo['oof_prediction']))

# Model Prediction Correlations

Generally blend will be better if there is some variation in models (if all sets of predictions are 100% correlated, blending will not provide any benefit)

In [None]:
combined_oofs = pd.DataFrame()

for o in oofs_dfs:
    combined_oofs = pd.concat([combined_oofs, o[['oof_prediction']]], 
                             axis=1)
    
combined_oofs.columns=output_tags    

combined_oofs_corr = combined_oofs.corr()

sns.set(font_scale=1.3)

fig,axes=plt.subplots(figsize=(12,12))

sns.heatmap(combined_oofs_corr,
           annot=True,
           vmin=0.98,
           vmax=1,
           fmt='.3f',
           cmap='seismic_r',
           linewidth=1,
         annot_kws={"fontsize":8})

plt.title('Model OOF Prediction Correlations')
plt.tight_layout()

In [None]:
import optuna
from optuna.samplers import TPESampler
optuna.logging.set_verbosity(optuna.logging.WARNING)

OPTUNA_TRIALS = 5000

def run_optimise():
    print('running Optuna')         
    class Optimizer:
        def __init__(self, metric, trials=OPTUNA_TRIALS):
            self.metric = metric
            self.trials = trials
            self.sampler = TPESampler(seed=42)

        def objective(self, trial):
            #print('running a trial')
            model_weights = np.array(create_model(trial))
            model_weights=model_weights/model_weights.sum()
            
            oof_blend = np.zeros((len(train),))
            
            for count, od in enumerate(oofs_dfs):
                oof_blend+=oofs_dfs[count]['oof_prediction']*model_weights[count]
                           
            
            error = roc_auc_score(train['target'], oof_blend)
            return error

        def optimize(self):
            study = optuna.create_study(direction="maximize", sampler=self.sampler)
            study.optimize(self.objective, n_trials=self.trials)
            return study.trials_dataframe()

    def create_model(trial):
        l_default=0.000
        u_default=3.0
        
        #formatted = ['{0:03}'.format(x) for x in range(len(oofs))]
        
        model_weights  = [trial.suggest_uniform(f'oof_weights_{x}', l_default,u_default) for x in output_tags]
        
        #print(xgb_weights)
        
        return model_weights

    optimizer = Optimizer('mse')

    output_params = optimizer.optimize()
    
    return output_params

In [None]:
#%%time
optim_settings = run_optimise()

In [None]:
optim_settings

In [None]:
params = ['params_oof_weights_' + x for x in output_tags]
optim_settings[params] = optim_settings[params] / optim_settings[params].sum(axis=1).values.reshape(-1,1)
optim_settings=optim_settings.sort_values('value',ascending=False).reset_index(drop=True)
optim_settings[['value']+params].head(10)

In [None]:
fig,axes=plt.subplots(nrows=len(subs)//5+1,ncols=5,figsize=(20,16), sharex=True, sharey=True)
for count,p in enumerate(params):
    axes[count//5,count%5].scatter(x=optim_settings[p], y=optim_settings['value'], color='Red')
    axes[count//5,count%5].set_title(output_tags[count] + ' weight vs ROCAUC score')
plt.tight_layout()

In [None]:
CUTOFF = 10
weightings = optim_settings.loc[0:CUTOFF,params].mean(axis=0).values
print('weightings to use')
print({a:np.round(b,4) for a,b in zip(output_tags, weightings)})

In [None]:
weightings.sum()

In [None]:
print('Generating Blended OOF and Submission Predictions')
train_predictions = np.zeros((len(train),))
test_predictions = np.zeros((len(test),))
for count,w in enumerate(weightings):
    train_predictions += oofs_dfs[count]['oof_prediction']*weightings[count]
    test_predictions += subs_dfs[count]['target']*weightings[count]
sns.kdeplot(train_predictions,
           color='Green')    
sns.kdeplot(test_predictions,
           color='Red')

plt.title('Blended Train OOF and Test Predictions')
plt.legend(['Train', 'Test'], facecolor='White')
plt.tight_layout()

In [None]:
print('Blended ROC AUC Score', roc_auc_score(train['target'], train_predictions))

In [None]:
sample_submission['target']=test_predictions
sample_submission.to_csv('submission.csv', index=False)
sample_submission.head(10)

Added - to make individual model outputs available without me having to add all datasets as public, am outputting the files inside this notebook.

In [None]:
for count, (descr, oof_file, sub_file) in enumerate(zip(output_tags, oofs_dfs, subs_dfs)):
    oof_file.to_csv(f'oof_predictions_model_{descr}.csv')
    sub_file.to_csv(f'test_predictions_model_{descr}.csv')