In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import itertools
import os

from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae

import seaborn as sns
sns.set(font_scale=1.4)

import matplotlib.pyplot as plt

def pl(nr=1, nc=1,fs1=20,fs2=7):
    fig,axes=plt.subplots(nrows=nr, ncols=nc, figsize=(fs1, fs2))
    return fig, axes

In [None]:
PATH = '/kaggle/input/tabular-playground-series-jan-2021/'
train = pd.read_csv(PATH+'train.csv')
test = pd.read_csv(PATH+'test.csv')
sample_submission = pd.read_csv(PATH+'sample_submission.csv')

FT_COLS = [x for x in train.columns if 'cont' in x]
LABEL='target'

In [None]:
#highlight outliers
train['outlier_filter'] = np.where(train[LABEL]<4, True, False)
print('# outliers', sum(train['outlier_filter']))

ol_filt = ~train['outlier_filter']

Note: this notebook & NN notebooks have been tidied from original submission. Score is the same.

# Links to notebooks for submissions to be blended

NN Models

V1 with more features

https://www.kaggle.com/davidedwards1/jan21-tabplayground-nn-final-more-features


V2 with less features

https://www.kaggle.com/davidedwards1/jan21-tabplayground-nn-final-fewer-features

**Please note: I took public notebook parameters for running my LGBM and XGB. Therefore thanks to original authors for work on the tree model parameters **

https://www.kaggle.com/hamditarek/tabular-playground-series-xgboost-lightgbm


https://www.kaggle.com/hamzaghanmi/xgboost-hyperparameter-tuning-using-optuna

# Load Inputs (OOF, Test predictions)

In [None]:
NSUBS=4

SUBMISSION_DESCR = ['LGBM','XGB','KERAS','KERAS FEWER FTS',]
colors=['Blue','Green','Red', 'Pink',]

SUBMISSION_PATHS = ['/kaggle/input/jan21-lgbm-submission/',
                   '/kaggle/input/jan21-tabular-xgb-sub/',
                    '/kaggle/input/jan21-tabplayground-nn1-output/',
                    '/kaggle/input/jan21-tabplayground-nn2-output/',                   
                   ]

# Prediction Distributions

In [None]:
f,a=pl(nr=1,nc=2,fs1=18,fs2=5)

oof_list = []
submission_list = []

sns.histplot(train[LABEL], color='Black', alpha=0.5, ax=a[0])
sns.histplot(train[LABEL], color='Black', alpha=0.5, ax=a[1])

for count,sp in enumerate(SUBMISSION_PATHS):
    oof_list+=[pd.read_csv(SUBMISSION_PATHS[count]+'oof_predictions.csv')['oof_prediction'].values]
    submission_list+=[pd.read_csv(SUBMISSION_PATHS[count]+'submission.csv')['target'].values]
    
for count, w in enumerate(submission_list):    
    sns.histplot(oof_list[count], color=colors[count], alpha=0.5, ax=a[0])
    sns.histplot(submission_list[count], color=colors[count], alpha=0.3, ax=a[1])
    
a[0].set_title('Train Labels & OOFs')
a[1].set_title('Train Labels & Test Predictions')
a[0].set_xlim(6,10)
a[1].set_xlim(6,10)

a[0].legend(['LABEL','LGBM','XGB','KERAS','KERAS2',], facecolor='White')
a[1].legend(['LABEL','LGBM - TEST PRD.','XGB - TEST PRD.','KERAS - TEST PRD.', 'KERAS2 - TEST PRD.',], facecolor='White')

plt.tight_layout()

In [None]:
#check min and max predictions
print('min and max predictions')
for count, w in enumerate(submission_list): 
    print(SUBMISSION_DESCR[count],submission_list[count].min(), submission_list[count].max())

# Check CV outcome depending on weighting of model

In [None]:
weights_range = [0.001  , 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 ,
       0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 0.999]

In [None]:
%%time
#create placeholder for results table
output_wts = np.zeros((194481,NSUBS+1))
j=0
import itertools
for a,b,c,d in itertools.product(weights_range, repeat=NSUBS):
    #get combination of weights, sum to 100%
    sum_w = np.array([a,b,c,d]).sum()
    wts = np.array([a,b,c,d]) / sum_w
    
    #get oof combination for weights
    final_oof_preds = np.zeros((len(train),))

    for count, w in enumerate(oof_list):
        final_oof_preds+=oof_list[count] * wts[count]
    
    #get error and put into output table
    output_wts[j,NSUBS] = np.sqrt(mse(train[LABEL], final_oof_preds))
    
    #record the associated weights
    output_wts[j,0:NSUBS] = wts
    
    j+=1

# Look at outcomes

Weights suggest approx 40% keras total across 2 models, 40% LGBM, 20% XGB

In [None]:
output_wts = pd.DataFrame(columns=['wt_lgbm','wt_xgb','wt_keras','wt_keras2', 'oof_error'],data=output_wts)

f,a=pl(nc=NSUBS,fs1=20)

for count,c in enumerate(['wt_lgbm','wt_xgb','wt_keras','wt_keras2', ]):
    a[count].scatter(x=output_wts[c],y=output_wts['oof_error'],color=colors[count])
    a[count].set_title(SUBMISSION_DESCR[count])
    a[count].set_xlabel('blending weight')
    a[count].set_ylabel('oof_score')
    
plt.tight_layout()

In [None]:
output_wts = output_wts.sort_values('oof_error').reset_index(drop=True)
output_wts.head(10)

In [None]:
#select weights

selected_wts = output_wts.loc[0:200, ['wt_lgbm','wt_xgb','wt_keras','wt_keras2', ]].mean(axis=0)
print(selected_wts.sum())
selected_wts

# Estimate final CV error and create mix / submission

Note: suspect my CV is somewhat underestimated due to label encoding etc across CV folds.

In [None]:
final_test_preds = np.zeros((len(sample_submission),))
final_oof_preds = np.zeros((len(train),))

for count, s in enumerate(submission_list):
    final_test_preds+=submission_list[count] * selected_wts.values[count]
    final_oof_preds+=oof_list[count] * selected_wts[count]
    
print('final CV error', np.sqrt(mse(train[LABEL], final_oof_preds)))

In [None]:
f,a=pl(nr=1,nc=2,fs1=15,fs2=5)


sns.kdeplot(train[LABEL], color='Black', alpha=0.5, ax=a[0])
sns.kdeplot(train[LABEL], color='Black', alpha=0.5, ax=a[1])
sns.kdeplot(final_test_preds, color='Green', alpha=0.5, ax=a[1])
    
a[0].set_title('Train Labels')
a[1].set_title('Train Labels & Test Predictions')
a[0].set_xlim(4.5,10)
a[1].set_xlim(4.5,10)

a[1].legend(['LABEL','Test Predictions'], facecolor='White')

plt.tight_layout()

In [None]:
sample_submission['target'] = final_test_preds
sample_submission.to_csv('submission.csv', index=False)
sample_submission.head(5)