![](https://marychin.org/download/kaggle/tabmar.png)

Update 10th March:
* Plot ROC curves: 
* - manual back-of-envelop calculation (excellent refresher);
* - sklearn.metrics.plot_roc_curve.
* ```roc_auc_score``` with and without ```average='micro'``` option: no difference found.

This is the first walkthrough of the March Playground:
* identifying troublesome features such as ```cat10```, ```cat5```, ```cat8```, ```cat7``` and others, which require handling;
* running quick-and-dirty baselines (without parameters tweaking) using LightGBM, XGBoost, CatBoost and Random Forests;
* looking at gains and feature rankings from LightGBM, XGBoost, CatBoost and Random Forests;
* running BorutaShap, which reports each feature as either confirmed important, unimportant or tentative.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")
sns.set_palette('hot')

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, confusion_matrix, plot_roc_curve
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import sys, glob, copy, warnings, time
warnings.simplefilter('ignore') # once | error | always | default | module

inp = '/kaggle/input/tabular-playground-series-mar-2021/'

In [None]:
df, features = {}, {}
print('{:18s}{:>10s}{:>5s}{:>5s}'.format('FILE', 'ROWS', 'COLS', 'NULL'))
for file in glob.glob(f'{inp}/*.csv'):
    label = file.split('/')[-1].split('.')[0]
    df[label] = pd.read_csv(file, index_col='id')
    features[label] = set(df[label].columns.to_list())
    print('{:18s}{:10,d}{:5d}{:5d}'.format(label, *df[label].shape, df[label].isna().any().sum()))

In [None]:
(df['sample_submission'].index == df['test'].index).all()
# Straightforward if True.

In [None]:
features['train'] == features['test'].union(features['sample_submission'])
# Straightforward if True.

In [None]:
df['train'].sample(5)

## Datatypes

In [None]:
sr = pd.DataFrame(df['train'].dtypes, columns=['dtype'])
for dtype, dtype_data in sr.groupby('dtype'):
    print('{:2d} columns of dtype {}\n{}'.format(len(dtype_data), dtype, '='*10))
    print(dtype_data.index.to_list(), '\n')

In [None]:
df['train']['target'].unique()
# target is in fact categorical, not continuous.

In [None]:
df['train'].describe(include='float')

In [None]:
df['train'].describe(include='object')

In [None]:
features = {'cat': df['train'].columns[ df['train'].columns.str.startswith('cat') ].to_list(),
            'con': df['train'].columns[ df['train'].columns.str.startswith('con') ].to_list(),
            'num': df['train'].select_dtypes(include=[float, int]).columns.to_list()}
# We are going to use these over and over; save us from having to do dot-columns again and again.
features

## Categories & encoding

In [None]:
unik = {'train'    : {}, # to hold unique categorical values from train
        'test'     : {}} # to hold unique categorical values from test
print('{:<8s}{} {}'.format('FEATURE', 'NUNIQUE', 'UNIQUE VALUES IN TRAIN'))
# Print list of unique values starting from the lowest nunique, in that order. Features near the bottom are the troublesome ones.
for feature in df['train'][features['cat']].nunique().sort_values().index:
    unik['train'].update({feature: set(sorted(df['train'][feature].unique()))})
    unik['test'].update({feature: set(sorted(df['test'][feature].unique()))})
    print('{:<8s}{:7d} {}'.format(feature, len(unik['train'][feature]), str(unik['train'][feature])))

In [None]:
print('{:<8s}{:76s}'.format('FEATURE', 'UNIQUE VALUES IN TRAIN'))
for feature in features['cat']:
    if unik['train'][feature]!=unik['test'][feature]:
        print('in train but not in test:', feature, unik['train'][feature].difference(unik['test'][feature]))
        print('in test but not in train:', feature, unik['test'][feature].difference(unik['train'][feature]))

In [None]:
ncoda = OrdinalEncoder().fit(pd.concat([ df['train'][features['cat']], 
                             df['test'][features['cat']] ]))
# For sanity check only; will be deleted real soon:
orig = copy.deepcopy(df)
for dataset in ['train', 'test']:
    df[dataset][features['cat']] = ncoda.transform(df[dataset][features['cat']])
    df[dataset][features['cat']] = df[dataset][features['cat']].astype(int)# .astype('category')
ncoda.categories_

In [None]:
# Just a pedantic sanity check.
assert (ncoda.inverse_transform(df['train'][features['cat']]) == orig['train'][features['cat']]).all().all()
assert (ncoda.inverse_transform(df['test'][features['cat']]) == orig['test'][features['cat']]).all().all()
del orig   # Deleted as promised.

In [None]:
df['train'].info()

## Distribution: categorical features by target

In [None]:
valuecount2D = pd.DataFrame()
for nfeature, feature in enumerate(features['cat']):
    tis = {'feature': feature}
    for group, group_data in df['train'].groupby(feature):
        tis['feature_category'] = group
        if group_data['target'].value_counts().nunique()==1:
            print(feature, group)
        for tis['target'], tis['count'] in group_data['target'].value_counts().iteritems():
            valuecount2D = pd.concat([valuecount2D, pd.DataFrame(tis, index=[f'{feature}_{group}_{tis["target"]}'])])
# valuecount2D.reset_index(drop=True, inplace=True)
valuecount2D.rename(columns={0: 'target=0', 1: 'target=1'}, inplace=True)
valuecount2D

In [None]:
# sanity
auto = valuecount2D.loc['cat18_3_1', 'count']
manual = len(df['train'].query('target==1 and cat18==3'))
if auto==manual:
    print('sane')
else:
    print('insane')

In [None]:
cols = 3
rows = int(np.ceil(len(features['cat'])/cols))
fig, ax = plt.subplots(rows, cols, figsize=(15, 7*rows), sharex=True)
# As before, start with well-behaved features, with the problematic ones at the end, in that order.
for nfeature, feature in enumerate(df['train'][features['cat']].nunique().sort_values().index):
    tis_ax = ax[nfeature//cols][nfeature%cols]
    sns.barplot(data=valuecount2D.loc[valuecount2D['feature']==feature], 
                x='count', y='feature_category', hue='target', orient='h', ax=tis_ax, palette='hot')
    tis_ax.set_title(feature)
# As warned by earlier text output we find cat5 and cat10 screaming for attention.

## Distribution: continuous features

In [None]:
%%time
plt.figure(figsize=(15, 5))
sns.violinplot(data=df['train'][ features['con'] ])

In [None]:
tmp = df['train'][features['con']]
xx = tmp.mean()
yy = tmp.median()
plt.figure(figsize=(10, 10))
plt.plot([xx.min(), xx.max()], [yy.min(), yy.max()], 'y-.')
plt.plot(xx, yy, '.r')
for x, y, z in zip(xx, yy, tmp):
    plt.text(x+.005, y, z)
_ = plt.axis('equal'); plt.xlabel('feature mean'); plt.ylabel('feature median')

## Distribution: continuous features by target

In [None]:
cols = 2
rows = int(np.ceil(len(features['con'])/cols))
fig, ax = plt.subplots(rows, cols, figsize= (15, 5*rows))
for nfeature, feature in enumerate(features['con']):
    sns.histplot(data=df['train'], y=feature, hue='target', stat='density', ax=ax[nfeature//cols, nfeature%cols], palette='hot')

## 2D flood maps: how features pair cross-talk
Seaborn has one-liners for this; but runs till eternity without returning. Here is therefore a dirty hack.

In [None]:
traintest = pd.concat([df['train'], df['test']])
binned = traintest[features['con']].apply(lambda x: pd.cut(x, bins=32, labels=False))
plt.figure(figsize=(15, 15))
nfeatures = len(features['con'])
for aa in range(1, nfeatures):
    for bb in range(aa):
        plt.subplot(nfeatures, nfeatures, aa*nfeatures + bb + 1)
        sns.heatmap(binned.groupby(features['con'][aa]).apply(lambda x: x[features['con'][bb]].value_counts()).unstack(), 
                    square=True, cmap='hot', cbar=False, xticklabels=False, yticklabels=False)
        plt.axis('off')
for tmp in range(1, nfeatures):
    plt.subplot(nfeatures, nfeatures, nfeatures*tmp+1)
    plt.axis('on'); plt.ylabel(features['con'][tmp])
for tmp in range(nfeatures-1):
    plt.subplot(nfeatures, nfeatures, nfeatures*(nfeatures-1)+tmp+1)
    plt.axis('on'); plt.xlabel(features['con'][tmp])
for tmp in range(1, nfeatures-1):
    plt.subplot(nfeatures, nfeatures, nfeatures*(nfeatures-1)+tmp+1)
    plt.ylabel('')

In [None]:
%%time
corr = traintest.corr()
corr.to_csv('corr.csv')
plt.figure(figsize=(15, 15))
sns.heatmap(corr, mask=np.triu(np.ones_like(corr, dtype=bool)), annot=True, fmt='.1f', linewidths=.5, square=True, cmap='hot', annot_kws={'size': 10}, cbar_kws={"shrink": .5})

In [None]:
slimcorr = pd.Series(dtype=float)
for feature in corr.columns:
    slimcorr.loc[feature] = corr[feature].sort_values()[-2]
slimcorr.sort_values(ascending=False)
# output reports no correlation too high; therefore too premature to drop any feature

## 4 baselines before tuning

In [None]:
dataX = df['train'].copy()
datay = dataX.pop('target')
trainX, validX, trainy, validy = train_test_split(dataX, datay)

def trainNpredict(model):
    tic = time.time()
    pred = model.fit(trainX, trainy).predict_proba(validX)[:, 1]
    roc_auc = roc_auc_score(validy, pred, average='micro')
    print("roc_auc_score(validy, pred, average='micro') =", roc_auc)
    print("roc_auc_score(validy, pred) =", roc_auc_score(validy, pred))
#   plot ROC curve
    return model, time.time()-tic, roc_auc

model, tictoc, roc_auc = {}, pd.Series(dtype=float), pd.Series(dtype=float)

In [None]:
label = 'rf'
model[label], tictoc[label], roc_auc.loc[label] = trainNpredict(RandomForestClassifier(n_estimators=200, max_depth=7))

In [None]:
label = 'lgb'
model[label], tictoc[label], roc_auc.loc[label] = trainNpredict(LGBMClassifier(**{'is_unbalance': True}))

In [None]:
label = 'xgb'
scale_pos_weight = (df['train']['target']==0).sum() / (df['train']['target']==1).sum()
model[label], tictoc[label], roc_auc.loc[label] = trainNpredict(XGBClassifier(**{'scale_pos_weight': scale_pos_weight}))

In [None]:
label = 'cat'
model[label], tictoc[label], roc_auc.loc[label] = trainNpredict(CatBoostClassifier(**{'scale_pos_weight': scale_pos_weight}))

In [None]:
roc_auc.sort_values(ascending=False)

In [None]:
tictoc.sort_values()

## Plot ROC curves

In [None]:
# manual back-of-envelop calculation
plt.figure(figsize=(7, 5))
kolor = {'rf': 'r',
         'lgb': 'g',
         'xgb': 'b',
         'cat': 'k'}
for k, v in model.items():
    df['train'][k] = v.predict_proba(dataX)[:, 1]
    for threshold in np.linspace(df['train'][k].min(), df['train'][k].max(), 100):
        positive = df['train'][k]>threshold
        true_positive = positive & (datay==1)
        false_positive = positive & (datay==0)
        plt.plot(false_positive.sum()/len(df['train']), true_positive.sum()/len(df['train']), '.', color=kolor[k])
        plt.xlabel('false positives'); plt.ylabel('true ppsitives')

In [None]:
# auto: sklearn.metrics.plot_roc_curve
for k in model.keys():
    plot_roc_curve(model[k], dataX, datay, color=kolor[k])

## Pick the best baseline, submit and see

In [None]:
df['sample_submission']['target'] = model[roc_auc.idxmax()].predict_proba(df['test'])[:, 1]
df['sample_submission'].to_csv('submission.csv')

## Gain

In [None]:
gain = pd.DataFrame(index=trainX.columns)
for treetype in model.keys():
    gain[treetype] = model[treetype].feature_importances_
gain.rank().astype(int).sort_values(by='lgb')

## BorutaShap

In [None]:
if 'BorutaShap' not in sys.modules:
    !pip install BorutaShap
from BorutaShap import BorutaShap

In [None]:
Feature_Selector = BorutaShap(model=XGBClassifier(**{'tree_method':'gpu_hist'}), importance_measure='shap')   # importance_measure='gini'
# Feature_Selector = BorutaShap(model=LGBMClassifier(), importance_measure='shap')
Feature_Selector.fit(X=dataX, y=datay, n_trials=1000, verbose=False) # sample=False, train_or_test = 'test', normalize=True, verbose=True)

In [None]:
Feature_Selector.results_to_csv(filename='borutashap.csv')
Feature_Selector.plot(which_features='all')  # X_size=15, figsize=(12,8), y_scale='log'

In [None]:
Feature_Selector.accepted

In [None]:
Feature_Selector.features_to_remove