![](https://marychin.org/download/kaggle/tabfeb.png)

# First steps: a few checkboxes
* Any null values? - No, all clear. See **Quick look** section.
* Is data skewed, a bit like [Mechanism of Action](https://www.kaggle.com/c/lish-moa/submissions)? - Yes, gravely. See **Distribution** section. Without a fix this is going to plague learning and prediction whether we use gradient boosters, neural networks or other techniques.
* Any outliers?
* Any highly-correlated features? No. See **Correlation** section.
* Any hurdle in submission process (like [Jane Street](https://www.kaggle.com/c/jane-street-market-prediction))? No; straightforward submission this time.

Note that cells beginning with Jupyter magic ```%%time``` take longer to run. The output of the cell indicates how long. Seaborn's ```pairplot``` is, for instance, notorious for taking a loooong time to run.

## What I like about this competition
Quick turnaround time! We can test parameters, get the output quickly, adjust and test again over and over. Even submissions return the score almost instantly. Excellent for experimenting.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")
sns.set_palette('hot')

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OrdinalEncoder
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
import sys, glob, copy, warnings
warnings.simplefilter("ignore")

inp = '/kaggle/input/tabular-playground-series-feb-2021/'

## Quick look

In [None]:
df, features = {}, {}
print('{:18s}{:>10s}{:>5s}{:>5s}'.format('FILE', 'ROWS', 'COLS', 'NULL'))
for file in glob.glob(f'{inp}/*.csv'):
    label = file.split('/')[-1].split('.')[0]
    df[label] = pd.read_csv(file, index_col='id')
    features[label] = set(df[label].columns.to_list())
    print('{:18s}{:10,d}{:5d}{:5d}'.format(label, *df[label].shape, df[label].isna().any().sum()))

In [None]:
(df['sample_submission'].index == df['test'].index).all()
# Output confirms good behavior; no worries.

In [None]:
features['train'] == features['test'].union(features['sample_submission'])
# Output confirms no surprises; nothing tricky; nothing fancy.
# This wasn't the case for competitions like Jane Street and Data Science Bowl.

In [None]:
df['train'].tail()

In [None]:
df['train'].describe(include='object')

In [None]:
features = {'cat': df['train'].columns[ df['train'].columns.str.startswith('cat') ].to_list(),
            'con': df['train'].select_dtypes(include='float').columns.to_list()}  # includes target
features

## Categorical features

In [None]:
print('{:<8s}{:76s}{}'.format('FEATURE', 'VALUES IN TRAIN', 'VALUES IN TEST'))
for feature in features['cat']:
    unik = {'train': sorted(df['train'][feature].unique()),
            'test' : sorted(df['test'][feature].unique())}
    print('{:<8s}{:76s}'.format(feature, str(unik['train'])), end='')
    if unik['train']!=unik['test']:
        print(str(unik['test']))
    else:
        print('same here')
# Output shows
# - agreement in all columns except cat6;
# - unique values in test are a subset of unique values in train.

In [None]:
ncoda = OrdinalEncoder().fit(df['train'][features['cat']])
# For sanity check only; will be deleted real soon:
orig = copy.deepcopy(df)
for dataset in ['train', 'test']:
    df[dataset][features['cat']] = ncoda.transform(df[dataset][features['cat']])
    df[dataset][features['cat']] = df[dataset][features['cat']].astype(int)  # .astype('category')
ncoda.categories_

In [None]:
# Just a pedantic sanity check.
assert (ncoda.inverse_transform(df['train'][features['cat']]) == orig['train'][features['cat']]).all().all()
assert (ncoda.inverse_transform(df['test'][features['cat']]) == orig['test'][features['cat']]).all().all()
del orig

## What we've got so far

In [None]:
df['train'].info()

## Distribution

In [None]:
cols = 3
rows = int(np.ceil(len(features['cat'])/cols))
fig, ax = plt.subplots(rows, cols, figsize=(15, 5*rows), sharex=True)
valuecounts = {}
max2min = pd.Series(dtype=float)
for nfeature, feature in enumerate(features['cat']):
    valuecounts[feature] = df['train'][feature].value_counts(normalize=True).sort_index()
    max2min[feature] = valuecounts[feature].max()/valuecounts[feature].min()
# plot histogram starting with the most imbalance feature, in that order
for nfeature, feature in enumerate(max2min.sort_values(ascending=False).index):
    tis_ax = ax[nfeature//cols][nfeature%cols]
    sns.barplot(x=valuecounts[feature].values, y=valuecounts[feature].index, orient='h', ax=tis_ax, palette='hot')
    tis_ax.set_title(f'{feature}: {max2min[feature]:7.1e}')
    print(feature, '_'*10)
    for idx, value in valuecounts[feature].items():
        print('{:2d} ({:7.1e})'.format(idx, value), end='\t')
    print()
# Output confirms our premonition of an unbalanced distribution.

In [None]:
# Plot distribution of target, broken down into contributing components of each categorical feature, starting with the most imbalance feature, in that order.
fig, ax = plt.subplots(rows, cols, figsize= (15, 5*rows), sharex=True)
for nfeature, feature in enumerate(max2min.sort_values(ascending=False).index):
    sns.histplot(data=df['train'], x='target', stat='density', hue=feature, ax=ax[nfeature//cols, nfeature%cols], palette='hot')

In [None]:
cols = 2
rows = int(np.ceil(len(features['con'])/cols))
fig, ax = plt.subplots(rows, cols, figsize= (15, 5*rows))
for nfeature, feature in enumerate(features['con']):
    sns.histplot(df['train'][feature], stat='density', ax=ax[nfeature//cols, nfeature%cols], palette='hot')

In [None]:
fig, ax = plt.subplots(2, 1, figsize=(15, 10))
skew_kurtosis = pd.DataFrame(df['train'][features['con']].skew(), columns=['skew'])
skew_kurtosis['kurtosis']  = df['train'][features['con']].kurtosis()
sns.barplot(x=skew_kurtosis.index, y='skew', data=skew_kurtosis, orient='v', ax=ax[0], palette='hot')
sns.barplot(x=skew_kurtosis.index, y='kurtosis', data=skew_kurtosis, orient='v', ax=ax[1], palette='hot')

In [None]:
tmp = df['train'][df['train'].columns[df['train'].columns.str.startswith('con')]]
xx = tmp.mean()
yy = tmp.median()
plt.figure(figsize=(10, 10))
plt.plot([xx.min(), xx.max()], [yy.min(), yy.max()], 'y-.')
plt.plot(xx, yy, '.r')
for x, y, z in zip(xx, yy, tmp):
    plt.text(x+.005, y, z)
_ = plt.axis('equal'); plt.xlabel('feature mean'); plt.ylabel('feature median')

In [None]:
%%time
plt.figure(figsize=(15, 5))
sns.violinplot(data=df['train'][ df['train'].columns[df['train'].columns.str.startswith('cont')] ], palette='hot')

## Correlation

In [None]:
%%time
traintest = pd.concat([df['train'], df['test']])
sns.pairplot(traintest, palette='hot')
# Pairplot reveals Tabular Playground's fingerprint unlikely to be found in Featured Competitions.

## 2D flood maps: how features pair cross-talk
Fingerprints in the previous cell doesn't tell very much; they are scatter plots which do not indicate the frequency. Flood maps would tell us more. Seaborn has a one-liner for that; but runs till eternity without returning. Here is therefore a dirty hack.

In [None]:
binned = traintest[features['con']].apply(lambda x: pd.cut(x, bins=32, labels=False))
plt.figure(figsize=(15, 15))
nfeatures = len(features['con'])
for aa in range(1, nfeatures):
    for bb in range(aa):
        plt.subplot(nfeatures, nfeatures, aa*nfeatures + bb + 1)
        sns.heatmap(binned.groupby(features['con'][aa]).apply(lambda x: x[features['con'][bb]].value_counts()).unstack(), 
                    square=True, cmap='hot', cbar=False, xticklabels=False, yticklabels=False)
        plt.axis('off')
for tmp in range(1, nfeatures):
    plt.subplot(nfeatures, nfeatures, nfeatures*tmp+1)
    plt.axis('on'); plt.ylabel(features['con'][tmp])
for tmp in range(nfeatures-1):
    plt.subplot(nfeatures, nfeatures, nfeatures*(nfeatures-1)+tmp+1)
    plt.axis('on'); plt.xlabel(features['con'][tmp])
for tmp in range(1, nfeatures-1):
    plt.subplot(nfeatures, nfeatures, nfeatures*(nfeatures-1)+tmp+1)
    plt.ylabel('')

In [None]:
%%time
corr = traintest.corr()
corr.to_csv('corr.csv')   # Best to save a copy as it takes ages to run.
plt.figure(figsize=(10, 10))
sns.heatmap(corr, mask=np.triu(np.ones_like(corr, dtype=bool)), square=True, cmap='hot', cbar_kws={"shrink": .5})
corr

In [None]:
slimcorr = pd.Series(dtype=float)
for feature in corr.columns:
    slimcorr.loc[feature] = corr[feature].sort_values()[-2]
slimcorr.sort_values(ascending=False)
# output reports no correlation too high; therefore too premature to drop any feature

## First stab

In [None]:
dataX = df['train'].copy()
datay = dataX.pop('target')
trainX, validX, trainy, validy = train_test_split(dataX, datay)

## Set the bar low

In [None]:
def trainNpredict(model):
    pred = model.fit(trainX, trainy).predict(validX)
    rmse = mean_squared_error(validy, pred, squared=False)
    print('rmse =', rmse)
    return model.predict(dataX), rmse

pred, rmse = {}, pd.Series(dtype=float)
pred['dummy median'], rmse.loc['dummy median'] = trainNpredict(DummyRegressor(strategy='median'))
# All 4 dummy models are courtesy of https://www.kaggle.com/inversion/get-started-feb-tabular-playground-competition.

## Next, raise bar marginally higher

In [None]:
pred['linear regression'], rmse.loc['linear regression'] = trainNpredict(LinearRegression(fit_intercept=True))

In [None]:
pred['lasso'], rmse.loc['lasso'] = trainNpredict(Lasso(fit_intercept=False))

In [None]:
%time pred['random forest'], rmse.loc['random forest'] = trainNpredict(RandomForestRegressor(n_estimators=50, n_jobs=-1))

In [None]:
%time pred['lgb'], rmse.loc['lgb'] = trainNpredict(LGBMRegressor())

In [None]:
%time pred['xgb'], rmse.loc['xgb'] = trainNpredict(XGBRegressor())

In [None]:
rmse.sort_values()

In [None]:
plt.figure(figsize=(10, 15))
for ndummy, dummy in enumerate(pred.keys()):
    plt.subplot(3, 2, 1+ndummy)
    plt.plot(datay, pred[dummy], '.')
    plt.plot([datay.min(), datay.max()], [datay.min(), datay.max()], 'y-.')
    plt.grid(True); plt.xlabel('true_y'); plt.ylabel('pred_y'); plt.title('{}: {:5.1f}'.format(dummy, rmse[dummy]))

## Pick the best dummy, submit and see

In [None]:
df['sample_submission']['target'] = LGBMRegressor().fit(dataX, datay).predict(df['test'])
df['sample_submission'].to_csv('dummy_submission.csv')

## BorutaShap

In [None]:
if 'BorutaShap' not in sys.modules:
    !pip install BorutaShap
from BorutaShap import BorutaShap
Feature_Selector = BorutaShap(model=LGBMRegressor(), importance_measure='shap', classification=False)
Feature_Selector.fit(X=dataX, y=datay, n_trials=150) # sample=False, train_or_test = 'test', normalize=True, verbose=True)
Feature_Selector.plot(which_features='all')