In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

import xgboost as xgb

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.dummy import DummyRegressor
from sklearn.metrics import get_scorer, SCORERS

SEED = 123

# XGBoost compared with mean baseline
In this notebook I'm showing how poor are predictions for this dataset, even after achieving a score among the best for this competition.

In [None]:
DBPATH = Path('/kaggle/input/tabular-playground-series-aug-2021/')
train = pd.read_csv(DBPATH / 'train.csv')
sub_test = pd.read_csv(DBPATH / 'test.csv')
FEATURES = [ f"f{i}" for i in range(100) ]
TARGETS = ['loss']
X, y = train[FEATURES], train[TARGETS]

In [None]:
# let's train XGBoost using 10% of the data (for speed purpose), parameters I got with optuna hyperoptimization, and 3 Kfold
# (see kaggle_projects/tps-aug-2021/1_xgb_optuna)

train_frac_samples = 0.1 # Fraction of samples used for training. TO IMPROVE >> Increase train_frac_samples up to 0.9
X_train = X.sample(frac=train_frac_samples, random_state=SEED)
y_train = y.iloc[X_train.index,:]
# Use all the rest for testing
X_test = X.drop(X_train.index)
y_test = y.drop(X_train.index)

test_preds = np.zeros(X_test.shape[0]) # Test for my own checking 
sub_preds = np.zeros(sub_test.shape[0]) # Submission

kf = KFold(n_splits=5,random_state=SEED,shuffle=True) # TO IMPROVE >> n_splits up to 10
rmse=[]
n=0

for trn_idx, val_idx in kf.split(X_train,y_train):
    X_trn, X_val = X.iloc[trn_idx], X.iloc[val_idx]
    y_trn, y_val = y.iloc[trn_idx], y.iloc[val_idx]
    model = xgb.XGBRegressor(**{
        'tree_method': 'auto',
        'n_estimators': 300,
        'lambda': 0.019333692109917443,
        'alpha': 8.194847496009798,
        'colsample_bytree': 0.5471185761751851,
        'subsample': 0.584376759562932,
        'learning_rate': 0.031035459267598212,
        'max_depth': 17,
        'min_child_weight': 217, 
        'seed': SEED
    })
    model.fit(X_trn,y_trn, 
              early_stopping_rounds=20, eval_set=[(X_val,y_val)],
              verbose=0)
    test_preds+=model.predict(X_test)/kf.n_splits # Running average of predictions
    sub_preds+=model.predict(sub_test[FEATURES])/kf.n_splits

    rmse.append(mean_squared_error(y_val, model.predict(X_val), squared=False))
    print(f'Kfold #{n+1}, RMSE (validation): {rmse[n]}')
    n+=1

In [None]:
rmse_test = mean_squared_error(y_test, model.predict(X_test), squared=False)
print(f"RMSE (test): {rmse_test}")

Considering that the best RMSE score of today (24 Aug 2021) is 7.846, my resuld does not sound bad!

...but how good are these predictions for a real application?

In [None]:
# Printing the submission CSV
pd.DataFrame(zip(sub_test['id'], sub_preds), columns=['id', 'loss']).to_csv('subssion.csv', index=False)

In [None]:
# Let's visualize the predictions on a parity plot
plt.figure(figsize=[5,5], dpi=100)
plt.scatter(model.predict(X_test), y_test, c='b', s=0.1, alpha=0.01)
plt.plot([0,40],[0,40], c='k', ls='--')
plt.xlabel('Predictions')
plt.ylabel('Targets')
plt.grid()
plt.show()

In [None]:
fig, axs = plt.subplots(ncols=2, sharey=True, sharex=True, figsize=[10,4])
axs[0].hist(y_test, bins=100)
axs[0].set_title('Targets')
axs[1].hist(model.predict(X_test), bins=100)
axs[1].set_title('Predictions')
plt.show()

One can notice that almost all the predictions are between 5 and 10, while the predictions span from 0 to 40.
It does not matter much to me that we are using regression and therefore continuous value: one can round them later to get discrete integers.

So how does a dummy model which simply returns the mean or the median value of the training set performs?

In [None]:
dummy_dict = {'mean': {}, 'median':{}}
for k in dummy_dict:
    dummy_dict[k]['model'] = DummyRegressor(strategy=k)
    dummy_dict[k]['model'].fit(X_train, y_train)
    dummy_dict[k]['value'] = dummy_dict[k]['model'].predict([0])[0] # get the mean/median value
    for scorer_str in SCORERS.keys(): # get all scorers
        scorer = get_scorer(scorer_str)
        try:
            dummy_dict[k][scorer_str] = scorer(estimator=dummy_dict[k]['model'], X=X_test, y_true=y_test)
        except:
            dummy_dict[k][scorer_str] = 'SKIP' # The scorer does not make sense for this problem
pd.DataFrame(dummy_dict) 

We can note how, just using the mean value of the training set target (6.81184), one can achieve a RMSE score of 7.944.

Considering that the best score of today is 7.846 and assuming that the score on this test partition is transferrable the hidden test of the competition, the whole change is to achieve just 0.1 RMSE improvement!