# Kaggle Tabular Competition Feb 2021 - Baseline

In this notebook, I will explore the [Kaggle Tabular Competition Feb 2021](https://www.kaggle.com/c/tabular-playground-series-feb-2021/overview).

## Data exploration

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
import optuna

In [None]:
def read_data(data_dir):
    train = pd.read_csv(os.path.join(data_dir, 'train.csv'), index_col='id')
    test = pd.read_csv(os.path.join(data_dir, 'test.csv'), index_col='id')
    sample_submission = pd.read_csv(os.path.join(data_dir, 'sample_submission.csv'), index_col='id')
    return train, test, sample_submission

In [None]:
DAT_DIR = '../input/tabular-playground-series-feb-2021'
train, test, sample_submission = read_data(DAT_DIR)

In [None]:
train.describe()

In [None]:
test.describe()

It seems the train and test data are quite similar. Next, let's build a baseline model in XGBoost. But before that, let us split the training data into training and validation.

In [None]:
X_train = train.drop(['target'], axis=1)
y_train = train.target

In [None]:
for c in X_train.select_dtypes(['object']):
    enc = LabelEncoder()
    X_train[c] = enc.fit_transform(X_train[c])
    test[c] = enc.transform(test[c])

In [None]:
valid_pct = 0.3

X_tr, X_va, y_tr, y_va = train_test_split(X_train, y_train, test_size=valid_pct, random_state=123)

In [None]:
print(f'X_tr.shape = {X_tr.shape}, X_va.shape = {X_va.shape}, y_tr.shape = {y_tr.shape}, y_va.shape={y_va.shape}')

## Baseline Model

In [None]:
def score_dataset(X, y, model=XGBRegressor(), cv_folds=2):
    # Metric is RMSE (Root Mean Squared Error)
    score = cross_val_score(
        model, X, y, cv=cv_folds, scoring="neg_mean_squared_error",
    )
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score

In [None]:
score = score_dataset(X_train, y_train)
print(f'RMSE: = {score:.4f}')

Next, let us do some hyperparameter tuning to see if we can improve the results.

In [None]:
def objective(trial):
    xgb_params = dict(
        max_depth=trial.suggest_int("max_depth", 2, 10),
        learning_rate=trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),
        n_estimators=trial.suggest_int("n_estimators", 1000, 8000),
        min_child_weight=trial.suggest_int("min_child_weight", 1, 10),
        colsample_bytree=trial.suggest_float("colsample_bytree", 0.2, 1.0),
        subsample=trial.suggest_float("subsample", 0.2, 1.0),
        reg_alpha=trial.suggest_float("reg_alpha", 1e-4, 1e2, log=True),
        reg_lambda=trial.suggest_float("reg_lambda", 1e-4, 1e2, log=True),
    )
    xgb = XGBRegressor(**xgb_params)
    return score_dataset(X_tr, y_tr, xgb)

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5)
xgb_params = study.best_params

In [None]:
xgb_params = {'max_depth': 7, 
              'learning_rate': 0.002368706913117573, 
              'n_estimators': 3842, 
              'min_child_weight': 4, 
              'colsample_bytree': 0.6612496396706031, 
              'subsample': 0.6060764549240347, 
              'reg_alpha': 0.18899174723187226, 
              'reg_lambda': 30.33470416661318}

In [None]:
final_xgb = XGBRegressor(**xgb_params)
score = cross_val_score(final_xgb, X_train, y_train, cv=2, scoring="neg_mean_squared_error")
print(f'RMSE: = {np.sqrt(-score.mean()):.4f}')

## Output

In [None]:
final_xgb = XGBRegressor(**xgb_params)
final_xgb.fit(X_train, y_train)

In [None]:
y_test = final_xgb.predict(test)

In [None]:
final_submission = pd.Series(y_test, name='target', index=sample_submission.index)
final_submission.to_csv('submission.csv')