---
# [Tabular Playground Series - Mar 2022][1]
---
**Comments**: Thanks to previous great Notebooks.

1. [[TPS JAN 22] Base XGB & LGB][2]
2. [XGBoost Stepwise Tuning Using Optuna][3]

---
[1]: https://www.kaggle.com/c/tabular-playground-series-mar-2022
[2]: https://www.kaggle.com/ranjeetshrivastav/tps-jan-22-base-xgb-lgb
[3]: https://www.kaggle.com/code/para24/xgboost-stepwise-tuning-using-optuna/notebook

# 0. Settings

In [None]:
# Import dependencies 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
%matplotlib inline

import os
import pathlib
import gc
import sys
import re
import math 
import random
import time 
from tqdm import tqdm 
from pprint import pprint

import warnings
warnings.filterwarnings('ignore')

import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

import xgboost as xgb
from xgboost import XGBRegressor
from xgboost import plot_importance

import optuna

print('import done!')

In [None]:
# global config
config = {}

# For reproducible results    
def seed_all(s):
    random.seed(s)
    np.random.seed(s)
    os.environ['PYTHONHASHSEED'] = str(s) 
    print('Seeds setted!')
global_seed = 42
seed_all(global_seed)

# 1. Data Preprocessing

## 1.1 Data Check

In [None]:
data_config = {'train_csv_path': '../input/tabular-playground-series-mar-2022/train.csv',
              'test_csv_path': '../input/tabular-playground-series-mar-2022/test.csv',
              'sample_submission_path': '../input/tabular-playground-series-mar-2022/sample_submission.csv',
              }

train_df = pd.read_csv(data_config['train_csv_path'])
test_df = pd.read_csv(data_config['test_csv_path'])
submission_df = pd.read_csv(data_config['sample_submission_path'])

print(train_df.shape, test_df.shape, submission_df.shape)
train_df.head()

In [None]:
train_df.dtypes

In [None]:
def print_unique_category(df, column):
    print(f'unique_category_number: {df[column].nunique()}')
    print(f'cagetories: {df[column].unique()}')
    print()

print_unique_category(train_df, 'direction')
print_unique_category(train_df, 'x')
print_unique_category(train_df, 'y')

In [None]:
# Null Value Check
def null_val_check(df):
    null_check_df = df.isnull().sum()
    for key in null_check_df.keys():
        assert null_check_df[key] == 0, f'{key} has {null_check_df[key]} null values.'
    print('No Null values.')
    
null_val_check(train_df)
null_val_check(test_df)

In [None]:
train_df.tail()

In [None]:
print_unique_category(test_df, 'direction')
print_unique_category(test_df, 'x')
print_unique_category(test_df, 'y')
test_df.head()

In [None]:
test_df.tail()

In [None]:
submission_df.head()

## 1.2 Feature Engineering

In [None]:
train_df = train_df.drop(['row_id'], axis=1)
test_df = test_df.drop(['row_id'], axis=1)

train_df.head()

In [None]:
def make_date_features(df, column='time', drop_column=True):
    df['datetime'] = pd.to_datetime(df[column])
    df['year'] = df['datetime'].dt.year
    df['month'] = df['datetime'].dt.month
    df['day'] = df['datetime'].dt.day
    df['dayofweek'] = df['datetime'].dt.dayofweek
    df['hour'] = df['datetime'].dt.hour
    df['minute'] = df['datetime'].dt.minute
    df = df.drop(['datetime'], axis=1)
    if drop_column:
        df = df.drop([column], axis=1)
    return df 

train_df = make_date_features(train_df)
train_df = train_df.drop(['year'], axis=1) # Because the 'year' column only contains '1991'.

test_df = make_date_features(test_df)
test_df = test_df.drop(['year'], axis=1)

train_df.head()

In [None]:
# we don't use old data.
valid_df = train_df.query('month == 9').reset_index(drop=True)
train_df = train_df.query('month == 7 or month == 8').reset_index(drop=True)
all_df = pd.concat([train_df, valid_df])

print(f'all_df length: {len(all_df)}')
print(f'train_df length: {len(train_df)}')
print(f'valid_df length: {len(valid_df)}')
valid_df.head()

In [None]:
y_all = all_df['congestion']
X_all = all_df.drop(['congestion'], axis=1)

y_train = train_df['congestion'] 
X_train = train_df.drop(['congestion'], axis=1)

y_valid = valid_df['congestion'] 
X_valid = valid_df.drop(['congestion'], axis=1)

print(X_all.shape, y_all.shape)
print(X_train.shape, y_train.shape)
print(X_valid.shape, y_valid.shape)
print(test_df.shape)

In [None]:
categorical_features = ['direction', 'dayofweek']
ct = ColumnTransformer([('one_hot', OneHotEncoder(), categorical_features)], remainder="passthrough")
ct.fit(X_train)

encoded_X_train = ct.transform(X_train)
print(encoded_X_train.shape)

feature_columns = ct.transformers_[0][1].get_feature_names(categorical_features)
print(feature_columns)

columns = list(X_train.columns)
for feature in categorical_features:
    columns.remove(feature)
columns = list(feature_columns) + columns

encoded_X_train_df = pd.DataFrame(encoded_X_train, columns=columns)
encoded_X_train_df.head()

In [None]:
encoded_X_all = ct.transform(X_all)
encoded_X_all_df = pd.DataFrame(encoded_X_all, columns=columns)

encoded_X_valid = ct.transform(X_valid)
encoded_X_valid_df = pd.DataFrame(encoded_X_valid, columns=columns)

encoded_X_test = ct.transform(test_df)
encoded_X_test_df = pd.DataFrame(encoded_X_test, columns=columns)

print(encoded_X_all_df.shape, encoded_X_valid_df.shape, encoded_X_test_df.shape)

# 2. Model Training

## 2.1 XGBRegressor

In [None]:
regressor = XGBRegressor(objective='reg:linear', seed=global_seed)
regressor.fit(encoded_X_train_df, y_train, verbose=True, early_stopping_rounds=10,
            eval_metric='mae', eval_set=[(encoded_X_valid_df, y_valid)])

In [None]:
fig, ax = plt.subplots(1,1,figsize=(20,12))
plot_importance(regressor,ax=ax, xlabel=None)
plt.title('XGB Feature importance')
plt.show()

## 2.2 [Stepwise Tuning][1] of Hyperparameters with Optuna

[1]: https://medium.com/optuna/lightgbm-tuner-new-optuna-integration-for-hyperparameter-optimization-8b7095e99258

In [None]:
def objective(trial, X_train, y_train, X_valid, y_valid, group, params=dict()):
    
    ## Initial Learning Parameters
    params['eta'] = 0.3
    params['num_boost_round'] = 1000
    
    if group == '1':
        params['max_depth'] = trial.suggest_int('max_depth', 2, 10)
        params['min_child_weight'] = trial.suggest_int('min_child_weight', 1, 10)
        
    if group == '2':
        params['gamma'] = trial.suggest_uniform('gamma', 0.5, 1)
        params['reg_lambda']  = trial.suggest_loguniform('reg_lambda', 1e-1, 10)
        params['reg_alpha'] = trial.suggest_loguniform('reg_alpha',1e-3,10)
        
    if group == '3':
        params['subsample'] = trial.suggest_uniform('subsample', 0.5, 1)
        params['colsample_bytree'] = trial.suggest_uniform('colsample_bytree', 0, 1)
        
    if group == '4':
        params['eta'] = trial.suggest_discrete_uniform('eta', 0.05, 0.5, 0.01)
        params['num_boost_round'] = trial.suggest_int('num_boost_round', 100, 1000)
    
    estimator = xgb.XGBRegressor(objective='reg:linear',
                                 verbosity=0,
                                 n_jobs=-1,
                                 random_state=global_seed,
                                 seed=global_seed,
                                 tree_method='gpu_hist',
                                 predictor='gpu_predictor',
                                 gpu_id=0,
                                 **params)
    
    estimator.fit(X_train, y_train, verbose=False, early_stopping_rounds=20,
            eval_metric='mae', eval_set=[(X_valid, y_valid)])
    
    preds = estimator.predict(X_valid)
    score = sklearn.metrics.mean_absolute_error(y_valid, preds)
    print(f'mae_score: {score}')
    return score

In [None]:
def execute_optimization(group, n_trials, direction='minimize', params=dict()):
    study = optuna.create_study(direction=direction)
    study.optimize(lambda trial: objective(trial, encoded_X_train_df, y_train,
                                           encoded_X_valid_df, y_valid,
                                           group=group, params=params),
                   n_trials=n_trials, n_jobs=-1)
    
    print("BEST CV SCORE", study.best_value)
    print('------------------------------------------------')
    print(f"OPTIMAL GROUP - {group} PARAMS: ", study.best_params)
    print('------------------------------------------------')
    
    return study.best_params

In [None]:
def stepwise_optimization(n_trials=15):
    final_params = {}
    for group in ['1', '2', '3', '4']:
        print(f"=========================== Optimizing Group - {group} ============================")
        update_params = execute_optimization(group, n_trials,
                                             direction='minimize', params=final_params)
        final_params.update(update_params)
        print(f"PARAMS after optimizing GROUP - {group}: ", final_params)
        print()
        print()

    print("=========================== FINAL OPTIMAL PARAMETERS ============================")
    print(final_params)
    
    return final_params

In [None]:
final_params = stepwise_optimization()

In [None]:
regressor = xgb.XGBRegressor(objective='reg:linear',
                             verbosity=0,
                             n_jobs=-1,
                             random_state=global_seed,
                             seed=global_seed,
                             tree_method='gpu_hist',
                             predictor='gpu_predictor',
                             gpu_id=0,
                             **final_params)

#regressor.fit(encoded_X_train_df, y_train, verbose=False, early_stopping_rounds=20, 
#              eval_metric='mae', eval_set=[(encoded_X_valid_df, y_valid)])

# use all data
regressor.fit(encoded_X_all_df, y_all, verbose=False)

# 3. Prediction and Submission

In [None]:
pred = regressor.predict(encoded_X_test_df)
submission_df['congestion'] = pred
submission_df.to_csv('submission.csv', index=False)
submission_df.head()