---
# [Tabular Playground Series - Jan 2022][1]
---

**Comments**: Thanks to previous great Notebooks.

[[TPS JAN 22] Base XGB & LGB][2]

[TPS 01 2022 CatBoost w/ Optuna & seed averaging][3]


---
[1]: https://www.kaggle.com/c/tabular-playground-series-jan-2022
[2]: https://www.kaggle.com/ranjeetshrivastav/tps-jan-22-base-xgb-lgb
[3]: https://www.kaggle.com/adamwurdits/tps-01-2022-catboost-seed-averaging?scriptVersionId=84848139

# 0. Settings

In [None]:
# Import dependencies 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
%matplotlib inline

import os
import pathlib
import gc
import sys
import re
import math 
import random
import time 
from tqdm import tqdm 
from pprint import pprint

import warnings
warnings.filterwarnings('ignore')

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import KFold 
from sklearn.model_selection import StratifiedKFold 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate

import xgboost as xgb
import optuna

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.layers.experimental import preprocessing

import transformers 
import datasets 

print('import done!')

In [None]:
# global config
config = {}

AUTOTUNE = tf.data.experimental.AUTOTUNE

# For reproducible results    
def seed_all(s):
    random.seed(s)
    np.random.seed(s)
    tf.random.set_seed(s)
    os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
    os.environ['PYTHONHASHSEED'] = str(s) 
    print('Seeds setted!')
global_seed = 42
seed_all(global_seed)

# 1. Data Preprocessing

## 1.1 Data Check

In [None]:
data_config = {'train_csv_path': '../input/tabular-playground-series-jan-2022/train.csv',
              'test_csv_path': '../input/tabular-playground-series-jan-2022/test.csv',
              'sample_submission_path': '../input/tabular-playground-series-jan-2022/sample_submission.csv',
              }

train_df = pd.read_csv(data_config['train_csv_path'])
test_df = pd.read_csv(data_config['test_csv_path'])
submission_df = pd.read_csv(data_config['sample_submission_path'])

print(train_df.shape, test_df.shape, submission_df.shape)
train_df.head()

In [None]:
print(len(train_df))
print()
train_df.dtypes

In [None]:
def unique_category(df, column):
    print(f'unique_category_number: {df[column].nunique()}')
    print(f'cagetories: {df[column].unique()}')
    print()

unique_category(train_df, 'country')
unique_category(train_df, 'store')
unique_category(train_df, 'product')

In [None]:
train_df.isnull().sum()

## 1.2 Feature Engneering

In [None]:
def date_features(df):
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['dayofweek'] = df['date'].dt.dayofweek
    return df 

train_df = date_features(train_df)
train_df = train_df.drop(['date'], axis=1)

test_df = date_features(test_df)
test_df = test_df.drop(['date'], axis=1)

train_df.head()

In [None]:
X_train_df = train_df.drop('num_sold', axis=1).copy()
y_train_df = train_df['num_sold'].copy()

X_test_df = test_df.copy()

X_train_df.shape, y_train_df.shape, X_test_df.shape

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

categorical_features = ['country', 'product', 'store', 'dayofweek', 'month', 'day']
ct = ColumnTransformer([('one_hot', OneHotEncoder(), categorical_features)], remainder="passthrough")
ct.fit(X_train_df)

encoded_X_train = ct.transform(X_train_df)
encoded_X_test = ct.transform(X_test_df)
print(encoded_X_train.shape)

feature_columns = ct.transformers_[0][1].get_feature_names(categorical_features)
print(feature_columns)

columns = list(X_train_df.columns)
for feature in categorical_features:
    columns.remove(feature)
columns = list(feature_columns) + columns

encoded_X_train_df = pd.DataFrame(encoded_X_train.toarray(), columns=columns)
encoded_X_test_df = pd.DataFrame(encoded_X_test.toarray(), columns=columns)

encoded_X_train_df = encoded_X_train_df.drop(['row_id', 'year'], axis=1)
encoded_X_test_df = encoded_X_test_df.drop(['row_id', 'year'], axis=1)
encoded_X_train_df.head()

# 2. Model Training

## 2.1 TimeSeriesSplit

In [None]:
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=3)

for fold, (train_index, test_index) in enumerate(tscv.split(encoded_X_train_df)):
    X_train, X_valid = encoded_X_train_df.iloc[train_index], encoded_X_train_df.iloc[test_index]
    y_train, y_valid = y_train_df.iloc[train_index], y_train_df.iloc[test_index]

    print(X_train.shape, y_train.shape, X_valid.shape, y_valid.shape)

## 2.2 Model Construction

In [None]:
tscv = TimeSeriesSplit(n_splits=3)
cross_rmse = []

for fold, (train_index, test_index) in enumerate(tscv.split(encoded_X_train_df)):
    X_train, X_valid = encoded_X_train_df.iloc[train_index], encoded_X_train_df.iloc[test_index]
    y_train, y_valid = y_train_df.iloc[train_index], y_train_df.iloc[test_index]

    print(X_train.shape, y_train.shape, X_valid.shape, y_valid.shape)

    reg_xgb = xgb.XGBRegressor(objective='reg:linear', seed=global_seed)
    reg_xgb.fit(X_train, y_train, verbose=False, early_stopping_rounds=10, eval_metric='rmse', eval_set=[(X_valid, y_valid)])

    y_pred = reg_xgb.predict(X_valid)
    score = np.sqrt(sklearn.metrics.mean_squared_error(y_valid, y_pred))
    cross_rmse.append(score)

print(f"CROSS_RMSE {np.mean(cross_rmse)}")

## 2.3 Hyperparameter Tuning

In [None]:
def objective(trial, X_df, y_df):

    params ={
        'max_depth': trial.suggest_int("max_depth", 4, 8),
        'eta': trial.suggest_uniform('eta', 0.05, 0.5),
        'gamma': trial.suggest_uniform('gamma', 0, 1),
        'min_child_weight': trial.suggest_int('min_child_weight', 0, 10),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 10), 
        #'subsample': trial.suggest_uniform('subsample', 0.5, 1),
        #'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0, 1),
    }
        
    estimator = xgb.XGBRegressor(objective='reg:linear',
                                 seed=global_seed,
                                 n_estimators=100,
                                 verbosity=0,
                                 n_jobs=-1,
                                 random_state=global_seed,
                                 **params)

    scores = cross_validate(estimator, X_df, y_df, cv=tscv,
                            scoring=('neg_mean_squared_error'),
                            return_train_score=True)
    print(f'scores: {scores}')
    score = -1 * np.mean(scores['test_score'][2])
    return score

In [None]:
study = optuna.create_study()
study.optimize(lambda trial: objective(trial, encoded_X_train_df, y_train_df), n_trials=100)

print(study.best_params)    
print(study.best_value)

In [None]:
best_reg = xgb.XGBRegressor(objective='reg:linear', seed=global_seed, **study.best_params)
best_reg.fit(encoded_X_train_df, y_train_df, verbose=True, eval_metric='rmse')

best_reg.get_params()

# 3. Prediction and Submission

In [None]:
pred = best_reg.predict(encoded_X_test_df)
pred.shape

In [None]:
submission_df['num_sold'] = pred 
submission_df.to_csv("submission.csv", index=False)
submission_df.head()