# imports & variables

In [None]:
import numpy as np
import pandas as pd
import random 
import os 
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, TimeSeriesSplit

from xgboost import XGBRegressor
import optuna

TRAIN_PATH = "../input/tabular-playground-series-jan-2022/train.csv"
TEST_PATH = "../input/tabular-playground-series-jan-2022/test.csv"
SAMPLE_SUBMISSION_PATH = "../input/tabular-playground-series-jan-2022/sample_submission.csv"
SUBMISSION_PATH = "submission.csv"

ID = "row_id"
TARGET = "num_sold"
DATE = "date"

TEST_SIZE = 0.2

OPTUNA_TRIALS = 500
OPTUNA_ESR = 50
OPTUNA_DIRECTION = "minimize"

NFOLD = 100
TREE_METHOD = 'gpu_hist'
BOOSTER = "gbtree"

SEED = 2002
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything()

# load & preprocess

In [None]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

train[DATE] = pd.to_datetime(train[DATE])
test[DATE]  = pd.to_datetime(test[DATE])

train['year'] = train[DATE].dt.year
train['month'] = train[DATE].dt.month
train['day'] = train[DATE].dt.day
train['day_of_year'] = train[DATE].dt.dayofyear
train['day_of_month'] = train[DATE].dt.days_in_month
train['day_of_week'] = train[DATE].dt.dayofweek
train['weekday'] = train[DATE].dt.weekday

test['year'] = test[DATE].dt.year
test['month'] = test[DATE].dt.month
test['day'] = test[DATE].dt.day
test['day_of_year'] = test[DATE].dt.dayofyear
test['day_of_month'] = test[DATE].dt.days_in_month
test['day_of_week'] = test[DATE].dt.dayofweek
test['weekday'] = test[DATE].dt.weekday

cat_cols = train.select_dtypes('object').columns.tolist()
train = pd.get_dummies(train, columns=cat_cols)
test  = pd.get_dummies(test, columns=cat_cols)

# build model & OOF predict 

In [None]:
# split data 
y = train[TARGET]
X = train.drop(columns=[ID, DATE, TARGET])
X_test = test.drop(columns=[ID, DATE])

# search best param
def smape(actual, predicted):
    numerator = np.abs(predicted - actual)
    denominator = (np.abs(actual) + np.abs(predicted)) / 2
    
    return np.mean(numerator / denominator)*100

def objective(trial, data=X, target=y):
    
    X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=SEED, shuffle=False)
    params = {
        'max_depth': trial.suggest_int('max_depth',1, 20),
        'eta': trial.suggest_float('eta', 1e-5, 0.1),
        'subsample': trial.suggest_discrete_uniform('subsample', 0.1, 0.9, 0.1),
        'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.1, 0.9, 0.1),
        'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-5, 1e5),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1e5),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e5),
        'gamma': trial.suggest_loguniform('gamma', 1e-5, 1e5),
        'predictor': "gpu_predictor",
        'eval_metric': 'mape'
    }
    
    model = XGBRegressor(**params,
                         tree_method=TREE_METHOD, 
                         booster=BOOSTER,
                         random_state=SEED)
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=OPTUNA_ESR, verbose=False)
    preds = model.predict(X_test)
    score = smape(y_test, preds)
    
    return score

study = optuna.create_study(direction=OPTUNA_DIRECTION)
study.optimize(objective, n_trials=OPTUNA_TRIALS)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

params=study.best_params
print(params)

# OOF 
timeSeriesSplit = TimeSeriesSplit(NFOLD)

preds = np.zeros(len(X_test))
scores = []

for fold, (train_idx, valid_idx) in enumerate(timeSeriesSplit.split(X)):
    
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_valid, y_valid = X.iloc[valid_idx], y.iloc[valid_idx]
    
    model = XGBRegressor(**params, 
                         tree_method=TREE_METHOD, 
                         booster=BOOSTER,)
    model.fit(X_train, y_train,verbose=False)
    
    preds_valid = model.predict(X_valid)
    score = smape(y_valid, preds_valid)
    scores.append(score)
    
    print(f"Fold: {fold + 1} Score: {score}")
    print("")
    
    preds += model.predict(X_test) / timeSeriesSplit.n_splits

print(f"\nOverall Validation Score: {np.mean(scores)}")

# submit

In [None]:
sub = pd.read_csv(SAMPLE_SUBMISSION_PATH)
sub[TARGET] = preds
sub.to_csv(SUBMISSION_PATH, index=False)
sub.head()