In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from xgboost import XGBRegressor
import optuna
import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv')
test  = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv')
ss    = pd.read_csv('../input/tabular-playground-series-jan-2022/sample_submission.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
print('Train shape :- ', train.shape)
print('Test shape :- ', test.shape)

In [None]:
train['date'] = pd.to_datetime(train['date'])
test['date']  = pd.to_datetime(test['date'])

In [None]:
train.describe(include='O')

In [None]:
cat_cols = train.select_dtypes('object').columns.tolist()

In [None]:
train['year'] = train['date'].dt.year
train['month'] = train['date'].dt.month
train['day'] = train['date'].dt.day
train['day_of_year'] = train['date'].dt.dayofyear
train['day_of_month'] = train['date'].dt.days_in_month
train['day_of_week'] = train['date'].dt.dayofweek
train['weekday'] = train['date'].dt.weekday

test['year'] = test['date'].dt.year
test['month'] = test['date'].dt.month
test['day'] = test['date'].dt.day
test['day_of_year'] = test['date'].dt.dayofyear
test['day_of_month'] = test['date'].dt.days_in_month
test['day_of_week'] = test['date'].dt.dayofweek
test['weekday'] = test['date'].dt.weekday

In [None]:
train = pd.get_dummies(train, columns=cat_cols)
test  = pd.get_dummies(test, columns=cat_cols)

In [None]:
train.head()

In [None]:
y = train.num_sold
train.drop(columns=['num_sold', 'date', 'row_id'], inplace=True)
test.drop(columns=['date', 'row_id'], inplace=True)

In [None]:
def smape(actual, predicted):
    numerator = np.abs(predicted - actual)
    denominator = (np.abs(actual) + np.abs(predicted)) / 2
    
    return np.mean(numerator / denominator)*100

In [None]:
def objective(trial, data=train, target=y):
    
    train_x, test_x, train_y, test_y = train_test_split(train, y, test_size=0.3, random_state=0, shuffle=False)
    params = {
        'max_depth': trial.suggest_int('amx_depth', 6, 15),
        'eta': trial.suggest_float('eta', 0.005, 0.1),
        'subsample': trial.suggest_discrete_uniform('subsample', 0.2, 0.9, 0.1),
        'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.2, 0.9, 0.1),
        'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-4, 1e4),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-4, 1e4),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-4, 1e4),
        'gamma': trial.suggest_loguniform('gamma', 1e-4, 1e4),
        'predictor': "gpu_predictor",
        'eval_metric': 'mape'
    }
    
    model = XGBRegressor(**params,
                         tree_method='gpu_hist',
                         random_state=2021)
    model.fit(train_x, train_y, eval_set=[(test_x, test_y)], early_stopping_rounds=100, verbose=False)
    preds = model.predict(test_x)
    score = smape(test_y, preds)
    
    return score

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_edf(study)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
optuna.visualization.plot_slice(study)

In [None]:
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
params=study.best_params
print(params)

In [None]:
%%time
folds = TimeSeriesSplit(10)

preds = np.zeros(len(test))
scores = []

for fold, (train_idx, valid_idx) in enumerate(folds.split(train)):
    
    X_train, y_train = train.iloc[train_idx], y.iloc[train_idx]
    X_valid, y_valid = train.iloc[valid_idx], y.iloc[valid_idx]
    
    model = XGBRegressor(booster='gbtree',
                         tree_method='gpu_hist',
                         predictor='gpu_predictor')
    
    model.fit(X_train, y_train,
              verbose=False)
    
    preds_valid = model.predict(X_valid)
    score = smape(y_valid, preds_valid)
    scores.append(score)
    
    print(f"Fold: {fold + 1} Score: {score}")
    print('||'*30)
    
    preds += model.predict(test) / folds.n_splits

print(f"\nOverall Validation Score: {np.mean(scores)}")

In [None]:
ss.num_sold = preds
ss.to_csv('xgb.csv', index=False)
ss.head()