In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
import gc
import holidays

from sklearn.model_selection import train_test_split

from xgboost import XGBRegressor
from xgboost import plot_importance

warnings.filterwarnings("ignore")
seed = 512

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv')
df_train.head()

In [None]:
# Define Model Evaluation functions
def smape(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred) / (y_true + np.abs(y_pred)) * 200)

def evaluate_model(model, x, y):
    y_pred = model.predict(x)
    result = smape(y, y_pred)
    return result

In [None]:
#Define data pre-processing functions
def label_encoder(df):
    country = {c : i for i, c in enumerate(df['country'].unique())}
    store = {s : i for i, s in enumerate(df['store'].unique())}
    product = {p : i for i, p in enumerate(df['product'].unique())}
    df = df.copy()
    df['country'] = df['country'].replace(country)
    df['store'] = df['store'].replace(store)
    df['product'] = df['product'].replace(product)
    return df

def preprocess_dates(df):
    df = df.copy()
    df['date'] = pd.to_datetime(df['date'])
    df['weekday'] = df['date'].dt.weekday
    df['week']=df['date'].dt.isocalendar().week     
    df['week'][df['week']>52]=52                    
    df['week']=df['week'].astype('int')
    df['month']=df['date'].dt.month
    df['quarter'] = df['date'].dt.quarter
    df['year']=df['date'].dt.year
    df['day_of_year'] = df['date'].dt.day_of_year
    df['day_of_month']=df['date'].dt.day
    df['is_month_start'] = df['date'].dt.is_month_start
    df['is_month_end'] = df['date'].dt.is_month_end
    df['weekend']=(df['weekday']//5 == 1)       
    df['weekend']=df['weekend'].astype('int')   
    return df

def preprocess_holidays(df):
    holiday_finland = holidays.CountryHoliday(country='FI', years=[2015, 2016, 2017, 2018, 2019])
    holiday_norway = holidays.CountryHoliday(country='NO', years=[2015, 2016, 2017, 2018, 2019])
    holiday_sweden = holidays.CountryHoliday(country='SE', years=[2015, 2016, 2017, 2018, 2019])
    holidays_fin_nor_swe = holiday_finland.copy()
    holidays_fin_nor_swe.update(holiday_norway)
    holidays_fin_nor_swe.update(holiday_sweden)
    dates = list(holidays_fin_nor_swe.keys())
    dates = sorted(pd.to_datetime(dates))
    df = df.copy()
    df['is_holiday'] = df['date'].apply(lambda x : 1 if x in dates else 0)
    return df

def preprocess_timeseries(df):
    df = df.copy()
    # Sin of date values
    df['sin_day_of_year'] = np.sin(df['day_of_year'])
    df['sin_month'] = np.sin(df['month'])
    df['sin_weekday'] = np.sin(df['weekday'])
    df['sin_quarter'] = np.sin(df['quarter'])
    # Cos of date values
    df['cos_day_of_year'] = np.cos(df['day_of_year'])
    df['cos_month'] = np.cos(df['month'])
    df['cos_weekday'] = np.cos(df['weekday'])
    df['cos_quarter'] = np.cos(df['quarter'])
    return df

In [None]:
train_df = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2022/train.csv", sep=',')
train_df

In [None]:
train_df = label_encoder(train_df)
train_df = preprocess_dates(train_df)
train_df = preprocess_holidays(train_df)
train_df = preprocess_timeseries(train_df)
train_df

In [None]:
X_train = train_df.drop(['row_id', 'date', 'num_sold'], axis=1)
y_train = train_df['num_sold']
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.25, random_state=seed, shuffle=False)

## Model: XGBoost
https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBRegressor

In [None]:
params = {'n_estimators': 1000, 
          'max_depth': 50, 
          'subsample': 1.0,
          'eta': 0.3,
          'colsample_bytree': 1.0,
          'gamma': 0.0, 
          'min_child_weight': 1,
          'reg_alpha': 1
         }

model =  XGBRegressor(**params,
                      random_state=seed,
                      early_stopping_rounds=300,
                      verbosity=0)

model.fit(X_train, y_train, verbose=True)
score = evaluate_model(model, X_test, y_test)
print(score)   # Public score = 7.36938

In [None]:
plot_importance(model)

# Optuna Optimization

In [None]:
import optuna

In [None]:
def objective(trial):
   
    n_estim = trial.suggest_int('n_estimators', 100, 1000)
    max_d = trial.suggest_int('max_depth', 10, 80)
    subsam = trial.suggest_float('subsample', 0.5, 1.0)    
    eta = trial.suggest_float('eta', 0.01, 0.5)
    colsample = trial.suggest_float('colsample_bytree', 0.7, 1.0)
    gammma = trial.suggest_float('gammma', 0.0, 0.5)
    min_child = trial.suggest_float('min_child_weight', 0.7, 1.0)
    reg_a = trial.suggest_float('reg_alpha', 0.7, 1.0)
    
    params = {'n_estimators': n_estim, 
              'max_depth': max_d, 
              'subsample': subsam,
              'eta': eta,
              'colsample_bytree': colsample,
              'gamma': gammma, 
              'min_child_weight': min_child,
              'reg_alpha': reg_a
             }
    
    model =  XGBRegressor(**params,
                          random_state=seed,
                          early_stopping_rounds=300,
                          verbosity=0)
    
    model.fit(X_train, y_train, verbose=True)
    score = evaluate_model(model, X_test, y_test)
    
    return score

In [None]:
# Create Optuna Trial
study = optuna.create_study(direction="minimize", sampler=optuna.samplers.TPESampler(seed=seed))

# Run trials
#study.optimize(objective , n_trials = 500)
study.optimize(objective, timeout = int(3600*9))    # an hour * X

In [None]:
# Best trial
print('Best trial score:', study.best_trial.value)
study.best_trial.params

In [None]:
# Create model with best trial parameters
params = {'n_estimators': study.best_trial.params['n_estimators'], 
          'max_depth': study.best_trial.params['max_depth'], 
          'subsample': study.best_trial.params['subsample'],
          'eta': study.best_trial.params['eta'],
          'colsample_bytree': study.best_trial.params['colsample_bytree'],
          'gamma': study.best_trial.params['gammma'], 
          'min_child_weight': study.best_trial.params['min_child_weight'],
          'reg_alpha': study.best_trial.params['reg_alpha']
         }

best_model =  XGBRegressor(**params,
                           random_state=seed,
                           early_stopping_rounds=300,
                           verbosity=0)

best_model.fit(X_train, y_train, verbose=True)
score = evaluate_model(best_model, X_test, y_test)
print(score)

## Submission

In [None]:
# Train best model with all train data
X_train = train_df.drop(['row_id', 'date', 'num_sold'], axis=1)
y_train = train_df['num_sold']

best_model.fit(X_train, y_train, verbose=True)

In [None]:
real_test_df = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2022/test.csv", sep=',')
real_test_df = label_encoder(real_test_df)
real_test_df = preprocess_dates(real_test_df)
real_test_df = preprocess_holidays(real_test_df)
real_test_df = preprocess_timeseries(real_test_df)
X_real_test = real_test_df.drop(['row_id', 'date'], axis=1)

In [None]:
target = best_model.predict(X_real_test).squeeze()
row_id =  real_test_df['row_id'].values
submission = pd.DataFrame({'row_id' : row_id, 'num_sold' : target})

In [None]:
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)