## Import packages

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

import xgboost as xgb

from datetime import date, datetime
import time
import calendar

import matplotlib.pyplot as plt
import seaborn as sns

## Load dataset

In [None]:
path = '../input/store-sales-time-series-forecasting/'

train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')
sub = pd.read_csv(path + 'sample_submission.csv')

oil = pd.read_csv(path + 'oil.csv')
holiday = pd.read_csv(path + 'holidays_events.csv')
store = pd.read_csv(path + 'stores.csv')
tran = pd.read_csv(path + 'transactions.csv')

## Preprocess datasets

* add weekday, year, month, day and payday

In [None]:
def preprocess_train(df):
    df['date'] = df['date'].map(lambda x: date.fromisoformat(x))
    df['weekday'] = df['date'].map(lambda x: x.weekday())
    df['year'] = df['date'].map(lambda x: x.year)
    df['month'] = df['date'].map(lambda x: x.month)
    df['day'] = df['date'].map(lambda x: x.day)
    df['eomd'] = df['date'].map(lambda x: calendar.monthrange(x.year, x.month)[1])
    df['payday'] = ((df['day'] == df['eomd'])|(df['day'] == 15)).astype(int)
    df.drop(['id', 'eomd'], axis=1, inplace=True)
    return df

train = preprocess_train(train)
test = preprocess_train(test)

* fill in NA values with month average oil price

In [None]:
def preprocess_oil(oil):
    oil['month'] = oil['date'].map(lambda x: int(x.replace('-', '')[:6]))
    oil['month_avg'] = oil.groupby('month')['dcoilwtico'].transform('mean')
    oil['tmp'] = oil['dcoilwtico'].map(np.isnan)
    oil['month_avg'] = oil['tmp'] * oil['month_avg']
    oil['dcoilwtico'].fillna(0, inplace=True)
    oil['dcoilwtico'] = oil['dcoilwtico'] + oil['month_avg']
    oil = oil.drop(['month', 'month_avg', 'tmp'], axis=1)
    oil['date'] = oil['date'].map(lambda x: date.fromisoformat(x))
    return oil

oil = preprocess_oil(oil)

* separate into three holiday types (national, regional and local) and event (FIFA World Cup etc)

In [None]:
def preprocess_holiday(df):
    df['date'] = df['date'].map(lambda x: date.fromisoformat(x))
    df = df[(df['transferred']==False)&(df['type']!='Work Day')]
    event = df[df['type']=='Event']
    earthquake = event[event['description'].str.startswith('Terremoto Manabi')]
    event = event[event['description'].str.startswith('Terremoto Manabi')==False]
    return df, event, earthquake

holiday, event, earthquake = preprocess_holiday(holiday)

event = event[['date', 'description']]
event.rename({'description': 'event_name'}, axis=1, inplace=True)

earthquake = earthquake[['date', 'description']]
earthquake.rename({'description': 'earthquake'}, axis=1, inplace=True)

h_local = holiday[holiday['locale']=='Local']
h_local = h_local[['date', 'locale_name', 'description']]
h_local = h_local.rename({'locale_name': 'city', 'description': 'local_holiday_name'}, axis=1)

h_regional = holiday[holiday['locale']=='Regional']
h_regional = h_regional[['date', 'locale_name', 'description']]
h_regional = h_regional.rename({'locale_name': 'state', 'description': 'regional_holiday_name'}, axis=1)

h_national = holiday[holiday['locale']=='National']
h_national = h_national[['date', 'description']]
h_national = h_national.rename({'description': 'national_holiday_name'}, axis=1)

## Merge datasets

In [None]:
def merge_tables(df):
    df = df.merge(oil, on='date', how='left')
    df = df.merge(store, on='store_nbr', how='left')
    df = df.merge(event, on='date', how='left').fillna('0')
    df = df.merge(earthquake, on='date', how='left').fillna('0')
    df = df.merge(h_local, on=['date', 'city'], how='left').fillna('0')
    df = df.merge(h_regional, on=['date', 'state'], how='left').fillna('0')
    df = df.merge(h_national, on='date', how='left').fillna('0')
    df = df.merge(tran, on=['date', 'store_nbr'], how='left').fillna(0)

    return df

train = merge_tables(train)
test = merge_tables(test)

In [None]:
train['dcoilwtico'] = train['dcoilwtico'].astype(float)
test['dcoilwtico'] = test['dcoilwtico'].astype(float)

In [None]:
# train = pd.get_dummies(train, columns=['family', 'store_nbr', 'city', 'state', 'type', 'cluster'])
# test = pd.get_dummies(test, columns=['family', 'store_nbr', 'city', 'state', 'type', 'cluster'])

## Label encoding

In [None]:
cat_features = ['family', 'store_nbr', 'city', 'state', 'type', 'cluster',
                'event_name', 'earthquake', 'local_holiday_name', 'regional_holiday_name', 'national_holiday_name']
for col in cat_features:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])

## Set train period and validation period

In [None]:
def preprocess_dataset(df, train_date: list, valid_date: list):
    df['is_train'] = df['date'].map(lambda x: x in train_date)
    df['is_valid'] = df['date'].map(lambda x: x in valid_date)
    return df


train_date = train['date'].unique()[-227:-31].tolist()
valid_date = train['date'].unique()[-31:].tolist()
train = preprocess_dataset(train, train_date, valid_date)

In [None]:
print('train date from {} to {}'.format(min(train_date), max(train_date)))
print('valid date from {} to {}'.format(min(valid_date), max(valid_date)))

## Set X and y

In [None]:
y = np.log(train['sales'] + 1)
X_train = train.drop(['date', 'sales', 'year'], axis=1)
X_test = test.drop(['date', 'year'], axis=1)

## Set XGB params

In [None]:
xgb_params = {
    'tree_method': 'gpu_hist', 
    'gpu_id': 0,
    'predictor': 'gpu_predictor', 
    'verbosity': 2,
    'objective': 'reg:squarederror', 
    'eval_metric': 'rmse', 
    'random_state': 2021,
    'learning_rate': 0.009948916127719946,
    'subsample': 0.9963593946651406,
    'colsample_bytree': 0.8056779523100791,
    'reg_alpha': 10.0,
    'reg_lambda': 0.1801543144548864,
    'min_child_weight': 47,
}

## Run XGB

In [None]:
def basic_xgboost(X_train, y, xgb_params, X_test):
    start = time.time()    
    # extract train and valid dataset
    trn_idx = X_train[X_train['is_train']==True].index.tolist()
    val_idx = X_train[X_train['is_valid']==True].index.tolist()

    X_tr = X_train.loc[trn_idx, :].drop(['is_train', 'is_valid'], axis=1)
    X_val = X_train.loc[val_idx, :].drop(['is_train', 'is_valid'], axis=1)
    y_tr = y[trn_idx]
    y_val = y[val_idx]
    
    xgb_train = xgb.DMatrix(X_tr, label=y_tr)
    xgb_valid = xgb.DMatrix(X_val, label=y_val)
    evallist = [(xgb_train, 'train'), (xgb_valid, 'eval')]
    evals_result = dict()
    
    model = xgb.train(params=xgb_params, dtrain=xgb_train, evals=evallist, evals_result=evals_result,
                      verbose_eval=5000, num_boost_round=100000, early_stopping_rounds=100)
    
    
    
    xgb_oof = np.zeros(y_val.shape[0])
    xgb_oof = model.predict(xgb_valid, iteration_range=(0, model.best_iteration))
    
    xgb_test = xgb.DMatrix(X_test)
    xgb_pred = pd.Series(model.predict(xgb_test, iteration_range=(0, model.best_iteration)),
                         name='xgb_pred')
    
    elapsed = time.time() - start
    error_value = mean_squared_error(y_val, xgb_oof, squared=False)
    print(f"xgb rmse: {error_value:.6f}, elapsed time: {elapsed:.2f}sec\n")

    return xgb_oof, model, evals_result, xgb_pred, y_val, X_val

In [None]:
%%time
oof, model, evals_result, pred, y_val, X_val = basic_xgboost(X_train, y, xgb_params, X_test)

## Plot error

In [None]:
df_error = X_val[['store_nbr', 'family']].copy()
df_error.reset_index(drop=True, inplace=True)
df_error['oof'] = pd.Series(oof)
df_error['y_valid'] = y_val.reset_index(drop=True).copy()

In [None]:
# good
y_oof = df_error[(df_error['store_nbr']==1)&(df_error['family']==12)]['oof'].tolist()
y_val = df_error[(df_error['store_nbr']==1)&(df_error['family']==12)]['y_valid'].tolist()
sns.lineplot(x=range(len(y_oof)), y=y_oof)
sns.lineplot(x=range(len(y_oof)), y=y_val)

In [None]:
# need to improve
y_oof = df_error[(df_error['store_nbr']==1)&(df_error['family']==14)]['oof'].tolist()
y_val = df_error[(df_error['store_nbr']==1)&(df_error['family']==14)]['y_valid'].tolist()
sns.lineplot(x=range(len(y_oof)), y=y_oof)
sns.lineplot(x=range(len(y_oof)), y=y_val)

## Create file for submission

In [None]:
sub['sales'] = np.exp(pred.map(lambda x: max(x, 0))) - 1
sub.to_csv('submission.csv', index=False)

## Print feature importance

In [None]:
# feature importance
fi = pd.DataFrame()
fi['feature'] = model.get_fscore().keys()
fi['importance'] = model.get_fscore().values()
display(fi.sort_values('importance', ascending=False))