In [None]:
import numpy as np # linear algebra
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_datapath = '../input/demand-forecasting-kernels-only/train.csv'
test_datapath = '../input/demand-forecasting-kernels-only/test.csv'
submission_datapath = '../input/demand-forecasting-kernels-only/sample_submission.csv'

In [None]:
df_train = pd.read_csv(train_datapath)
df_test = pd.read_csv(test_datapath)
df_submission = pd.read_csv(submission_datapath)

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df_submission.head()

In [None]:
def convert_dates(x):
    x['date']=pd.to_datetime(x['date']) #converting date column to datetime format
    x['month']=x['date'].dt.month #creating a new column 'month' from 'date' using dt.month
    x['year']=x['date'].dt.year #same - for year
    x['dayofweek']=x['date'].dt.dayofweek #same - for day
    x.pop('date') #delete 'date' column
    return x

In [None]:
df_train = convert_dates(df_train)

In [None]:
df_train.head()

In [None]:
df_test = convert_dates(df_test)

In [None]:
df_test.head()

In [None]:
def add_avg(x):
    x['daily_avg']=x.groupby(['item','store','dayofweek'])['sales'].transform('mean') #daily_avg column based on sales per day
    x['monthly_avg']=x.groupby(['item','store','month'])['sales'].transform('mean') #monthly_avg column based on sales per month
    return x

In [None]:
df_train = add_avg(df_train)

In [None]:
df_train.head()

In [None]:
daily_avg = df_train.groupby(['item','store','dayofweek'])['sales'].mean().reset_index() #finding daily_avg value to use in x_pred

In [None]:
monthly_avg = df_train.groupby(['item','store','month'])['sales'].mean().reset_index() #finding monthly_avg value to use in x_pred

In [None]:
def merge(x,y,col,col_name):
    x =pd.merge(x, y, how='left', on=None, left_on=col, right_on=col,
            left_index=False, right_index=False, sort=True,
             copy=True, indicator=False,validate=None)
    x=x.rename(columns={'sales':col_name})
    return x

In [None]:
df_test = merge(df_test, daily_avg,['item','store','dayofweek'],'daily_avg')
df_test = merge(df_test, monthly_avg,['item','store','month'],'monthly_avg')

In [None]:
df_test.sample(10)

In [None]:
x_train,x_test,y_train,y_test = train_test_split(df_train.drop('sales',axis=1),df_train.pop('sales'),random_state=123,test_size=0.2) #splitting train dataset to test/train

# XGBoost

Instead of numpy arrays or pandas dataFrame, XGBoost uses DMatrices. A DMatrix can contain both the features and the target. If you already have loaded you data into numpy arrays X and y, you can create a DMatrix with: <br>

xgb.DMatrix(X, label=y)

https://blog.cambridgespark.com/hyperparameter-tuning-in-xgboost-4ff9100a3b2f - Tutorial Link
https://xgboost.readthedocs.io/en/latest/parameter.html - Documentation

In [None]:
def XGBmodel(x_train,x_test,y_train,y_test):
    matrix_train = xgb.DMatrix(x_train,label=y_train)
    matrix_test = xgb.DMatrix(x_test,label=y_test)
    model=xgb.train(params={'objective':'reg:linear','eval_metric':'mae'} #reg:linear cuz target value is a regression, mae for mean absolute error, can be rmse as well. More info - see documentation
                    ,dtrain=matrix_train,num_boost_round=200, 
                    early_stopping_rounds=20,evals=[(matrix_test,'test')],) #early_stopping_rounds = 20 : stop if 20 consequent rounds without decrease of error
    return model

model=XGBmodel(x_train,x_test,y_train,y_test)

In [None]:
x_test_pred = model.predict(xgb.DMatrix(x_test))

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt

In [None]:
mean_squared_error(y_true=y_test,
                   y_pred=x_test_pred)

In [None]:
root_mean_sqaure_error_RMSE = sqrt(mean_squared_error(y_true=y_test, y_pred=x_test_pred))
root_mean_sqaure_error_RMSE

In [None]:
mean_absolute_error(y_true=y_test,
                   y_pred=x_test_pred)

In [None]:
submission = pd.DataFrame(df_test.pop('id'))

In [None]:
submission.head()

In [None]:
y_pred = model.predict(xgb.DMatrix(df_test), ntree_limit = model.best_ntree_limit) #best_ntree_limit derives from best iteration in the model which is 87. For that, need to enable early stopping in the model.

In [None]:
submission['sales']= y_pred

In [None]:
submission.to_csv('submission.csv',index=False)

In [None]:
submission.head()