In [None]:
import pandas as pd
import numpy as np

In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train=pd.read_csv("/kaggle/input/demand-forecasting-kernels-only/train.csv")

In [None]:
train.head()

In [None]:
train.describe()

25% of the items had sales lower than 30, 50% of the items had sales lower than 47 and 75% had sales lower than 70. Highest sales were 231. 

In [None]:
test=pd.read_csv("/kaggle/input/demand-forecasting-kernels-only/test.csv")

In [None]:
test.head()

# Quick Visualizations

In [None]:
#Converting date to datetime format
train['date']=pd.to_datetime(train['date'])
test['date']=pd.to_datetime(test['date'])

#Extracting dayofweek,dayofyear,year,month for training set

train['weekday']=train['date'].dt.dayofweek
train['dayofyear']=train['date'].dt.dayofyear
train['year']=train['date'].dt.year
train['month']=train['date'].dt.month

#Extracting dayofweek,dayofyear,year,month for testing set

test['weekday']=test['date'].dt.dayofweek
test['dayofyear']=test['date'].dt.dayofyear
test['year']=test['date'].dt.year
test['month']=test['date'].dt.month

In [None]:
import plotly.express as px
px.box(x=train['year'],y=train['sales'],title="Yearly Sales")

Sales seemed to have incerased on an yearly basis but not by that much.

In [None]:
px.line(x=train['date'],y=train['sales'],title="Daily Sales")

There seems to be seasonality in June 

In [None]:
px.box(x=train['month'],y=train['sales'],title="Monthly Sales")

Sales are lowest during the beginning and end of the year but highest during the June-August. Could be an effect of the summer holidays?

# Missing Values

In [None]:
train.isnull().sum()

No missing Values

# Outlier detection

In [None]:
train.skew()

No outliers as skewness is between -1 to +1

# Modeling

In [None]:
train.columns

In [None]:
X=train.copy()
X.drop(['sales','date'],axis=1,inplace=True)
y=train['sales']

from sklearn.model_selection import train_test_split

#Splitting data into training and validation test
X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.2,random_state=123)

print("Shape of training features:",X_train.shape)
print("Shape of training labels:",y_train.shape)
print("Shape of validation features:",X_val.shape)
print("Shape of validation labels:",y_val.shape)

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score

In [None]:
xgb=XGBRegressor(random_state=123)

XBG_score=cross_val_score(xgb,X_train,y_train,cv=5,scoring='neg_mean_squared_error',verbose=15)

In [None]:
print("MSE:",-XBG_score.mean())
print("RMSE:",np.sqrt(-XBG_score.mean()))

Since this competition is judged based on the SMAPE score, we will be using that.

In [None]:
def smape(A, F):
    return 100/len(A) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))

In [None]:
xgb.fit(X_train,y_train)
training_predictions=xgb.predict(X_train)
print("SMAPE score:",smape(y_train,training_predictions))

Let's try to tune the hyperparameters to improve the score

# Hyperparameter tuning

In [None]:
xgb

In [None]:
param_grid = {
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
    'min_child_weight': np.arange(0.0001, 0.5, 0.001),
    'gamma': np.arange(0.0,40.0,0.005),
    'learning_rate': np.arange(0.0005,0.3,0.0005),
    'subsample': np.arange(0.01,1.0,0.01),}

In [None]:
#Bayesian optimization over hyper parameters.

from skopt import BayesSearchCV

In [None]:
tuned_XGB=BayesSearchCV(xgb,param_grid,cv=3,scoring='neg_mean_squared_error',random_state=123,verbose=15)

In [None]:
tuned_XGB.fit(X_train,y_train)

In [None]:
"""
import winsound
duration = 1000  # milliseconds
freq = 440  # Hz
winsound.Beep(freq, duration)
"""

In [None]:
tuned_XGB.best_estimator_

In [None]:
training_predictions=tuned_XGB.predict(X_train)
print("SMAPE score:",smape(y_train,training_predictions))

# Using validation set

In [None]:
validation_predictions=tuned_XGB.predict(X_val)
print("SMAPE score:",smape(y_val,validation_predictions))

# Using Test set

In [None]:
X=test.copy()
X.drop(['id','date'],axis=1,inplace=True)

test_predictions=tuned_XGB.predict(X)

In [None]:
final_test=pd.DataFrame()
final_test['id']=test['id']
final_test['sales']=test_predictions
print(final_test.head())

final_test.to_csv("submission.csv", index=False)