## Problem Statement

In this competition were given a dataset for two fictitous stores and asked to build a forecast predicting which will have better sales going forward. 

## Loading Libraries and Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import  GridSearchCV, train_test_split
import category_encoders as ce
from sklearn.metrics import mean_absolute_error, make_scorer

First we'll load our train dataset and define a feature engineering function that we can reuse on our test set. In order to capture the importance of different time series feature's we'll add month, year, day, weekday, and quarter as features.

In [None]:
input_df = pd.read_csv("../input/tabular-playground-series-jan-2022/train.csv")

def initial_engineering(data):
    
    '''Defining cleaning function to be used for both the train and test sets'''
    
    data['month'] = pd.DatetimeIndex(data['date']).month.astype(object)
    data['year'] = pd.DatetimeIndex(data['date']).year
    data['day'] = pd.DatetimeIndex(data['date']).day
    
    #Converting existing column types
    data['country'] = data['country'].astype(object)
    data['store'] = data['store'].astype(object)
    data['product'] = data['product'].astype(object)
    
    #Adding additional date variables
    data['date'] = pd.DatetimeIndex(data['date'])
    data['weekday'] = data['date'].dt.dayofweek.astype(object)
    data['quarter'] = data['date'].dt.quarter.astype(object)
    data['month_start'] = data['date'].dt.is_month_start.astype(object)
    data['month_end'] = data['date'].dt.is_month_end.astype(object)
    data['quarter_start'] = data['date'].dt.is_quarter_start.astype(object)
    data['quarter_end'] = data['date'].dt.is_quarter_end.astype(object)
    return data

data = initial_engineering(input_df)

def add_previous_sales(data):
    
    '''Adds the previous days sales as a predictor'''
    
    first_date = (data.sort_values(by="date"))['date'][0]
    data_points = len(data[data['date'] == first_date]) #Returning total number of datapoints
    #print(data_points)
    
    data = data.sort_values(by = ['date', 'product_1','product_2', 'product_3', 
                                  'country_1','country_2', 'store_1', 'store_2'])
    #print(data.drop(data.tail(data_points).index)['num_sold'])
    data_copy = data.drop(data.tail(data_points).index)['num_sold'].tolist()
    placeholder = [data['num_sold'].mean() for i in range(data_points)]
    data_copy = placeholder + data_copy
    data['previous_sales'] = data_copy
    return data



## Data Visualization

Before we perform any modeling we want to explore any relationships between our predictor and dependent variables.

In [None]:
#Data visualization
data.plot(x='date',y='num_sold',figsize=(15,6),linestyle='--', markerfacecolor='r',color='b',markersize=10)
plt.xlabel('Year-Month')
plt.ylabel('Number Sold')


Based on the above time series graph we can see a clear seasonal trend with sales peaking in December and January of each year. 

Next we'll break our graph down by country, product, and store to determine if there are any differences based on these variables.

In [None]:
plt.figure(figsize=(10,5))
chart = sns.relplot(x="date", y="num_sold", hue="product",
            col="country", row="store", height=3,
            kind="line", estimator=None, data=data)
chart.set_xticklabels(rotation=45, horizontalalignment='right')

Breaking our data down  we can see that all products follow a similar seasonal trend regardless of product type, country, or store. 

However, while they each exhibit the peaks at the same time there are several differences worth noting:
* Product sales are impacted by store; this can be seen clearly comparing hat and mug sales in Finland for KaggleRama versus Kagglemart
* Sales differ across country; can be seen comparing kaggleRama sales for Finland, Norway, and Sweeden


In [None]:
#Autocorrelation plots
sm.graphics.tsa.plot_acf(data.num_sold)
sm.graphics.tsa.plot_pacf(data.num_sold)


Given that we'll be using XGBoost as our model of choice we must first encode our categorical variables to numeric. We'll use OneHotEncoder for this so that our algorithm doesnt mistakenly assume a numeric relationship between different levels of our categorical variables.

In [None]:
#Performing initial engineering and adding previous sales as variable  
data = initial_engineering(data)

#Encoding categorical variables
encoder = ce.OneHotEncoder(cols=['weekday', 'quarter', 'product', 'store', 'country', 'month', 'day',
                                 'month_start', 'month_end', 'quarter_start', 'quarter_end'])
encoder.fit(data)
data = encoder.transform(data)
#Adding previous days sales as a predictor
data = add_previous_sales(data)


## Modeling

In [None]:
#Modeling
data = data.drop([ 'row_id'], axis=1)
x_train, x_test, y_train, y_test = train_test_split(
     data.drop(['num_sold'], axis=1), data['num_sold'], test_size=0.2, random_state=0)

x_train = x_train.drop(['date'], axis=1)
x_test = x_test.drop(['date'], axis=1)

#Defining and fitting initial model
xgb_model = xgb.XGBRegressor()
xgb_model.fit(x_train, y_train)

#Defining parameters and performing hyperparameter tuning
params = { 
    "colsample_bytree": [0.4, 0.5, 0.6],
    "gamma": [0.25],
    "learning_rate": [0.1],
    "max_depth": [3,5,7], 
    "n_estimators": [100, 150], 
    "subsample": [ 0.3, 0.4, 0.5]
}

gs = GridSearchCV(
        estimator=xgb_model,
        param_grid=params, 
        cv=3, 
        n_jobs=12, 
        scoring=make_scorer(mean_absolute_error),
        verbose=1
    )
fitted_model = gs.fit(x_train, y_train)


#Evaluating test accuracy
prediction = xgb_model.predict(x_test)
MAE = sum(abs(prediction - y_test))/len(prediction)
print(MAE)


Viewing results

In [None]:
#Visualizing gridsearch results
grid_scores = pd.DataFrame(gs.cv_results_)

chart = sns.relplot(x="param_colsample_bytree", y="mean_test_score",hue = "param_subsample",
                    col="param_max_depth", row = "param_n_estimators",height=3,
                    kind="line", estimator=None, data=grid_scores)

xgb.plot_importance(xgb_model)

## Making Predictions

In [None]:
def make_prediction(test, model, data):
    
    '''Adds previous days sales and makes predictions'''
     
    test = test.sort_values(by = ['date', 'product_1','product_2', 'product_3', 
                                  'country_1','country_2', 'store_1', 'store_2'])
    dates = (test.sort_values(by="date"))['date'].unique()
    first_date = dates[0]
    test_id = test['row_id']
    test = test.drop(['row_id'], axis = 1)
    
    predictions = []
    previous_sales = []
    for i in dates:
        prediction_set = test[test['date'] == i]
        
        
        #If first date in predictions set use last date in train set for previous sales
        if i == first_date:
            print(data.tail(len(data[data['date'] == i]))['num_sold'])
            prediction_set['previous_sales'] = data.tail(len(data[data['date'] == i]))['num_sold']
            days_prediction = model.predict(prediction_set.drop(['date'], axis=1))
            predictions = days_prediction
            previous_sales = days_prediction
            
        #else use last prediction as previous sales
        else:
            prediction_set['previous_sales'] = previous_sales
            days_prediction = list(model.predict(prediction_set.drop(['date'], axis=1)))
            predictions = np.concatenate((predictions, days_prediction), axis = 0)
            previous_sales = days_prediction
    print(predictions)
    print(type(predictions))       
    frame = { 'row_id': test_id, 'num_sold': predictions }
    out_df = pd.DataFrame(frame)
    
    return out_df

#Loading testdata and performing initial feature engineering
test = pd.read_csv("../input/tabular-playground-series-jan-2022/test.csv")
test = initial_engineering(test)

#Encoding categorical variables
encoder = ce.OneHotEncoder(cols=['weekday', 'quarter', 'product', 'store', 'country', 'month', 'day',
                                 'month_start', 'month_end', 'quarter_start', 'quarter_end'])
encoder.fit(test)
test = encoder.transform(test)


#Making prediction and outputting excel file
out_df= make_prediction(test, xgb_model, data)
out_df.to_csv('python_out_3.csv', index=False,)