In [22]:
import sys
import os
sys.path.append(os.path.abspath('..'))

from paths import (TRANSFORMED_DATA_DIR, VALIDATION_DIR)

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import OneHotEncoder
from category_encoders import TargetEncoder

from sklearn.feature_selection import mutual_info_regression

from sklearn.model_selection import TimeSeriesSplit

from sklearn.ensemble import HistGradientBoostingRegressor

from sklearn.pipeline import Pipeline

from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import mlflow
import mlflow.sklearn

import optuna

# Utilities
import pickle
import json
from datetime import datetime
import warnings

# Jupyter settings
%config IPCompleter.greedy=True
#Deactivate scientific notation
pd.options.display.float_format = '{:.2f}'.format

#Deactivate warnings
import warnings
warnings.filterwarnings("ignore")

%config IPCompleter.greedy=True

In [23]:
# Load validation data
df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'work.parquet')
df

Unnamed: 0_level_0,state_id,store_id,dept_id,cat_id,item_id,wm_yr_wk,d,sales,sell_price,year,month,wday,weekday,event_name_1,event_type_1,event_name_2,event_type_2
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2013-01-01,CA,CA_3,FOODS_3,FOODS,FOODS_3_090,11249,d_704,0,1.25,2013,1,4,Tuesday,NewYear,National,,
2013-01-01,CA,CA_3,FOODS_3,FOODS,FOODS_3_120,11249,d_704,0,4.98,2013,1,4,Tuesday,NewYear,National,,
2013-01-01,CA,CA_3,FOODS_3,FOODS,FOODS_3_202,11249,d_704,20,4.28,2013,1,4,Tuesday,NewYear,National,,
2013-01-01,CA,CA_3,FOODS_3,FOODS,FOODS_3_252,11249,d_704,34,1.48,2013,1,4,Tuesday,NewYear,National,,
2013-01-01,CA,CA_3,FOODS_3,FOODS,FOODS_3_288,11249,d_704,0,,2013,1,4,Tuesday,NewYear,National,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-11-30,CA,CA_4,FOODS_3,FOODS,FOODS_3_329,11544,d_1767,5,2.98,2015,11,3,Monday,,,,
2015-11-30,CA,CA_4,FOODS_3,FOODS,FOODS_3_555,11544,d_1767,4,1.68,2015,11,3,Monday,,,,
2015-11-30,CA,CA_4,FOODS_3,FOODS,FOODS_3_586,11544,d_1767,9,1.68,2015,11,3,Monday,,,,
2015-11-30,CA,CA_4,FOODS_3,FOODS,FOODS_3_587,11544,d_1767,13,2.48,2015,11,3,Monday,,,,


In [12]:
# Load validation data
data = pd.read_parquet(VALIDATION_DIR / 'validation.parquet')
data

Unnamed: 0_level_0,state_id,store_id,dept_id,cat_id,item_id,wm_yr_wk,d,sales,sell_price,year,month,wday,weekday,event_name_1,event_type_1,event_name_2,event_type_2
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2015-12-01,CA,CA_3,FOODS_3,FOODS,FOODS_3_090,11544,d_1768,57,1.50,2015,12,4,Tuesday,,,,
2015-12-01,CA,CA_3,FOODS_3,FOODS,FOODS_3_120,11544,d_1768,109,4.98,2015,12,4,Tuesday,,,,
2015-12-01,CA,CA_3,FOODS_3,FOODS,FOODS_3_202,11544,d_1768,17,4.58,2015,12,4,Tuesday,,,,
2015-12-01,CA,CA_3,FOODS_3,FOODS,FOODS_3_252,11544,d_1768,29,1.58,2015,12,4,Tuesday,,,,
2015-12-01,CA,CA_3,FOODS_3,FOODS,FOODS_3_288,11544,d_1768,25,1.50,2015,12,4,Tuesday,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-12-31,CA,CA_4,FOODS_3,FOODS,FOODS_3_329,11548,d_1798,6,2.98,2015,12,6,Thursday,,,,
2015-12-31,CA,CA_4,FOODS_3,FOODS,FOODS_3_555,11548,d_1798,5,1.68,2015,12,6,Thursday,,,,
2015-12-31,CA,CA_4,FOODS_3,FOODS,FOODS_3_586,11548,d_1798,11,1.68,2015,12,6,Thursday,,,,
2015-12-31,CA,CA_4,FOODS_3,FOODS,FOODS_3_587,11548,d_1798,16,2.48,2015,12,6,Thursday,,,,


### Data cleaning

In [24]:
# Function to clean and prepare data
def clean_data(df):
    
    # Modify data types
    temp = df.astype({'month': 'O', 'wday': 'O'})             
    
    # Impute missing values
    temp.loc[df['event_name_1'].isna(), 'event_name_1'] = 'No_event'
    
    def impute_mode(records):
        # Calculate the mode of the price for that product
        mode = records.sell_price.mode()[0]
        # Impute missing values
        records.loc[records.sell_price.isna(), 'sell_price'] = mode
        # Return all records of the product
        return records

    temp = temp.groupby('item_id', group_keys=False).apply(impute_mode)
      
    return temp

# Test it
cleaned_data = clean_data(df)
cleaned_data

Unnamed: 0_level_0,state_id,store_id,dept_id,cat_id,item_id,wm_yr_wk,d,sales,sell_price,year,month,wday,weekday,event_name_1,event_type_1,event_name_2,event_type_2
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2013-01-01,CA,CA_3,FOODS_3,FOODS,FOODS_3_090,11249,d_704,0,1.25,2013,1,4,Tuesday,NewYear,National,,
2013-01-01,CA,CA_4,FOODS_3,FOODS,FOODS_3_090,11249,d_704,33,1.25,2013,1,4,Tuesday,NewYear,National,,
2013-01-01,CA,CA_3,FOODS_3,FOODS,FOODS_3_120,11249,d_704,0,4.98,2013,1,4,Tuesday,NewYear,National,,
2013-01-01,CA,CA_4,FOODS_3,FOODS,FOODS_3_120,11249,d_704,0,4.98,2013,1,4,Tuesday,NewYear,National,,
2013-01-01,CA,CA_3,FOODS_3,FOODS,FOODS_3_202,11249,d_704,20,4.28,2013,1,4,Tuesday,NewYear,National,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-11-30,CA,CA_4,FOODS_3,FOODS,FOODS_3_586,11544,d_1767,9,1.68,2015,11,3,Monday,No_event,,,
2015-11-30,CA,CA_3,FOODS_3,FOODS,FOODS_3_587,11544,d_1767,26,2.48,2015,11,3,Monday,No_event,,,
2015-11-30,CA,CA_4,FOODS_3,FOODS,FOODS_3_587,11544,d_1767,13,2.48,2015,11,3,Monday,No_event,,,
2015-11-30,CA,CA_3,FOODS_3,FOODS,FOODS_3_714,11544,d_1767,11,1.58,2015,11,3,Monday,No_event,,,


### Feature engineering

In [18]:
def create_variables(x):
    
    # Intermittent demand
    
    def stock_break(sales, n = 5):
        zero_sales = pd.Series(np.where(sales == 0,1,0))
        num_zeros = zero_sales.rolling(n).sum()
        stock_break = np.where(num_zeros == n,1,0)
        return(stock_break)
    
    x = x.sort_values(by = ['store_id','item_id','date'])
    x['stock_break_3'] = x.groupby(['store_id','item_id']).sales.transform(lambda x: stock_break(x, 3)).values
    x['stock_break_7'] = x.groupby(['store_id','item_id']).sales.transform(lambda x: stock_break(x,7)).values
    x['stock_break_15'] = x.groupby(['store_id','item_id']).sales.transform(lambda x: stock_break(x,15)).values
    
    
    #LAGS
    
    def create_lags(x, variable, num_lags = 7):
        lags = pd.DataFrame()
        for each in range(1,num_lags+1):
            lags[variable + '_lag_'+ str(each)] = x[variable].shift(each)
        return(lags)
    
    lags_sell_price_df = (x.groupby(['store_id', 'item_id'])
                        .apply(lambda x: create_lags(x = x, variable = 'sell_price', num_lags= 7))
                        .reset_index()
                        .set_index('date'))
    
    lags_stock_break_3_df = (x.groupby(['store_id','item_id'])
                            .apply(lambda x: create_lags(x = x, variable = 'stock_break_3', num_lags= 1))
                            .reset_index()
                            .set_index('date'))
    
    lags_stock_break_7_df = (x.groupby(['store_id','item_id'])
                            .apply(lambda x: create_lags(x = x, variable = 'stock_break_7', num_lags= 1))
                            .reset_index()
                            .set_index('date'))
    
    lags_stock_break_15_df = (x.groupby(['store_id','item_id'])
                            .apply(lambda x: create_lags(x = x, variable = 'stock_break_15', num_lags= 1))
                            .reset_index()
                            .set_index('date'))
    
    lags_sales_df = (x.groupby(['store_id','item_id'])
                    .apply(lambda x: create_lags(x = x, variable = 'sales', num_lags= 15))
                    .reset_index()
                    .set_index('date'))
    
    
    #MOVING WINDOWS
    
    def min_moving(x, variable, num_periods = 7):
        minm = pd.DataFrame()
        for each in range(2,num_periods+1):
            minm[variable + '_minm_' + str(each)] = x[variable].shift(1).rolling(each).min()
        return(minm)
    
    def moving_average(x, variable, num_periods = 7):
        mm = pd.DataFrame()
        for each in range(2,num_periods+1):
            mm[variable + '_mm_' + str(each)] = x[variable].shift(1).rolling(each).mean()
        return(mm)
    
    def max_moving(x, variable, num_periods = 7):
        maxm = pd.DataFrame()
        for each in range(2,num_periods+1):
            maxm[variable + '_maxm_' + str(each)] = x[variable].shift(1).rolling(each).max()
        return(maxm)
    
    min_moving_df = (x.groupby(['store_id','item_id'])
                  .apply(lambda x: min_moving(x = x, variable = 'sales', num_periods= 15))
                  .reset_index()
                  .set_index('date'))
    
    moving_average_df = (x.groupby(['store_id','item_id'])
                    .apply(lambda x: moving_average(x = x, variable = 'sales', num_periods= 15))
                    .reset_index()
                    .set_index('date'))
    
    max_moving_df = (x.groupby(['store_id','item_id'])
                    .apply(lambda x: max_moving(x = x, variable = 'sales', num_periods= 15))
                    .reset_index()
                    .set_index('date'))
    
    
    # Join all dataframes

    x_joined = pd.concat([x,
                      lags_sell_price_df,
                      lags_stock_break_3_df,
                      lags_stock_break_7_df,
                      lags_stock_break_15_df,
                      lags_sales_df,
                      min_moving_df,
                      moving_average_df,
                      max_moving_df], axis = 1)

    # Eliminate duplicate columns
    x_joined = x_joined.loc[:,~x_joined.columns.duplicated()]
    
    x_joined.dropna(inplace=True)
    
    x_joined.drop(columns = ['sell_price','stock_break_3','stock_break_7','stock_break_15'],
                  inplace=True)
    
    # Create a single variable for the product
    x_joined.insert(loc=0,column='product',value=x_joined.store_id + '_'+ x_joined.item_id)
    x_joined = x_joined.drop(columns = ['store_id','item_id'])
    
    return(x_joined)

In [25]:
result = create_variables(cleaned_data)
result

Unnamed: 0_level_0,product,state_id,dept_id,cat_id,wm_yr_wk,d,sales,year,month,wday,...,sales_maxm_6,sales_maxm_7,sales_maxm_8,sales_maxm_9,sales_maxm_10,sales_maxm_11,sales_maxm_12,sales_maxm_13,sales_maxm_14,sales_maxm_15
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-05-05,CA_3_FOODS_3_090,CA,FOODS_3,FOODS,11315,d_828,175,2013,5,2,...,266.0,266.0,276.0,276.0,276.0,276.0,276.0,276.0,276.0,276.0
2014-04-20,CA_3_FOODS_3_090,CA,FOODS_3,FOODS,11412,d_1178,245,2014,4,2,...,243.0,243.0,243.0,243.0,243.0,243.0,243.0,243.0,243.0,243.0
2014-06-15,CA_3_FOODS_3_090,CA,FOODS_3,FOODS,11420,d_1234,168,2014,6,2,...,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0
2013-05-05,CA_3_FOODS_3_120,CA,FOODS_3,FOODS,11315,d_828,0,2013,5,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2014-04-20,CA_3_FOODS_3_120,CA,FOODS_3,FOODS,11412,d_1178,59,2014,4,2,...,72.0,80.0,80.0,80.0,80.0,87.0,87.0,87.0,87.0,87.0
2014-06-15,CA_3_FOODS_3_120,CA,FOODS_3,FOODS,11420,d_1234,49,2014,6,2,...,89.0,89.0,89.0,89.0,89.0,89.0,89.0,89.0,109.0,109.0
2013-05-05,CA_3_FOODS_3_202,CA,FOODS_3,FOODS,11315,d_828,48,2013,5,2,...,50.0,50.0,50.0,50.0,50.0,50.0,50.0,56.0,56.0,56.0
2014-04-20,CA_3_FOODS_3_202,CA,FOODS_3,FOODS,11412,d_1178,76,2014,4,2,...,63.0,63.0,63.0,63.0,63.0,63.0,63.0,63.0,63.0,63.0
2014-06-15,CA_3_FOODS_3_202,CA,FOODS_3,FOODS,11420,d_1234,3,2014,6,2,...,3.0,3.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,42.0
2013-05-05,CA_3_FOODS_3_252,CA,FOODS_3,FOODS,11315,d_828,41,2013,5,2,...,70.0,70.0,70.0,70.0,70.0,70.0,70.0,78.0,78.0,78.0
