# FEATURE ENGINEERING

## IMPORT PACKAGES


In [1]:
import sys
import os
sys.path.append(os.path.abspath('..')) 

from paths import (
    TRANSFORMED_DATA_DIR,
    VALIDATION_DIR
)

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import OneHotEncoder
#from category_encoders import TargetEncoder

%config IPCompleter.greedy=True

## IMPORT DATA

In [3]:
cat = pd.read_parquet(TRANSFORMED_DATA_DIR / 'cat_result_quality.parquet')
num = pd.read_parquet(TRANSFORMED_DATA_DIR / 'num_result_quality.parquet')

## NEW FEATURES

We already have the following features:

1. Date Components:

- Year, Month, Day: These are fundamental components that can help capture seasonal trends and patterns in the data.

- Weekday: This can help identify weekly patterns, such as increased sales on weekends.

- Week of the Year: This can capture seasonal effects that occur at specific times of the year.

2. Calendar Variables:

- Holidays and Events: Identifying whether a date corresponds to a holiday or special event can be crucial, as these often lead to spikes in sales.

- Promotional Periods: Variables indicating promotional events can help capture the impact of marketing efforts on sales.

We are creating the following features:

- Lags Variables: which are previous time steps of a variable. For example, if you are forecasting sales, a lag variable might be the sales from the previous day, week, or month.

    - Purpose:
    
        - Capturing Temporal Dependencies: Lag variables help capture the temporal dependencies in time series data. For instance, sales today may depend on sales from the previous day or week.

        - Improving Forecast Accuracy: Including lagged values can improve the accuracy of forecasting models by providing historical context.

- Rolling Windows: which are moving windows of a variable. For example, a rolling window might be the sales from the previous 7 days, 14 days, or 30 days.

    - Purpose:

        - Smoothing Trends: Rolling windows help smooth out short-term fluctuations and highlight longer-term trends in the data. This can be particularly useful in time series forecasting.

        - Capturing Seasonality: Rolling statistics can help capture seasonal patterns that may not be evident from individual time points. For example, a rolling average can show how sales typically increase during certain months or seasons.

        - Feature Engineering: Rolling window features can serve as additional predictors in your forecasting model, providing more context about recent trends.


For this phase, we need to join the cat and num dataframes again.

In [4]:
columns_to_drop = ['id', 'dept_id', 'cat_id', 'state_id']

In [5]:
df = (
    pd.concat([cat, num], axis=1)  # Concatenate categorical and numerical DataFrames
    .drop(columns=columns_to_drop, errors='ignore')  # Drop specified columns, ignore if they don't exist
)
df

Unnamed: 0_level_0,item_id,store_id,d,weekday,wday,month,year,event_name_1,event_type_1,sales,wm_yr_wk,sell_price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2013-01-01,FOODS_3_090,CA_3,d_704,Tuesday,4,1,2013,NewYear,National,0,11249,1.25
2013-01-01,FOODS_3_090,CA_4,d_704,Tuesday,4,1,2013,NewYear,National,33,11249,1.25
2013-01-01,FOODS_3_120,CA_3,d_704,Tuesday,4,1,2013,NewYear,National,0,11249,4.98
2013-01-01,FOODS_3_120,CA_4,d_704,Tuesday,4,1,2013,NewYear,National,0,11249,4.98
2013-01-01,FOODS_3_202,CA_3,d_704,Tuesday,4,1,2013,NewYear,National,20,11249,4.28
...,...,...,...,...,...,...,...,...,...,...,...,...
2015-11-30,FOODS_3_586,CA_4,d_1767,Monday,3,11,2015,No_event,No_event,9,11544,1.68
2015-11-30,FOODS_3_587,CA_3,d_1767,Monday,3,11,2015,No_event,No_event,26,11544,2.48
2015-11-30,FOODS_3_587,CA_4,d_1767,Monday,3,11,2015,No_event,No_event,13,11544,2.48
2015-11-30,FOODS_3_714,CA_3,d_1767,Monday,3,11,2015,No_event,No_event,11,11544,1.58


### Intermitent Demand Variable

This variable will identify how many consecutive days have had zero sales.

We will define it as having a stock break if the last n days have had zero sales.

We can create several by changing n.

In [6]:
def stock_break(sales, n = 5):
    zero_sales = pd.Series(np.where(sales == 0, 1, 0))
    num_zeros = zero_sales.rolling(n).sum()
    stock_break = np.where(num_zeros == n, 1, 0)
    return(stock_break)

In [7]:
df = df.sort_values(by = ['store_id','item_id','date'])
df

Unnamed: 0_level_0,item_id,store_id,d,weekday,wday,month,year,event_name_1,event_type_1,sales,wm_yr_wk,sell_price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2013-01-01,FOODS_3_090,CA_3,d_704,Tuesday,4,1,2013,NewYear,National,0,11249,1.25
2013-01-02,FOODS_3_090,CA_3,d_705,Wednesday,5,1,2013,No_event,No_event,224,11249,1.25
2013-01-03,FOODS_3_090,CA_3,d_706,Thursday,6,1,2013,No_event,No_event,241,11249,1.25
2013-01-04,FOODS_3_090,CA_3,d_707,Friday,7,1,2013,No_event,No_event,232,11249,1.25
2013-01-05,FOODS_3_090,CA_3,d_708,Saturday,1,1,2013,No_event,No_event,301,11250,1.25
...,...,...,...,...,...,...,...,...,...,...,...,...
2015-11-26,FOODS_3_714,CA_4,d_1763,Thursday,6,11,2015,Thanksgiving,National,22,11543,1.58
2015-11-27,FOODS_3_714,CA_4,d_1764,Friday,7,11,2015,No_event,No_event,10,11543,1.58
2015-11-28,FOODS_3_714,CA_4,d_1765,Saturday,1,11,2015,No_event,No_event,17,11544,1.58
2015-11-29,FOODS_3_714,CA_4,d_1766,Sunday,2,11,2015,No_event,No_event,7,11544,1.58


In [11]:
df['stock_break_3'] = df.groupby(['store_id','item_id']).sales.transform(lambda x: stock_break(x, 3)).values

In [9]:
df['stock_break_7'] = df.groupby(['store_id','item_id']).sales.transform(lambda x: stock_break(x,7)).values

In [10]:
df['stock_break_15'] = df.groupby(['store_id','item_id']).sales.transform(lambda x: stock_break(x,15)).values

### Lag variables

Let's create lag variables over the following variables:

* sales: lags of 15 days
* sell_price: lags of 7 days
* stock_break: lag of 1 day

In [12]:
def create_lags(df, variable, num_lags = 7):
    lags = pd.DataFrame()
    for each in range(1, num_lags + 1):
        lags[variable + '_lag_' + str(each)] = df[variable].shift(each)
    return(lags)

In [13]:
lags_sell_price_df = (df.groupby(['store_id', 'item_id'])
                      .apply(lambda x: create_lags(df = x, variable = 'sell_price', num_lags = 7))
                      .reset_index()
                      .set_index('date'))

  .apply(lambda x: create_lags(df = x, variable = 'sell_price', num_lags = 7))


In [14]:
lags_sell_price_df

Unnamed: 0_level_0,store_id,item_id,sell_price_lag_1,sell_price_lag_2,sell_price_lag_3,sell_price_lag_4,sell_price_lag_5,sell_price_lag_6,sell_price_lag_7
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2013-01-01,CA_3,FOODS_3_090,,,,,,,
2013-01-02,CA_3,FOODS_3_090,1.25,,,,,,
2013-01-03,CA_3,FOODS_3_090,1.25,1.25,,,,,
2013-01-04,CA_3,FOODS_3_090,1.25,1.25,1.25,,,,
2013-01-05,CA_3,FOODS_3_090,1.25,1.25,1.25,1.25,,,
...,...,...,...,...,...,...,...,...,...
2015-11-26,CA_4,FOODS_3_714,1.58,1.58,1.58,1.58,1.58,1.58,1.58
2015-11-27,CA_4,FOODS_3_714,1.58,1.58,1.58,1.58,1.58,1.58,1.58
2015-11-28,CA_4,FOODS_3_714,1.58,1.58,1.58,1.58,1.58,1.58,1.58
2015-11-29,CA_4,FOODS_3_714,1.58,1.58,1.58,1.58,1.58,1.58,1.58


In [15]:
lags_stock_break_3_df = (df.groupby(['store_id','item_id'])
                            .apply(lambda x: create_lags(df = x, variable = 'stock_break_3', num_lags= 1))
                            .reset_index()
                            .set_index('date'))

  .apply(lambda x: create_lags(df = x, variable = 'stock_break_3', num_lags= 1))


In [16]:
lags_stock_break_7_df = (df.groupby(['store_id','item_id'])
                            .apply(lambda x: create_lags(df = x, variable = 'stock_break_7', num_lags= 1))
                            .reset_index()
                            .set_index('date'))

  .apply(lambda x: create_lags(df = x, variable = 'stock_break_7', num_lags= 1))


In [17]:
lags_stock_break_15_df = (df.groupby(['store_id','item_id'])
                            .apply(lambda x: create_lags(df = x, variable = 'stock_break_15', num_lags= 1))
                            .reset_index()
                            .set_index('date'))

  .apply(lambda x: create_lags(df = x, variable = 'stock_break_15', num_lags= 1))


In [18]:
lags_sales_df = (df.groupby(['store_id','item_id'])
                    .apply(lambda x: create_lags(df = x, variable = 'sales', num_lags= 15))
                    .reset_index()
                    .set_index('date'))

  .apply(lambda x: create_lags(df = x, variable = 'sales', num_lags= 15))


### Rolling windows variables

Let's create rolling windows variables over the following variables:

* minimum rolling window
* mean rolling window
* maximum rolling window

Each of them in the range of 15 days.

In [19]:
def min_rolling_window(df, variable, num_periods = 7):
    #Create a dataframe to store the minimum values
    minm = pd.DataFrame()
    
    for each in range(2,num_periods+1):
        minm[variable + '_minm_' + str(each)] = df[variable].shift(1).rolling(each).min()
    
    #Return the dataframe of lags
    return(minm)

In [20]:
def mean_rolling_window(df, variable, num_periods = 7):
    mm = pd.DataFrame()
    
    for each in range(2,num_periods+1):
        mm[variable + '_mm_' + str(each)] = df[variable].shift(1).rolling(each).mean()
    
    #Return the dataframe of lags
    return(mm)

In [21]:
def max_rolling_window(df, variable, num_periods = 7):

    maxm = pd.DataFrame()
    
    for each in range(2,num_periods+1):
        maxm[variable + '_maxm_' + str(each)] = df[variable].shift(1).rolling(each).max()
    
    #Return the dataframe of lags
    return(maxm)

In [22]:
min_rolling_window_df = (df.groupby(['store_id','item_id'])
                          .apply(lambda x: min_rolling_window(df = x, variable = 'sales', num_periods= 15))
                          .reset_index()
                          .set_index('date'))

  .apply(lambda x: min_rolling_window(df = x, variable = 'sales', num_periods= 15))


In [23]:
mean_rolling_window_df = (df.groupby(['store_id','item_id'])
                          .apply(lambda x: mean_rolling_window(df = x, variable = 'sales', num_periods= 15))
                          .reset_index()
                          .set_index('date'))

  .apply(lambda x: mean_rolling_window(df = x, variable = 'sales', num_periods= 15))


In [24]:
max_rolling_window_df = (df.groupby(['store_id','item_id'])
                          .apply(lambda x: max_rolling_window(df = x, variable = 'sales', num_periods= 15))
                          .reset_index()
                          .set_index('date'))

  .apply(lambda x: max_rolling_window(df = x, variable = 'sales', num_periods= 15))


## PREPARE DATAFRAMES

### Join all generated dataframes

In [25]:
df_joined = pd.concat([df,
                       lags_sell_price_df,
                       lags_stock_break_3_df,
                       lags_stock_break_7_df,
                       lags_stock_break_15_df,
                       lags_sales_df,
                       min_rolling_window_df,
                       mean_rolling_window_df,
                       max_rolling_window_df], axis = 1)

# Eliminate duplicated columns
df_joined = df_joined.loc[:,~df_joined.columns.duplicated()]
df_joined

Unnamed: 0_level_0,item_id,store_id,d,weekday,wday,month,year,event_name_1,event_type_1,sales,...,sales_maxm_6,sales_maxm_7,sales_maxm_8,sales_maxm_9,sales_maxm_10,sales_maxm_11,sales_maxm_12,sales_maxm_13,sales_maxm_14,sales_maxm_15
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-01,FOODS_3_090,CA_3,d_704,Tuesday,4,1,2013,NewYear,National,0,...,,,,,,,,,,
2013-01-02,FOODS_3_090,CA_3,d_705,Wednesday,5,1,2013,No_event,No_event,224,...,,,,,,,,,,
2013-01-03,FOODS_3_090,CA_3,d_706,Thursday,6,1,2013,No_event,No_event,241,...,,,,,,,,,,
2013-01-04,FOODS_3_090,CA_3,d_707,Friday,7,1,2013,No_event,No_event,232,...,,,,,,,,,,
2013-01-05,FOODS_3_090,CA_3,d_708,Saturday,1,1,2013,No_event,No_event,301,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-11-26,FOODS_3_714,CA_4,d_1763,Thursday,6,11,2015,Thanksgiving,National,22,...,15.0,15.0,15.0,15.0,15.0,17.0,17.0,17.0,17.0,17.0
2015-11-27,FOODS_3_714,CA_4,d_1764,Friday,7,11,2015,No_event,No_event,10,...,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0
2015-11-28,FOODS_3_714,CA_4,d_1765,Saturday,1,11,2015,No_event,No_event,17,...,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0
2015-11-29,FOODS_3_714,CA_4,d_1766,Sunday,2,11,2015,No_event,No_event,7,...,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0


### Eliminate the nulls that have been generated by the new variables

In [26]:
df_joined.dropna(inplace=True)

### Eliminate the variables that we are not going to need to modelize

In [29]:
to_eliminate = ['d','wm_yr_wk','sell_price','stock_break_3','stock_break_7','stock_break_15']

In [30]:
df_joined.drop(columns=to_eliminate, inplace=True)

In [38]:

df_joined

Unnamed: 0_level_0,item_id,store_id,weekday,wday,month,year,event_name_1,event_type_1,sales,sell_price_lag_1,...,sales_maxm_6,sales_maxm_7,sales_maxm_8,sales_maxm_9,sales_maxm_10,sales_maxm_11,sales_maxm_12,sales_maxm_13,sales_maxm_14,sales_maxm_15
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-16,FOODS_3_090,CA_3,Wednesday,5,1,2013,No_event,No_event,191,1.25,...,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0
2013-01-17,FOODS_3_090,CA_3,Thursday,6,1,2013,No_event,No_event,170,1.25,...,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0
2013-01-18,FOODS_3_090,CA_3,Friday,7,1,2013,No_event,No_event,224,1.25,...,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0
2013-01-19,FOODS_3_090,CA_3,Saturday,1,1,2013,No_event,No_event,362,1.25,...,281.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0
2013-01-20,FOODS_3_090,CA_3,Sunday,2,1,2013,No_event,No_event,255,1.25,...,362.0,362.0,362.0,362.0,362.0,362.0,362.0,362.0,362.0,362.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-11-26,FOODS_3_714,CA_4,Thursday,6,11,2015,Thanksgiving,National,22,1.58,...,15.0,15.0,15.0,15.0,15.0,17.0,17.0,17.0,17.0,17.0
2015-11-27,FOODS_3_714,CA_4,Friday,7,11,2015,No_event,No_event,10,1.58,...,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0
2015-11-28,FOODS_3_714,CA_4,Saturday,1,11,2015,No_event,No_event,17,1.58,...,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0
2015-11-29,FOODS_3_714,CA_4,Sunday,2,11,2015,No_event,No_event,7,1.58,...,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0


In [43]:
df_joined.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 20980 entries, 2013-01-16 to 2015-11-30
Data columns (total 76 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   item_id               20980 non-null  object 
 1   store_id              20980 non-null  object 
 2   weekday               20980 non-null  object 
 3   wday                  20980 non-null  object 
 4   month                 20980 non-null  object 
 5   year                  20980 non-null  object 
 6   event_name_1          20980 non-null  object 
 7   event_type_1          20980 non-null  object 
 8   sales                 20980 non-null  int64  
 9   sell_price_lag_1      20980 non-null  float64
 10  sell_price_lag_2      20980 non-null  float64
 11  sell_price_lag_3      20980 non-null  float64
 12  sell_price_lag_4      20980 non-null  float64
 13  sell_price_lag_5      20980 non-null  float64
 14  sell_price_lag_6      20980 non-null  float64
 15  se

In [42]:
df_joined = df_joined.astype({
    'weekday': 'object',
    'wday': 'object',
    'month': 'object',
    'year': 'object'
})

### Identify the target

In [44]:
target = df_joined.sales

### Split the dataframe into categorical and numerical

In [45]:
cat = df_joined.select_dtypes(include='O')

In [46]:
cat

Unnamed: 0_level_0,item_id,store_id,weekday,wday,month,year,event_name_1,event_type_1
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2013-01-16,FOODS_3_090,CA_3,Wednesday,5,1,2013,No_event,No_event
2013-01-17,FOODS_3_090,CA_3,Thursday,6,1,2013,No_event,No_event
2013-01-18,FOODS_3_090,CA_3,Friday,7,1,2013,No_event,No_event
2013-01-19,FOODS_3_090,CA_3,Saturday,1,1,2013,No_event,No_event
2013-01-20,FOODS_3_090,CA_3,Sunday,2,1,2013,No_event,No_event
...,...,...,...,...,...,...,...,...
2015-11-26,FOODS_3_714,CA_4,Thursday,6,11,2015,Thanksgiving,National
2015-11-27,FOODS_3_714,CA_4,Friday,7,11,2015,No_event,No_event
2015-11-28,FOODS_3_714,CA_4,Saturday,1,11,2015,No_event,No_event
2015-11-29,FOODS_3_714,CA_4,Sunday,2,11,2015,No_event,No_event


In [47]:
num = df_joined.select_dtypes(exclude='O')

In [48]:
num

Unnamed: 0_level_0,sales,sell_price_lag_1,sell_price_lag_2,sell_price_lag_3,sell_price_lag_4,sell_price_lag_5,sell_price_lag_6,sell_price_lag_7,stock_break_3_lag_1,stock_break_7_lag_1,...,sales_maxm_6,sales_maxm_7,sales_maxm_8,sales_maxm_9,sales_maxm_10,sales_maxm_11,sales_maxm_12,sales_maxm_13,sales_maxm_14,sales_maxm_15
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-16,191,1.25,1.25,1.25,1.25,1.25,1.25,1.25,0.0,0.0,...,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0
2013-01-17,170,1.25,1.25,1.25,1.25,1.25,1.25,1.25,0.0,0.0,...,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0
2013-01-18,224,1.25,1.25,1.25,1.25,1.25,1.25,1.25,0.0,0.0,...,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0
2013-01-19,362,1.25,1.25,1.25,1.25,1.25,1.25,1.25,0.0,0.0,...,281.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0
2013-01-20,255,1.25,1.25,1.25,1.25,1.25,1.25,1.25,0.0,0.0,...,362.0,362.0,362.0,362.0,362.0,362.0,362.0,362.0,362.0,362.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-11-26,22,1.58,1.58,1.58,1.58,1.58,1.58,1.58,0.0,0.0,...,15.0,15.0,15.0,15.0,15.0,17.0,17.0,17.0,17.0,17.0
2015-11-27,10,1.58,1.58,1.58,1.58,1.58,1.58,1.58,0.0,0.0,...,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0
2015-11-28,17,1.58,1.58,1.58,1.58,1.58,1.58,1.58,0.0,0.0,...,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0
2015-11-29,7,1.58,1.58,1.58,1.58,1.58,1.58,1.58,0.0,0.0,...,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0


## CATEGORICAL VARIABLES TRANSFORMATION

### One Hot Encoding

#### Variables to apply OHE

In [49]:
var_ohe = ['year',
          'month',
          'wday',
          'weekday',
          'event_name_1',
          'event_type_1'
        ]

#### Instantiate the OneHotEncoder

In [50]:
ohe = OneHotEncoder(sparse_output = False, handle_unknown='ignore')

#### Train and apply the OneHotEncoder

In [51]:
cat

Unnamed: 0_level_0,item_id,store_id,weekday,wday,month,year,event_name_1,event_type_1
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2013-01-16,FOODS_3_090,CA_3,Wednesday,5,1,2013,No_event,No_event
2013-01-17,FOODS_3_090,CA_3,Thursday,6,1,2013,No_event,No_event
2013-01-18,FOODS_3_090,CA_3,Friday,7,1,2013,No_event,No_event
2013-01-19,FOODS_3_090,CA_3,Saturday,1,1,2013,No_event,No_event
2013-01-20,FOODS_3_090,CA_3,Sunday,2,1,2013,No_event,No_event
...,...,...,...,...,...,...,...,...
2015-11-26,FOODS_3_714,CA_4,Thursday,6,11,2015,Thanksgiving,National
2015-11-27,FOODS_3_714,CA_4,Friday,7,11,2015,No_event,No_event
2015-11-28,FOODS_3_714,CA_4,Saturday,1,11,2015,No_event,No_event
2015-11-29,FOODS_3_714,CA_4,Sunday,2,11,2015,No_event,No_event


In [52]:
cat_ohe = ohe.fit_transform(cat[var_ohe])

#### Save as dataframe

In [53]:
cat_ohe = pd.DataFrame(cat_ohe, columns = ohe.get_feature_names_out())

### Target Encoding

In [57]:
from category_encoders import TargetEncoder 

#### Variables to apply TE

In [54]:
var_te = ['year',
          'month',
          'wday',
          'weekday',
          'event_name_1',
          'event_type_1'
        ]

#### Instantiate the TargetEncoder

In [58]:
te = TargetEncoder(min_samples_leaf=100, return_df = False)

#### Train and apply the TargetEncoder

In [59]:
cat_te = te.fit_transform(cat[var_te], y = target)

#### Save as dataframe

In [60]:
#Add suffixes to the names
names_te = [variable + '_te' for variable in var_te]

#Save as dataframe
cat_te = pd.DataFrame(cat_te, columns = names_te)

## JOIN ALL TRANSFORMED DATAFRAMES

### Include all generated dataframes in a list



We retrieve the segmentation variables from df_joined.

In [61]:
from_df_joined = df_joined[['store_id','item_id']].reset_index()

from_df_joined.head(2)

Unnamed: 0,date,store_id,item_id
0,2013-01-16,CA_3,FOODS_3_090
1,2013-01-17,CA_3,FOODS_3_090


### Join all dataframes

In [62]:
dataframes = [from_df_joined, cat_ohe,cat_te,num.reset_index(drop=True)]

In [63]:
df_transformed = pd.concat(dataframes, axis = 1)

df_transformed

Unnamed: 0,date,store_id,item_id,year_2013,year_2014,year_2015,month_1,month_2,month_3,month_4,...,sales_maxm_6,sales_maxm_7,sales_maxm_8,sales_maxm_9,sales_maxm_10,sales_maxm_11,sales_maxm_12,sales_maxm_13,sales_maxm_14,sales_maxm_15
0,2013-01-16,CA_3,FOODS_3_090,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0
1,2013-01-17,CA_3,FOODS_3_090,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0
2,2013-01-18,CA_3,FOODS_3_090,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0
3,2013-01-19,CA_3,FOODS_3_090,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,281.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0,351.0
4,2013-01-20,CA_3,FOODS_3_090,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,362.0,362.0,362.0,362.0,362.0,362.0,362.0,362.0,362.0,362.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20975,2015-11-26,CA_4,FOODS_3_714,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,15.0,15.0,15.0,15.0,15.0,17.0,17.0,17.0,17.0,17.0
20976,2015-11-27,CA_4,FOODS_3_714,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0
20977,2015-11-28,CA_4,FOODS_3_714,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0
20978,2015-11-29,CA_4,FOODS_3_714,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0


## SAVE DATASET AFTER DATA TRANSFORMATION

In [64]:
#Define the names of the file
path_df_transformed = TRANSFORMED_DATA_DIR / 'df_transformed.parquet'

In [65]:
#Save the files
df_transformed.to_parquet(path_df_transformed)