# Data preprocesing, Part 4

## Import modules

In [1]:
import cudf
import numpy as np
import pandas as pd
import gc

## Load data

In [2]:
raw_data_dir = "./data/"
processed_data_dir = "./processed_data/"

In [3]:
grid_df = cudf.DataFrame(pd.read_pickle(processed_data_dir + "grid_df_part1.pkl"))
grid_df = grid_df[["id", "day_id", "sales"]]
SHIFT_DAY = 28

## Generate lag features

**Lag features** are the value of the target variable at prior timestamps. Lag features are useful because what happens in the past often influences what would happen in the future. In our example, we generate lag features by reading the sales amount at X days prior, where X = 28, 29, ..., 42.

In [4]:
LAG_DAYS = [col for col in range(SHIFT_DAY, SHIFT_DAY + 15)]

# Need to first ensure that rows in each time series are sorted by day_id
grid_df = grid_df.sort_values(["id", "day_id"])

grid_df = grid_df.assign(
    **{f"sales_lag_{l}": grid_df.groupby(["id"])["sales"].shift(l) for l in LAG_DAYS}
)

In [5]:
grid_df

Unnamed: 0,id,day_id,sales,sales_lag_28,sales_lag_29,sales_lag_30,sales_lag_31,sales_lag_32,sales_lag_33,sales_lag_34,sales_lag_35,sales_lag_36,sales_lag_37,sales_lag_38,sales_lag_39,sales_lag_40,sales_lag_41,sales_lag_42
13225,FOODS_1_001_CA_1_evaluation,d_1,3.0,,,,,,,,,,,,,,,
13226,FOODS_1_001_CA_1_evaluation,d_2,0.0,,,,,,,,,,,,,,,
13227,FOODS_1_001_CA_1_evaluation,d_3,0.0,,,,,,,,,,,,,,,
13228,FOODS_1_001_CA_1_evaluation,d_4,1.0,,,,,,,,,,,,,,,
13229,FOODS_1_001_CA_1_evaluation,d_5,4.0,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47734672,HOUSEHOLD_2_516_WI_3_evaluation,d_1965,,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
47734673,HOUSEHOLD_2_516_WI_3_evaluation,d_1966,,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
47734674,HOUSEHOLD_2_516_WI_3_evaluation,d_1967,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
47734675,HOUSEHOLD_2_516_WI_3_evaluation,d_1968,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Compute rolling window statistics

In the previous cell, we used the value of sales at a single timestamp to generate lag features. To capture richer information about the past, let us also get the distribution of the sales value over multiple timestamps, by computing **rolling window statistics**. Rolling window statistics are statistics (e.g. mean, standard deviation) over a time duration in the past. Rolling windows statistics complement lag features and provide more information about the past behavior of the target variable.

Read more about lag features and rolling window statistics in [Introduction to feature engineering for time series forecasting](https://medium.com/data-science-at-microsoft/introduction-to-feature-engineering-for-time-series-forecasting-620aa55fcab0).

In [6]:
# Shift by 28 days and apply windows of various sizes
print(f"Shift size: {SHIFT_DAY}")
for i in [7, 14, 30, 60, 180]:
    print(f"    Window size: {i}")
    grid_df[f"rolling_mean_{i}"] = (
        grid_df.groupby(["id"])["sales"]
        .shift(SHIFT_DAY)
        .rolling(i)
        .mean()
        .astype(np.float32)
    )
    grid_df[f"rolling_std_{i}"] = (
        grid_df.groupby(["id"])["sales"]
        .shift(SHIFT_DAY)
        .rolling(i)
        .std()
        .astype(np.float32)
    )

Shift size: 28
    Window size: 7
    Window size: 14
    Window size: 30
    Window size: 60
    Window size: 180


In [7]:
grid_df.columns

Index(['id', 'day_id', 'sales', 'sales_lag_28', 'sales_lag_29', 'sales_lag_30',
       'sales_lag_31', 'sales_lag_32', 'sales_lag_33', 'sales_lag_34',
       'sales_lag_35', 'sales_lag_36', 'sales_lag_37', 'sales_lag_38',
       'sales_lag_39', 'sales_lag_40', 'sales_lag_41', 'sales_lag_42',
       'rolling_mean_7', 'rolling_std_7', 'rolling_mean_14', 'rolling_std_14',
       'rolling_mean_30', 'rolling_std_30', 'rolling_mean_60',
       'rolling_std_60', 'rolling_mean_180', 'rolling_std_180'],
      dtype='object')

In [8]:
grid_df.dtypes

id                  category
day_id              category
sales                float32
sales_lag_28         float32
sales_lag_29         float32
sales_lag_30         float32
sales_lag_31         float32
sales_lag_32         float32
sales_lag_33         float32
sales_lag_34         float32
sales_lag_35         float32
sales_lag_36         float32
sales_lag_37         float32
sales_lag_38         float32
sales_lag_39         float32
sales_lag_40         float32
sales_lag_41         float32
sales_lag_42         float32
rolling_mean_7       float32
rolling_std_7        float32
rolling_mean_14      float32
rolling_std_14       float32
rolling_mean_30      float32
rolling_std_30       float32
rolling_mean_60      float32
rolling_std_60       float32
rolling_mean_180     float32
rolling_std_180      float32
dtype: object

Once lag features and rolling window statistics are computed, persist them to the disk.

In [10]:
grid_df.to_pandas().to_pickle(processed_data_dir + "lags_df_28.pkl")