# Data preprocesing, Part 3

## Import modules

In [1]:
import cudf
import numpy as np
import pandas as pd
import cupy as cp
import gc

## Load data

In [2]:
raw_data_dir = "./data/"
processed_data_dir = "./processed_data/"

In [3]:
calendar_df = cudf.read_csv(raw_data_dir + "calendar.csv").rename(
    columns={"d": "day_id"}
)

In [4]:
grid_df = cudf.from_pandas(pd.read_pickle(processed_data_dir + "grid_df_part1.pkl"))
grid_df = grid_df[["id", "day_id"]]
grid_df

Unnamed: 0,id,day_id
0,FOODS_1_001_CA_1_evaluation,d_1537
1,FOODS_1_001_CA_1_evaluation,d_1538
2,FOODS_1_001_CA_1_evaluation,d_1539
3,FOODS_1_001_CA_1_evaluation,d_1540
4,FOODS_1_001_CA_1_evaluation,d_1541
...,...,...
47735392,HOUSEHOLD_2_516_WI_3_evaluation,d_52
47735393,HOUSEHOLD_2_516_WI_3_evaluation,d_53
47735394,HOUSEHOLD_2_516_WI_3_evaluation,d_54
47735395,HOUSEHOLD_2_516_WI_3_evaluation,d_55


## Generate date-related features

We first identify the date in each row of `grid_df` using information from `calendar_df`.

In [5]:
# Bring in the following columns from calendar_df into grid_df
icols = [
    "date",
    "day_id",
    "event_name_1",
    "event_type_1",
    "event_name_2",
    "event_type_2",
    "snap_CA",
    "snap_TX",
    "snap_WI",
]
grid_df = grid_df.merge(calendar_df[icols], on=["day_id"], how="left")
grid_df

Unnamed: 0,id,day_id,date,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,FOODS_1_001_TX_3_evaluation,d_1522,2015-03-30,,,,,0,0,0
1,FOODS_1_001_TX_3_evaluation,d_1523,2015-03-31,,,,,0,0,0
2,FOODS_1_001_TX_3_evaluation,d_1524,2015-04-01,,,,,1,1,0
3,FOODS_1_001_TX_3_evaluation,d_1525,2015-04-02,,,,,1,0,1
4,FOODS_1_001_TX_3_evaluation,d_1526,2015-04-03,,,,,1,1,1
...,...,...,...,...,...,...,...,...,...,...
47735392,HOUSEHOLD_2_516_WI_3_evaluation,d_52,2011-03-21,,,,,0,0,0
47735393,HOUSEHOLD_2_516_WI_3_evaluation,d_53,2011-03-22,,,,,0,0,0
47735394,HOUSEHOLD_2_516_WI_3_evaluation,d_54,2011-03-23,,,,,0,0,0
47735395,HOUSEHOLD_2_516_WI_3_evaluation,d_55,2011-03-24,,,,,0,0,0


In [6]:
# Convert columns into categorical type to save memory
for col in [
    "event_name_1",
    "event_type_1",
    "event_name_2",
    "event_type_2",
    "snap_CA",
    "snap_TX",
    "snap_WI",
]:
    grid_df[col] = grid_df[col].astype("category")
# Convert "date" column into timestamp type
grid_df["date"] = cudf.to_datetime(grid_df["date"])

Using the `date` column, we can generate related features, such as day, week, or month.

In [7]:
grid_df["tm_d"] = grid_df["date"].dt.day.astype(np.int8)
grid_df["tm_w"] = grid_df["date"].dt.isocalendar().week.astype(np.int8)
grid_df["tm_m"] = grid_df["date"].dt.month.astype(np.int8)
grid_df["tm_y"] = grid_df["date"].dt.year
grid_df["tm_y"] = (grid_df["tm_y"] - grid_df["tm_y"].min()).astype(np.int8)
grid_df["tm_wm"] = cp.ceil(grid_df["tm_d"].to_cupy() / 7).astype(
    np.int8
)  # which week in tje month?
grid_df["tm_dw"] = grid_df["date"].dt.dayofweek.astype(
    np.int8
)  # which day in the week?
grid_df["tm_w_end"] = (grid_df["tm_dw"] >= 5).astype(
    np.int8
)  # whether today is in the weekend
del grid_df["date"]  # no longer needed

In [8]:
grid_df

Unnamed: 0,id,day_id,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,tm_d,tm_w,tm_m,tm_y,tm_wm,tm_dw,tm_w_end
0,FOODS_1_001_TX_3_evaluation,d_1522,,,,,0,0,0,30,14,3,4,5,0,0
1,FOODS_1_001_TX_3_evaluation,d_1523,,,,,0,0,0,31,14,3,4,5,1,0
2,FOODS_1_001_TX_3_evaluation,d_1524,,,,,1,1,0,1,14,4,4,1,2,0
3,FOODS_1_001_TX_3_evaluation,d_1525,,,,,1,0,1,2,14,4,4,1,3,0
4,FOODS_1_001_TX_3_evaluation,d_1526,,,,,1,1,1,3,14,4,4,1,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47735392,HOUSEHOLD_2_516_WI_3_evaluation,d_52,,,,,0,0,0,21,12,3,0,3,0,0
47735393,HOUSEHOLD_2_516_WI_3_evaluation,d_53,,,,,0,0,0,22,12,3,0,4,1,0
47735394,HOUSEHOLD_2_516_WI_3_evaluation,d_54,,,,,0,0,0,23,12,3,0,4,2,0
47735395,HOUSEHOLD_2_516_WI_3_evaluation,d_55,,,,,0,0,0,24,12,3,0,4,3,0


Now we can persist the table to the disk.

In [9]:
grid_df.to_pandas().to_pickle(processed_data_dir + "grid_df_part3.pkl")