# Data preprocesing, Part 2

## Import modules

In [1]:
import cudf
import numpy as np
import pandas as pd
import gc

## Load data

In [2]:
raw_data_dir = "./data/"
processed_data_dir = "./processed_data/"

In [3]:
prices_df = cudf.read_csv(raw_data_dir + "sell_prices.csv")
calendar_df = cudf.read_csv(raw_data_dir + "calendar.csv").rename(
    columns={"d": "day_id"}
)

In [4]:
prices_df

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,CA_1,HOBBIES_1_001,11325,9.58
1,CA_1,HOBBIES_1_001,11326,9.58
2,CA_1,HOBBIES_1_001,11327,8.26
3,CA_1,HOBBIES_1_001,11328,8.26
4,CA_1,HOBBIES_1_001,11329,8.26
...,...,...,...,...
6841116,WI_3,FOODS_3_827,11617,1.00
6841117,WI_3,FOODS_3_827,11618,1.00
6841118,WI_3,FOODS_3_827,11619,1.00
6841119,WI_3,FOODS_3_827,11620,1.00


In [5]:
calendar_df

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,day_id,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1964,2016-06-15,11620,Wednesday,5,6,2016,d_1965,,,,,0,1,1
1965,2016-06-16,11620,Thursday,6,6,2016,d_1966,,,,,0,0,0
1966,2016-06-17,11620,Friday,7,6,2016,d_1967,,,,,0,0,0
1967,2016-06-18,11621,Saturday,1,6,2016,d_1968,,,,,0,0,0


## Generate price-related features

Let us engineer additional features that are related to the sale price. We consider the distribution of the price of a given product over time and ask how the current price compares to the historical trend.

In [6]:
# Highest price over all weeks
prices_df["price_max"] = prices_df.groupby(["store_id", "item_id"])[
    "sell_price"
].transform("max")
# Lowest price over all weeks
prices_df["price_min"] = prices_df.groupby(["store_id", "item_id"])[
    "sell_price"
].transform("min")
# Standard deviation of the price
prices_df["price_std"] = prices_df.groupby(["store_id", "item_id"])[
    "sell_price"
].transform("std")
# Mean (average) price over all weeks
prices_df["price_mean"] = prices_df.groupby(["store_id", "item_id"])[
    "sell_price"
].transform("mean")

We also consider the ratio of the current price to the max price.

In [7]:
prices_df["price_norm"] = prices_df["sell_price"] / prices_df["price_max"]

Some items have a very stable price, whereas other items respond to inflation quickly and rise in price. To capture the price elasticity, we count the number of unique price values for a given product over time.

In [8]:
prices_df["price_nunique"] = prices_df.groupby(["store_id", "item_id"])[
    "sell_price"
].transform("nunique")

We also consider, for a given price, how many other items are being sold at the exact same price.

In [9]:
prices_df["item_nunique"] = prices_df.groupby(["store_id", "sell_price"])[
    "item_id"
].transform("nunique")

In [10]:
prices_df

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price,price_max,price_min,price_std,price_mean,price_norm,price_nunique,item_nunique
0,CA_1,HOBBIES_1_001,11325,9.58,9.58,8.26,0.152139,8.285714,1.000000,3,3
1,CA_1,HOBBIES_1_001,11326,9.58,9.58,8.26,0.152139,8.285714,1.000000,3,3
2,CA_1,HOBBIES_1_001,11327,8.26,9.58,8.26,0.152139,8.285714,0.862213,3,5
3,CA_1,HOBBIES_1_001,11328,8.26,9.58,8.26,0.152139,8.285714,0.862213,3,5
4,CA_1,HOBBIES_1_001,11329,8.26,9.58,8.26,0.152139,8.285714,0.862213,3,5
...,...,...,...,...,...,...,...,...,...,...,...
6841116,WI_3,FOODS_3_827,11617,1.00,1.00,1.00,0.000000,1.000000,1.000000,1,142
6841117,WI_3,FOODS_3_827,11618,1.00,1.00,1.00,0.000000,1.000000,1.000000,1,142
6841118,WI_3,FOODS_3_827,11619,1.00,1.00,1.00,0.000000,1.000000,1.000000,1,142
6841119,WI_3,FOODS_3_827,11620,1.00,1.00,1.00,0.000000,1.000000,1.000000,1,142


Another useful way to put prices in context is to compare the price of a product to its historical price a week ago, a month ago, or an year ago.

In [11]:
# Add "month" and "year" columns to prices_df
week_to_month_map = calendar_df[["wm_yr_wk", "month", "year"]].drop_duplicates(
    subset=["wm_yr_wk"]
)
prices_df = prices_df.merge(week_to_month_map, on=["wm_yr_wk"], how="left")

# Sort by wm_yr_wk. The rows will also be sorted in ascending months and years.
prices_df = prices_df.sort_values(["store_id", "item_id", "wm_yr_wk"])

In [12]:
# Compare with the average price in the previous week
prices_df["price_momentum"] = prices_df["sell_price"] / prices_df.groupby(
    ["store_id", "item_id"]
)["sell_price"].shift(1)
# Compare with the average price in the previous month
prices_df["price_momentum_m"] = prices_df["sell_price"] / prices_df.groupby(
    ["store_id", "item_id", "month"]
)["sell_price"].transform("mean")
# Compare with the average price in the previous year
prices_df["price_momentum_y"] = prices_df["sell_price"] / prices_df.groupby(
    ["store_id", "item_id", "year"]
)["sell_price"].transform("mean")

In [13]:
# Remove "month" and "year" columns, as we don't need them any more
del prices_df["month"], prices_df["year"]

# Convert float64 columns into float32 type to save memory
columns = [
    "sell_price",
    "price_max",
    "price_min",
    "price_std",
    "price_mean",
    "price_norm",
    "price_momentum",
    "price_momentum_m",
    "price_momentum_y",
]
for col in columns:
    prices_df[col] = prices_df[col].astype(np.float32)

In [14]:
prices_df.dtypes

store_id             object
item_id              object
wm_yr_wk              int64
sell_price          float32
price_max           float32
price_min           float32
price_std           float32
price_mean          float32
price_norm          float32
price_nunique         int32
item_nunique          int32
price_momentum      float32
price_momentum_m    float32
price_momentum_y    float32
dtype: object

## Bring in price-related features into `grid_df`

We load `grid_df` from the Part 1 notebook and bring in columns from `price_df`.

In [15]:
grid_df = cudf.from_pandas(pd.read_pickle(processed_data_dir + "grid_df_part1.pkl"))

In [16]:
# After merging price_df, keep columns id and day_id from grid_df and drop all other columns from grid_df
original_columns = list(grid_df)
grid_df = grid_df.merge(prices_df, on=["store_id", "item_id", "wm_yr_wk"], how="left")
columns_to_keep = ["id", "day_id"] + [
    col for col in list(grid_df) if col not in original_columns
]
grid_df = grid_df[["id", "day_id"] + columns_to_keep]
grid_df

Unnamed: 0,id,day_id,sell_price,price_max,price_min,price_std,price_mean,price_norm,price_nunique,item_nunique,price_momentum,price_momentum_m,price_momentum_y
0,FOODS_1_001_CA_2_evaluation,d_1040,2.24,2.24,2.00,1.095719e-01,2.169362,1.0,2,61,1.0,1.019868,1.0
1,FOODS_1_001_CA_2_evaluation,d_1041,2.24,2.24,2.00,1.095719e-01,2.169362,1.0,2,61,1.0,1.019868,1.0
2,FOODS_1_001_CA_2_evaluation,d_1042,2.24,2.24,2.00,1.095719e-01,2.169362,1.0,2,61,1.0,1.019868,1.0
3,FOODS_1_001_CA_2_evaluation,d_1043,2.24,2.24,2.00,1.095719e-01,2.169362,1.0,2,61,1.0,1.019868,1.0
4,FOODS_1_001_CA_2_evaluation,d_1044,2.24,2.24,2.00,1.095719e-01,2.169362,1.0,2,61,1.0,1.024958,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
47735392,HOUSEHOLD_2_516_WI_2_evaluation,d_884,5.94,5.94,5.94,3.648122e-14,5.940000,1.0,1,47,1.0,1.000000,1.0
47735393,HOUSEHOLD_2_516_WI_2_evaluation,d_885,5.94,5.94,5.94,3.648122e-14,5.940000,1.0,1,47,1.0,1.000000,1.0
47735394,HOUSEHOLD_2_516_WI_2_evaluation,d_886,5.94,5.94,5.94,3.648122e-14,5.940000,1.0,1,47,1.0,1.000000,1.0
47735395,HOUSEHOLD_2_516_WI_2_evaluation,d_887,5.94,5.94,5.94,3.648122e-14,5.940000,1.0,1,47,1.0,1.000000,1.0


We persist the combined table to disk.

In [17]:
grid_df.to_pandas().to_pickle(processed_data_dir + "grid_df_part2.pkl")