# Data preprocesing, Part 6

## Import modules

In [1]:
import cudf
import numpy as np
import pandas as pd
import gc
import glob
import pathlib
import gcsfs

Enter the name of the Cloud Storage bucket you used in `start_here.ipynb`.

In [2]:
bucket_name = "<Put the name of the bucket here>"

## Filter by store and product department and create data segments

After combining all columns produced in the previous notebooks, we filter the rows in the data set by `store_id` and `dept_id` and create a segment. Each segment is saved as a pickle file and then upload to Cloud Storage.

In [4]:
processed_data_dir = "./processed_data/"
segmented_data_dir = "./segmented_data/"
pathlib.Path(segmented_data_dir).mkdir(exist_ok=True)

STORES = [
    "CA_1",
    "CA_2",
    "CA_3",
    "CA_4",
    "TX_1",
    "TX_2",
    "TX_3",
    "WI_1",
    "WI_2",
    "WI_3",
]
DEPTS = [
    "HOBBIES_1",
    "HOBBIES_2",
    "HOUSEHOLD_1",
    "HOUSEHOLD_2",
    "FOODS_1",
    "FOODS_2",
    "FOODS_3",
]

grid2_colnm = [
    "sell_price",
    "price_max",
    "price_min",
    "price_std",
    "price_mean",
    "price_norm",
    "price_nunique",
    "item_nunique",
    "price_momentum",
    "price_momentum_m",
    "price_momentum_y",
]

grid3_colnm = [
    "event_name_1",
    "event_type_1",
    "event_name_2",
    "event_type_2",
    "snap_CA",
    "snap_TX",
    "snap_WI",
    "tm_d",
    "tm_w",
    "tm_m",
    "tm_y",
    "tm_wm",
    "tm_dw",
    "tm_w_end",
]

lag_colnm = [
    "sales_lag_28",
    "sales_lag_29",
    "sales_lag_30",
    "sales_lag_31",
    "sales_lag_32",
    "sales_lag_33",
    "sales_lag_34",
    "sales_lag_35",
    "sales_lag_36",
    "sales_lag_37",
    "sales_lag_38",
    "sales_lag_39",
    "sales_lag_40",
    "sales_lag_41",
    "sales_lag_42",
    "rolling_mean_7",
    "rolling_std_7",
    "rolling_mean_14",
    "rolling_std_14",
    "rolling_mean_30",
    "rolling_std_30",
    "rolling_mean_60",
    "rolling_std_60",
    "rolling_mean_180",
    "rolling_std_180",
]

target_enc_colnm = [
    "enc_store_id_dept_id_mean",
    "enc_store_id_dept_id_std",
    "enc_item_id_state_id_mean",
    "enc_item_id_state_id_std",
]

In [5]:
def prepare_data(store, dept=None):
    """
    Filter and clean data according to stores and product departments

    Parameters
    ----------
    store: Filter data by retaining rows whose store_id matches this parameter.
    dept: Filter data by retaining rows whose dept_id matches this parameter.
          This parameter can be set to None to indicate that we shouldn't filter by dept_id.
    """
    if store is None:
        raise ValueError(f"store parameter must not be None")

    grid1 = cudf.DataFrame(pd.read_pickle(processed_data_dir + "grid_df_part1.pkl"))

    if dept is None:
        grid1 = grid1[grid1["store_id"] == store]
    else:
        grid1 = grid1[(grid1["store_id"] == store) & (grid1["dept_id"] == dept)].drop(
            columns=["dept_id"]
        )
    grid1 = grid1.drop(columns=["release_week", "wm_yr_wk", "store_id", "state_id"])

    grid2 = cudf.DataFrame(pd.read_pickle(processed_data_dir + "grid_df_part2.pkl"))[
        ["id", "day_id"] + grid2_colnm
    ]
    grid_df = grid1.merge(grid2, on=["id", "day_id"], how="left")
    del grid1, grid2

    grid3 = cudf.DataFrame(pd.read_pickle(processed_data_dir + "grid_df_part3.pkl"))[
        ["id", "day_id"] + grid3_colnm
    ]
    grid_df = grid_df.merge(grid3, on=["id", "day_id"], how="left")
    del grid3

    lag_df = cudf.DataFrame(pd.read_pickle(processed_data_dir + "lags_df_28.pkl"))[
        ["id", "day_id"] + lag_colnm
    ]

    grid_df = grid_df.merge(lag_df, on=["id", "day_id"], how="left")
    del lag_df

    target_enc_df = cudf.DataFrame(
        pd.read_pickle(processed_data_dir + "target_encoding_df.pkl")
    )[["id", "day_id"] + target_enc_colnm]

    grid_df = grid_df.merge(target_enc_df, on=["id", "day_id"], how="left")
    del target_enc_df
    gc.collect()

    grid_df = grid_df.drop(columns=["id"])
    grid_df["day_id"] = (
        grid_df["day_id"]
        .to_pandas()
        .astype("str")
        .apply(lambda x: x[2:])
        .astype(np.int16)
    )

    return grid_df

In [6]:
# First save the segment to the disk
for store in STORES:
    print(f"Processing store {store}...")
    grid_df = prepare_data(store=store)
    grid_df.to_pandas().to_pickle(segmented_data_dir + f"combined_df_store_{store}.pkl")
    del grid_df
    gc.collect()

for store in STORES:
    for dept in DEPTS:
        print(f"Processing (store {store}, department {dept})...")
        grid_df = prepare_data(store=store, dept=dept)
        grid_df.to_pandas().to_pickle(
            segmented_data_dir + f"combined_df_store_{store}_dept_{dept}.pkl"
        )
        del grid_df
        gc.collect()

Processing store CA_1...
Processing store CA_2...
Processing store CA_3...
Processing store CA_4...
Processing store TX_1...
Processing store TX_2...
Processing store TX_3...
Processing store WI_1...
Processing store WI_2...
Processing store WI_3...
Processing (store CA_1, department HOBBIES_1)...
Processing (store CA_1, department HOBBIES_2)...
Processing (store CA_1, department HOUSEHOLD_1)...
Processing (store CA_1, department HOUSEHOLD_2)...
Processing (store CA_1, department FOODS_1)...
Processing (store CA_1, department FOODS_2)...
Processing (store CA_1, department FOODS_3)...
Processing (store CA_2, department HOBBIES_1)...
Processing (store CA_2, department HOBBIES_2)...
Processing (store CA_2, department HOUSEHOLD_1)...
Processing (store CA_2, department HOUSEHOLD_2)...
Processing (store CA_2, department FOODS_1)...
Processing (store CA_2, department FOODS_2)...
Processing (store CA_2, department FOODS_3)...
Processing (store CA_3, department HOBBIES_1)...
Processing (store C

In [7]:
# Then copy the segment to Cloud Storage
fs = gcsfs.GCSFileSystem()

for e in glob.glob(segmented_data_dir + "*"):
    print(f"Uploading {e}...")
    basename = pathlib.Path(e).name
    fs.put_file(e, f"{bucket_name}/{basename}")

Uploading ./segmented_data/combined_df_store_WI_1_dept_FOODS_3.pkl...
Uploading ./segmented_data/combined_df_store_TX_2_dept_FOODS_3.pkl...
Uploading ./segmented_data/combined_df_store_CA_2_dept_HOBBIES_2.pkl...
Uploading ./segmented_data/combined_df_store_WI_2_dept_HOBBIES_2.pkl...
Uploading ./segmented_data/combined_df_store_CA_1_dept_FOODS_3.pkl...
Uploading ./segmented_data/combined_df_store_TX_1_dept_FOODS_1.pkl...
Uploading ./segmented_data/combined_df_store_CA_3_dept_HOBBIES_2.pkl...
Uploading ./segmented_data/combined_df_store_CA_3_dept_HOUSEHOLD_2.pkl...
Uploading ./segmented_data/combined_df_store_WI_2_dept_HOBBIES_1.pkl...
Uploading ./segmented_data/combined_df_store_TX_1.pkl...
Uploading ./segmented_data/combined_df_store_CA_4_dept_HOBBIES_1.pkl...
Uploading ./segmented_data/combined_df_store_WI_3_dept_FOODS_1.pkl...
Uploading ./segmented_data/combined_df_store_WI_1_dept_HOBBIES_2.pkl...
Uploading ./segmented_data/combined_df_store_WI_1_dept_HOUSEHOLD_2.pkl...
Uploading ./s

In [8]:
# Also upload the product weights
fs = gcsfs.GCSFileSystem()
fs.put_file(
    processed_data_dir + "product_weights.pkl", f"{bucket_name}/product_weights.pkl"
)