# Data preprocesing, Part 5

## Import modules

In [1]:
import cudf
import numpy as np
import pandas as pd
import gc

## Load data

In [2]:
raw_data_dir = "./data/"
processed_data_dir = "./processed_data/"

In [3]:
grid_df = cudf.DataFrame(pd.read_pickle(processed_data_dir + "grid_df_part1.pkl"))
grid_df

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,day_id,sales,release_week,wm_yr_wk
0,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_1537,1.0,11101,11511
1,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_1538,0.0,11101,11511
2,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_1539,2.0,11101,11511
3,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_1540,0.0,11101,11511
4,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_1541,0.0,11101,11512
...,...,...,...,...,...,...,...,...,...,...
47735392,HOUSEHOLD_2_516_WI_3_evaluation,HOUSEHOLD_2_516,HOUSEHOLD_2,HOUSEHOLD,WI_3,WI,d_52,0.0,11101,11108
47735393,HOUSEHOLD_2_516_WI_3_evaluation,HOUSEHOLD_2_516,HOUSEHOLD_2,HOUSEHOLD,WI_3,WI,d_53,0.0,11101,11108
47735394,HOUSEHOLD_2_516_WI_3_evaluation,HOUSEHOLD_2_516,HOUSEHOLD_2,HOUSEHOLD,WI_3,WI,d_54,0.0,11101,11108
47735395,HOUSEHOLD_2_516_WI_3_evaluation,HOUSEHOLD_2_516,HOUSEHOLD_2,HOUSEHOLD,WI_3,WI,d_55,0.0,11101,11108


## Target encoding

Categorical variables present challenges to many machine learning algorithms such as XGBoost. One way to overcome the challenge is to use **target encoding**, where we encode categorical variables by replacing them with a statistic for the target variable. In this example, we will use the mean and the standard deviation.

Read more about target encoding in [Target-encoding Categorical Variables](https://towardsdatascience.com/dealing-with-categorical-variables-by-using-target-encoder-a0f1733a4c69).

In [4]:
icols = [["store_id", "dept_id"], ["item_id", "state_id"]]
new_columns = []

for col in icols:
    print(f"Encoding columns {col}")
    col_name = "_" + "_".join(col) + "_"
    grid_df["enc" + col_name + "mean"] = (
        grid_df.groupby(col)["sales"].transform("mean").astype(np.float32)
    )
    grid_df["enc" + col_name + "std"] = (
        grid_df.groupby(col)["sales"].transform("std").astype(np.float32)
    )
    new_columns.extend(["enc" + col_name + "mean", "enc" + col_name + "std"])

Encoding columns ['store_id', 'dept_id']
Encoding columns ['item_id', 'state_id']


In [5]:
grid_df = grid_df[["id", "day_id"] + new_columns]
grid_df

Unnamed: 0,id,day_id,enc_store_id_dept_id_mean,enc_store_id_dept_id_std,enc_item_id_state_id_mean,enc_item_id_state_id_std
0,FOODS_1_001_CA_1_evaluation,d_1537,1.613112,3.216672,0.873390,1.666305
1,FOODS_1_001_CA_1_evaluation,d_1538,1.613112,3.216672,0.873390,1.666305
2,FOODS_1_001_CA_1_evaluation,d_1539,1.613112,3.216672,0.873390,1.666305
3,FOODS_1_001_CA_1_evaluation,d_1540,1.613112,3.216672,0.873390,1.666305
4,FOODS_1_001_CA_1_evaluation,d_1541,1.613112,3.216672,0.873390,1.666305
...,...,...,...,...,...,...
47735392,HOUSEHOLD_2_516_WI_3_evaluation,d_52,0.261486,0.666380,0.083276,0.301445
47735393,HOUSEHOLD_2_516_WI_3_evaluation,d_53,0.261486,0.666380,0.083276,0.301445
47735394,HOUSEHOLD_2_516_WI_3_evaluation,d_54,0.261486,0.666380,0.083276,0.301445
47735395,HOUSEHOLD_2_516_WI_3_evaluation,d_55,0.261486,0.666380,0.083276,0.301445


In [6]:
grid_df.dtypes

id                           category
day_id                       category
enc_store_id_dept_id_mean     float32
enc_store_id_dept_id_std      float32
enc_item_id_state_id_mean     float32
enc_item_id_state_id_std      float32
dtype: object

Once we computed the target encoding, we persist the table to the disk.

In [7]:
grid_df.to_pandas().to_pickle(processed_data_dir + "target_encoding_df.pkl")