In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from category_encoders import TargetEncoder
from cstm_pkg_grp_9.data.sets import pop_target
from joblib import dump
import os

# Load Data

In [2]:
train_data = pd.read_csv("../../data/interim/train_data_cleaned.csv")
test_data = pd.read_csv("../../data/interim/test_data_cleaned.csv")

# Verify Data

## Train data

In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34720691 entries, 0 to 34720690
Data columns (total 9 columns):
 #   Column         Dtype  
---  ------         -----  
 0   item_id        object 
 1   dept_id        object 
 2   cat_id         object 
 3   store_id       object 
 4   state_id       object 
 5   sales_revenue  float64
 6   year           int64  
 7   month          int64  
 8   day            int64  
dtypes: float64(1), int64(3), object(5)
memory usage: 2.3+ GB


In [4]:
train_data.head()

Unnamed: 0,item_id,dept_id,cat_id,store_id,state_id,sales_revenue,year,month,day
0,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,5.52,2011,1,29
1,HOBBIES_1_009,HOBBIES_1,HOBBIES,CA_1,CA,3.12,2011,1,29
2,HOBBIES_1_010,HOBBIES_1,HOBBIES,CA_1,CA,0.0,2011,1,29
3,HOBBIES_1_012,HOBBIES_1,HOBBIES,CA_1,CA,0.0,2011,1,29
4,HOBBIES_1_015,HOBBIES_1,HOBBIES,CA_1,CA,2.8,2011,1,29


In [5]:
train_data.shape

(34720691, 9)

## Test data

In [6]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12160986 entries, 0 to 12160985
Data columns (total 9 columns):
 #   Column         Dtype  
---  ------         -----  
 0   item_id        object 
 1   dept_id        object 
 2   cat_id         object 
 3   store_id       object 
 4   state_id       object 
 5   sales_revenue  float64
 6   year           int64  
 7   month          int64  
 8   day            int64  
dtypes: float64(1), int64(3), object(5)
memory usage: 835.0+ MB


In [7]:
test_data.head()

Unnamed: 0,item_id,dept_id,cat_id,store_id,state_id,sales_revenue,year,month,day
0,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0.0,2015,4,19
1,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0.0,2015,4,19
2,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0.0,2015,4,19
3,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,18.56,2015,4,19
4,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,8.64,2015,4,19


The data seems to be intact and as per requirements

# Data Transformation

In [8]:
features_train, target_train = pop_target(train_data, 'sales_revenue')
features_test, target_test = pop_target(test_data, 'sales_revenue')

## Target Mean Encoding

'item_id' features contains 3049 features. If it is One hot encoded then that will increase he dimensions. Therefore, target mean encoding will be applied.

In [9]:
tme = TargetEncoder(cols=['item_id'])

### Train Set

In [10]:
tm_features_train = tme.fit_transform(features_train, target_train)

In [11]:
tm_features_train.shape

(34720691, 8)

### Test Set

In [12]:
tm_features_test = tme.transform(features_test)

In [13]:
tm_features_test.shape

(12160986, 8)

In [14]:
tm_features_train.head()

Unnamed: 0,item_id,dept_id,cat_id,store_id,state_id,year,month,day
0,2.248204,HOBBIES_1,HOBBIES,CA_1,CA,2011,1,29
1,1.537946,HOBBIES_1,HOBBIES,CA_1,CA,2011,1,29
2,1.89118,HOBBIES_1,HOBBIES,CA_1,CA,2011,1,29
3,2.724936,HOBBIES_1,HOBBIES,CA_1,CA,2011,1,29
4,3.205972,HOBBIES_1,HOBBIES,CA_1,CA,2011,1,29


## Ordinal Encoding

Categorical features such as dept_id, store_id, cat_id and state_id will be changed to numerical values using ordinal encoder.

In [16]:
oe = OrdinalEncoder(dtype=int)

In [17]:
tm_features_train[['dept_id', 'store_id', 'cat_id', 'state_id']] = oe.fit_transform(tm_features_train[['dept_id', 'store_id', 'cat_id', 'state_id']])

In [18]:
tm_features_test[['dept_id', 'store_id', 'cat_id', 'state_id']] = oe.transform(tm_features_test[['dept_id', 'store_id', 'cat_id', 'state_id']])

In [19]:
tm_features_train.shape

(34720691, 8)

In [20]:
tm_features_train.head()

Unnamed: 0,item_id,dept_id,cat_id,store_id,state_id,year,month,day
0,2.248204,3,1,0,0,2011,1,29
1,1.537946,3,1,0,0,2011,1,29
2,1.89118,3,1,0,0,2011,1,29
3,2.724936,3,1,0,0,2011,1,29
4,3.205972,3,1,0,0,2011,1,29


## Standard Scalar

To maintain the same level of magnitude between the values standard scaling technique will be applied.

In [21]:
scaler = StandardScaler()

In [23]:
train_scaled = scaler.fit_transform(tm_features_train)
train_df_final = pd.DataFrame(train_scaled, columns=tm_features_train.columns)

In [24]:
train_df_final.head()

Unnamed: 0,item_id,dept_id,cat_id,store_id,state_id,year,month,day
0,-0.328719,-0.096686,0.128218,-1.574645,-1.092484,-1.636141,-1.538818,1.516152
1,-0.454687,-0.096686,0.128218,-1.574645,-1.092484,-1.636141,-1.538818,1.516152
2,-0.392039,-0.096686,0.128218,-1.574645,-1.092484,-1.636141,-1.538818,1.516152
3,-0.244169,-0.096686,0.128218,-1.574645,-1.092484,-1.636141,-1.538818,1.516152
4,-0.158854,-0.096686,0.128218,-1.574645,-1.092484,-1.636141,-1.538818,1.516152


In [25]:
test_scaled = scaler.transform(tm_features_test)
test_df_final = pd.DataFrame(test_scaled, columns=tm_features_test.columns)

### Adding target back

In [26]:
train_df_final['sales_revenue'] = target_train
test_df_final['sales_revenue'] = target_test

### Saving Transformers

In [27]:
dump(tme, '../../models/predictive/target_mean_encoder_1.joblib')

['../../models/predictive/target_mean_encoder_1.joblib']

In [28]:
dump(oe, '../../models/predictive/ordinal_encoder_1.joblib')

['../../models/predictive/ordinal_encoder_1.joblib']

In [29]:
dump(scaler, '../../models/predictive/scaler_1.joblib')

['../../models/predictive/scaler_1.joblib']

## Saving csvs

In [30]:
folder_path = "../../data/processed"
train_df_final.to_csv(os.path.join(folder_path, 'train_processed_1.csv'), index=False)
test_df_final.to_csv(os.path.join(folder_path, 'test_processed_1.csv'), index=False)