Encode categorical variables (e.g. .cat.codes). Feature engineering. Scale/normalize numerical data. Split train/test sets. Save scalers, encoders, mappings

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import joblib

  from pandas.core import (


In [2]:
df = pd.read_csv('./data/clean/no_outliers.csv')
df.head()

Unnamed: 0,Category,DIY,Sell,Color 1,Color 2,Source,Seasonal Availability,Mannequin Piece,Style,Villager Equippable
0,accessories,No,122,White,Colorful,Able Sisters,All Year,No,Active,Yes
1,accessories,No,122,Black,Colorful,Able Sisters,All Year,No,Active,Yes
2,accessories,No,35,Beige,White,Able Sisters,All Year,Yes,Active,No
3,accessories,No,122,Yellow,Yellow,Able Sisters,All Year,Yes,Cute,No
4,accessories,No,620,Yellow,Red,Birthday,All Year,No,Gorgeous,Yes


In [3]:
# All the features are categorical except for the target
label = 'Sell'
feats = df.drop(label, axis=1).columns
feats

Index(['Category', 'DIY', 'Color 1', 'Color 2', 'Source',
       'Seasonal Availability', 'Mannequin Piece', 'Style',
       'Villager Equippable'],
      dtype='object')

# Encode categorical features

In [4]:
for feat in feats:
    df[feat] = df[feat].astype('category')

In [5]:
df[feats].head()

Unnamed: 0,Category,DIY,Color 1,Color 2,Source,Seasonal Availability,Mannequin Piece,Style,Villager Equippable
0,accessories,No,White,Colorful,Able Sisters,All Year,No,Active,Yes
1,accessories,No,Black,Colorful,Able Sisters,All Year,No,Active,Yes
2,accessories,No,Beige,White,Able Sisters,All Year,Yes,Active,No
3,accessories,No,Yellow,Yellow,Able Sisters,All Year,Yes,Cute,No
4,accessories,No,Yellow,Red,Birthday,All Year,No,Gorgeous,Yes


In [6]:
cat_mappings = dict()
for feat in feats:
    feat_mapping = dict(enumerate(df[feat].cat.categories, 1))
    feat_mapping[0] = 'unknown'
    reverse_mapping = {v: k for k, v in feat_mapping.items()}
    cat_mappings[f'{feat}_rev'] = feat_mapping
    cat_mappings[feat] = reverse_mapping

In [7]:
cat_mappings['Category']

{'accessories': 1,
 'bags': 2,
 'bottoms': 3,
 'dresses': 4,
 'shoes': 5,
 'socks': 6,
 'tops': 7,
 'unknown': 0}

In [8]:
cat_mappings['Category_rev']

{1: 'accessories',
 2: 'bags',
 3: 'bottoms',
 4: 'dresses',
 5: 'shoes',
 6: 'socks',
 7: 'tops',
 0: 'unknown'}

In [9]:
feat_encoded = [df[feat].map(cat_mappings[feat]).values for feat in feats]

In [10]:
feat_encoded = np.stack(feat_encoded, axis=1)
feat_encoded

array([[ 1,  1, 13, ...,  1,  1,  2],
       [ 1,  1,  2, ...,  1,  1,  2],
       [ 1,  1,  1, ...,  2,  1,  1],
       ...,
       [ 7,  1,  8, ...,  1,  5,  2],
       [ 7,  1, 14, ...,  1,  5,  2],
       [ 7,  1,  7, ...,  1,  1,  2]])

# Embedding sizes

In [11]:
cat_sizes = [len(df[feat].cat.categories.values) for feat in feats]

In [12]:
emb_sizes = [(size, min(50, (size+1)//2)) for size in cat_sizes]
emb_sizes

[(7, 4), (2, 1), (14, 7), (14, 7), (15, 8), (5, 3), (2, 1), (6, 3), (2, 1)]

# Train Test Split

In [13]:
X = feat_encoded
y = df[label].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

# Scale target label ('Sell') to 0 - 1 range

In [14]:
scaler = MinMaxScaler()
y_train = scaler.fit_transform(np.reshape(y_train, (-1, 1)))
y_test = scaler.transform(np.reshape(y_test, (-1, 1)))

In [15]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2952, 9), (739, 9), (2952, 1), (739, 1))

# Save data, encode mappings, and scaler

In [16]:
joblib.dump((X_train, X_test, y_train, y_test), './data/preprocessed/train_test_data.pkl')

preprocessing = {
    'label_scaler': scaler,
    'category_mappings': cat_mappings
}
joblib.dump(preprocessing, './data/preprocessed/preprocessing.pkl')

joblib.dump(emb_sizes, "data/preprocessed/embedding_sizes.pkl")

['data/preprocessed/embedding_sizes.pkl']