In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

from feature_engine.wrappers import SklearnTransformerWrapper
from feature_engine.encoding import RareLabelEncoder

In [None]:
# # load the dataset from Kaggle

# data = pd.read_csv('houseprice.csv')
# data.head()

# # let's separate into training and testing set

# X_train, X_test, y_train, y_test = train_test_split(
#     data.drop(['Id', 'SalePrice'], axis=1),
#     data['SalePrice'],
#     test_size=0.3,
#     random_state=0,
# )

# X_train.shape, X_test.shape

In [3]:
# Read the separate files
train_df = pd.read_csv('../data/house-prices/train.csv')
test_df = pd.read_csv('../data/house-prices/test.csv')

# Separate features and target in training data
X_train = train_df.drop(['Id', 'SalePrice'], axis=1)
y_train = train_df['SalePrice']

# For test data, you might not have the target variable
X_test = test_df.drop(['Id'], axis=1)  # Note: test data might not have SalePrice column

print("X_train :", X_train.shape)
print("X_test :", X_test.shape)

X_train : (1460, 79)
X_test : (1459, 79)


## OrdinalEncoder

In [4]:
cols = ['Alley',
        'MasVnrType',
        'BsmtQual',
        'BsmtCond',
        'BsmtExposure',
        'BsmtFinType1',
        'BsmtFinType2',
        'Electrical',
        'FireplaceQu',
        'GarageType',
        'GarageFinish',
        'GarageQual',
        ]

In [5]:
# let's remove rare labels to avoid errors when encoding

rare_label_enc = RareLabelEncoder(n_categories=2, variables=cols)

X_train = rare_label_enc.fit_transform(X_train.fillna('Missing'))
X_test = rare_label_enc.transform(X_test.fillna('Missing'))

In [6]:
# now let's replace categories by integers

encoder = SklearnTransformerWrapper(
    transformer = OrdinalEncoder(),
    variables = cols,
)

encoder.fit(X_train)

In [7]:
# we can navigate to the parameters of the sklearn transformer
# like this:

encoder.transformer_.categories_

[array(['Missing', 'Rare'], dtype=object),
 array(['BrkFace', 'Missing', 'Rare', 'Stone'], dtype=object),
 array(['Ex', 'Gd', 'Rare', 'TA'], dtype=object),
 array(['Rare', 'TA'], dtype=object),
 array(['Av', 'Gd', 'Mn', 'No', 'Rare'], dtype=object),
 array(['ALQ', 'BLQ', 'GLQ', 'LwQ', 'Rare', 'Rec', 'Unf'], dtype=object),
 array(['Rare', 'Unf'], dtype=object),
 array(['FuseA', 'Rare', 'SBrkr'], dtype=object),
 array(['Gd', 'Missing', 'Rare', 'TA'], dtype=object),
 array(['Attchd', 'BuiltIn', 'Detchd', 'Missing', 'Rare'], dtype=object),
 array(['Fin', 'Missing', 'RFn', 'Unf'], dtype=object),
 array(['Missing', 'Rare', 'TA'], dtype=object)]

In [8]:
# encode categories

X_train = encoder.transform(X_train)
X_test = encoder.transform(X_test)

X_train[cols].isnull().mean()

Alley           0.0
MasVnrType      0.0
BsmtQual        0.0
BsmtCond        0.0
BsmtExposure    0.0
BsmtFinType1    0.0
BsmtFinType2    0.0
Electrical      0.0
FireplaceQu     0.0
GarageType      0.0
GarageFinish    0.0
GarageQual      0.0
dtype: float64

In [9]:
X_test[cols].head()

Unnamed: 0,Alley,MasVnrType,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Electrical,FireplaceQu,GarageType,GarageFinish,GarageQual
0,0.0,1.0,3.0,1.0,3.0,5.0,0.0,2.0,1.0,0.0,3.0,2.0
1,0.0,0.0,3.0,1.0,3.0,0.0,1.0,2.0,1.0,0.0,3.0,2.0
2,0.0,1.0,1.0,1.0,3.0,2.0,1.0,2.0,3.0,0.0,0.0,2.0
3,0.0,0.0,3.0,1.0,3.0,2.0,1.0,2.0,0.0,0.0,0.0,2.0
4,0.0,1.0,1.0,1.0,3.0,0.0,1.0,2.0,1.0,0.0,2.0,2.0
