In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from feature_engine.wrappers import SklearnTransformerWrapper

In [None]:
# data = pd.read_csv('houseprice.csv')
# data.head()

# # let's separate into training and testing set

# X_train, X_test, y_train, y_test = train_test_split(
#     data.drop(['Id', 'SalePrice'], axis=1), data['SalePrice'], test_size=0.3, random_state=0)

# X_train.shape, X_test.shape

In [3]:
# Read the separate files
train_df = pd.read_csv('../data/house-prices/train.csv')
test_df = pd.read_csv('../data/house-prices/test.csv')

# Separate features and target in training data
X_train = train_df.drop(['Id', 'SalePrice'], axis=1)
y_train = train_df['SalePrice']

# For test data, you might not have the target variable
X_test = test_df.drop(['Id'], axis=1)  # Note: test data might not have SalePrice column

print("X_train :", X_train.shape)
print("X_test :", X_test.shape)

X_train : (1460, 79)
X_test : (1459, 79)


## Scaling

In [4]:
cols = [var for var in X_train.columns if X_train[var].dtypes !='O']

cols

['MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

In [5]:
# let's apply the standard scaler on the above variables

scaler = SklearnTransformerWrapper(transformer = StandardScaler(),
                                    variables = cols)

scaler.fit(X_train.fillna(0))

In [6]:
X_train = scaler.transform(X_train.fillna(0))
X_test = scaler.transform(X_test.fillna(0))

In [7]:
# mean values, learnt by the StandardScaler
scaler.transformer_.mean_

array([5.68972603e+01, 5.76232877e+01, 1.05168281e+04, 6.09931507e+00,
       5.57534247e+00, 1.97126781e+03, 1.98486575e+03, 1.03117123e+02,
       4.43639726e+02, 4.65493151e+01, 5.67240411e+02, 1.05742945e+03,
       1.16262671e+03, 3.46992466e+02, 5.84452055e+00, 1.51546370e+03,
       4.25342466e-01, 5.75342466e-02, 1.56506849e+00, 3.82876712e-01,
       2.86643836e+00, 1.04657534e+00, 6.51780822e+00, 6.13013699e-01,
       1.86873973e+03, 1.76712329e+00, 4.72980137e+02, 9.42445205e+01,
       4.66602740e+01, 2.19541096e+01, 3.40958904e+00, 1.50609589e+01,
       2.75890411e+00, 4.34890411e+01, 6.32191781e+00, 2.00781575e+03])

In [8]:
# std values, learnt by the StandardScaler
scaler.transformer_.scale_

array([4.22860820e+01, 3.46524309e+01, 9.97784611e+03, 1.38252284e+00,
       1.11241818e+00, 3.01925588e+01, 2.06383353e+01, 1.80669468e+02,
       4.55941866e+02, 1.61264017e+02, 4.41715605e+02, 4.38555057e+02,
       3.86455322e+02, 4.36378914e+02, 4.86064268e+01, 5.25300394e+02,
       5.18732867e-01, 2.38670868e-01, 5.50727099e-01, 5.02713131e-01,
       8.15498620e-01, 2.20262727e-01, 1.62483655e+00, 6.44445572e-01,
       4.53541893e+02, 7.47059036e-01, 2.13731608e+02, 1.25295863e+02,
       6.62333334e+01, 6.10982138e+01, 2.93072887e+01, 5.57383170e+01,
       4.01635452e+01, 4.95953090e+02, 2.70270015e+00, 1.32764022e+00])

In [9]:
# the mean of the scaled variables is 0
X_train[cols].mean()

MSSubClass      -8.455945e-17
LotFrontage      9.490126e-17
LotArea         -5.840077e-17
OverallQual      1.387018e-16
OverallCond      3.540547e-16
YearBuilt        1.046347e-15
YearRemodAdd     4.496860e-15
MasVnrArea      -3.893385e-17
BsmtFinSF1      -2.433366e-17
BsmtFinSF2      -3.406712e-17
BsmtUnfSF       -6.600504e-17
TotalBsmtSF      2.457699e-16
1stFlrSF         6.509253e-17
2ndFlrSF        -1.825024e-17
LowQualFinSF     1.216683e-17
GrLivArea       -1.277517e-16
BsmtFullBath     2.311697e-17
BsmtHalfBath     2.433366e-17
FullBath         1.180182e-16
HalfBath         2.083569e-17
BedroomAbvGr     2.141362e-16
KitchenAbvGr     4.501726e-16
TotRmsAbvGrd    -1.022014e-16
Fireplaces      -4.866731e-18
GarageYrBlt      1.764190e-16
GarageCars       1.216683e-16
GarageArea      -1.216683e-17
WoodDeckSF       5.596741e-17
OpenPorchSF      3.041707e-17
EnclosedPorch   -2.311697e-17
3SsnPorch        4.866731e-18
ScreenPorch      5.475072e-17
PoolArea         1.946692e-17
MiscVal   

In [10]:
# the std of the scaled variables is ~1

X_train[cols].std()

MSSubClass       1.000343
LotFrontage      1.000343
LotArea          1.000343
OverallQual      1.000343
OverallCond      1.000343
YearBuilt        1.000343
YearRemodAdd     1.000343
MasVnrArea       1.000343
BsmtFinSF1       1.000343
BsmtFinSF2       1.000343
BsmtUnfSF        1.000343
TotalBsmtSF      1.000343
1stFlrSF         1.000343
2ndFlrSF         1.000343
LowQualFinSF     1.000343
GrLivArea        1.000343
BsmtFullBath     1.000343
BsmtHalfBath     1.000343
FullBath         1.000343
HalfBath         1.000343
BedroomAbvGr     1.000343
KitchenAbvGr     1.000343
TotRmsAbvGrd     1.000343
Fireplaces       1.000343
GarageYrBlt      1.000343
GarageCars       1.000343
GarageArea       1.000343
WoodDeckSF       1.000343
OpenPorchSF      1.000343
EnclosedPorch    1.000343
3SsnPorch        1.000343
ScreenPorch      1.000343
PoolArea         1.000343
MiscVal          1.000343
MoSold           1.000343
YrSold           1.000343
dtype: float64