In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

from feature_engine.wrappers import SklearnTransformerWrapper

In [4]:
# Read the separate files
train_df = pd.read_csv('../data/house-prices/train.csv')
test_df = pd.read_csv('../data/house-prices/test.csv')

# Separate features and target in training data
X_train = train_df.drop(['Id', 'SalePrice'], axis=1)
y_train = train_df['SalePrice']

# For test data, you might not have the target variable
X_test = test_df.drop(['Id'], axis=1)  # Note: test data might not have SalePrice column

print("X_train :", X_train.shape)
print("X_test :", X_test.shape)

X_train : (1460, 79)
X_test : (1459, 79)


In [5]:
X_train[['LotFrontage', 'MasVnrArea']].isnull().mean()

LotFrontage    0.177397
MasVnrArea     0.005479
dtype: float64

## SimpleImputer

### Mean imputation

In [6]:
imputer = SklearnTransformerWrapper(
    transformer = SimpleImputer(strategy='mean'),
    variables = ['LotFrontage', 'MasVnrArea'],
)

imputer.fit(X_train)

In [7]:
# we can find the mean values within the parameters of the
# simple imputer

imputer.transformer_.statistics_

array([ 70.04995837, 103.68526171])

In [8]:
# remove NA

X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

X_train[['LotFrontage', 'MasVnrArea']].isnull().mean()

LotFrontage    0.0
MasVnrArea     0.0
dtype: float64

### Frequent category imputation

In [10]:
cols = [c for c in train_df.columns if train_df[c].dtypes=='O' and train_df[c].isnull().sum()>0]
train_df[cols].head()

Unnamed: 0,Alley,MasVnrType,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Electrical,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PoolQC,Fence,MiscFeature
0,,BrkFace,Gd,TA,No,GLQ,Unf,SBrkr,,Attchd,RFn,TA,TA,,,
1,,,Gd,TA,Gd,ALQ,Unf,SBrkr,TA,Attchd,RFn,TA,TA,,,
2,,BrkFace,Gd,TA,Mn,GLQ,Unf,SBrkr,TA,Attchd,RFn,TA,TA,,,
3,,,TA,Gd,No,ALQ,Unf,SBrkr,Gd,Detchd,Unf,TA,TA,,,
4,,BrkFace,Gd,TA,Av,GLQ,Unf,SBrkr,TA,Attchd,RFn,TA,TA,,,


In [11]:
imputer = SklearnTransformerWrapper(
    transformer=SimpleImputer(strategy='most_frequent'),
    variables=cols,
)

# find the most frequent category
imputer.fit(X_train)

In [12]:
# we can find the most frequent values within the parameters of the
# simple imputer

imputer.transformer_.statistics_

array(['Grvl', 'BrkFace', 'TA', 'TA', 'No', 'Unf', 'Unf', 'SBrkr', 'Gd',
       'Attchd', 'Unf', 'TA', 'TA', 'Gd', 'MnPrv', 'Shed'], dtype=object)

In [13]:
# remove NA

X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

X_train[cols].isnull().mean()

Alley           0.0
MasVnrType      0.0
BsmtQual        0.0
BsmtCond        0.0
BsmtExposure    0.0
BsmtFinType1    0.0
BsmtFinType2    0.0
Electrical      0.0
FireplaceQu     0.0
GarageType      0.0
GarageFinish    0.0
GarageQual      0.0
GarageCond      0.0
PoolQC          0.0
Fence           0.0
MiscFeature     0.0
dtype: float64

In [14]:
X_test[cols].head()

Unnamed: 0,Alley,MasVnrType,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Electrical,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PoolQC,Fence,MiscFeature
0,Grvl,BrkFace,TA,TA,No,Rec,LwQ,SBrkr,Gd,Attchd,Unf,TA,TA,Gd,MnPrv,Shed
1,Grvl,BrkFace,TA,TA,No,ALQ,Unf,SBrkr,Gd,Attchd,Unf,TA,TA,Gd,MnPrv,Gar2
2,Grvl,BrkFace,Gd,TA,No,GLQ,Unf,SBrkr,TA,Attchd,Fin,TA,TA,Gd,MnPrv,Shed
3,Grvl,BrkFace,TA,TA,No,GLQ,Unf,SBrkr,Gd,Attchd,Fin,TA,TA,Gd,MnPrv,Shed
4,Grvl,BrkFace,Gd,TA,No,ALQ,Unf,SBrkr,Gd,Attchd,RFn,TA,TA,Gd,MnPrv,Shed
