In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
# load data.

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
print("The Shape of train df :",train.shape)
print('The shape of test df :',test.shape)

The Shape of train df : (1460, 81)
The shape of test df : (1459, 80)


In [3]:
X_train = train.drop(columns='SalePrice')
y_train = train['SalePrice']
X_test = test.copy()
print('Shape of X_train',X_train.shape)
print('Shape of y_train',y_train.shape)
print('Shape of X_test',X_test.shape)

Shape of X_train (1460, 80)
Shape of y_train (1460,)
Shape of X_test (1459, 80)


Everything will be done on X_train ,
y_train will bbe kept aside because it is the output column , 
X_test : The value of nans that we get by using SimpleImputer will be stored in X_test.

# Missing Value Imputation

In [4]:
isnull_sum = X_train.isnull().sum()
isnull_sum

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
Length: 80, dtype: int64

In [5]:
# finding the numerical variable which has missing values.
num_vars = X_train.select_dtypes(include=['int64','float64']).columns
num_vars

Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold'],
      dtype='object')

In [6]:
num_vars.shape

(37,)

In [7]:
num_vars_miss =[]
for var in X_train[num_vars]:
    if X_train[var].isnull().sum()>0:
        num_vars_miss.append(var)
        
# or 

#num_var_miss = [var for var in num_vars if isnull_sum[var]>0]


In [8]:
num_vars_miss

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [9]:
# finding the categorical variable which has missing values.
cat_vars = X_train.select_dtypes(include = 'object').columns

In [10]:
cat_var_miss=[]
for var in X_train[cat_vars]:
    if X_train[var].isnull().sum()>0:
        cat_var_miss.append(var)
        
#or 
#cat_var_miss = [var for var in cat_vars if isnull_sum[var]>0]

In [11]:
cat_var_miss

['Alley',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

Now suppose we want to give the numerical missing data the value of 'mean' , half of the categorical data we want  to give 'mode' value and the rest of the Caegorical data we want to give constant value , to the nan values.

here we can't use simpleimputer to solve this problem DIRECTLY.

we need to create a PIPELINE.

In [12]:
# creating the variable that u want to fill values differently

num_var_mean = ['LotFrontage']
num_var_median = ['MasVnrArea', 'GarageYrBlt']
cat_var_mode = ['Alley',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu']

cat_var_missing = ['GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [13]:
# pipeline and strategy creation:
num_var_mean_imputer = Pipeline(steps = [('imputer',SimpleImputer(strategy='mean'))])
num_var_median_imputer = Pipeline(steps = [('imputer',SimpleImputer(strategy='median'))])
cat_var_mode_imputer = Pipeline(steps = [('imputer',SimpleImputer(strategy='most_frequent'))])
cat_var_missing_imputer = Pipeline(steps = [('imputer',SimpleImputer(strategy='constant',fill_value = 'missing'))])

In [15]:
# now where to apply these strategies. we use columntransformer
preprocessor = ColumnTransformer(transformers=[('mean_imputer',num_var_mean_imputer , num_var_mean),
                                ('median_imputer' , num_var_median_imputer , num_var_median),
                                ('mode_imputer' , cat_var_mode_imputer , cat_var_mode),
                                ('missing_imputer' , cat_var_missing_imputer , cat_var_missing)])

In [16]:
# now we have given , what strategy to apply and to which columns.
# now we fit the imputer to the data frame.
preprocessor.fit(X_train)

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('mean_imputer',
                                 Pipeline(memory=None,
                                          steps=[('imputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='mean',
                                                                verbose=0))],
                                          verbose=False),
                                 ['LotFrontage']),
                                ('median_imputer',
                                 Pipeline(memory=None,
                     

In [17]:
preprocessor.transform

<bound method ColumnTransformer.transform of ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('mean_imputer',
                                 Pipeline(memory=None,
                                          steps=[('imputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='mean',
                                                                verbose=0))],
                                          verbose=False),
                                 ['LotFrontage']),
                                ('median_imputer',
                               

In [18]:
# to see which variable has which strategy

preprocessor.named_transformers_['mean_imputer'].named_steps['imputer'].statistics_

array([70.04995837])

In [19]:
preprocessor.named_transformers_['median_imputer'].named_steps['imputer'].statistics_

array([   0., 1980.])

In [20]:
X_train_clean = preprocessor.transform(X_train)

In [21]:
X_test_clean = preprocessor.transform(X_test)

In [22]:
X_train_clean

array([[65.0, 196.0, 2003.0, ..., 'missing', 'missing', 'missing'],
       [80.0, 0.0, 1976.0, ..., 'missing', 'missing', 'missing'],
       [68.0, 162.0, 2001.0, ..., 'missing', 'missing', 'missing'],
       ...,
       [66.0, 0.0, 1941.0, ..., 'missing', 'GdPrv', 'Shed'],
       [68.0, 0.0, 1950.0, ..., 'missing', 'missing', 'missing'],
       [75.0, 0.0, 1965.0, ..., 'missing', 'missing', 'missing']],
      dtype=object)

In [23]:
preprocessor.transformers_

[('mean_imputer',
  Pipeline(memory=None,
           steps=[('imputer',
                   SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                                 missing_values=nan, strategy='mean',
                                 verbose=0))],
           verbose=False),
  ['LotFrontage']),
 ('median_imputer',
  Pipeline(memory=None,
           steps=[('imputer',
                   SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                                 missing_values=nan, strategy='median',
                                 verbose=0))],
           verbose=False),
  ['MasVnrArea', 'GarageYrBlt']),
 ('mode_imputer',
  Pipeline(memory=None,
           steps=[('imputer',
                   SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                                 missing_values=nan, strategy='most_frequent',
                                 verbose=0))],
           verbose=False),
  ['Alley',
   'MasVnrType',
   'BsmtQu

In [24]:
X_train_clean_miss_var = pd.DataFrame(X_train_clean , columns = num_var_mean+num_var_median+cat_var_mode+cat_var_missing)

In [25]:
X_train_clean_miss_var

Unnamed: 0,LotFrontage,MasVnrArea,GarageYrBlt,Alley,MasVnrType,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Electrical,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PoolQC,Fence,MiscFeature
0,65,196,2003,Grvl,BrkFace,Gd,TA,No,GLQ,Unf,SBrkr,Gd,Attchd,RFn,TA,TA,missing,missing,missing
1,80,0,1976,Grvl,,Gd,TA,Gd,ALQ,Unf,SBrkr,TA,Attchd,RFn,TA,TA,missing,missing,missing
2,68,162,2001,Grvl,BrkFace,Gd,TA,Mn,GLQ,Unf,SBrkr,TA,Attchd,RFn,TA,TA,missing,missing,missing
3,60,0,1998,Grvl,,TA,Gd,No,ALQ,Unf,SBrkr,Gd,Detchd,Unf,TA,TA,missing,missing,missing
4,84,350,2000,Grvl,BrkFace,Gd,TA,Av,GLQ,Unf,SBrkr,TA,Attchd,RFn,TA,TA,missing,missing,missing
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,62,0,1999,Grvl,,Gd,TA,No,Unf,Unf,SBrkr,TA,Attchd,RFn,TA,TA,missing,missing,missing
1456,85,119,1978,Grvl,Stone,Gd,TA,No,ALQ,Rec,SBrkr,TA,Attchd,Unf,TA,TA,missing,MnPrv,missing
1457,66,0,1941,Grvl,,TA,Gd,No,GLQ,Unf,SBrkr,Gd,Attchd,RFn,TA,TA,missing,GdPrv,Shed
1458,68,0,1950,Grvl,,TA,TA,Mn,GLQ,Rec,FuseA,Gd,Attchd,Unf,TA,TA,missing,missing,missing


In [26]:
X_train_clean_miss_var.isnull().sum()

LotFrontage     0
MasVnrArea      0
GarageYrBlt     0
Alley           0
MasVnrType      0
BsmtQual        0
BsmtCond        0
BsmtExposure    0
BsmtFinType1    0
BsmtFinType2    0
Electrical      0
FireplaceQu     0
GarageType      0
GarageFinish    0
GarageQual      0
GarageCond      0
PoolQC          0
Fence           0
MiscFeature     0
dtype: int64