## Note : 
- we use Pipeline and ColumnTransformer to apply custom SimpleImputer strategies to our custom batch of columns

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# loading and cleaning NA columns of the dataset

In [4]:
df = pd.read_csv('./Datasets/house_prices/train.csv')

In [11]:
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

In [12]:
drop_cols = df.isnull().mean()*100
drop_cols = drop_cols[drop_cols>25].keys()
drop_cols

Index(['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], dtype='object')

In [13]:
df2 = df.drop(drop_cols,axis = 1)
df2.shape

(1460, 76)

# imputation using sklearn: SimpleImputer, Pipeline, ColumnTransformer

In [14]:
# numerical columns having NAN calues

num_nan_cols = df2.select_dtypes(include=['int64','float64']).isnull().sum()
num_nan_cols = num_nan_cols[num_nan_cols>0].keys()
num_nan_cols

Index(['LotFrontage', 'MasVnrArea', 'GarageYrBlt'], dtype='object')

In [16]:
# catagorical columns having NAN calues

cat_nan_cols = df2.select_dtypes(include=['O']).isnull().sum()
cat_nan_cols = cat_nan_cols[cat_nan_cols>0].keys()
cat_nan_cols

Index(['MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
       'BsmtFinType2', 'Electrical', 'GarageType', 'GarageFinish',
       'GarageQual', 'GarageCond'],
      dtype='object')

#### The following way of filling NAN values is completely just for demonstration purpose
- in actual dataset we must have domain knowledge to try out different missing data handling strategies

In [19]:
# segmenting different columns as per their strategy

mean_cols = ['LotFrontage']
median_cols = ['MasVnrArea', 'GarageYrBlt']
mode_cols = ['MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1']
constant_cols = ['BsmtFinType2', 'Electrical', 'GarageType', 'GarageFinish','GarageQual', 'GarageCond']

In [26]:
# Pipelining each imputer : kind of initializing an imputer for each strategy

# Pipeline(steps=[('name','imputer')])

mean_imputer = Pipeline(steps=[('imputer',SimpleImputer(strategy='mean'))])
median_imputer = Pipeline(steps=[('imputer',SimpleImputer(strategy='median'))])
mode_imputer = Pipeline(steps=[('imputer',SimpleImputer(strategy='most_frequent'))])
constant_imputer = Pipeline(steps=[('imputer',SimpleImputer(strategy='constant',fill_value='missing'))])

In [28]:
# ColumnTransformer : to apply each imputer to respective columns desired

# ColumnTransformer(transformers=[(name1 , pipeline_imputer_1 , list_of_cols_1),
#                                (name2, pipeline_imputer_1 , list_of_cols_2),
#                                (name3, pipeline_imputer_1 , list_of_cols_3)...])

preprocessor = ColumnTransformer(transformers=[('mean_transformer' , mean_imputer , mean_cols),
                               ('median_transformer' , median_imputer , median_cols),
                               ('mode_transformer' , mode_imputer , mode_cols),
                               ('constant_transformer' , constant_imputer , constant_cols)])

In [44]:
# creating a copy

df3 = df2.copy()

In [45]:
# calling fit to apply all our settings to dataset

preprocessor.fit(df3)

ColumnTransformer(transformers=[('mean_transformer',
                                 Pipeline(steps=[('imputer', SimpleImputer())]),
                                 ['LotFrontage']),
                                ('median_transformer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median'))]),
                                 ['MasVnrArea', 'GarageYrBlt']),
                                ('mode_transformer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent'))]),
                                 ['MasVnrType', 'BsmtQual', 'BsmtCond',
                                  'BsmtExposure', 'BsmtFinType1']),
                                ('constant_transformer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(fill_value='missi

In [35]:
# to see statistics:

preprocessor.named_transformers_['mean_transformer'].named_steps['imputer'].statistics_

array([70.04995837])

In [36]:
preprocessor.named_transformers_['median_transformer'].named_steps['imputer'].statistics_

array([   0., 1980.])

In [37]:
preprocessor.named_transformers_['mode_transformer'].named_steps['imputer'].statistics_

array(['None', 'TA', 'TA', 'No', 'Unf'], dtype=object)

In [38]:
preprocessor.named_transformers_['constant_transformer'].named_steps['imputer'].statistics_

array(['missing', 'missing', 'missing', 'missing', 'missing', 'missing'],
      dtype=object)

In [46]:
# the imputed 2D array we got

df4 = preprocessor.transform(df3)
df4

array([[65.0, 196.0, 2003.0, ..., 'RFn', 'TA', 'TA'],
       [80.0, 0.0, 1976.0, ..., 'RFn', 'TA', 'TA'],
       [68.0, 162.0, 2001.0, ..., 'RFn', 'TA', 'TA'],
       ...,
       [66.0, 0.0, 1941.0, ..., 'RFn', 'TA', 'TA'],
       [68.0, 0.0, 1950.0, ..., 'Unf', 'TA', 'TA'],
       [75.0, 0.0, 1965.0, ..., 'Fin', 'TA', 'TA']], dtype=object)

In [48]:
df4.shape

(1460, 14)

In [50]:
df4 = pd.DataFrame(df4, columns=mean_cols+median_cols+mode_cols+constant_cols)
df4.head()

Unnamed: 0,LotFrontage,MasVnrArea,GarageYrBlt,MasVnrType,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Electrical,GarageType,GarageFinish,GarageQual,GarageCond
0,65.0,196.0,2003.0,BrkFace,Gd,TA,No,GLQ,Unf,SBrkr,Attchd,RFn,TA,TA
1,80.0,0.0,1976.0,,Gd,TA,Gd,ALQ,Unf,SBrkr,Attchd,RFn,TA,TA
2,68.0,162.0,2001.0,BrkFace,Gd,TA,Mn,GLQ,Unf,SBrkr,Attchd,RFn,TA,TA
3,60.0,0.0,1998.0,,TA,Gd,No,ALQ,Unf,SBrkr,Detchd,Unf,TA,TA
4,84.0,350.0,2000.0,BrkFace,Gd,TA,Av,GLQ,Unf,SBrkr,Attchd,RFn,TA,TA


In [51]:
df4.isnull().sum().sum()

0

In [53]:
df5 = df2.copy()
df5.isnull().sum().sum()

868

In [57]:
# updating the whole dataframe

df5.update(df4)

In [56]:
df5.isnull().sum().sum()

0

In [58]:
df5['LotFrontage'].head()

0    65.0
1    80.0
2    68.0
3    60.0
4    84.0
Name: LotFrontage, dtype: object