In [1]:
#import library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# Different strategy for different variable

In [27]:
train = pd.read_csv('data/train.csv')
print(train.shape)
test = pd.read_csv('data/test.csv')
print(test.shape)
X_test = test.copy()

(1460, 81)
(1459, 80)


In [28]:
x_train = train.drop(columns="SalePrice")  #matrix
y_train = train["SalePrice"]   #vector

In [29]:
isnull_sum=x_train.isnull().sum()
isnull_sum

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
Length: 80, dtype: int64

In [30]:
# finding the numerical variable having missing value
num_vars = x_train.select_dtypes(include=['int64','float64']).columns
num_vars_miss = [var for var in num_vars if isnull_sum[var]>0]

num_vars_miss

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [31]:
## finding the categorical variable having missing value
cat_vars = x_train.select_dtypes(include=["O"]).columns
cat_vars_miss = [var for var in cat_vars if isnull_sum[var]>0]

cat_vars_miss

['Alley',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [32]:
#creating the pipeline
num_var_mean = ["LotFrontage"]
num_var_median = ['MasVnrArea', 'GarageYrBlt']
cat_vars_mode = ['Alley',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',]
cat_vars_constant = ['GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [33]:
#creating the pipeline
num_var_mean_imputer =Pipeline(steps=[("imputer",SimpleImputer(strategy='mean'))])
num_var_median_imputer =Pipeline(steps=[("imputer",SimpleImputer(strategy='median'))])
cat_vars_mode_imputer =Pipeline(steps=[("imputer",SimpleImputer(strategy='most_frequent'))])
# cat_vars_constant_imputer =Pipeline(steps=[("imputer",SimpleImputer(strategy='constant',fill_value='missing'))])

In [34]:
preprocessor=ColumnTransformer(transformers=[('mean_imputer',num_var_mean_imputer,num_var_mean),
                                 ('median_imputer',num_var_median_imputer,num_var_median),
                                ('mode_imputer',cat_vars_mode_imputer,cat_vars_mode),
#                                 ('constant_imputer',cat_vars_constant,cat_vars_constant_imputer)
                             ])

In [36]:
preprocessor.fit(x_train)
X_train_clean = preprocessor.transform(x_train)
X_test_clean = preprocessor.transform(X_test)

In [37]:
preprocessor.named_transformers_["mean_imputer"].named_steps["imputer"].statistics_

array([70.04995837])

In [38]:
preprocessor.named_transformers_["mode_imputer"].named_steps["imputer"].statistics_

array(['Grvl', 'None', 'TA', 'TA', 'No', 'Unf', 'Unf', 'SBrkr', 'Gd'],
      dtype=object)

In [40]:
X_train_clean_miss_var = pd.DataFrame(X_train_clean, 
                                      columns=num_var_mean+num_var_median+cat_vars_mode)



In [41]:
X_train_clean_miss_var.head()

Unnamed: 0,LotFrontage,MasVnrArea,GarageYrBlt,Alley,MasVnrType,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Electrical,FireplaceQu
0,65,196,2003,Grvl,BrkFace,Gd,TA,No,GLQ,Unf,SBrkr,Gd
1,80,0,1976,Grvl,,Gd,TA,Gd,ALQ,Unf,SBrkr,TA
2,68,162,2001,Grvl,BrkFace,Gd,TA,Mn,GLQ,Unf,SBrkr,TA
3,60,0,1998,Grvl,,TA,Gd,No,ALQ,Unf,SBrkr,Gd
4,84,350,2000,Grvl,BrkFace,Gd,TA,Av,GLQ,Unf,SBrkr,TA


In [42]:
X_train_clean_miss_var.isnull().sum().sum()

0

In [44]:
train['Alley'].value_counts()

Grvl    50
Pave    41
Name: Alley, dtype: int64

In [45]:
X_train_clean_miss_var['Alley'].value_counts()

Grvl    1419
Pave      41
Name: Alley, dtype: int64

In [46]:
# no missing values variables index
remainder_vars_index = [0,
   1,
   2,
   4,
   5,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   27,
   28,
   29,
   34,
   36,
   37,
   38,
   39,
   40,
   41,
   43,
   44,
   45,
   46,
   47,
   48,
   49,
   50,
   51,
   52,
   53,
   54,
   55,
   56,
   61,
   62,
   65,
   66,
   67,
   68,
   69,
   70,
   71,
   75,
   76,
   77,
   78,
   79]

In [47]:
# get no missing values variables name using their index
remainder_vars = [isnull_sum.keys()[var_index] for var_index in remainder_vars_index]
remainder_vars

['Id',
 'MSSubClass',
 'MSZoning',
 'LotArea',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'Heating',
 'HeatingQC',
 'CentralAir',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'KitchenQual',
 'TotRmsAbvGrd',
 'Functional',
 'Fireplaces',
 'GarageCars',
 'GarageArea',
 'PavedDrive',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold',
 'SaleType',
 'SaleCondition']

In [48]:
# concatinate X_train_clean_miss_var df and remainder_vars
X_train =  pd.concat([X_train_clean_miss_var,train[remainder_vars]], axis=1)

In [49]:
X_train.shape


(1460, 73)