In [23]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [3]:
train = pd.read_csv('C:\\Users\\prasad jadhav\\Downloads\\house-prices-advanced-regression-techniques\\train.csv')
test = pd.read_csv('C:\\Users\\prasad jadhav\\Downloads\\house-prices-advanced-regression-techniques\\test.csv')

In [4]:
print('Shape of train dataset:', train.shape)
print('Shape of test dataset:', test.shape)

Shape of train dataset: (1460, 81)
Shape of test dataset: (1459, 80)


In [5]:
print(train.head())
print(test.head())

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

In [8]:
X_train = train.drop(columns = 'SalePrice')
y_train = train['SalePrice']

print('Shape of train dataset:', X_train.shape)
print('Shape of test dataset:', y_train.shape)

Shape of train dataset: (1460, 80)
Shape of test dataset: (1460,)


- Missing Value Impulation

In [11]:
null_value = X_train.isnull().sum()
null_value

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
Length: 80, dtype: int64

_Numerical Missing Value_

In [18]:
num_var = X_train.select_dtypes(include = ['int64', 'float64']).columns
num_var_null = [var for var in num_var if null_value[var]>0]
num_var_null

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

_Categorical Missing Value_

In [19]:
cat_var = X_train.select_dtypes(include = ['object']).columns
cat_var_null = [var for var in cat_var if null_value[var]>0]
cat_var_null

['Alley',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

***Create Pipeline***

In [21]:
num_var_mean = ['LotFrontage']
num_var_median = ['MasVnrArea', 'GarageYrBlt']
cat_var_mode = ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond',
                'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical',
                'FireplaceQu']
cat_var_null = ['GarageType', 'GarageFinish', 'GarageQual',
                'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']

In [24]:
num_var_mean_imputer = Pipeline(steps = [('Imputer', SimpleImputer(strategy = 'mean'))])
num_var_median_imputer = Pipeline(steps = [('Imputer', SimpleImputer(strategy = 'median'))])
cat_var_mode_imputer = Pipeline(steps = [('Imputer', SimpleImputer(strategy = 'most_frequent'))])
cat_var_null_imputer = Pipeline(steps = [('Imputer', SimpleImputer(strategy = 'constant', fill_value = 'Null'))])

In [25]:
preprocessor = ColumnTransformer(transformers = [('mean_imputer', num_var_mean_imputer, num_var_mean),
                                  ('median_imputer', num_var_median_imputer, num_var_median),
                                  ('mode_imputer', cat_var_mode_imputer, cat_var_mode),
                                  ('null_imputer', cat_var_null_imputer, cat_var_null)])

In [26]:
preprocessor.fit(X_train)

In [27]:
preprocessor.transform

<bound method ColumnTransformer.transform of ColumnTransformer(transformers=[('mean_imputer',
                                 Pipeline(steps=[('Imputer', SimpleImputer())]),
                                 ['LotFrontage']),
                                ('median_imputer',
                                 Pipeline(steps=[('Imputer',
                                                  SimpleImputer(strategy='median'))]),
                                 ['MasVnrArea', 'GarageYrBlt']),
                                ('mode_imputer',
                                 Pipeline(steps=[('Imputer',
                                                  SimpleImputer(strategy='most_frequent'))]),
                                 ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond',
                                  'BsmtExposure', 'BsmtFinType1',
                                  'BsmtFinType2', 'Electrical',
                                  'FireplaceQu']),
                                ('null_imput

In [28]:
preprocessor.named_transformers_['mean_imputer'].named_steps['Imputer'].statistics_

array([70.04995837])

In [29]:
train['LotFrontage'].mean()

70.04995836802665

In [31]:
preprocessor.named_transformers_['mode_imputer'].named_steps['Imputer'].statistics_

array(['Grvl', 'None', 'TA', 'TA', 'No', 'Unf', 'Unf', 'SBrkr', 'Gd'],
      dtype=object)

In [34]:
X_train_clean = preprocessor.transform(X_train)
test_clean = preprocessor.transform(test)

In [35]:
X_train_clean

array([[65.0, 196.0, 2003.0, ..., 'Null', 'Null', 'Null'],
       [80.0, 0.0, 1976.0, ..., 'Null', 'Null', 'Null'],
       [68.0, 162.0, 2001.0, ..., 'Null', 'Null', 'Null'],
       ...,
       [66.0, 0.0, 1941.0, ..., 'Null', 'GdPrv', 'Shed'],
       [68.0, 0.0, 1950.0, ..., 'Null', 'Null', 'Null'],
       [75.0, 0.0, 1965.0, ..., 'Null', 'Null', 'Null']], dtype=object)

In [36]:
test_clean

array([[80.0, 0.0, 1961.0, ..., 'Null', 'MnPrv', 'Null'],
       [81.0, 108.0, 1958.0, ..., 'Null', 'Null', 'Gar2'],
       [74.0, 0.0, 1997.0, ..., 'Null', 'MnPrv', 'Null'],
       ...,
       [160.0, 0.0, 1960.0, ..., 'Null', 'Null', 'Null'],
       [62.0, 0.0, 1980.0, ..., 'Null', 'MnPrv', 'Shed'],
       [74.0, 94.0, 1993.0, ..., 'Null', 'Null', 'Null']], dtype=object)

In [39]:
preprocessor.transformers_

[('mean_imputer',
  Pipeline(steps=[('Imputer', SimpleImputer())]),
  ['LotFrontage']),
 ('median_imputer',
  Pipeline(steps=[('Imputer', SimpleImputer(strategy='median'))]),
  ['MasVnrArea', 'GarageYrBlt']),
 ('mode_imputer',
  Pipeline(steps=[('Imputer', SimpleImputer(strategy='most_frequent'))]),
  ['Alley',
   'MasVnrType',
   'BsmtQual',
   'BsmtCond',
   'BsmtExposure',
   'BsmtFinType1',
   'BsmtFinType2',
   'Electrical',
   'FireplaceQu']),
 ('null_imputer',
  Pipeline(steps=[('Imputer',
                   SimpleImputer(fill_value='Null', strategy='constant'))]),
  ['GarageType',
   'GarageFinish',
   'GarageQual',
   'GarageCond',
   'PoolQC',
   'Fence',
   'MiscFeature']),
 ('remainder',
  'drop',
  [0,
   1,
   2,
   4,
   5,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   27,
   28,
   29,
   34,
   36,
   37,
   38,
   39,
   40,
   41,
   43,
   44,
   45,
   46,
   47,
   48,
   49,
   50,
  

In [45]:
X_train_clean_null = pd.DataFrame(X_train_clean, columns = num_var_mean + num_var_median + cat_var_mode + cat_var_null)
X_train_clean_null.head()

Unnamed: 0,LotFrontage,MasVnrArea,GarageYrBlt,Alley,MasVnrType,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Electrical,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PoolQC,Fence,MiscFeature
0,65.0,196.0,2003.0,Grvl,BrkFace,Gd,TA,No,GLQ,Unf,SBrkr,Gd,Attchd,RFn,TA,TA,Null,Null,Null
1,80.0,0.0,1976.0,Grvl,,Gd,TA,Gd,ALQ,Unf,SBrkr,TA,Attchd,RFn,TA,TA,Null,Null,Null
2,68.0,162.0,2001.0,Grvl,BrkFace,Gd,TA,Mn,GLQ,Unf,SBrkr,TA,Attchd,RFn,TA,TA,Null,Null,Null
3,60.0,0.0,1998.0,Grvl,,TA,Gd,No,ALQ,Unf,SBrkr,Gd,Detchd,Unf,TA,TA,Null,Null,Null
4,84.0,350.0,2000.0,Grvl,BrkFace,Gd,TA,Av,GLQ,Unf,SBrkr,TA,Attchd,RFn,TA,TA,Null,Null,Null


In [44]:
X_train_clean_null.isnull().sum().sum()

0

In [46]:
train['Alley'].value_counts()

Grvl    50
Pave    41
Name: Alley, dtype: int64

In [47]:
X_train_clean_null['Alley'].value_counts()

Grvl    1419
Pave      41
Name: Alley, dtype: int64

In [48]:
X_train_clean_null['MiscFeature'].value_counts()

Null    1406
Shed      49
Gar2       2
Othr       2
TenC       1
Name: MiscFeature, dtype: int64

***Thank You***