In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.impute import SimpleImputer

# Importing dataset

In [3]:
df = pd.read_csv('./Datasets/house_prices/train.csv')

In [4]:
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

# cleaning the columns with missing value more than 25%

In [5]:
del_cols = df.isnull().mean()*100
del_cols = del_cols[del_cols>25].keys()

In [6]:
df2 = df.drop(del_cols, axis=1)
df2.shape

(1460, 76)

# missing value imputation ( sklearn.impute.SimpleImputer)

### SimpleImputer(  ) parameters

- missing_values : mention all values we consider to be NAN values
- strategy : 
    - 'mean' : fills the NA values with the MEAN value of the column // [default value]
    - 'median' : fills the NA values with the MEDIAN value of the column
    - 'most_frequent' : fills the NA values with the MODE value of the column
    - 'constant' : fills the NA values with the CONSTANT/VARIABLE value we provided
- fill_value : if we selected - stragy = 'constant', we have to provide the value here

### SimpleImputer attributes
- .statistics_ : provides the replacement for NA values for each column according to the strategy

### SimpleImputer methods
- .fit( ) : calculates the replacement for the NAN values according to the mentioned strategy
- .transform( ) : applies the values computed in fit to respective NAN columns
- .fit_transform( ) :does both the work of fit and transform at a time

# Numerical value imputation Using SimpleImputer

In [7]:
# extracting all numerical columns

num_cols = df2.select_dtypes(include=['int64','float64'])

In [8]:
# columns having null values

null_num_cols = num_cols.isnull().sum()
null_num_cols = null_num_cols[null_num_cols>0].keys()
null_num_cols

Index(['LotFrontage', 'MasVnrArea', 'GarageYrBlt'], dtype='object')

In [9]:
# creating an imputer object

mean_imputer = SimpleImputer(strategy='mean')

In [10]:
# finding median for all null columns

mean_imputer.fit(df2[null_num_cols])

SimpleImputer()

In [11]:
# replacement values for NAN for each column

mean_imputer.statistics_

array([  70.04995837,  103.68526171, 1978.50616389])

In [12]:
# imputing the NAN values (it creates a separate 2D array, it doesnt assign the value to the original dataset)

mean_imputer.transform(df2[null_num_cols])

array([[  65.,  196., 2003.],
       [  80.,    0., 1976.],
       [  68.,  162., 2001.],
       ...,
       [  66.,    0., 1941.],
       [  68.,    0., 1950.],
       [  75.,    0., 1965.]])

In [13]:
# upating the values in dataset

df[null_num_cols] = mean_imputer.transform(df2[null_num_cols])

In [14]:
df[null_num_cols].isnull().sum().sum()

0

# Catagorical value imputation Using SimpleImputer

In [15]:
# getting all catagorical columns with null values

null_cat_cols = df2.select_dtypes(include=['O']).isnull().sum()
null_cat_cols = null_cat_cols[null_cat_cols>0].keys()
null_cat_cols

Index(['MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
       'BsmtFinType2', 'Electrical', 'GarageType', 'GarageFinish',
       'GarageQual', 'GarageCond'],
      dtype='object')

In [16]:
# creating an imputer object

mode_imputer = SimpleImputer(strategy='most_frequent')

In [17]:
# fit and transform and assigning

df2[null_cat_cols] = mode_imputer.fit_transform(df2[null_cat_cols])

In [18]:
mode_imputer.statistics_

array(['None', 'TA', 'TA', 'No', 'Unf', 'Unf', 'SBrkr', 'Attchd', 'Unf',
       'TA', 'TA'], dtype=object)

In [19]:
df2[null_cat_cols].isnull().sum().sum()

0