In [2]:
import numpy as np 
import pandas as pd
# explicitly require this experimental feature for Importing Interative Imputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer, MissingIndicator


Create a DataFrame 

In [3]:
data = pd.DataFrame({
    'A': [1, 2, np.nan, 4],
    'B': [5, np.nan, 7, 8],
    'C': [9, np.nan, 4, 4]
})

print(data)

     A    B    C
0  1.0  5.0  9.0
1  2.0  NaN  NaN
2  NaN  7.0  4.0
3  4.0  8.0  4.0


# 1. SimpleImputer

This class provides a basic imputation strategy for missing values. It replaces missing values with either the mean, median, most frequent or a constant value of each column.

**1.1. Mean Strategy**

Here the missing values got replaced by the mean of the respective columns.
Example : For Column 'A' , the mean is (1+2+4)/3 = 2.333
So the missing value got replaced with 2.333

In [4]:
simp_imputer_mean = SimpleImputer(strategy='mean')
data_simp_imp_mean = pd.DataFrame(simp_imputer_mean.fit_transform(data), columns=data.columns)
print(data_simp_imp_mean)

          A         B         C
0  1.000000  5.000000  9.000000
1  2.000000  6.666667  5.666667
2  2.333333  7.000000  4.000000
3  4.000000  8.000000  4.000000


**1.2. Median Strategy**

Here the missing values got replaced by the Median of the respective columns.
Example : For Column 'B' , the list is [5,7,8] so 7 becomes the median.
So the missing value got replaced with 7

In [5]:
simp_imputer_median = SimpleImputer(strategy='median')
data_simp_imp_median = pd.DataFrame(simp_imputer_median.fit_transform(data), columns=data.columns)
print(data_simp_imp_median)

     A    B    C
0  1.0  5.0  9.0
1  2.0  7.0  4.0
2  2.0  7.0  4.0
3  4.0  8.0  4.0


**1.3. Most Frequent Strategy**

Here the missing values got replaced by the most frequent value of the respective columns.
Example : For Column 'C' , the 4 is the most frequent value.
So the missing value got replaced with 4.

In [6]:
simp_imputer_mfreq = SimpleImputer(strategy='most_frequent')
data_simp_imp_mfreq = pd.DataFrame(simp_imputer_mfreq.fit_transform(data), columns=data.columns)
print(data_simp_imp_mfreq)

     A    B    C
0  1.0  5.0  9.0
1  2.0  5.0  4.0
2  1.0  7.0  4.0
3  4.0  8.0  4.0


**1.4. Constant Strategy**

Here the missing values got replaced by a constant value given by us.
For Constant we need to pass one extra parameter 'fill_value' with the constant value we wants to fill in place of missing values.

In [7]:
simp_imputer_const = SimpleImputer(strategy='constant',fill_value=99)
data_simp_imp_const = pd.DataFrame(simp_imputer_const.fit_transform(data), columns=data.columns)
print(data_simp_imp_const)

      A     B     C
0   1.0   5.0   9.0
1   2.0  99.0  99.0
2  99.0   7.0   4.0
3   4.0   8.0   4.0


As we pass 99 as our constant value so it got populated in all the missing value places.

# 2. KNNImputer

This class provides imputation using k-Nearest Neighbors. It replaces missing values by taking the average of the k-nearest neighbors.

We can tune the 'n_neighbors' parameters as per our requirements. It will take the average of 'n' nearest points in the data set to compute the average.

In [8]:
knn_imputer = KNNImputer(n_neighbors=2)
data_imp_knn = pd.DataFrame(knn_imputer.fit_transform(data), columns=data.columns)
print(data_imp_knn)

     A    B    C
0  1.0  5.0  9.0
1  2.0  6.5  6.5
2  2.5  7.0  4.0
3  4.0  8.0  4.0


In [9]:
knn_imputer = KNNImputer(n_neighbors=3)
data_imp_knn = pd.DataFrame(knn_imputer.fit_transform(data), columns=data.columns)
print(data_imp_knn)

          A         B         C
0  1.000000  5.000000  9.000000
1  2.000000  6.666667  5.666667
2  2.333333  7.000000  4.000000
3  4.000000  8.000000  4.000000


In [10]:
knn_imputer = KNNImputer(n_neighbors=5)
data_imp_knn = pd.DataFrame(knn_imputer.fit_transform(data), columns=data.columns)
print(data_imp_knn)

          A         B         C
0  1.000000  5.000000  9.000000
1  2.000000  6.666667  5.666667
2  2.333333  7.000000  4.000000
3  4.000000  8.000000  4.000000


As the max neighbors ( Availble data points in the respective column ) can be 3 for all three columns  , so after 3 even if we increase the neighbors , the value will not change.

# 3. IterativeImputer

This class provides imputation using a multivariate regression model. It iteratively fills in missing values based on the values of other features in the dataset.

This is a strategy for imputing missing values by modeling each feature with missing values as a function of other features in a round-robin fashion.

'max_iter' paramter defines Maximum number of imputation rounds to perform before returning the imputations computed during the final round. 

In [11]:
iter_imputer = IterativeImputer(max_iter=5, random_state=0)
data_imp_iter = pd.DataFrame(iter_imputer.fit_transform(data), columns=data.columns)
print(data_imp_iter)

          A         B         C
0  1.000000  5.000000  9.000000
1  2.000000  5.911005  7.231407
2  3.592707  7.000000  4.000000
3  4.000000  8.000000  4.000000




In [12]:
iter_imputer = IterativeImputer(max_iter=10, random_state=0)
data_imp_iter = pd.DataFrame(iter_imputer.fit_transform(data), columns=data.columns)
print(data_imp_iter)

          A         B         C
0  1.000000  5.000000  9.000000
1  2.000000  5.913682  7.234543
2  3.592878  7.000000  4.000000
3  4.000000  8.000000  4.000000


In [13]:
iter_imputer = IterativeImputer(max_iter=15, random_state=0)
data_imp_iter = pd.DataFrame(iter_imputer.fit_transform(data), columns=data.columns)
print(data_imp_iter)

          A         B         C
0  1.000000  5.000000  9.000000
1  2.000000  5.913682  7.234543
2  3.592878  7.000000  4.000000
3  4.000000  8.000000  4.000000


After a certain number of iterations , the values got fitted into the data points and it does not change after that.

# 4. MissingIndicator

This class provides a binary indicator of the missing values in a dataset. It returns a similar data matrix with an indication of whether the place has a missing value or not.

In [14]:
indicator = MissingIndicator(features='all')
missing_values = indicator.fit_transform(data)
data_indicator = pd.concat([data, pd.DataFrame(missing_values, columns=['A_missing', 'B_missing', 'C_missing'])], axis=1)
print(data_indicator)

     A    B    C  A_missing  B_missing  C_missing
0  1.0  5.0  9.0      False      False      False
1  2.0  NaN  NaN      False       True       True
2  NaN  7.0  4.0       True      False      False
3  4.0  8.0  4.0      False      False      False
