In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas_profiling 

For various reasons, many real world datasets contain missing values, often encoded as blanks, NaNs or other placeholders. Such datasets however are incompatible with scikit-learn estimators which assume that all values in an array are numerical, and that all have and hold meaning. A basic strategy to use incomplete datasets is to discard entire rows and/or columns containing missing values. However, this comes at the price of losing data which may be valuable (even though incomplete). A better strategy is to impute the missing values, i.e., to infer them from the known part of the data. 
1. Univariate vs. Multivariate Imputation
2. Univariate feature imputation
3. Multivariate feature imputation

The SimpleImputer class provides basic strategies for imputing missing values. Missing values can be imputed with a provided constant value, or using the statistics (mean, median or most frequent) of each column in which the missing values are located. This class also allows for different missing values encodings.

The following snippet demonstrates how to replace missing values, encoded as np.nan, using the mean value of the columns (axis 0) that contain the missing values:

In [2]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')

x = np.array([[1, 2], [np.nan, 3], [7, 6]])
np.nanmean(x)

3.8

In [3]:
np.nanmedian(x)

3.0

In [4]:
imp.fit_transform(x)

array([[1., 2.],
       [4., 3.],
       [7., 6.]])

In [5]:
X = [[np.nan, 2], [6, np.nan], [7, 6]]
X

[[nan, 2], [6, nan], [7, 6]]

In [6]:
print(imp.fit_transform(X))

[[6.5 2. ]
 [6.  4. ]
 [7.  6. ]]


The SimpleImputer class also supports sparse matrices:

strategies: ['mean', 'median', 'most_frequent', 'constant']

In [7]:
import scipy.sparse as sp
X = sp.csc_matrix([[5, 2], [0, -1], [8, 4]])
imp = SimpleImputer(missing_values=-1, strategy='mean')
imp.fit(X)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=-1, strategy='mean', verbose=0)

In [8]:
imp.transform(X).toarray()

array([[5., 2.],
       [0., 3.],
       [8., 4.]])

In [9]:
X_test = sp.csc_matrix([[-1, 2], [6, -1], [7, 6]])
print(imp.transform(X_test).toarray())

[[4.33333333 2.        ]
 [6.         3.        ]
 [7.         6.        ]]


In [10]:
import pandas as pd
df = pd.DataFrame([["a", "x"],
                   [np.nan, "y"],
                   ["a", np.nan],
                   ["b", "y"]], dtype="category")

df

Unnamed: 0,0,1
0,a,x
1,,y
2,a,
3,b,y


In [11]:
imp = SimpleImputer(strategy="most_frequent")
print(imp.fit_transform(df))

[['a' 'x']
 ['a' 'y']
 ['a' 'y']
 ['b' 'y']]


In [12]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [87]:

imp = IterativeImputer(max_iter=10, random_state=0)
imp

IterativeImputer(add_indicator=False, estimator=None,
                 imputation_order='ascending', initial_strategy='mean',
                 max_iter=10, max_value=None, min_value=None,
                 missing_values=nan, n_nearest_features=None, random_state=0,
                 sample_posterior=False, tol=0.001, verbose=0)

In [89]:

imp.fit([[1, 2], [3, 6], [4, 8], [np.nan, 3], [7, np.nan]])

IterativeImputer(add_indicator=False, estimator=None,
                 imputation_order='ascending', initial_strategy='mean',
                 max_iter=10, max_value=None, min_value=None,
                 missing_values=nan, n_nearest_features=None, random_state=0,
                 sample_posterior=False, tol=0.001, verbose=0)

In [91]:

X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]]
X_test

[[nan, 2], [6, nan], [nan, 6]]

In [93]:

# the model learns that the second feature is double the first
print(np.round(imp.transform(X_test)))

[[ 1.  2.]
 [ 6. 12.]
 [ 3.  6.]]


In [35]:
from sklearn.impute import MissingIndicator
X = np.array([[-1, -1, 1, 3],
              [4, -1, 0, -1],
              [8, -1, 1, 0]])
X

array([[-1, -1,  1,  3],
       [ 4, -1,  0, -1],
       [ 8, -1,  1,  0]])

In [36]:
indicator = MissingIndicator(missing_values=-1)

In [37]:
indicator

MissingIndicator(error_on_new=True, features='missing-only', missing_values=-1,
                 sparse='auto')

In [38]:
mask_missing_values_only = indicator.fit_transform(X)
mask_missing_values_only

array([[ True,  True, False],
       [False,  True,  True],
       [False,  True, False]])

In [39]:
indicator.features_

array([0, 1, 3], dtype=int64)

In [30]:
indicator = MissingIndicator(missing_values=-1, features="all")
mask_all = indicator.fit_transform(X)
mask_all



indicator.features_

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [31]:
from sklearn.datasets import load_iris
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion, make_pipeline
from sklearn.tree import DecisionTreeClassifier
X, y = load_iris(return_X_y=True)
mask = np.random.randint(0, 2, size=X.shape).astype(np.bool)
X[mask] = np.nan
X_train, X_test, y_train, _ = train_test_split(X, y, test_size=100,
                                               random_state=0)

In [None]:
transformer = FeatureUnion(
    transformer_list=[
        ('features', SimpleImputer(strategy='mean')),
        ('indicators', MissingIndicator())])
transformer = transformer.fit(X_train, y_train)
results = transformer.transform(X_test)
results.shape


In [None]:
clf = clf.fit(X_train, y_train)
results = clf.predict(X_test)
results.shape