In [1]:
# Filling missing values by using Scikit

In [1]:
from sklearn.impute import SimpleImputer
import numpy as np

In [2]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
x = np.array([[1,2],[np.nan,3],[7,6]])
np.nanmean(x)

3.8

In [3]:
np.nanmedian(x)

3.0

In [4]:
imp.fit_transform(x)

array([[1., 2.],
       [4., 3.],
       [7., 6.]])

In [5]:
X = [[np.nan,2],[6,np.nan],[7,6]]
X

[[nan, 2], [6, nan], [7, 6]]

In [6]:
import scipy.sparse as sp 
X = sp.csc_matrix([[1,2],[0,-1],[8,4]])
imp = SimpleImputer(missing_values=-1,strategy='mean')
imp.fit(X)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=-1, strategy='mean', verbose=0)

In [7]:
imp.transform(X).toarray()

array([[1., 2.],
       [0., 3.],
       [8., 4.]])

In [8]:
X_test = sp.csc_matrix([[-1,2],[6,-1],[7,6]])
print(imp.transform(X_test).toarray())

[[3. 2.]
 [6. 3.]
 [7. 6.]]


In [9]:
import pandas as pd
df = pd.DataFrame([['a', 'x'],
                  [np.nan, 'y'],
                  ['a',np.nan],
                  ['b','y']], dtype='category')
df

Unnamed: 0,0,1
0,a,x
1,,y
2,a,
3,b,y


In [10]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [11]:
imp = IterativeImputer(max_iter=10, random_state=0)
imp

IterativeImputer(add_indicator=False, estimator=None,
                 imputation_order='ascending', initial_strategy='mean',
                 max_iter=10, max_value=None, min_value=None,
                 missing_values=nan, n_nearest_features=None, random_state=0,
                 sample_posterior=False, skip_complete=False, tol=0.001,
                 verbose=0)

In [12]:
imp.fit([[1,2],[3,6],[4,8],[np.nan,3],[7,np.nan]])

IterativeImputer(add_indicator=False, estimator=None,
                 imputation_order='ascending', initial_strategy='mean',
                 max_iter=10, max_value=None, min_value=None,
                 missing_values=nan, n_nearest_features=None, random_state=0,
                 sample_posterior=False, skip_complete=False, tol=0.001,
                 verbose=0)

In [13]:
X_test= [[np.nan,2],[6,np.nan],[np.nan,6]]
X_test

[[nan, 2], [6, nan], [nan, 6]]

In [16]:
print(np.round(imp.transform(X_test)))

[[ 1.  2.]
 [ 6. 12.]
 [ 3.  6.]]


In [17]:
from sklearn.impute import MissingIndicator
X = np.array([[-1,-1,1,3],
              [4,-1,0,-1],
              [8,-1,1,0]])
X

array([[-1, -1,  1,  3],
       [ 4, -1,  0, -1],
       [ 8, -1,  1,  0]])

In [18]:
indicator = MissingIndicator(missing_values=-1)

In [19]:
indicator

MissingIndicator(error_on_new=True, features='missing-only', missing_values=-1,
                 sparse='auto')

In [20]:
mask_missing_values_only = indicator.fit_transform(X)
mask_missing_values_only

array([[ True,  True, False],
       [False,  True,  True],
       [False,  True, False]])

In [21]:
indicator.features_

array([0, 1, 3], dtype=int64)

In [22]:
indicator = MissingIndicator(missing_values=-1, features='all')
mask_all = indicator.fit_transform(X)
mask_all
indicator.features_

array([0, 1, 2, 3])

In [23]:
from sklearn.pipeline import FeatureUnion, make_pipeline
from sklearn.neighbors import KNeighborsClassifier

In [26]:
from sklearn.datasets import load_iris
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.model_selection import train_test_split

In [27]:
X,y = load_iris(return_X_y=True)
mask = np.random.randint(0,2,size=X.shape).astype(np.bool)

In [28]:
X[mask] = np.nan
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=100,random_state=0)

In [30]:
transformer = FeatureUnion(transformer_list=[
    ('features', SimpleImputer(strategy='mean')),
    ('indicators', MissingIndicator())])
transformer = transformer.fit(X_train, y_train)
results = transformer.transform(X_test)
results.shape

(100, 8)

In [31]:
clf = KNeighborsClassifier()

In [32]:
clf = clf.fit(X_train, y_train)
results = clf.predict(X_test)
results.shape

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').