### Simple examples of data imputation with scikit-learn
#### (read and play)

In [0]:
import numpy as np
import pandas as pd
from io import StringIO

Creating some data with missing values

In [0]:
csvdata = '''
A,B,C,D,E
1,2,3,4,
5,6,,8,
0,,11,12,13
'''

df = pd.read_csv(StringIO(csvdata))
df

Unnamed: 0,A,B,C,D,E
0,1,2.0,3.0,4,
1,5,6.0,,8,
2,0,,11.0,12,13.0


Radical choice: delete whole column

In [0]:
df.drop(["E"], axis=1, inplace=True)
df

Unnamed: 0,A,B,C,D
0,1,2.0,3.0,4
1,5,6.0,,8
2,0,,11.0,12


Recreating

In [0]:
df = pd.read_csv(StringIO(csvdata))
df

Unnamed: 0,A,B,C,D,E
0,1,2.0,3.0,4,
1,5,6.0,,8,
2,0,,11.0,12,13.0


Less Radical: delete rows with missing values on "C" column

In [0]:
df.dropna(axis=0, how='any', thresh=None, subset=["C"], inplace=True)
df

Unnamed: 0,A,B,C,D,E
0,1,2.0,3.0,4,
2,0,,11.0,12,13.0


If you do not specify the columns, it will delete every row with any missing value

In [0]:
df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)
df

Unnamed: 0,A,B,C,D,E


Imputing with scikit-learn

In [0]:
from sklearn.impute import SimpleImputer

In [0]:
df = pd.read_csv(StringIO(csvdata))
df

Unnamed: 0,A,B,C,D,E
0,1,2.0,3.0,4,
1,5,6.0,,8,
2,0,,11.0,12,13.0


Imputing mean values

In [0]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(df["C"].values.reshape(-1,1))
df["C"] = imp.transform(df["C"].values.reshape(-1,1))
df

Unnamed: 0,A,B,C,D,E
0,1,2.0,3.0,4,
1,5,6.0,7.0,8,
2,0,,11.0,12,13.0


In [0]:
df = pd.read_csv(StringIO(csvdata))
df

Unnamed: 0,A,B,C,D,E
0,1,2.0,3.0,4,
1,5,6.0,,8,
2,0,,11.0,12,13.0


Imputing a constant value

In [0]:
imp = SimpleImputer(missing_values=np.nan, fill_value=200, strategy='constant')
imp.fit(df["C"].values.reshape(-1,1))
df["C"] = imp.transform(df["C"].values.reshape(-1,1))
df

Unnamed: 0,A,B,C,D,E
0,1,2.0,3.0,4,
1,5,6.0,200.0,8,
2,0,,11.0,12,13.0


In [0]:
df = pd.read_csv(StringIO(csvdata))
df

Unnamed: 0,A,B,C,D,E
0,1,2.0,3.0,4,
1,5,6.0,,8,
2,0,,11.0,12,13.0


Interactive imputing (experimental)

In [0]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [0]:
imp_mean = IterativeImputer(random_state=0)
imp_mean.fit(df)
columns = df.columns
df = pd.DataFrame(imp_mean.transform(df), columns=columns)
df



Unnamed: 0,A,B,C,D,E
0,1.0,2.0,3.0,4.0,13.0
1,5.0,6.0,7.272594,8.0,13.0
2,0.0,6.94924,11.0,12.0,13.0
