In [160]:
import numpy as np
import pandas as pd

In [161]:
df = pd.read_csv('mydata.csv')
df

Unnamed: 0,A,B,C,D
0,1,2.0,3.0,'Good'
1,4,,6.0,'Good'
2,7,,9.0,'Excellent'
3,10,11.0,12.0,
4,13,14.0,15.0,'Excellent'
5,16,17.0,,'Fair'
6,19,12.0,12.0,'Excellent'
7,20,11.0,23.0,'Fair'


There are a lot of missing values in this file. We already covered many possibilities. One of them is do replace a column by the mean value:

In [162]:
df['B']=df['B'].fillna(value=df['B'].mean())
df['C']=df['C'].fillna(value=df['C'].mean())

A different approach is required for column D where you want to use the most frequent value

In [163]:
df['D']=df['D'].fillna(value=df['D'].value_counts().index[0])

In [164]:
df

Unnamed: 0,A,B,C,D
0,1,2.0,3.0,'Good'
1,4,11.166667,6.0,'Good'
2,7,11.166667,9.0,'Excellent'
3,10,11.0,12.0,'Excellent'
4,13,14.0,15.0,'Excellent'
5,16,17.0,11.428571,'Fair'
6,19,12.0,12.0,'Excellent'
7,20,11.0,23.0,'Fair'


Of course do you want to do this all the time ?

You first initialize an instance of the SimpleImputer class by indicating the strategy (mean) as well as specifying the missing values that you want to locate (np.nan):

In [165]:
from sklearn.impute import SimpleImputer

In [166]:
df = pd.read_csv('mydata.csv')
imputer = SimpleImputer(strategy='mean', missing_values=np.nan)

Once the instance is created, you use the `fit()` function to fit the imputer on the column(s) that you want to work on:

In [167]:
imputer = imputer.fit(df[['B','C']])

You can now use the `transform()` function to fill the missing values based on the strategy you specified in the initializer of the SimpleImputer class:

In [168]:
df[['B','C']] = imputer.transform(df[['B','C']])

In [169]:
df

Unnamed: 0,A,B,C,D
0,1,2.0,3.0,'Good'
1,4,11.166667,6.0,'Good'
2,7,11.166667,9.0,'Excellent'
3,10,11.0,12.0,
4,13,14.0,15.0,'Excellent'
5,16,17.0,11.428571,'Fair'
6,19,12.0,12.0,'Excellent'
7,20,11.0,23.0,'Fair'


Now doing column D:

In [170]:
imputer = SimpleImputer(strategy='most_frequent', 
                        missing_values=np.nan)
imputer = imputer.fit(df[['D']])
df[['D']] = imputer.transform(df[['D']])

In [171]:
df

Unnamed: 0,A,B,C,D
0,1,2.0,3.0,'Good'
1,4,11.166667,6.0,'Good'
2,7,11.166667,9.0,'Excellent'
3,10,11.0,12.0,'Excellent'
4,13,14.0,15.0,'Excellent'
5,16,17.0,11.428571,'Fair'
6,19,12.0,12.0,'Excellent'
7,20,11.0,23.0,'Fair'


This can still be done much better

In [172]:
from sklearn.compose import ColumnTransformer

In [173]:
df = pd.read_csv('mydata.csv')
df.values

array([[1, 2.0, 3.0, "'Good'"],
       [4, nan, 6.0, "'Good'"],
       [7, nan, 9.0, "'Excellent'"],
       [10, 11.0, 12.0, nan],
       [13, 14.0, 15.0, "'Excellent'"],
       [16, 17.0, nan, "'Fair'"],
       [19, 12.0, 12.0, "'Excellent'"],
       [20, 11.0, 23.0, "'Fair'"]], dtype=object)

In [174]:
First_step=('replace_with_mean',
            SimpleImputer(strategy='mean',missing_values=np.nan),
            [1,2])

Second_step=('replace_with_mostfrequent',
             SimpleImputer(strategy='most_frequent',missing_values=np.nan),
            [3])

In [175]:
my_transformer = ColumnTransformer(transformers=[First_step,Second_step],remainder='passthrough')


In [176]:
my_transformer.fit(df.values)

ColumnTransformer(remainder='passthrough',
                  transformers=[('replace_with_mean', SimpleImputer(), [1, 2]),
                                ('replace_with_mostfrequent',
                                 SimpleImputer(strategy='most_frequent'),
                                 [3])])

In [177]:
Y=my_transformer.transform(df.values)

In [178]:
pd.DataFrame(Y,columns=['A','B','C','D'])

Unnamed: 0,A,B,C,D
0,2.0,3.0,'Good',1
1,11.166667,6.0,'Good',4
2,11.166667,9.0,'Excellent',7
3,11.0,12.0,'Excellent',10
4,14.0,15.0,'Excellent',13
5,17.0,11.428571,'Fair',16
6,12.0,12.0,'Excellent',19
7,11.0,23.0,'Fair',20
