## Imputation - Handling Missing Values 

In statistics, imputation is the process of replacing missing data with substituted values.

In [75]:
import numpy as np 
import pandas as pd 

# load dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/horse-colic.csv'
dataframe = pd.read_csv(url, header=None, na_values='?')

In [67]:
dataframe

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,2.0,1,530101,38.5,66.0,28.0,3.0,3.0,,2.0,...,45.0,8.4,,,2.0,2,11300,0,0,2
1,1.0,1,534817,39.2,88.0,20.0,,,4.0,1.0,...,50.0,85.0,2.0,2.0,3.0,2,2208,0,0,2
2,2.0,1,530334,38.3,40.0,24.0,1.0,1.0,3.0,1.0,...,33.0,6.7,,,1.0,2,0,0,0,1
3,1.0,9,5290409,39.1,164.0,84.0,4.0,1.0,6.0,2.0,...,48.0,7.2,3.0,5.3,2.0,1,2208,0,0,1
4,2.0,1,530255,37.3,104.0,35.0,,,6.0,2.0,...,74.0,7.4,,,2.0,2,4300,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,1.0,1,533886,,120.0,70.0,4.0,,4.0,2.0,...,55.0,65.0,,,3.0,2,3205,0,0,2
296,2.0,1,527702,37.2,72.0,24.0,3.0,2.0,4.0,2.0,...,44.0,,3.0,3.3,3.0,1,2208,0,0,1
297,1.0,1,529386,37.5,72.0,30.0,4.0,3.0,4.0,1.0,...,60.0,6.8,,,2.0,1,3205,0,0,2
298,1.0,1,530612,36.5,100.0,24.0,3.0,3.0,3.0,1.0,...,50.0,6.0,3.0,3.4,1.0,1,2208,0,0,1


In [68]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 28 columns):
0     299 non-null float64
1     300 non-null int64
2     300 non-null int64
3     240 non-null float64
4     276 non-null float64
5     242 non-null float64
6     244 non-null float64
7     231 non-null float64
8     253 non-null float64
9     268 non-null float64
10    245 non-null float64
11    256 non-null float64
12    244 non-null float64
13    196 non-null float64
14    194 non-null float64
15    53 non-null float64
16    198 non-null float64
17    182 non-null float64
18    271 non-null float64
19    267 non-null float64
20    135 non-null float64
21    102 non-null float64
22    299 non-null float64
23    300 non-null int64
24    300 non-null int64
25    300 non-null int64
26    300 non-null int64
27    300 non-null int64
dtypes: float64(21), int64(7)
memory usage: 65.8 KB


In [69]:
dataframe.isna().sum()

0       1
1       0
2       0
3      60
4      24
5      58
6      56
7      69
8      47
9      32
10     55
11     44
12     56
13    104
14    106
15    247
16    102
17    118
18     29
19     33
20    165
21    198
22      1
23      0
24      0
25      0
26      0
27      0
dtype: int64

## Simple Imputer

In [70]:
from sklearn.impute import SimpleImputer

In [71]:
imputer = SimpleImputer(strategy = 'mean')
dataframe_numpy = imputer.fit_transform(dataframe)

In [72]:
dataframe_numpy

array([[2.00000e+00, 1.00000e+00, 5.30101e+05, ..., 0.00000e+00,
        0.00000e+00, 2.00000e+00],
       [1.00000e+00, 1.00000e+00, 5.34817e+05, ..., 0.00000e+00,
        0.00000e+00, 2.00000e+00],
       [2.00000e+00, 1.00000e+00, 5.30334e+05, ..., 0.00000e+00,
        0.00000e+00, 1.00000e+00],
       ...,
       [1.00000e+00, 1.00000e+00, 5.29386e+05, ..., 0.00000e+00,
        0.00000e+00, 2.00000e+00],
       [1.00000e+00, 1.00000e+00, 5.30612e+05, ..., 0.00000e+00,
        0.00000e+00, 1.00000e+00],
       [1.00000e+00, 1.00000e+00, 5.34618e+05, ..., 0.00000e+00,
        0.00000e+00, 2.00000e+00]])

In [73]:
imputer = SimpleImputer(strategy = 'mean')
dataframe.iloc[:,:]= imputer.fit_transform(dataframe.iloc[:,:])

In [74]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 28 columns):
0     300 non-null float64
1     300 non-null float64
2     300 non-null float64
3     300 non-null float64
4     300 non-null float64
5     300 non-null float64
6     300 non-null float64
7     300 non-null float64
8     300 non-null float64
9     300 non-null float64
10    300 non-null float64
11    300 non-null float64
12    300 non-null float64
13    300 non-null float64
14    300 non-null float64
15    300 non-null float64
16    300 non-null float64
17    300 non-null float64
18    300 non-null float64
19    300 non-null float64
20    300 non-null float64
21    300 non-null float64
22    300 non-null float64
23    300 non-null float64
24    300 non-null float64
25    300 non-null float64
26    300 non-null float64
27    300 non-null float64
dtypes: float64(28)
memory usage: 65.8 KB


## Fillna

In [77]:
dataframe[15].fillna(dataframe[15].median() , inplace=True)

In [76]:
dataframe.median()

0          1.00
1          1.00
2     530305.50
3         38.20
4         64.00
5         24.50
6          3.00
7          2.00
8          3.00
9          1.00
10         3.00
11         3.00
12         2.00
13         2.00
14         1.00
15         5.00
16         3.00
17         4.00
18        45.00
19         7.50
20         2.00
21         2.25
22         1.00
23         1.00
24      2673.50
25         0.00
26         0.00
27         2.00
dtype: float64

In [78]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 28 columns):
0     300 non-null float64
1     300 non-null int64
2     300 non-null int64
3     300 non-null float64
4     300 non-null float64
5     300 non-null float64
6     300 non-null float64
7     300 non-null float64
8     300 non-null float64
9     300 non-null float64
10    300 non-null float64
11    300 non-null float64
12    300 non-null float64
13    300 non-null float64
14    300 non-null float64
15    300 non-null float64
16    300 non-null float64
17    300 non-null float64
18    300 non-null float64
19    300 non-null float64
20    300 non-null float64
21    300 non-null float64
22    300 non-null float64
23    300 non-null int64
24    300 non-null int64
25    300 non-null int64
26    300 non-null int64
27    300 non-null int64
dtypes: float64(21), int64(7)
memory usage: 65.8 KB
