### Handling missing values using SimpleImputer

#### sklearn.Impute.SimpleImputer

- Imputation transformer for completing missing values.
- New in version 0.20: SimpleImputer replaces the previous sklearn.preprocessing.Imputer

In [104]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer

In [128]:
np.set_printoptions(suppress=True)

In [129]:
df = pd.read_csv(r'./data/titanic_feature.csv')

In [130]:
df.shape

(891, 12)

In [131]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [132]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [133]:
df['Age'].mean()

29.69911764705882

In [134]:
age_imputer = SimpleImputer()

In [135]:
type(age_imputer)

sklearn.impute._base.SimpleImputer

In [136]:
# Fit to data, then transform it.

df['age_imputed'] = age_imputer.fit_transform(df.loc[:, 'Age'].values.reshape(-1, 1))

In [137]:
df.loc[df.Age.isna() == True, ['Age', 'age_imputed']].head()

Unnamed: 0,Age,age_imputed
5,,29.699118
17,,29.699118
19,,29.699118
26,,29.699118
28,,29.699118


In [138]:
# Example 2
dataset = pd.read_csv(r'./data/Data.csv')

In [139]:
# display records in which any missing value is there

indx_missing_val = dataset[dataset.isna().any(axis=1)].index

dataset.iloc[indx_missing_val]

Unnamed: 0,Country,Age,Salary,Purchased
4,Germany,40.0,,Yes
6,Spain,,52000.0,No


In [140]:
# calculate mean of Age and Salary

dataset.loc[:, ['Age', 'Salary']].mean(axis=0)

Age          38.777778
Salary    63777.777778
dtype: float64

In [141]:
imputer = SimpleImputer(missing_values = np.nan, strategy='mean')

In [142]:
# fit to data and transform it

imputer.fit_transform(dataset.iloc[:, [1,2]].values)[indx_missing_val]

array([[   40.        , 63777.77777778],
       [   38.77777778, 52000.        ]])

In [144]:
dataset.iloc[indx_missing_val, [1,2]] = imputer.\
                                        fit_transform(dataset.iloc[:, [1,2]].values)[indx_missing_val]

In [145]:
dataset.iloc[indx_missing_val, [1,2]] 

Unnamed: 0,Age,Salary
4,40.0,63777.777778
6,38.777778,52000.0


In [153]:
# Example 3
url = 'https://raw.github.com/shekhar270779/Learn_Data_Wrangling/master/data/horse-colic.csv'

horse = pd.read_csv(url, header=None, na_values='?')

- The horse colic dataset describes the medical characteristics of horses with colic and whether they survived or not.
- Colic in horses is defined as abdominal pain,
- Dataset has missing values marked with a question mark '?'. 
- While loading dataset we specify missing i.e. na_values as '?'


In [159]:
horse.shape

(300, 28)

In [154]:
horse.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,2.0,1,530101,38.5,66.0,28.0,3.0,3.0,,2.0,...,45.0,8.4,,,2.0,2,11300,0,0,2
1,1.0,1,534817,39.2,88.0,20.0,,,4.0,1.0,...,50.0,85.0,2.0,2.0,3.0,2,2208,0,0,2
2,2.0,1,530334,38.3,40.0,24.0,1.0,1.0,3.0,1.0,...,33.0,6.7,,,1.0,2,0,0,0,1


In [165]:
# impute top 5 missing columns by their mean values 

missing = horse.isna().sum()

indx_top5_missing_cols = missing.sort_values(ascending=False).head().index

indx_top5_missing_cols

Int64Index([15, 21, 20, 17, 14], dtype='int64')

In [170]:
X = horse.iloc[:, indx_top5_missing_cols]
X.head()

Unnamed: 0,15,21,20,17,14
0,,,,5.0,
1,,2.0,2.0,2.0,
2,,,,1.0,
3,5.0,5.3,3.0,,2.0
4,,,,,


In [171]:
np.mean(X, axis=0)

15    4.707547
21    3.019608
20    2.037037
17    3.692308
14    1.582474
dtype: float64

In [175]:
imputer = SimpleImputer()

X_transformed = imputer.fit_transform(X)

horse.iloc[:, indx_top5_missing_cols] = X_transformed

In [176]:
horse.iloc[0:5, indx_top5_missing_cols]

Unnamed: 0,15,21,20,17,14
0,4.707547,3.019608,2.037037,5.0,1.582474
1,4.707547,2.0,2.0,2.0,1.582474
2,4.707547,3.019608,2.037037,1.0,1.582474
3,5.0,5.3,3.0,3.692308,2.0
4,4.707547,3.019608,2.037037,3.692308,1.582474
