# missing value imputation using sklearn
## different strategy for different variables(numerical & categorical) with sckit-learn

In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
train = pd.read_csv('train.csv')
test= pd.read_csv('test.csv')

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
x_train=train.drop(columns=['Survived'])
y_train=train['Survived']
x_test=test.copy()
print(x_train.shape, y_train.shape,x_test.shape)

(891, 11) (891,) (418, 11)


## missing value imputation

In [5]:
isnull_sum=x_train.isnull().sum()
isnull_sum

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
num_var=x_train.select_dtypes(include=['int64','float64']).columns
num_vars_miss=[var for var in num_var if isnull_sum[var]>0]
num_vars_miss

['Age']

In [7]:
cat_var=x_train.select_dtypes(include=['O']).columns
cat_vars_miss=[var for var in cat_var if isnull_sum[var]>0]
cat_vars_miss

['Cabin', 'Embarked']

In [8]:
num_var_mean=['Age']
cat_vars_mode=['Cabin']
cat_var_missing=['Embarked']

In [9]:
num_var_mean_imputer=Pipeline(steps=[('imputer',SimpleImputer(strategy='mean'))])
cat_var_mode_imputer=Pipeline(steps=[('imputer',SimpleImputer(strategy='mode'))])
cat_var_missing_imputer=Pipeline(steps=[('imputer',SimpleImputer(strategy='constant',fill_value='missing'))])

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ('mean_imputer', SimpleImputer(), num_var_mean),  # Impute numerical variables with mean
        ('mode_imputer', SimpleImputer(strategy='most_frequent'), cat_vars_mode),  # Impute categorical variables with mode
        ('constant_imputer', SimpleImputer(strategy='constant', fill_value='missing'), cat_var_missing)  # Impute missing categorical variables with constant value
    ]
)

In [11]:
preprocessor.fit(x_train)

In [12]:
preprocessor.transform

<bound method ColumnTransformer.transform of ColumnTransformer(transformers=[('mean_imputer', SimpleImputer(), ['Age']),
                                ('mode_imputer',
                                 SimpleImputer(strategy='most_frequent'),
                                 ['Cabin']),
                                ('constant_imputer',
                                 SimpleImputer(fill_value='missing',
                                               strategy='constant'),
                                 ['Embarked'])])>

In [13]:
print(preprocessor.transformers_)


[('mean_imputer', SimpleImputer(), ['Age']), ('mode_imputer', SimpleImputer(strategy='most_frequent'), ['Cabin']), ('constant_imputer', SimpleImputer(fill_value='missing', strategy='constant'), ['Embarked']), ('remainder', 'drop', [0, 1, 2, 3, 5, 6, 7, 8])]


In [14]:
mean_imputer = preprocessor.named_transformers_['mean_imputer']
mean_imputer.statistics_

array([29.69911765])

In [15]:
train['Survived'].mean()

0.3838383838383838

In [16]:
X_train_clean=preprocessor.transform(x_train)
X_test_clean=preprocessor.transform(x_test)

In [17]:
X_train_clean

array([[22.0, 'B96 B98', 'S'],
       [38.0, 'C85', 'C'],
       [26.0, 'B96 B98', 'S'],
       ...,
       [29.69911764705882, 'B96 B98', 'S'],
       [26.0, 'C148', 'C'],
       [32.0, 'B96 B98', 'Q']], dtype=object)

In [20]:
X_train_clean_miss_var=pd.DataFrame(X_train_clean, columns=num_var_mean+cat_vars_mode+cat_var_missing)
X_train_clean_miss_var

Unnamed: 0,Age,Cabin,Embarked
0,22.0,B96 B98,S
1,38.0,C85,C
2,26.0,B96 B98,S
3,35.0,C123,S
4,35.0,B96 B98,S
...,...,...,...
886,27.0,B96 B98,S
887,19.0,B42,S
888,29.699118,B96 B98,S
889,26.0,C148,C


In [22]:
X_train_clean_miss_var.isnull().sum().sum()

0

In [23]:
train['Cabin'].value_counts()

Cabin
B96 B98        4
G6             4
C23 C25 C27    4
C22 C26        3
F33            3
              ..
E34            1
C7             1
C54            1
E36            1
C148           1
Name: count, Length: 147, dtype: int64

In [24]:
X_train_clean_miss_var['Cabin'].value_counts()

Cabin
B96 B98        691
G6               4
C23 C25 C27      4
C22 C26          3
F33              3
              ... 
E34              1
C7               1
C54              1
E36              1
C148             1
Name: count, Length: 147, dtype: int64