## MICE Algorithm

First, we need to import some important library such as Numpy, Pandas, and so on.
Then we have to read file and 

In [7]:
import numpy as np
import pandas as pd
import re
import sys
from sklearn import metrics
from statsmodels.imputation import mice
from sklearn.model_selection import train_test_split

#### Check which field has missing value

In [8]:
# read file
df=pd.read_csv("train.csv").iloc[:,1:]
print('File read succesfully !', f'Shape of original file: {df.shape}')
# count the number of NaN values in each column
print(df.isnull().sum())

File read succesfully ! Shape of original file: (891, 11)
Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64


In [9]:
# Detection of missing values

def detect_missing():
        # checking missing values
        null_series = df.isnull().sum()
        print()
        null_column_list = []
        if sum(null_series):
            print('Following columns contains missing values : ')
            total_samples = df.shape[0]
            for i, j in null_series.items():
                if j:
                    print("{} : {:.2f} %".format(i, (j/total_samples)*100))
                    null_column_list.append(i)
        else:
            print("None of the columns contains missing values !")
        return null_column_list

In [10]:
null_column_list = detect_missing()
print(null_column_list)


Following columns contains missing values : 
Age : 19.87 %
Cabin : 77.10 %
Embarked : 0.22 %
['Age', 'Cabin', 'Embarked']


#### Deal with Nan using MICE algorithm

In [12]:
df_mice = df.copy()
# mapping Embarked using numeric values
embarked_mapping = {"S": 1, "C": 2, "Q": 3}
df_mice['Embarked'] = df_mice['Embarked'].map(embarked_mapping)
# mapping Cabin using numeric values
deck = {"A": 0, "B": 1, "C": 2, "D": 3, "E": 4, "F": 5, "G": 6, "U": 7}
df_mice['Cabin'] = df_mice['Cabin'].fillna("U")
df_mice['Cabin'] = df_mice['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())
df_mice['Cabin'] = df_mice['Cabin'].map(deck)
df_mice['Cabin'].replace({7:np.nan}, inplace=True)

numeric_features = [column for column in df_mice.columns if df_mice[column].dtype != 'object']
imp = mice.MICEData(df_mice[numeric_features])
imp.set_imputer('')
for i in range(100):
    imp.update_all()
operated_cols = [column for column in numeric_features if df[column].isnull().sum()]
print(f'Operating on following features : {operated_cols}')
# copying the imputed values to the original df
for i in operated_cols:
    df_mice[i] = imp.data[i]

# reverse mapping the values
embarked_mapping = {1:"S", 2:"C", 3:"Q"}
df_mice['Embarked'] = df_mice['Embarked'].map(embarked_mapping)
deck_mapping = {0 : "A", 1 : "B", 2 : "C", 3 : "D", 4 : "E", 5 : "F", 6 : "G"}
df_mice['Cabin'] = df_mice['Cabin'].map(deck_mapping)

Operating on following features : ['Age', 'Cabin', 'Embarked']


In [13]:
# count the number of NaN values in each column
print(df_mice.isnull().sum())

Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Cabin       0
Embarked    0
dtype: int64


#### Check missing value after filling

In [14]:
# count the number of NaN values in each column
print(df_mice.isnull().sum())

Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Cabin       0
Embarked    0
dtype: int64


In [15]:
df_mice

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,F,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,F,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,F,S
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,F,S
887,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B,S
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,32.0,1,2,W./C. 6607,23.4500,G,S
889,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C,C


#### Write csv file after filling the missing value by using Median

In [7]:
print("Write to output csv file ")
df.to_csv('output_mice.csv')

Write to output csv file 
