# **Missing Indicator Imputation**

### **Import libraries, Load and Analyze Dataset**

In [67]:
import numpy as np
import pandas as pd

In [68]:
from sklearn.model_selection import train_test_split
from sklearn.impute import MissingIndicator,SimpleImputer

In [69]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Learning Machine Learning/train.csv',usecols=['Age','Fare','Survived'])
df.head()

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05


### **Create X & Y for Train-Test-Split**

In [70]:
X = df.drop(columns=['Survived'])
y = df['Survived']

In [71]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [72]:
X_train.head()

Unnamed: 0,Age,Fare
30,40.0,27.7208
10,4.0,16.7
873,47.0,9.0
182,9.0,31.3875
876,20.0,9.8458


### **Review Without “Missing Indicator Method” Technique**

In [73]:
si = SimpleImputer()
X_train_trf = si.fit_transform(X_train)
X_test_trf = si.transform(X_test)

In [74]:
X_train_trf

array([[ 40.        ,  27.7208    ],
       [  4.        ,  16.7       ],
       [ 47.        ,   9.        ],
       ...,
       [ 71.        ,  49.5042    ],
       [ 29.78590426, 221.7792    ],
       [ 29.78590426,  25.925     ]])

### **Check for Logistic Regression**

In [75]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train_trf,y_train)
y_pred = clf.predict(X_test_trf)
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6145251396648045

### **Missing Indicator**

In [76]:
mi = MissingIndicator()
mi.fit(X_train)

In [77]:
MissingIndicator()

In [78]:
mi.features_

array([0])

### **Transform: Train Missing**

In [79]:
X_train_missing = mi.transform(X_train)

In [80]:
X_train_missing

array([[False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [

### **Transform: Test Missing**

In [81]:
X_test_missing = mi.transform(X_test)

In [82]:
X_test_missing

array([[False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [ True],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [ True],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [

In [83]:
X_train['Age_NA'] = X_train_missing
X_test['Age_NA'] = X_test_missing

In [84]:
X_train

Unnamed: 0,Age,Fare,Age_NA
30,40.0,27.7208,False
10,4.0,16.7000,False
873,47.0,9.0000,False
182,9.0,31.3875,False
876,20.0,9.8458,False
...,...,...,...
534,30.0,8.6625,False
584,,8.7125,True
493,71.0,49.5042,False
527,,221.7792,True


### **Check Accuracy**

In [85]:
si = SimpleImputer()
X_train_trf2 = si.fit_transform(X_train)
X_test_trf2 = si.transform(X_test)

In [86]:
from sklearn.linear_model import LogisticRegression
clf  = LogisticRegression()
clf.fit(X_train_trf2,y_train)
y_pred = clf.predict(X_test_trf2)
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6312849162011173