# Capturing NAN values with a new feature

## It works well if the data is not missing completely at random (MNAR)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
df=pd.read_csv('train.csv',usecols =['Age','Fare','Survived'])

In [4]:
df

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.2500
1,1,38.0,71.2833
2,1,26.0,7.9250
3,1,35.0,53.1000
4,0,35.0,8.0500
...,...,...,...
886,0,27.0,13.0000
887,1,19.0,30.0000
888,0,,23.4500
889,1,26.0,30.0000


In [5]:
df.isnull().sum()

Survived      0
Age         177
Fare          0
dtype: int64

In [6]:
df.isnull().mean()

Survived    0.000000
Age         0.198653
Fare        0.000000
dtype: float64

In [16]:
def impute_nan(df,variable,median):
    df[variable + "_median"]=df[variable].fillna(median)
    df[variable+"_random"]=df[variable]
    #it will have random sample to fill na values
    random_sample=df[variable].dropna().sample(df[variable].isnull().sum(),random_state=0)
    #pandas need to have same index in order to merge dataset
    random_sample.index=df[df[variable].isnull()].index
    df.loc[df[variable].isnull(),variable+'_random']=random_sample

In [7]:
df['Age_NaN']=np.where(df['Age'].isnull(),1,0)

In [9]:
df

Unnamed: 0,Survived,Age,Fare,Age_NaN
0,0,22.0,7.2500,0
1,1,38.0,71.2833,0
2,1,26.0,7.9250,0
3,1,35.0,53.1000,0
4,0,35.0,8.0500,0
...,...,...,...,...
886,0,27.0,13.0000,0
887,1,19.0,30.0000,0
888,0,,23.4500,1
889,1,26.0,30.0000,0


In [15]:
df.Age.mean()

29.69911764705882

In [17]:
df['Age'].fillna(df.Age.median(),inplace=True)

In [18]:
df

Unnamed: 0,Survived,Age,Fare,Age_NaN
0,0,22.0,7.2500,0
1,1,38.0,71.2833,0
2,1,26.0,7.9250,0
3,1,35.0,53.1000,0
4,0,35.0,8.0500,0
...,...,...,...,...
886,0,27.0,13.0000,0
887,1,19.0,30.0000,0
888,0,28.0,23.4500,1
889,1,26.0,30.0000,0


## This basically help to tell the model that something has happened here. Some changes were made here.

# Advantages
1. Easy to implement.
2. Captures the importance of missing values.

# Disadvantage
1. This creates additional features.(Curse of Dimensionality)
   
   Curse of Dimensionality describes the explosive nature of increasing data dimensions and its resulting exponential increase    in computational efforts required for its processing and/or analysis.