# day 13 (Random - Sample - imputation) | Numeric Data


# Import Libraries


In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

# Import Dataset


In [2]:
df = pd.read_csv('train.csv',usecols=['Age','Fare','Survived'])

In [3]:
df.head()

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05


# Check missing (null) value


In [4]:
df.isnull().mean() * 100

Survived     0.00000
Age         19.86532
Fare         0.00000
dtype: float64

# Create X & Y


In [5]:
X = df.drop(columns=['Survived'])
y = df['Survived']

# Apply Train Test Split


In [6]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [7]:
X_train

Unnamed: 0,Age,Fare
30,40.0,27.7208
10,4.0,16.7000
873,47.0,9.0000
182,9.0,31.3875
876,20.0,9.8458
...,...,...
534,30.0,8.6625
584,,8.7125
493,71.0,49.5042
527,,221.7792


# New column create in Both Train & Test


In [8]:
X_train['Age_imputed'] = X_train['Age']
X_test['Age_imputed'] = X_test['Age']

In [9]:
X_test.tail()

Unnamed: 0,Age,Fare,Age_imputed
89,24.0,8.05,24.0
80,22.0,9.0,22.0
846,,69.55,
870,26.0,7.8958,26.0
251,29.0,10.4625,29.0


In [10]:
X_test.head()

Unnamed: 0,Age,Fare,Age_imputed
707,42.0,26.2875,42.0
37,21.0,8.05,21.0
615,24.0,65.0,24.0
169,28.0,56.4958,28.0
68,17.0,7.925,17.0


In [11]:
X_train.tail()

Unnamed: 0,Age,Fare,Age_imputed
534,30.0,8.6625,30.0
584,,8.7125,
493,71.0,49.5042,71.0
527,,221.7792,
168,,25.925,


In [12]:
X_train.head()

Unnamed: 0,Age,Fare,Age_imputed
30,40.0,27.7208,40.0
10,4.0,16.7,4.0
873,47.0,9.0,47.0
182,9.0,31.3875,9.0
876,20.0,9.8458,20.0


# Replace Value Age_imputed


In [13]:
X_train['Age_imputed'][X_train['Age_imputed'].isnull()] = X_train['Age'].dropna().sample(X_train['Age'].isnull().sum()).values

X_test['Age_imputed'][X_test['Age_imputed'].isnull()] = X_train['Age'].dropna().sample(X_test['Age'].isnull().sum()).values

# Review Sample Random Generate Value


In [14]:
X_train['Age'].dropna().sample(1).values

array([2.])

In [15]:
X_train['Age'].isnull().sum()

148

In [20]:
X_train['Age'].dropna().sample(X_train['Age'].isnull().sum()).values

array([ 9.  , 29.  , 16.  , 32.  , 50.  , 64.  , 18.  , 34.  , 15.  ,
       44.  , 34.  , 27.  , 58.  , 21.  , 31.  , 26.  , 23.  , 62.  ,
        4.  , 27.  , 71.  , 40.5 , 21.  , 25.  , 26.  , 49.  , 47.  ,
       19.  , 18.  , 31.  , 40.  , 30.  , 24.  , 47.  , 36.5 , 30.  ,
       27.  , 32.  , 19.  , 28.  , 31.  ,  1.  , 39.  , 18.  , 35.  ,
       20.  , 26.  , 20.  , 20.  , 30.  , 21.  , 19.  , 30.  , 38.  ,
       31.  ,  3.  , 24.  , 31.  , 36.  , 29.  , 40.5 , 14.  , 58.  ,
       41.  , 53.  , 16.  , 19.  ,  3.  , 20.  , 29.  , 71.  , 60.  ,
       10.  , 54.  , 29.  , 21.  , 14.  , 29.  , 19.  ,  6.  , 16.  ,
       36.  , 54.  , 34.  , 48.  , 33.  , 22.  , 40.  , 38.  , 29.  ,
       28.  , 30.  , 22.  , 22.  ,  9.  , 32.  , 20.  , 24.  , 42.  ,
       13.  , 25.  , 47.  , 36.  , 30.  , 30.  , 18.  , 45.  , 23.  ,
       40.  , 33.  , 45.  , 32.  , 34.5 , 24.  , 19.  , 35.  , 31.  ,
       50.  , 56.  , 23.  , 40.  , 22.  , 26.  , 58.  , 11.  , 18.  ,
       40.  , 21.  ,

In [21]:
X_train

Unnamed: 0,Age,Fare,Age_imputed
30,40.0,27.7208,40.0
10,4.0,16.7000,4.0
873,47.0,9.0000,47.0
182,9.0,31.3875,9.0
876,20.0,9.8458,20.0
...,...,...,...
534,30.0,8.6625,30.0
584,,8.7125,2.0
493,71.0,49.5042,71.0
527,,221.7792,35.0


# Compare Original Age and Imputed Age


In [1]:
sns.distplot(X_train['Age'],label='Original',hist=False)
sns.distplot(X_train['Age_imputed'],label = 'Imputed',hist=False)
plt.legend()
plt.show()

NameError: name 'sns' is not defined

# Compare Variable Variance


In [23]:
print('Original variable variance: ', X_train['Age'].var())
print('Variance after random imputation: ', X_train['Age_imputed'].var())

Original variable variance:  204.34951339046142
Variance after random imputation:  206.3582311224893
