In [1]:
import numpy as np
import pandas as pd

# 1. Missing Categorical Values

In [2]:
adult = pd.read_csv("adult_train.csv", usecols=['workclass','occupation','native.country','target'])

In [3]:
adult.shape

(32561, 4)

In [4]:
adult.sample(5)

Unnamed: 0,workclass,occupation,native.country,target
7227,Private,Machine-op-inspct,United-States,<=50K
21990,Private,Prof-specialty,United-States,<=50K
28759,Self-emp-not-inc,Machine-op-inspct,United-States,>50K
14012,Private,Adm-clerical,United-States,>50K
17757,,,United-States,>50K


In [5]:
adult.isnull().sum()

workclass         1836
occupation        1843
native.country     583
target               0
dtype: int64

In [6]:
X = adult[['workclass', 'occupation', 'native.country']]
y = adult['target']

In [7]:
X

Unnamed: 0,workclass,occupation,native.country
0,State-gov,Adm-clerical,United-States
1,Self-emp-not-inc,Exec-managerial,United-States
2,Private,Handlers-cleaners,United-States
3,Private,Handlers-cleaners,United-States
4,Private,Prof-specialty,Cuba
...,...,...,...
32556,Private,Tech-support,United-States
32557,Private,Machine-op-inspct,United-States
32558,Private,Adm-clerical,United-States
32559,Private,Adm-clerical,United-States


In [8]:
y

0         <=50K
1         <=50K
2         <=50K
3         <=50K
4         <=50K
          ...  
32556     <=50K
32557      >50K
32558     <=50K
32559     <=50K
32560      >50K
Name: target, Length: 32561, dtype: object

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split( X, y, train_size = 0.8, random_state = 4)

#   a. Mode Imputation

In [10]:
from sklearn.impute import SimpleImputer

In [11]:
si = SimpleImputer(strategy='most_frequent')      # mode = most frequent value
                                                  # fills the missing values with mode of that column

In [12]:
X_train = si.fit_transform(X_train)
X_test = si.transform(X_test)

In [13]:
si.statistics_           # prints the value used to fill missing columns

array([' Private', ' Craft-repair', ' United-States'], dtype=object)

In [14]:
X_train

array([[' Private', ' Exec-managerial', ' United-States'],
       [' Private', ' Other-service', ' United-States'],
       [' Private', ' Other-service', ' United-States'],
       ...,
       [' Private', ' Adm-clerical', ' United-States'],
       [' Private', ' Prof-specialty', ' United-States'],
       [' Private', ' Other-service', ' United-States']], dtype=object)

In [15]:
X_test

array([[' Private', ' Transport-moving', ' United-States'],
       [' Private', ' Craft-repair', ' United-States'],
       [' Local-gov', ' Exec-managerial', ' United-States'],
       ...,
       [' Private', ' Machine-op-inspct', ' United-States'],
       [' Private', ' Handlers-cleaners', ' United-States'],
       [' Private', ' Other-service', ' United-States']], dtype=object)

In [16]:
y_train

31674      >50K
19303     <=50K
5535      <=50K
8693      <=50K
14777     <=50K
          ...  
22401     <=50K
17093     <=50K
27063     <=50K
8366       >50K
17530     <=50K
Name: target, Length: 26048, dtype: object

In [17]:
y_test

28762     <=50K
4823      <=50K
3106       >50K
11293     <=50K
7008      <=50K
          ...  
19894     <=50K
31986      >50K
21968     <=50K
28263     <=50K
32398     <=50K
Name: target, Length: 6513, dtype: object

# b. Missing Category Imputation

In [18]:
si = SimpleImputer(strategy='constant', fill_value='Missing')     # fills the missing values with the word 'Missing'

In [19]:
X_train = si.fit_transform(X_train)
X_test = si.transform(X_test)

In [20]:
si.statistics_ 

array(['Missing', 'Missing', 'Missing'], dtype=object)

# c. Missing Indicator

In [21]:
si = SimpleImputer(strategy='most_frequent', add_indicator= True)      # adds a new column which contains boolean value 
                                                                       # corresponding to the row.
                                                                       # 'True' if value is missing 
                                                                       # 'False' if value is present

In [22]:
X_train = si.fit_transform(X_train)
X_test = si.transform(X_test)

# 2.  Missing Numeric Values


In [23]:
housing = pd.read_csv("housing.csv", usecols=['total_bedrooms','median_income','median_house_value'])

In [24]:
housing.shape

(20640, 3)

In [25]:
housing.sample(5)

Unnamed: 0,total_bedrooms,median_income,median_house_value
3974,142.0,5.6159,340500.0
6067,937.0,5.9716,262100.0
11805,1269.0,2.8194,111300.0
5597,135.0,1.3333,187500.0
13777,563.0,3.3365,122800.0


In [26]:
housing.isnull().sum()

total_bedrooms        207
median_income           0
median_house_value      0
dtype: int64

In [27]:
X = housing[['total_bedrooms','median_income']]
y = housing['median_house_value']

In [28]:
X

Unnamed: 0,total_bedrooms,median_income
0,129.0,8.3252
1,1106.0,8.3014
2,190.0,7.2574
3,235.0,5.6431
4,280.0,3.8462
...,...,...
20635,374.0,1.5603
20636,150.0,2.5568
20637,485.0,1.7000
20638,409.0,1.8672


In [29]:
y

0        452600.0
1        358500.0
2        352100.0
3        341300.0
4        342200.0
           ...   
20635     78100.0
20636     77100.0
20637     92300.0
20638     84700.0
20639     89400.0
Name: median_house_value, Length: 20640, dtype: float64

In [30]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split( X, y, train_size = 0.8, random_state = 44)

# a. Mean/Median Imputation

In [31]:
from sklearn.impute import SimpleImputer

In [32]:
imputer1 = SimpleImputer(strategy='mean')


In [33]:
X_train = imputer1.fit_transform(X_train)
X_test = imputer1.transform(X_test)

In [34]:
imputer1.statistics_ 

array([537.24755322,   3.87753818])

In [35]:
X_train

array([[675.    ,   2.6618],
       [372.    ,   3.9261],
       [812.    ,   2.532 ],
       ...,
       [308.    ,   6.    ],
       [264.    ,   5.1408],
       [387.    ,   1.5517]])

In [36]:
imputer2 = SimpleImputer(strategy='median')

In [37]:
X_train = imputer2.fit_transform(X_train)
X_test = imputer2.transform(X_test)

In [38]:
imputer2.statistics_ 

array([438.    ,   3.5391])

# b. Arbitrary Value Imputation

In [39]:
imputer3 = SimpleImputer(strategy='constant',fill_value=-1)

In [40]:
X_train = imputer3.fit_transform(X_train)
X_test = imputer3.transform(X_test)

In [41]:
imputer3.statistics_ 

array([-1., -1.])

# c. KNN Imputation

In [42]:
from sklearn.impute import KNNImputer

In [43]:
knn = KNNImputer(n_neighbors=3,weights='distance')

In [44]:
X_train = knn.fit_transform(X_train)
X_test = knn.transform(X_test)