In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('data/income_evaluation.csv', na_values=' ?')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [9]:
df[' education'].unique()

array([' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th',
       ' Some-college', ' Assoc-acdm', ' Assoc-voc', ' 7th-8th',
       ' Doctorate', ' Prof-school', ' 5th-6th', ' 10th', ' 1st-4th',
       ' Preschool', ' 12th'], dtype=object)

In [3]:
df.isna().sum()

age                   0
 workclass         1836
 fnlwgt               0
 education            0
 education-num        0
 marital-status       0
 occupation        1843
 relationship         0
 race                 0
 sex                  0
 capital-gain         0
 capital-loss         0
 hours-per-week       0
 native-country     583
 income               0
dtype: int64

In [4]:
# hours per week
np.random.seed(seed=0)
h = np.random.choice(a=df.index, replace=False, size=20)
df.loc[h, ' hours-per-week'] = np.nan

In [5]:
# age
np.random.seed(seed=10)
a = np.random.choice(a=df.index, replace=False, size=28)
df.loc[a, 'age'] = np.nan

In [6]:
df.isna().sum()

age                  28
 workclass         1836
 fnlwgt               0
 education            0
 education-num        0
 marital-status       0
 occupation        1843
 relationship         0
 race                 0
 sex                  0
 capital-gain         0
 capital-loss         0
 hours-per-week      20
 native-country     583
 income               0
dtype: int64

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(' income', 1), df[' income'],
                                                   test_size=0.2, random_state=5)

In [10]:
from sklearn.impute import KNNImputer

In [21]:
knn = KNNImputer(n_neighbors=5, add_indicator=True)

In [14]:
X_train['age'].dtypes

dtype('float64')

In [15]:
X_train.columns

Index(['age', ' workclass', ' fnlwgt', ' education', ' education-num',
       ' marital-status', ' occupation', ' relationship', ' race', ' sex',
       ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country'],
      dtype='object')

In [17]:
num = [col for col in X_train.columns if X_train[col].dtypes != 'O']

In [19]:
X_train[num].head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
21425,55.0,238216,9,0,0,40.0
28707,24.0,306460,9,0,0,40.0
4455,48.0,213140,4,0,0,40.0
2231,36.0,127306,13,0,0,40.0
18864,53.0,103586,13,0,0,55.0


In [22]:
knn.fit(X_train[num])

KNNImputer(add_indicator=True)

In [23]:
knn.transform(X_train[num])

array([[5.50000e+01, 2.38216e+05, 9.00000e+00, ..., 4.00000e+01,
        0.00000e+00, 0.00000e+00],
       [2.40000e+01, 3.06460e+05, 9.00000e+00, ..., 4.00000e+01,
        0.00000e+00, 0.00000e+00],
       [4.80000e+01, 2.13140e+05, 4.00000e+00, ..., 4.00000e+01,
        0.00000e+00, 0.00000e+00],
       ...,
       [8.50000e+01, 1.66027e+05, 9.00000e+00, ..., 5.00000e+01,
        0.00000e+00, 0.00000e+00],
       [3.60000e+01, 4.69056e+05, 9.00000e+00, ..., 2.50000e+01,
        0.00000e+00, 0.00000e+00],
       [2.60000e+01, 1.98163e+05, 1.40000e+01, ..., 4.00000e+01,
        0.00000e+00, 0.00000e+00]])

In [27]:
pd.DataFrame(knn.transform(X_train[num])).head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,55.0,238216.0,9.0,0.0,0.0,40.0,0.0,0.0
1,24.0,306460.0,9.0,0.0,0.0,40.0,0.0,0.0
2,48.0,213140.0,4.0,0.0,0.0,40.0,0.0,0.0
3,36.0,127306.0,13.0,0.0,0.0,40.0,0.0,0.0
4,53.0,103586.0,13.0,0.0,0.0,55.0,0.0,0.0


In [30]:
X_test[num].isna().sum()

age                5
 fnlwgt            0
 education-num     0
 capital-gain      0
 capital-loss      0
 hours-per-week    1
dtype: int64

In [31]:
knn.transform(X_test[num])

array([[3.20000e+01, 2.60954e+05, 7.00000e+00, ..., 3.00000e+01,
        0.00000e+00, 0.00000e+00],
       [3.10000e+01, 2.36391e+05, 1.00000e+01, ..., 4.00000e+01,
        0.00000e+00, 0.00000e+00],
       [5.90000e+01, 1.75689e+05, 1.00000e+01, ..., 1.40000e+01,
        0.00000e+00, 0.00000e+00],
       ...,
       [2.60000e+01, 1.77482e+05, 1.20000e+01, ..., 4.50000e+01,
        0.00000e+00, 0.00000e+00],
       [4.70000e+01, 2.58498e+05, 1.00000e+01, ..., 5.20000e+01,
        0.00000e+00, 0.00000e+00],
       [4.50000e+01, 1.60962e+05, 1.00000e+01, ..., 3.50000e+01,
        0.00000e+00, 0.00000e+00]])

In [33]:
pd.DataFrame(knn.transform(X_test[num])).isna().sum().sum()

0