In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_csv('data/income_evaluation.csv', na_values=' ?')

In [3]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
df.columns

Index(['age', ' workclass', ' fnlwgt', ' education', ' education-num',
       ' marital-status', ' occupation', ' relationship', ' race', ' sex',
       ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country',
       ' income'],
      dtype='object')

In [5]:
df.columns = df.columns.str.strip()
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

In [6]:
df.isna().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
income               0
dtype: int64

### Just to spice things up a little, I'll infuse missing values in numerical columns as well

In [7]:
np.array(df.index)

array([    0,     1,     2, ..., 32558, 32559, 32560], dtype=int64)

In [8]:
np.random.seed(15)

r = np.random.choice(df.index, size=40, replace=False)
print(r)
df.loc[r, 'age'] = np.nan

[10125 11478  4224  6592 21910 25737 32017 13595 16559 12752 13786 18301
 21099 12408 28467 12247 27397 26561 29923 14741 25613 26132 17718 15920
 17821 30011  8622  9580 30129 14653 26693 29771 23620  9708   807  5212
  2463 14875  4138   767]


In [9]:
np.random.seed(25)

s = np.random.choice(df.index, size=40, replace=False)
print(s)
df.loc[s, 'hours-per-week'] = np.nan

[ 3254 30026  7221 23481 15318 27004 32462  1033 19921 23753 18252 16482
  5790 32463  5545  5010 22026  1334 27182 28010 30808 30029  6859  5682
 23453 23589 21217 26579  7209 32356  4188 20018  1648 22219 25007 15516
 10767 20086 20713  5973]


In [10]:
df.isna().sum()

age                 40
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week      40
native-country     583
income               0
dtype: int64

In [11]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('income', 1), df.income, test_size=0.2,
                                                   random_state=40)

In [12]:
num_cols = [col for col in X_train.columns if X_train[col].dtypes!='O']
num_cols

['age',
 'fnlwgt',
 'education-num',
 'capital-gain',
 'capital-loss',
 'hours-per-week']

In [13]:
cat_cols = [col for col in X_train.columns if (X_train[col].dtypes=='O') & (col!='education')]
cat_cols

['workclass',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

## The not-so-good and unscalable approach

In [14]:
ct1 = ColumnTransformer([
    ('si_num', SimpleImputer(strategy='median', add_indicator=True), num_cols),
], remainder='drop')

In [15]:
pd.DataFrame(ct1.fit_transform(X_train)).head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,32.0,185027.0,12.0,0.0,1887.0,40.0,0.0,0.0
1,20.0,163911.0,10.0,0.0,0.0,40.0,0.0,0.0
2,46.0,368561.0,13.0,0.0,0.0,55.0,0.0,0.0
3,34.0,164280.0,6.0,0.0,0.0,40.0,0.0,0.0
4,72.0,108796.0,15.0,0.0,0.0,40.0,0.0,0.0


In [16]:
ct2 = ColumnTransformer([
    ('si_num', SimpleImputer(strategy='median', add_indicator=True), num_cols),
    ('rob_num', RobustScaler(), num_cols)
], remainder='drop')

In [17]:
pd.DataFrame(ct2.fit_transform(X_train)).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,32.0,185027.0,12.0,0.0,1887.0,40.0,0.0,0.0,-0.25,0.054502,0.666667,0.0,1887.0,0.0
1,20.0,163911.0,10.0,0.0,0.0,40.0,0.0,0.0,-0.85,-0.121984,0.0,0.0,0.0,0.0
2,46.0,368561.0,13.0,0.0,0.0,55.0,0.0,0.0,0.45,1.588461,1.0,0.0,0.0,3.0
3,34.0,164280.0,6.0,0.0,0.0,40.0,0.0,0.0,-0.15,-0.1189,-1.333333,0.0,0.0,0.0
4,72.0,108796.0,15.0,0.0,0.0,40.0,0.0,0.0,1.75,-0.582629,1.666667,0.0,0.0,0.0


In [18]:
ct3 = ColumnTransformer([
    ('si_num', SimpleImputer(strategy='median', add_indicator=True), num_cols),
    ('si_cat', SimpleImputer(strategy='constant', fill_value='missing', add_indicator=True), cat_cols)
], remainder='drop')

In [19]:
pd.DataFrame(ct3.fit_transform(X_train)).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,32,185027,12,0,1887,40,0,0,Private,Married-civ-spouse,Craft-repair,Husband,White,Male,Ireland,False,False,False
1,20,163911,10,0,0,40,0,0,missing,Never-married,missing,Own-child,White,Female,United-States,True,True,False
2,46,368561,13,0,0,55,0,0,Private,Married-civ-spouse,Sales,Husband,White,Male,United-States,False,False,False
3,34,164280,6,0,0,40,0,0,Private,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,Portugal,False,False,False
4,72,108796,15,0,0,40,0,0,missing,Married-civ-spouse,missing,Husband,White,Male,United-States,True,True,False


In [20]:
list(range(6))

[0, 1, 2, 3, 4, 5]

In [21]:
list(range(8, 18))

[8, 9, 10, 11, 12, 13, 14, 15, 16, 17]

In [22]:
ct4 = ColumnTransformer([
    ('rob_num', RobustScaler(), list(range(6))),
    ('ohe_cat', OneHotEncoder(sparse=False, handle_unknown='ignore'), list(range(8, 18)))
])

In [23]:
xtf = ct3.fit_transform(X_train)

In [24]:
ct4.fit_transform(xtf)

array([[-0.25      ,  0.05450188,  0.66666667, ...,  0.        ,
         1.        ,  0.        ],
       [-0.85      , -0.12198358,  0.        , ...,  1.        ,
         1.        ,  0.        ],
       [ 0.45      ,  1.58846108,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.8       , -0.59094547,  0.        , ...,  1.        ,
         1.        ,  0.        ],
       [-0.9       ,  1.36967628, -1.        , ...,  0.        ,
         1.        ,  0.        ],
       [-0.2       , -1.28713364, -0.33333333, ...,  0.        ,
         1.        ,  0.        ]])

## The good and scalable approach

In [25]:
pp_num = Pipeline([
    ('num_imp', SimpleImputer(strategy='median', add_indicator=False)),
    ('rob_num', RobustScaler())
])

pp_cat = Pipeline([
    ('cat_imp', SimpleImputer(strategy='constant', add_indicator=False, fill_value='missing')),
    ('ohe_cat', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

In [26]:
from sklearn.impute import MissingIndicator

In [27]:
ct = ColumnTransformer([
    ('mi', MissingIndicator(), X_train.columns),
    ('pp_num', pp_num, num_cols),
    ('pp_cat', pp_cat, cat_cols)
])

In [29]:
xt = ct.fit_transform(X_train)
xt

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [30]:
pd.DataFrame(xt).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,87,88,89,90,91,92,93,94,95,96
0,0.0,0.0,0.0,0.0,0.0,-0.25,0.054502,0.666667,0.0,1887.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,1.0,0.0,0.0,-0.85,-0.121984,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.45,1.588461,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,-0.15,-0.1189,-1.333333,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,1.0,0.0,0.0,1.75,-0.582629,1.666667,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [33]:
pd.DataFrame(xt).isna().sum().sum()

0

In [31]:
X_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
26470,40.0,Private,188291,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40.0,United-States
1104,34.0,Self-emp-not-inc,196791,Assoc-acdm,12,Married-civ-spouse,Prof-specialty,Wife,White,Female,0,0,25.0,United-States
85,53.0,Private,346253,HS-grad,9,Divorced,Sales,Own-child,White,Female,0,0,35.0,United-States
16639,39.0,Private,435638,Some-college,10,Never-married,Machine-op-inspct,Not-in-family,White,Male,0,0,40.0,United-States
28097,22.0,Private,324922,HS-grad,9,Never-married,Machine-op-inspct,Not-in-family,White,Male,0,0,50.0,United-States


In [32]:
ct.transform(X_test)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [34]:
pipe_final = Pipeline([
    ('ct_step', ct),
    ('model', DecisionTreeClassifier())
])

In [35]:
pipe_final.fit(X_train, y_train)

Pipeline(steps=[('ct_step',
                 ColumnTransformer(transformers=[('mi', MissingIndicator(),
                                                  Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'],
      dtype='object')),
                                                 ('pp_num',
                                                  Pipeline(steps=[('num_imp',
                                                                   SimpleImputer(strategy='med...
                                                   'education-num',
                                                   'capital-gain',
                                                   'capital-loss',
                                                   'hours-per-week']),
                                                 ('pp_cat',
                                        

In [36]:
pipe_final.predict(X_test)

array([' >50K', ' >50K', ' <=50K', ..., ' <=50K', ' <=50K', ' <=50K'],
      dtype=object)

In [37]:
pipe_final.score(X_test, y_test)

0.8165208045447566

In [39]:
pipe_final.named_steps

{'ct_step': ColumnTransformer(transformers=[('mi', MissingIndicator(),
                                  Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
        'marital-status', 'occupation', 'relationship', 'race', 'sex',
        'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'],
       dtype='object')),
                                 ('pp_num',
                                  Pipeline(steps=[('num_imp',
                                                   SimpleImputer(strategy='median')),
                                                  ('rob_num', RobustScaler())]),
                                  ['age', 'fnlwgt', 'education-num',
                                   'capital-gain', 'capital-loss',
                                   'hours-per-week']),
                                 ('pp_cat',
                                  Pipeline(steps=[('cat_imp',
                                                   SimpleImputer(fill_value='missing'

In [40]:
pipe_final.named_steps['ct_step']

ColumnTransformer(transformers=[('mi', MissingIndicator(),
                                 Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'],
      dtype='object')),
                                ('pp_num',
                                 Pipeline(steps=[('num_imp',
                                                  SimpleImputer(strategy='median')),
                                                 ('rob_num', RobustScaler())]),
                                 ['age', 'fnlwgt', 'education-num',
                                  'capital-gain', 'capital-loss',
                                  'hours-per-week']),
                                ('pp_cat',
                                 Pipeline(steps=[('cat_imp',
                                                  SimpleImputer(fill_value='missing',
                        

In [41]:
pipe_final.named_steps['ct_step'].named_transformers_

{'mi': MissingIndicator(),
 'pp_num': Pipeline(steps=[('num_imp', SimpleImputer(strategy='median')),
                 ('rob_num', RobustScaler())]),
 'pp_cat': Pipeline(steps=[('cat_imp',
                  SimpleImputer(fill_value='missing', strategy='constant')),
                 ('ohe_cat',
                  OneHotEncoder(handle_unknown='ignore', sparse=False))])}

In [42]:
pipe_final.named_steps['ct_step'].named_transformers_['pp_cat']

Pipeline(steps=[('cat_imp',
                 SimpleImputer(fill_value='missing', strategy='constant')),
                ('ohe_cat',
                 OneHotEncoder(handle_unknown='ignore', sparse=False))])

In [43]:
pipe_final.named_steps['ct_step'].named_transformers_['pp_cat'].named_steps

{'cat_imp': SimpleImputer(fill_value='missing', strategy='constant'),
 'ohe_cat': OneHotEncoder(handle_unknown='ignore', sparse=False)}

In [44]:
pipe_final.named_steps['ct_step'].named_transformers_['pp_cat'].named_steps['ohe_cat']

OneHotEncoder(handle_unknown='ignore', sparse=False)

In [45]:
pipe_final.named_steps['ct_step'].named_transformers_['pp_cat'].named_steps['ohe_cat'].get_feature_names()

array(['x0_ Federal-gov', 'x0_ Local-gov', 'x0_ Never-worked',
       'x0_ Private', 'x0_ Self-emp-inc', 'x0_ Self-emp-not-inc',
       'x0_ State-gov', 'x0_ Without-pay', 'x0_missing', 'x1_ Divorced',
       'x1_ Married-AF-spouse', 'x1_ Married-civ-spouse',
       'x1_ Married-spouse-absent', 'x1_ Never-married', 'x1_ Separated',
       'x1_ Widowed', 'x2_ Adm-clerical', 'x2_ Armed-Forces',
       'x2_ Craft-repair', 'x2_ Exec-managerial', 'x2_ Farming-fishing',
       'x2_ Handlers-cleaners', 'x2_ Machine-op-inspct',
       'x2_ Other-service', 'x2_ Priv-house-serv', 'x2_ Prof-specialty',
       'x2_ Protective-serv', 'x2_ Sales', 'x2_ Tech-support',
       'x2_ Transport-moving', 'x2_missing', 'x3_ Husband',
       'x3_ Not-in-family', 'x3_ Other-relative', 'x3_ Own-child',
       'x3_ Unmarried', 'x3_ Wife', 'x4_ Amer-Indian-Eskimo',
       'x4_ Asian-Pac-Islander', 'x4_ Black', 'x4_ Other', 'x4_ White',
       'x5_ Female', 'x5_ Male', 'x6_ Cambodia', 'x6_ Canada',
       'x6_ Ch