In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, OneHotEncoder

In [3]:
df = pd.read_csv('data/income_evaluation.csv', na_values=' ?')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
df.shape

(32561, 15)

In [5]:
df.isna().sum()

age                   0
 workclass         1836
 fnlwgt               0
 education            0
 education-num        0
 marital-status       0
 occupation        1843
 relationship         0
 race                 0
 sex                  0
 capital-gain         0
 capital-loss         0
 hours-per-week       0
 native-country     583
 income               0
dtype: int64

In [6]:
df.dropna(inplace=True)

In [7]:
df.shape

(30162, 15)

In [9]:
df.columns = df.columns.str.strip()
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

In [10]:
df.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64

In [11]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('income', axis=1), df.income,
                                                   test_size=0.2, random_state=0)

### Step 1: Scale the numerical columns
### Step 2: One Hot Encode the categorical columns

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30162 entries, 0 to 32560
Data columns (total 15 columns):
age               30162 non-null int64
workclass         30162 non-null object
fnlwgt            30162 non-null int64
education         30162 non-null object
education-num     30162 non-null int64
marital-status    30162 non-null object
occupation        30162 non-null object
relationship      30162 non-null object
race              30162 non-null object
sex               30162 non-null object
capital-gain      30162 non-null int64
capital-loss      30162 non-null int64
hours-per-week    30162 non-null int64
native-country    30162 non-null object
income            30162 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [14]:
num_cols = [col for col in X_train.columns if X_train[col].dtypes!='O']
num_cols

['age',
 'fnlwgt',
 'education-num',
 'capital-gain',
 'capital-loss',
 'hours-per-week']

In [16]:
cat_cols = [col for col in X_train.columns if X_train[col].dtypes=='O']
cat_cols

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

In [20]:
X_train[num_cols].head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
30461,34,185216,10,0,0,40
18186,51,95128,9,0,0,40
24974,50,280278,9,0,0,40
25659,52,40200,10,0,0,35
13876,31,49325,9,0,0,40


In [21]:
r = RobustScaler()
r.fit(X_train[num_cols])

RobustScaler()

In [23]:
X_train_num_scaled = r.transform(X_train[num_cols])

In [25]:
X_test_num_scaled = r.transform(X_test[num_cols])

In [28]:
X_train[cat_cols].head()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country
30461,Private,Some-college,Never-married,Adm-clerical,Own-child,White,Male,United-States
18186,Private,HS-grad,Divorced,Craft-repair,Not-in-family,White,Male,United-States
24974,Private,HS-grad,Widowed,Prof-specialty,Unmarried,Black,Female,United-States
25659,Self-emp-not-inc,Some-college,Widowed,Craft-repair,Not-in-family,Black,Male,United-States
13876,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,United-States


In [29]:
o = OneHotEncoder(sparse=False, handle_unknown='ignore')
o.fit(X_train[cat_cols])

OneHotEncoder(handle_unknown='ignore', sparse=False)

In [32]:
X_train_cat_encoded = o.transform(X_train[cat_cols])

In [33]:
X_test_cat_encoded = o.transform(X_test[cat_cols])

In [37]:
pd.DataFrame(np.concatenate((X_train_num_scaled, X_train_cat_encoded), axis=1))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,94,95,96,97,98,99,100,101,102,103
0,-0.157895,0.057533,0.00,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.736842,-0.693958,-0.25,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.684211,0.850516,-0.25,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.789474,-1.152153,0.00,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-0.315789,-1.076035,-0.25,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5,0.105263,1.676949,-1.00,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6,0.842105,-0.242761,-0.25,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,-0.105263,0.308678,0.25,0.0,0.0,-1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8,0.473684,-0.131408,1.00,0.0,0.0,2.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,-0.052632,-1.061320,0.75,0.0,0.0,4.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [38]:
pd.DataFrame(np.concatenate((X_test_num_scaled, X_test_cat_encoded), axis=1))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,94,95,96,97,98,99,100,101,102,103
0,0.368421,-0.804695,-0.25,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-0.842105,1.066033,0.00,0.0,0.0,-4.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.052632,0.064991,0.00,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,-0.368421,-1.108551,0.25,0.0,0.0,1.8,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.263158,-0.373168,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,-0.684211,0.943351,-0.25,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6,-0.842105,0.123550,0.00,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7,-0.263158,0.142877,0.75,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8,-0.105263,-0.294138,-0.25,0.0,0.0,4.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
9,-0.368421,1.232785,-0.25,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


# Column Transformer enters!

In [39]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [41]:
df[['education', 'education-num']].head()

Unnamed: 0,education,education-num
0,Bachelors,13
1,Bachelors,13
2,HS-grad,9
3,11th,7
4,Bachelors,13


In [42]:
cat_cols

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

In [43]:
ct = ColumnTransformer([
    ('step1', RobustScaler(), num_cols),
    ('step2', OneHotEncoder(sparse=False, handle_unknown='ignore'), ['workclass',
                                                                     'marital-status', 'occupation',
                                                                     'relationship', 'race', 'sex',
                                                                     'native-country']),
], remainder='drop')

In [44]:
ct.fit(X_train)

ColumnTransformer(transformers=[('step1', RobustScaler(),
                                 ['age', 'fnlwgt', 'education-num',
                                  'capital-gain', 'capital-loss',
                                  'hours-per-week']),
                                ('step2',
                                 OneHotEncoder(handle_unknown='ignore',
                                               sparse=False),
                                 ['workclass', 'marital-status', 'occupation',
                                  'relationship', 'race', 'sex',
                                  'native-country'])])

In [45]:
ct.transform(X_train)

array([[-0.15789474,  0.05753301,  0.        , ...,  1.        ,
         0.        ,  0.        ],
       [ 0.73684211, -0.69395807, -0.25      , ...,  1.        ,
         0.        ,  0.        ],
       [ 0.68421053,  0.85051594, -0.25      , ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [ 0.68421053, -0.88395799,  0.75      , ...,  1.        ,
         0.        ,  0.        ],
       [ 0.63157895,  0.71112538,  0.75      , ...,  1.        ,
         0.        ,  0.        ],
       [-0.15789474,  0.03218245, -0.25      , ...,  1.        ,
         0.        ,  0.        ]])

In [46]:
ct.transform(X_test)

array([[ 0.36842105, -0.80469473, -0.25      , ...,  1.        ,
         0.        ,  0.        ],
       [-0.84210526,  1.06603325,  0.        , ...,  1.        ,
         0.        ,  0.        ],
       [-0.05263158,  0.06499053,  0.        , ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [ 0.42105263,  0.80389393, -1.        , ...,  1.        ,
         0.        ,  0.        ],
       [ 0.21052632,  1.4814855 ,  0.        , ...,  1.        ,
         0.        ,  0.        ],
       [-0.63157895, -0.21916266,  0.75      , ...,  0.        ,
         0.        ,  0.        ]])

In [48]:
pd.DataFrame(ct.transform(X_train)).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,78,79,80,81,82,83,84,85,86,87
0,-0.157895,0.057533,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.736842,-0.693958,-0.25,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.684211,0.850516,-0.25,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.789474,-1.152153,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-0.315789,-1.076035,-0.25,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [49]:
ct.transformers_

[('step1',
  RobustScaler(),
  ['age',
   'fnlwgt',
   'education-num',
   'capital-gain',
   'capital-loss',
   'hours-per-week']),
 ('step2',
  OneHotEncoder(handle_unknown='ignore', sparse=False),
  ['workclass',
   'marital-status',
   'occupation',
   'relationship',
   'race',
   'sex',
   'native-country']),
 ('remainder', 'drop', [3])]

In [54]:
ct.transformers_[0][1].scale_

array([1.90000e+01, 1.19879e+05, 4.00000e+00, 1.00000e+00, 1.00000e+00,
       5.00000e+00])

In [57]:
ct.transformers_[1][1].get_feature_names()

array(['x0_ Federal-gov', 'x0_ Local-gov', 'x0_ Private',
       'x0_ Self-emp-inc', 'x0_ Self-emp-not-inc', 'x0_ State-gov',
       'x0_ Without-pay', 'x1_ Divorced', 'x1_ Married-AF-spouse',
       'x1_ Married-civ-spouse', 'x1_ Married-spouse-absent',
       'x1_ Never-married', 'x1_ Separated', 'x1_ Widowed',
       'x2_ Adm-clerical', 'x2_ Armed-Forces', 'x2_ Craft-repair',
       'x2_ Exec-managerial', 'x2_ Farming-fishing',
       'x2_ Handlers-cleaners', 'x2_ Machine-op-inspct',
       'x2_ Other-service', 'x2_ Priv-house-serv', 'x2_ Prof-specialty',
       'x2_ Protective-serv', 'x2_ Sales', 'x2_ Tech-support',
       'x2_ Transport-moving', 'x3_ Husband', 'x3_ Not-in-family',
       'x3_ Other-relative', 'x3_ Own-child', 'x3_ Unmarried', 'x3_ Wife',
       'x4_ Amer-Indian-Eskimo', 'x4_ Asian-Pac-Islander', 'x4_ Black',
       'x4_ Other', 'x4_ White', 'x5_ Female', 'x5_ Male', 'x6_ Cambodia',
       'x6_ Canada', 'x6_ China', 'x6_ Columbia', 'x6_ Cuba',
       'x6_ Dominican

In [58]:
num_cols

['age',
 'fnlwgt',
 'education-num',
 'capital-gain',
 'capital-loss',
 'hours-per-week']

In [64]:
ct1 = ColumnTransformer([
    ('step1', RobustScaler(), ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',]),
    ('step2', OneHotEncoder(sparse=False, handle_unknown='ignore'), ['workclass',
                                                                     'marital-status', 'occupation',
                                                                     'relationship', 'race', 'sex',
                                                                     'native-country']),
    ('step3', 'passthrough', ['hours-per-week']),
    ('step4', 'drop', ['education'])
], remainder='drop')

In [66]:
pd.DataFrame(ct1.fit_transform(X_train)).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,78,79,80,81,82,83,84,85,86,87
0,-0.157895,0.057533,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,40.0
1,0.736842,-0.693958,-0.25,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,40.0
2,0.684211,0.850516,-0.25,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,40.0
3,0.789474,-1.152153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,35.0
4,-0.315789,-1.076035,-0.25,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,40.0
