In [16]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# init data

In [17]:
cwd = os.getcwd()
root_path = os.path.dirname(cwd)
data_path = os.path.join(root_path, 'data')

In [18]:
data_df = pd.read_csv(os.path.join(data_path, 'adult.csv'))

# fill empty values (simple imputer)

In [20]:
data_df = data_df.drop(columns=['fnlwgt'])

In [21]:
data_df.dtypes

age                int64
workclass         object
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
income            object
income_class       int64
dtype: object

In [32]:
X = data_df.drop(columns=['income', 'education-num', 'income_class'])
y = data_df['income']

In [33]:
X.columns

Index(['age', 'workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
       'hours-per-week', 'native-country'],
      dtype='object')

In [19]:
imputer_nulls_cat = SimpleImputer(strategy='constant', fill_value='?')
imputer_nulls_num = SimpleImputer(strategy='mean')

In [38]:
std_scaler = StandardScaler()
range_scaler = MinMaxScaler((-1, 1))
one_hot_encoder = OneHotEncoder(sparse_output=False)
ed_ord_categories = ['Preschool', '1st-4th', '5th-6th', '7th-8th', '9th', '10th', '11th', '12th',
    'HS-grad', 'Some-college', 'Assoc-acdm', 'Assoc-voc', 'Prof-school',
    'Bachelors', 'Masters', 'Doctorate']
ordinal_encoder = OrdinalEncoder(categories=[ed_ord_categories])

In [39]:
transform_num_std = Pipeline(steps=[
    ('imputer', imputer_nulls_num),
    ('std_scaler', std_scaler)
])
transform_num_range = Pipeline(steps=[
    ('imputer', imputer_nulls_num),
    ('range_scaler', range_scaler)
])
transform_cat_one_hot = Pipeline(steps=[
    ('imputer', imputer_nulls_cat),
    ('one_hot_encoder', one_hot_encoder)
])
transform_cat_ordinal = Pipeline(steps=[
    ('imputer', imputer_nulls_cat),
    ('ordinal_encoder', ordinal_encoder)
])

In [40]:
cols_num_std = [
    'capital-gain', 'capital-loss'
]
cols_num_range = [
    'age', 'hours-per-week'
]
cols_cat_one_hot = [
        'workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country'
]
cols_cat_ordinal = [
    'education'
]

In [41]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num_std', transform_num_std, cols_num_std),
        ('num_range', transform_num_range, cols_num_range),
        ('cat_one_hot', transform_cat_one_hot, cols_cat_one_hot),
        ('cat_ordinal', transform_cat_ordinal, cols_cat_ordinal)
    ]
)

In [36]:
preprocessor

In [43]:
preres = preprocessor.fit_transform(X)

In [44]:
preres.shape

(48842, 107)

In [45]:
preres

array([[ 0.14693247, -0.2171271 , -0.39726027, ...,  0.        ,
         0.        , 13.        ],
       [-0.14480353, -0.2171271 , -0.09589041, ...,  0.        ,
         0.        , 13.        ],
       [-0.14480353, -0.2171271 , -0.42465753, ...,  0.        ,
         0.        ,  8.        ],
       ...,
       [-0.14480353, -0.2171271 , -0.42465753, ...,  0.        ,
         0.        , 13.        ],
       [ 0.58722034, -0.2171271 , -0.26027397, ...,  0.        ,
         0.        , 13.        ],
       [-0.14480353, -0.2171271 , -0.50684932, ...,  0.        ,
         0.        , 13.        ]])