# <font color = 'lightblue'><b>Column Transformer</b></font>

In [19]:
import numpy as np
import pandas as pd

In [20]:
data = pd.DataFrame({
    'age': [25, 30, np.nan, 35, 40], 
    'salary': [50000, 60000, 70000, np.nan, 100000],
    'gender': ['M', 'F', 'M', 'F', 'F'],
    'education': ['Bachelor', 'Master', 'PhD', 'Bachelor', 'Master'],
    'department': ['HR', 'Engineering', 'HR', 'Marketing', 'Engineering'],
    'target': ['yes', 'no', 'yes', 'no', 'yes']
})

data

Unnamed: 0,age,salary,gender,education,department,target
0,25.0,50000.0,M,Bachelor,HR,yes
1,30.0,60000.0,F,Master,Engineering,no
2,,70000.0,M,PhD,HR,yes
3,35.0,,F,Bachelor,Marketing,no
4,40.0,100000.0,F,Master,Engineering,yes


## <font color = 'lightblue'><b>Preprocessing Steps</b></font>
#### `Imputation`: Fill missing values in numerical features.
#### `Scaling`: Standardize the numerical features.
#### `One-Hot Encoding`: Apply to gender and department.
#### `Ordinal Encoding`: Apply to education.
#### `Label Encoding`: Apply to target.

In [21]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [22]:
numerical_transformer = Pipeline(steps=[
                                 ('imputer', SimpleImputer(strategy='mean')),
                                 ('scaler', StandardScaler())
                                 ])

In [31]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', numerical_transformer, ['age', 'salary']),
        ('one-hot-encoding', OneHotEncoder(), ['gender', 'department']),
        ('ordinal', OrdinalEncoder(categories=[['Bachelor', 'Master', 'PhD']]), ['education'])
    ]
)

In [32]:
le = LabelEncoder()

In [33]:
X = data.drop('target', axis = 1)
y = data['target']

In [34]:
X_preprocessed = preprocessor.fit_transform(X)
y_preprocessed = le.fit_transform(y)

In [36]:
X_preprocessed_df = pd.DataFrame(X_preprocessed, columns=[
    'age', 'salary', 
    'gender_F', 'gender_M', 
    'department_Engineering', 'department_HR', 'department_Marketing',
    'education'
])

In [37]:
y_preprocessed_df = pd.DataFrame(y_preprocessed, columns=['target'])

In [40]:
df = pd.concat([X_preprocessed_df, y_preprocessed_df], axis=1)

In [41]:
df

Unnamed: 0,age,salary,gender_F,gender_M,department_Engineering,department_HR,department_Marketing,education,target
0,-1.5,-1.195229,0.0,1.0,0.0,1.0,0.0,0.0,1
1,-0.5,-0.597614,1.0,0.0,1.0,0.0,0.0,1.0,0
2,0.0,0.0,0.0,1.0,0.0,1.0,0.0,2.0,1
3,0.5,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0
4,1.5,1.792843,1.0,0.0,1.0,0.0,0.0,1.0,1
