<a href="https://colab.research.google.com/github/rafasacaan/the-notebook/blob/main/sklearn/sklearn_column_transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [68]:
from sklearn.datasets import fetch_openml

X, y = fetch_openml('titanic', version=1, return_X_y=True, as_frame=True, parser='auto')
X.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [69]:
X.shape

(1309, 13)

In [None]:
!pip install skrub

In [73]:
import pandas as pd

from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, Binarizer
from skrub import SelectCols
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import cross_val_score

In [74]:
from sklearn.compose import ColumnTransformer

**Feature pipeline**

We can use existing functions or create specific column transformer classes to pass

In [75]:
from sklearn.base import BaseEstimator, TransformerMixin

In [128]:
# Transformer functions and classes


class CustomTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, column_name):
        self.column_name = column_name

    def fit(self, X, y=None):
        return self  # The fit method typically does nothing for transformers

    def transform(self, X):
        X_transformed = X.copy()
        X_transformed[self.column_name] = (
            X_transformed[self.column_name].apply(lambda x: x * 2)
        )
        return X_transformed


age_pipe = make_pipeline(
  SimpleImputer(fill_value=19, strategy="constant"),
  make_union(
      Binarizer(threshold=18),
      Binarizer(threshold=12),
  )
)

In [129]:
# Helper function to get feature names from the ColumnTransformer
# (needs some fixing to make it work)

def get_feature_names(column_transformer, input_features):

    feature_names = []

    for name, transformer, columns in column_transformer.transformers_:

        if name == 'remainder' and transformer == 'passthrough':
            feature_names.extend(input_features.columns[columns])

        elif isinstance(transformer, Pipeline):
            for step in transformer:
                if hasattr(step, 'get_feature_names_out'):
                    feature_names.extend(step.get_feature_names_out())
                else:
                    feature_names.extend(columns)

        elif hasattr(transformer, 'get_feature_names_out'):
            feature_names.extend(transformer.get_feature_names_out())

        else:
            feature_names.extend(columns)

    return feature_names

In [144]:
# The transformer

# A note about ColumnTransformer
# On each tranformer step, the output of one step is not the
# input of the next. Instead, each step is always being fed
# from the initial dataframe.

column_trf = ColumnTransformer(
    transformers=[
        # name, what, who
        ("one_hot_features", OneHotEncoder(), ["pclass", "sex"]),
        ("age_pipe", age_pipe, ["age"]),
        ("fare_mul", CustomTransformer("fare"), ["fare"]),
        ("age_mul", CustomTransformer("age"), ["age"]),
        ("given_features", "passthrough", ["fare","age"])
    ]
)

column_trf

In [145]:
column_trf.fit_transform(X).shape

(1309, 11)

In [146]:
column_trf.fit_transform(X)[:5,:]

array([[  1.    ,   0.    ,   0.    ,   1.    ,   0.    ,   1.    ,
          1.    , 422.675 ,  58.    , 211.3375,  29.    ],
       [  1.    ,   0.    ,   0.    ,   0.    ,   1.    ,   0.    ,
          0.    , 303.1   ,   1.8334, 151.55  ,   0.9167],
       [  1.    ,   0.    ,   0.    ,   1.    ,   0.    ,   0.    ,
          0.    , 303.1   ,   4.    , 151.55  ,   2.    ],
       [  1.    ,   0.    ,   0.    ,   0.    ,   1.    ,   1.    ,
          1.    , 303.1   ,  60.    , 151.55  ,  30.    ],
       [  1.    ,   0.    ,   0.    ,   1.    ,   0.    ,   1.    ,
          1.    , 303.1   ,  50.    , 151.55  ,  25.    ]])

Modeling pipeline

In [147]:
model_pipeline = make_pipeline(
    column_trf,
    HistGradientBoostingClassifier(random_state=42)
)

_ = cross_val_score(model_pipeline, X,y, cv=10, n_jobs=-1)
_.mean(), _

(0.7700058719906048,
 array([0.74045802, 0.84732824, 0.86259542, 0.83969466, 0.84732824,
        0.83969466, 0.73282443, 0.63358779, 0.64885496, 0.70769231]))