In [1]:
from sklearn.datasets import fetch_openml 

In [2]:
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

In [11]:
X.head(3)

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [55]:
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import OneHotEncoder, Binarizer
from sklearn.impute import SimpleImputer
from skrub import SelectCols
from sklearn.ensemble import HistGradientBoostingClassifier

feat_pipe = make_union(
    make_pipeline(
        SelectCols(["pclass", "sex"]),
        OneHotEncoder(sparse_output=False)
    ),
    make_pipeline(
        SelectCols("age"),
        SimpleImputer(fill_value=19, strategy="constant"),
        make_union(
            Binarizer(threshold=18),
            Binarizer(threshold=12),
        )
    ),
    SelectCols(["fare", "age"])
)

pipe = make_pipeline(
    feat_pipe, 
    HistGradientBoostingClassifier()
)

pipe

In [64]:
feat_pipe = make_union(
    make_pipeline(
        SelectCols(["pclass", "sex"]),
        OneHotEncoder(sparse_output=False)
    ),
    make_pipeline(
        SelectCols("age"),
        SimpleImputer(fill_value=19, strategy="constant"),
        make_union(
            Binarizer(threshold=18),
            Binarizer(threshold=12),
        )
    ),
    SelectCols(["fare", "age"])
)

In [84]:
import numpy as np 

make_column_transformer(
    ("passthrough", make_column_selector(dtype_exclude=np.number))
).fit_transform(X)

array([['Allen, Miss. Elisabeth Walton', 'female', '24160', ..., 'S',
        '2', 'St Louis, MO'],
       ['Allison, Master. Hudson Trevor', 'male', '113781', ..., 'S',
        '11', 'Montreal, PQ / Chesterville, ON'],
       ['Allison, Miss. Helen Loraine', 'female', '113781', ..., 'S',
        nan, 'Montreal, PQ / Chesterville, ON'],
       ...,
       ['Zakarian, Mr. Mapriededer', 'male', '2656', ..., 'C', nan, nan],
       ['Zakarian, Mr. Ortin', 'male', '2670', ..., 'C', nan, nan],
       ['Zimmerman, Mr. Leo', 'male', '315082', ..., 'S', nan, nan]],
      dtype=object)

In [79]:
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector

age_pipe = make_pipeline(
    SimpleImputer(fill_value=19, strategy="constant"),
    make_union(
        Binarizer(threshold=18),
        Binarizer(threshold=12),
    )
)

col_tfm = make_column_transformer(
    (OneHotEncoder(), ["pclass", "sex"]),
    (age_pipe, ["age"]),
    ("passthrough", ["fare", "age"])
)

col_tfm

In [85]:
from sklearn.model_selection import cross_val_score

mod_pipe = make_pipeline(
    col_tfm,
    HistGradientBoostingClassifier()
)

_ = cross_val_score(mod_pipe, X, y, cv=10, n_jobs=-1)
_.mean(), _

(0.7700058719906048,
 array([0.74045802, 0.84732824, 0.86259542, 0.83969466, 0.84732824,
        0.83969466, 0.73282443, 0.63358779, 0.64885496, 0.70769231]))

In [89]:
simp_tfm = ColumnTransformer([
    ("one_hot_features", OneHotEncoder(), ["sex"]),
    ("given_features", "passthrough", ["fare", "age", "pclass"]),
])

mod_pipe = make_pipeline(
    simp_tfm,
    HistGradientBoostingClassifier()
)

_ = cross_val_score(mod_pipe, X, y, cv=10, n_jobs=-1)
_.mean(), _

(0.7639107457428069,
 array([0.7480916 , 0.83969466, 0.85496183, 0.82442748, 0.80916031,
        0.83969466, 0.74045802, 0.61068702, 0.64885496, 0.72307692]))