In [1]:
import pandas as pd

In [4]:
data = pd.read_csv('../data/data_workflow.csv')
data

Unnamed: 0,age,bmi,children,smoker,region,charges
0,19.0,27.900,0,True,southwest,16884.92400
1,18.0,33.770,1,False,southeast,1725.55230
2,,33.000,3,False,southeast,4449.46200
3,33.0,22.705,0,False,northwest,21984.47061
4,32.0,28.880,0,False,northwest,3866.85520
...,...,...,...,...,...,...
1333,50.0,30.970,3,False,northwest,10600.54830
1334,18.0,31.920,0,False,northeast,2205.98080
1335,18.0,36.850,0,False,southeast,1629.83350
1336,21.0,25.800,0,False,southwest,2007.94500


In [3]:
X = data.drop(columns='charges')
y = data['charges']

X_train = X[:1100]
y_train = y[:1100]
X_test = X[1100:]
y_test = y[1100:]

## Pipeline

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [6]:
# Preprocess 'age'
pipe = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
])

pipe.fit(X_train[['age']])
pipe.transform(X_train[['age']])

array([[-1.45565969],
       [-1.52644066],
       [ 0.        ],
       ...,
       [-1.24331678],
       [ 0.88011225],
       [-1.03097388]])

In [9]:
pipe.steps

[('imputer', SimpleImputer()), ('scaler', StandardScaler())]

In [11]:
pipe['scaler']

### Column transformer

In [12]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [13]:
# Impute then scale numerical values: 
num_transformer = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
])

# Encode categorical values
cat_transformer = OneHotEncoder(handle_unknown='ignore')

# Parallelize "num_transformer" and "cat_transfomer"
# Receives a list of tuples of 3 elements (name, transformer, columns)
preprocessor = ColumnTransformer([
    ('num_tr', num_transformer, ['age', 'bmi']),
    ('cat_tr', cat_transformer, ['smoker', 'region'])
])

In [14]:
preprocessor

In [15]:
X_train_transformed = preprocessor.fit_transform(X_train)

display(X_train.head(3))
display(pd.DataFrame(X_train_transformed).head(3))

Unnamed: 0,age,bmi,children,smoker,region
0,19.0,27.9,0,True,southwest
1,18.0,33.77,1,False,southeast
2,,33.0,3,False,southeast


Unnamed: 0,0,1,2,3,4,5,6,7
0,-1.45566,-0.479092,0.0,1.0,0.0,0.0,0.0,1.0
1,-1.526441,0.492337,1.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.36491,1.0,0.0,0.0,0.0,1.0,0.0


We are missing the **column names**

In [16]:
SimpleImputer.get_feature_names_out = (lambda self, names = None: self.feature_names_in_)

In [18]:
scaler = SimpleImputer()
scaler.fit(data[['age']])
scaler.get_feature_names_out()

array(['age'], dtype=object)

In [21]:
scaler.feature_names_in_

array(['age'], dtype=object)

In [24]:
X_train_transformed = pd.DataFrame(preprocessor.transform(X_train), columns=preprocessor.get_feature_names_out())
X_train_transformed

Unnamed: 0,num_tr__age,num_tr__bmi,cat_tr__smoker_False,cat_tr__smoker_True,cat_tr__region_northeast,cat_tr__region_northwest,cat_tr__region_southeast,cat_tr__region_southwest
0,-1.455660,-0.479092,0.0,1.0,0.0,0.0,0.0,1.0
1,-1.526441,0.492337,1.0,0.0,0.0,0.0,1.0,0.0
2,0.000000,0.364910,1.0,0.0,0.0,0.0,1.0,0.0
3,-0.464726,-1.338815,1.0,0.0,0.0,1.0,0.0,0.0
4,-0.535507,-0.316911,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...
1095,-1.526441,0.091850,1.0,0.0,1.0,0.0,0.0,0.0
1096,0.809331,0.689271,0.0,1.0,1.0,0.0,0.0,0.0
1097,-1.243317,0.492337,1.0,0.0,0.0,0.0,1.0,0.0
1098,0.880112,0.013242,1.0,0.0,1.0,0.0,0.0,0.0


`preprocessor` drops all columns bot specified by default. It can be changed using `remainder=passthrough`. 

In [25]:
preprocessor = ColumnTransformer([
    ('num_tr', num_transformer, ['age','bmi']),
    ('cat_tr', cat_transformer, ['region','smoker'])],
    remainder='passthrough'
)
preprocessor

In [26]:
pd.DataFrame(preprocessor.fit_transform(X_train)).head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,-1.45566,-0.479092,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,-1.526441,0.492337,0.0,0.0,1.0,0.0,1.0,0.0,1.0
2,0.0,0.36491,0.0,0.0,1.0,0.0,1.0,0.0,3.0


### Custom: Function transformer

In [32]:
from sklearn.preprocessing import FunctionTransformer
import numpy as np

In [33]:
# Create a transformer that compresses data to 2 digits (for instance!)
rounder = FunctionTransformer(np.round)
rounder = FunctionTransformer(lambda array: np.round(array, decimals=2))

`FunctionTransformer`only works for stateless transformations. It cannot store information during `fit()`.

### New transformer

In [34]:
from sklearn.base import TransformerMixin, BaseEstimator

In [35]:
class MyCustomTranformer(TransformerMixin, BaseEstimator): 
    # BaseEstimator generates the get_params() and set_params() methods that all Pipelines require
    # TransformerMixin creates the fit_transform() method from fit() and transform()
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        # Here you store what needs to be stored during .fit(X_train) as instance attributes
        # Return "self" to allow chaining .fit().transform()
        pass
    
    def transform(self, X, y=None):
        # Return result as a DataFrame for integration into ColumnTransformer
        pass

In [36]:
my_transformer = MyCustomTranformer()
my_transformer.fit(X_train)
my_transformer.transform(X_train)
my_transformer.transform(X_test)

### Feature union ||

Applies transformations *in parallel*

In [37]:
from sklearn.pipeline import FeatureUnion

In [38]:
# Create a custom transformer that multiplies two columns
bmi_age_ratio_constructor = FunctionTransformer(lambda df: pd.DataFrame(df["bmi"] / df["age"]))

union = FeatureUnion([
    ('preprocess', preprocessor), # columns 0-8
    ('bmi_age_ratio', bmi_age_ratio_constructor) # new colum 9
])

union

It is not necessary to detail the name of the steps

In [39]:
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.compose import make_column_transformer

In [40]:
Pipeline([
    ('my_name_for_imputer', SimpleImputer()),
    ('my_name_for_scaler', StandardScaler())
])

# Equivalent to
make_pipeline(SimpleImputer(), StandardScaler())

In [41]:
num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
cat_transformer = OneHotEncoder()

preproc_basic = make_column_transformer(
    (num_transformer, ['age', 'bmi']),
    (cat_transformer, ['smoker', 'region']),
    remainder='passthrough'
)

preproc_full = make_union(preproc_basic, bmi_age_ratio_constructor)

preproc_full

`make_column_selector` selects features based on `dtype`

In [42]:
from sklearn.compose import make_column_selector

In [43]:
num_col = make_column_selector(dtype_include=['float64'])
cat_col = make_column_selector(dtype_include=['object','bool'])

In [44]:
X_train.dtypes

age         float64
bmi         float64
children      int64
smoker         bool
region       object
dtype: object

### Final pipeline

In [45]:
num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
num_col = make_column_selector(dtype_include=['float64'])

cat_transformer = OneHotEncoder()
cat_col = make_column_selector(dtype_include=['object','bool'])

preproc_basic = make_column_transformer(
    (num_transformer, num_col),
    (cat_transformer, cat_col),
    remainder='passthrough'
)

preproc_full = make_union(preproc_basic, bmi_age_ratio_constructor)

preproc_full

### Full pipe

In [46]:
from sklearn.linear_model import Ridge

In [47]:
# Preprocessor
num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
cat_transformer = OneHotEncoder()

preproc = make_column_transformer(
    (num_transformer, make_column_selector(dtype_include=['float64'])),
    (cat_transformer, make_column_selector(dtype_include=['object','bool'])),
    remainder='passthrough'
)

# Add estimator
pipe = make_pipeline(preproc, Ridge())
pipe

In [48]:
# Train Pipeline
pipe.fit(X_train,y_train)

# Make predictions
pipe.predict(X_test.iloc[0:2])

# Score model
pipe.score(X_test,y_test)

0.7472459359430914

### Cross validation

In [49]:
from sklearn.model_selection import cross_val_score

In [50]:
# Cross-validate Pipeline
# Scoring can be changed
cross_val_score(pipe, X_train, y_train, cv=5, scoring='r2').mean()

0.7463235584349777

### Grid Search

In [51]:
from sklearn.model_selection import GridSearchCV

In [53]:
# Inspect the parameters of all of the pipe's components to find the one you want to Grid Search
pipe.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('pipeline',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer()),
                                                    ('standardscaler',
                                                     StandardScaler())]),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x7f2df936ba60>),
                                   ('onehotencoder', OneHotEncoder(),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x7f2df94a9a90>)])),
  ('ridge', Ridge())],
 'verbose': False,
 'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('pipeline',
                                  Pipeline(steps=[('simpleimputer',
                                

In [54]:
# Instantiate Grid Search
grid_search = GridSearchCV(
    pipe, 
    param_grid={
        # Access any component of the Pipeline, as far back as you want
        'columntransformer__pipeline__simpleimputer__strategy': ['mean', 'median'],
        'ridge__alpha': [0.1, 0.5, 1, 5, 10]
    },
    cv=5,
    scoring="r2")

grid_search.fit(X_train, y_train)

grid_search.best_params_

{'columntransformer__pipeline__simpleimputer__strategy': 'median',
 'ridge__alpha': 1}

In [55]:
pipe_tuned = grid_search.best_estimator_
pipe_tuned

### Cache to avoid memory computations

In [56]:
from tempfile import mkdtemp
from shutil import rmtree

In [None]:
# Create a temp folder
cachedir = mkdtemp()

# Instantiate the Pipeline with the cache parameter
pipe = Pipeline(steps, memory=cachedir)

# Clear the cache directory after the cross-validation
rmtree(cachedir)

### Debug a Pipe

In [58]:
# Access the components of a Pipeline with `named_steps`
pipe_tuned.named_steps.keys()

dict_keys(['columntransformer', 'ridge'])

In [59]:
# Check intermediate steps
pipe_tuned.named_steps["columntransformer"].fit_transform(X_train).shape

(1100, 9)

### Export pipeline

In [60]:
import pickle

In [61]:
# Export Pipeline as pickle file
with open("pipeline.pkl", "wb") as file:
    pickle.dump(pipe_tuned, file)

# Load Pipeline from pickle file
my_pipeline = pickle.load(open("pipeline.pkl","rb"))

my_pipeline.score(X_test, y_test)

0.7472449607503219

## Auto ML
### Tree-based Pipeline Optimization Tool (TPOT)

In [None]:
!pip install tpot

In [63]:
import os
from tpot import TPOTRegressor



In [65]:
X_train_preproc = preproc_basic.fit_transform(X_train)
X_test_preproc = preproc_basic.transform(X_test)

In [67]:
# Instantiate TPOTClassifier
tpot = TPOTRegressor(generations=4, population_size=20, verbosity=2, scoring='r2', n_jobs=-1, cv=2)

# Process autoML with TPOT
tpot.fit(X_train_preproc, y_train)

# Print score
tpot.score(X_test_preproc, y_test)

Optimization Progress:   0%|          | 0/100 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.8449696700358738

Generation 2 - Current best internal CV score: 0.8461908465451355

Generation 3 - Current best internal CV score: 0.8461908465451355

Generation 4 - Current best internal CV score: 0.8484267595595457

Best pipeline: RandomForestRegressor(AdaBoostRegressor(input_matrix, learning_rate=0.001, loss=linear, n_estimators=100), bootstrap=False, max_features=0.4, min_samples_leaf=9, min_samples_split=16, n_estimators=100)
0.8615160628858889


