**Today's challenge:**
1. preprocessing<br>
    1.1 impute missing values and scale numerical features<br>
    1.2 encode categorical features<br>
    1.3 other preprocesssing <br><br>
    
2. fine tune model and save 

# Preprocessing pipelines

In [1]:
# Import relevant libraries
import pandas as pd
import numpy as np


In [2]:
#Import the data
data = pd.read_csv('data_workflow.csv')

data.head(5)

Unnamed: 0,age,bmi,children,smoker,region,charges
0,19.0,27.9,0,True,southwest,16884.924
1,18.0,33.77,1,False,southeast,1725.5523
2,,33.0,3,False,southeast,4449.462
3,33.0,22.705,0,False,northwest,21984.47061
4,32.0,28.88,0,False,northwest,3866.8552


In [3]:
# Create training and testing set
X = data.drop(columns='charges')
y = data['charges']

X_train = X[:1100]
y_train = y[:1100]
X_test = X[1100:]
y_test = y[1100:]

In [5]:
y

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64

## Impute missing values and scale numerical features

In [6]:
from sklearn.pipeline import Pipeline
# Preprocess "age"
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

pipe = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
])

pipe.fit(X_train[['age']])
pipe.transform(X_train[['age']])

array([[-1.45565969],
       [-1.52644066],
       [ 0.        ],
       ...,
       [-1.24331678],
       [ 0.88011225],
       [-1.03097388]])

In [9]:
# access steps

pipe[]
pipe['imputer']

SimpleImputer()

In [14]:
pipe[0]

SimpleImputer()

## Encode categorical features

In [15]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Impute then Scale for numerical variables: 
num_transformer = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())])

# Encode categorical variables
cat_transformer = OneHotEncoder(handle_unknown='ignore')

# cat_transformer = Pipeline([
#     ('imputer', #Categorical),
#     ('scaler', StandardScaler())])

# Paralellize "num_transformer" and "One hot encoder"
preprocessor = ColumnTransformer([
    ('num_tr', num_transformer, ['age','bmi']),
    ('cat_tr', cat_transformer, ['smoker', 'region'])])

In [16]:
 # visualizing pipelines in HTML
from sklearn import set_config; set_config(display='diagram')
preprocessor

In [None]:
#How many columns do we expect to see?


In [17]:
X_train_transformed = preprocessor.fit_transform(X_train)

display(X_train.head(3))
display(pd.DataFrame(X_train_transformed).head(3))

Unnamed: 0,age,bmi,children,smoker,region
0,19.0,27.9,0,True,southwest
1,18.0,33.77,1,False,southeast
2,,33.0,3,False,southeast


Unnamed: 0,0,1,2,3,4,5,6,7
0,-1.45566,-0.479092,0.0,1.0,0.0,0.0,0.0,1.0
1,-1.526441,0.492337,1.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.36491,1.0,0.0,0.0,0.0,1.0,0.0


In [18]:
#Where are the column names?

# This should be fixed in Scikit-Learn 1.0.2: all transformers will have this method.
# SimpleImputer does not have a get_feature_names_out, so we need to add it manually.
SimpleImputer.get_feature_names_out = (lambda self, names=None: self.feature_names_in_)

In [19]:
# Get your feature
preprocessor.get_feature_names_out()

array(['num_tr__age', 'num_tr__bmi', 'cat_tr__smoker_False',
       'cat_tr__smoker_True', 'cat_tr__region_northeast',
       'cat_tr__region_northwest', 'cat_tr__region_southeast',
       'cat_tr__region_southwest'], dtype=object)

In [20]:
pd.DataFrame(X_train_transformed, 
             columns=preprocessor.get_feature_names_out()
            ).head()

Unnamed: 0,num_tr__age,num_tr__bmi,cat_tr__smoker_False,cat_tr__smoker_True,cat_tr__region_northeast,cat_tr__region_northwest,cat_tr__region_southeast,cat_tr__region_southwest
0,-1.45566,-0.479092,0.0,1.0,0.0,0.0,0.0,1.0
1,-1.526441,0.492337,1.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.36491,1.0,0.0,0.0,0.0,1.0,0.0
3,-0.464726,-1.338815,1.0,0.0,0.0,1.0,0.0,0.0
4,-0.535507,-0.316911,1.0,0.0,0.0,1.0,0.0,0.0


In [21]:
preprocessor = ColumnTransformer([
    ('num_tr', num_transformer, ['age','bmi']),
    ('cat_tr', cat_transformer, ['region','smoker'])],
    remainder='passthrough')
preprocessor

In [22]:
pd.DataFrame(preprocessor.fit_transform(X_train)).head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,-1.45566,-0.479092,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,-1.526441,0.492337,0.0,0.0,1.0,0.0,1.0,0.0,1.0
2,0.0,0.36491,0.0,0.0,1.0,0.0,1.0,0.0,3.0


In [23]:
# # Get your feature
preprocessor.get_feature_names_out()

array(['num_tr__age', 'num_tr__bmi', 'cat_tr__region_northeast',
       'cat_tr__region_northwest', 'cat_tr__region_southeast',
       'cat_tr__region_southwest', 'cat_tr__smoker_False',
       'cat_tr__smoker_True', 'remainder__children'], dtype=object)

## Other preprocessing

### Custom: Function Transformer

In [24]:
from sklearn.preprocessing import FunctionTransformer

# Create a transformer that compresses data to 2 digits (for instance!)
rounder = FunctionTransformer(np.round)
rounder = FunctionTransformer(lambda array: np.round(array, decimals=2))

**Reminder of lambda function**

In [25]:
# #lambda arguments : expression
x = lambda a : a + 10
print(x(5))

15


In [26]:
# Add it at the end of our numerical transformer
num_transformer = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler()),
    ('rounder', rounder)])

preprocessor = ColumnTransformer([
    ('num_tr', num_transformer, ['bmi', 'age']),
    ('cat_tr', cat_transformer, ['region', 'smoker'])],
    remainder='passthrough')
preprocessor

In [27]:
#Let's see the impact of the function transformer
pd.DataFrame(preprocessor.fit_transform(X_train)).head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,-0.48,-1.46,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.49,-1.53,0.0,0.0,1.0,0.0,1.0,0.0,1.0


**IMPORTANT** Can only be applied to state-less transformations

### Transformers under the hood

In [None]:
from sklearn.base import TransformerMixin, BaseEstimator

class MyCustomTranformer(TransformerMixin, BaseEstimator): 
    # BaseEstimator generates get_params() and set_params() methods that all pipelines require
    # TransformerMixin creates fit_transform() method from fit() and transform()
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        # Store here what needs to be stored during .fit(X_train) as instance attributes.
        # Return "self" to allow chaining .fit().transform()
       
        pass
    
    def transform(self, X, y=None):
        # Return result as dataframe for integration into ColumnTransformer
        pass

In [None]:
my_transformer = MyCustomTranformer()

my_transformer.fit(X_train)

my_transformer.transform(X_train)

my_transformer.transform(X_test)

### Feature Union

In [28]:
from sklearn.pipeline import FeatureUnion

# Create a custom transformer that multiplies two columns
bmi_age_ratio_constructor = FunctionTransformer(lambda df: pd.DataFrame(df["bmi"] / df["age"]))

union = FeatureUnion([
    ('preprocess', preprocessor), # columns 0-8
    ('bmi_age_ratio', bmi_age_ratio_constructor) # new colums 9
])
union

In [30]:
X

Unnamed: 0,age,bmi,children,smoker,region
0,19.0,27.900,0,True,southwest
1,18.0,33.770,1,False,southeast
2,,33.000,3,False,southeast
3,33.0,22.705,0,False,northwest
4,32.0,28.880,0,False,northwest
...,...,...,...,...,...
1333,50.0,30.970,3,False,northwest
1334,18.0,31.920,0,False,northeast
1335,18.0,36.850,0,False,southeast
1336,21.0,25.800,0,False,southwest


In [32]:
union = FeatureUnion([
    (#pass through data frame)
    ('bmi_age_ratio', bmi_age_ratio_constructor) # new colums 9,
        remainder = 'passthrough'
])


union.fit(X)


In [33]:
pd.DataFrame(union.fit_transform(X)).head()

Unnamed: 0,0
0,1.468421
1,1.876111
2,
3,0.68803
4,0.9025


In [29]:
pd.DataFrame(union.fit_transform(X_train)).head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.48,-1.46,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.468421


In [35]:
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.compose import make_column_transformer

Pipeline([
    ('my_name_for_imputer', SimpleImputer()),
    ('my_name_for_scaler', StandardScaler())
])

# Equivalent to
make_pipeline(SimpleImputer(), StandardScaler())

In [36]:
num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
cat_transformer = OneHotEncoder()

preproc_basic = make_column_transformer((num_transformer, ['age', 'bmi']),
                                       (cat_transformer, ['smoker', 'region']),
                                       remainder='passthrough')

preproc_full = make_union(preproc_basic, bmi_age_ratio_constructor)
preproc_full

In [37]:
#make_column_selector selects features automatically based on dtype

from sklearn.compose import make_column_selector

num_col = make_column_selector(dtype_include=['float64'])
cat_col = make_column_selector(dtype_include=['object','bool'])

X_train.dtypes

age         float64
bmi         float64
children      int64
smoker         bool
region       object
dtype: object

In [38]:
#Complete preprocessing pipeline

num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
num_col = make_column_selector(dtype_include=['float64'])

cat_transformer = OneHotEncoder()
cat_col = make_column_selector(dtype_include=['object','bool'])

preproc_basic = make_column_transformer(
    (num_transformer, num_col),
    (cat_transformer, cat_col),
    remainder='passthrough')

preproc_full = make_union(preproc_basic, bmi_age_ratio_constructor)
preproc_full

# Tuning the pipeline

## Including models in pipelines

In [39]:
from sklearn.linear_model import Ridge

# Preprocessor
num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
cat_transformer = OneHotEncoder()

preproc = make_column_transformer(
    (num_transformer, make_column_selector(dtype_include=['float64'])),
    (cat_transformer, make_column_selector(dtype_include=['object','bool'])),
    remainder='passthrough')

# Add Estimator
pipe = make_pipeline(preproc, Ridge())
pipe

## Using the pipeline

In [None]:
# X_train
# y_train

In [40]:
# Train pipeline
pipe.fit(X_train,y_train)

# Make predictions
pipe.predict(X_test.iloc[0:2])

# Score model
pipe.score(X_test,y_test)

0.7472459359430912

## Cross validating the pipeline

In [41]:
from sklearn.model_selection import cross_val_score

# Cross validate pipeline
cross_val_score(pipe, X_train, y_train, cv=5, scoring='r2').mean()

0.7463235584349777

In [None]:
# pipe.get_params()

In [43]:
from sklearn.model_selection import GridSearchCV

# Inspect all pipe components parameters to find the one you want to gridsearch
pipe.get_params()

# Instanciate grid search
grid_search = GridSearchCV(
    pipe, 
    param_grid={
        # Access any component of the pipeline, as far back as you want
        'columntransformer__pipeline__simpleimputer__strategy': ['mean', 'median'],
        'ridge__alpha': [0.1, 0.5, 1, 5, 10]},
    cv=5,
    scoring="r2")

In [47]:
pipe.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('pipeline',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer()),
                                                    ('standardscaler',
                                                     StandardScaler())]),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x135f28490>),
                                   ('onehotencoder', OneHotEncoder(),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x135e46f70>)])),
  ('ridge', Ridge())],
 'verbose': False,
 'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('pipeline',
                                  Pipeline(steps=[('simpleimputer',
                                      

In [44]:
grid_search.fit(X_train, y_train)
grid_search.best_params_

{'columntransformer__pipeline__simpleimputer__strategy': 'median',
 'ridge__alpha': 1}

In [45]:
pipe_tuned = grid_search.best_estimator_

## Debugging the pipeline

In [48]:
# Access component of pipeline with `name_steps`
pipe_tuned.named_steps.keys()



dict_keys(['columntransformer', 'ridge'])

In [49]:
# Check intermediate steps
pipe_tuned.named_steps["columntransformer"].fit_transform(X_train).shape

(1100, 9)

## Exporting models/pipelines

In [None]:
import pickle

# Export pipeline as pickle file
with open("pipeline.pkl", "wb") as file:
    pickle.dump(pipe_tuned, file)

# Load pipeline from pickle file
my_pipeline = pickle.load(open("pipeline.pkl","rb"))

my_pipeline.score(X_test, y_test)