In [1]:
import pandas as pd
import numpy as np

In [None]:
# Manque le Csv à intégrer dans le dépôt
data = pd.read_csv("data_workflow.csv")
data.head()

In [5]:
from sklearn.model_selection import train_test_split

X = data.drop(columns='charges')
y = data['charges']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

Nous allons réaliser les traitements suivants, dans une même pipeline:


- imputation des valeurs manquantes
- scaling des features numériques
- encodage des features catégorielles
- entraintement du modèle

In [6]:
# Preprocess "age"
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('standard_scaler', StandardScaler())
])

pipeline.fit(X_train[['age']])
pipeline.transform(X_train[['age']])

array([[ 0.26853419],
       [-0.01403379],
       [ 0.40981817],
       [-1.42687366],
       [ 1.46944808],
       [-1.21494768],
       [ 1.11623811],
       [-1.42687366],
       [ 1.61073206],
       [-1.42687366],
       [ 0.76302814],
       [-1.0030217 ],
       [ 1.68137406],
       [ 0.76302814],
       [ 0.0566082 ],
       [ 1.39880608],
       [-1.14430569],
       [ 0.48046017],
       [-0.43788575],
       [ 1.11623811],
       [ 0.76302814],
       [-0.22595977],
       [-1.14430569],
       [-0.50852774],
       [ 0.26853419],
       [-0.08467578],
       [-0.01403379],
       [ 1.11623811],
       [-1.35623167],
       [ 0.1272502 ],
       [-0.86173771],
       [-0.93237971],
       [-1.49751565],
       [-0.01403379],
       [-0.50852774],
       [ 1.2575221 ],
       [ 1.04559611],
       [-0.57916974],
       [ 0.1272502 ],
       [ 0.62174415],
       [-1.28558967],
       [-1.07366369],
       [-1.14430569],
       [-1.42687366],
       [-1.0030217 ],
       [ 0

In [74]:
# accéder aux étapes
pipeline #[0]

**Column transformer**

Son rôle est d'appliquer des traitements sur des colonnes spécifiques qui vont se faire en parallèle

In [76]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Imputation et scaling des variables numériques
num_transformer = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())])

# Encodage de la feature catégorielle
cat_transformer = OneHotEncoder(handle_unknown='ignore')

# On parallelise les deux traitements "num_transformer" et "cat_transformer"
preprocessor = ColumnTransformer([
    ('num_tr', num_transformer, ['age','bmi']),
    ('cat_tr', cat_transformer, ['smoker', 'region'])])

In [77]:
 # visualisation des pipelines en HTML
from sklearn import set_config
set_config(display='diagram')
preprocessor

In [78]:
X_train_transformed = preprocessor.fit_transform(X_train)

display(X_train.head(3))
display(pd.DataFrame(X_train_transformed).head(3))

Unnamed: 0,age,bmi,children,smoker,region
1046,43.0,25.08,0,False,northeast
682,39.0,35.3,2,True,southwest
1037,45.0,30.495,1,True,northwest


Unnamed: 0,0,1,2,3,4,5,6,7
0,0.268471,-0.920153,1.0,0.0,1.0,0.0,0.0,0.0
1,-0.014097,0.792507,0.0,1.0,0.0,0.0,0.0,1.0
2,0.409755,-0.012711,0.0,1.0,0.0,1.0,0.0,0.0


In [15]:
preprocessor.feature_names_in_

array(['age', 'bmi', 'children', 'smoker', 'region'], dtype=object)

In [16]:
preprocessor.get_feature_names_out()

array(['num_tr__age', 'num_tr__bmi', 'cat_tr__smoker_False',
       'cat_tr__smoker_True', 'cat_tr__region_northeast',
       'cat_tr__region_northwest', 'cat_tr__region_southeast',
       'cat_tr__region_southwest'], dtype=object)

In [17]:
pd.DataFrame(
    X_train_transformed,
    columns=preprocessor.get_feature_names_out()
).head()

Unnamed: 0,num_tr__age,num_tr__bmi,cat_tr__smoker_False,cat_tr__smoker_True,cat_tr__region_northeast,cat_tr__region_northwest,cat_tr__region_southeast,cat_tr__region_southwest
0,0.268471,-0.920153,1.0,0.0,1.0,0.0,0.0,0.0
1,-0.014097,0.792507,0.0,1.0,0.0,0.0,0.0,1.0
2,0.409755,-0.012711,0.0,1.0,0.0,1.0,0.0,0.0
3,-1.426937,0.390317,1.0,0.0,0.0,0.0,0.0,1.0
4,1.469386,1.563371,0.0,1.0,0.0,0.0,0.0,1.0


la variable 'children' n'a pas été traitée par le ColumnTransformer, par défaut elle n'est pas renvoyée

In [18]:
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, ['age','bmi']),
    ('cat_transformer', cat_transformer, ['region','smoker'])],
    remainder='passthrough'
)

preprocessor

In [19]:
pd.DataFrame(preprocessor.fit_transform(X_train),
            columns=preprocessor.get_feature_names_out()).head(3)

Unnamed: 0,num_transformer__age,num_transformer__bmi,cat_transformer__region_northeast,cat_transformer__region_northwest,cat_transformer__region_southeast,cat_transformer__region_southwest,cat_transformer__smoker_False,cat_transformer__smoker_True,remainder__children
0,0.268471,-0.920153,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-0.014097,0.792507,0.0,0.0,0.0,1.0,0.0,1.0,2.0
2,0.409755,-0.012711,0.0,1.0,0.0,0.0,0.0,1.0,1.0


On peut également appliquer dans une pipeline des fonctions quelconque en les encapsulant grace à **FunctionTransformer**.

Créons un Transformer pour arrondir les données dans notre dataframe

In [22]:
from sklearn.preprocessing import FunctionTransformer
rounder = FunctionTransformer(lambda array: np.round(array, decimals=2))


In [33]:
from sklearn.pipeline import FeatureUnion

bmi_age_ratio = FunctionTransformer(lambda df: pd.DataFrame(df["bmi"] / df["age"]))

union = FeatureUnion([
    ('preprocess', preprocessor), 
    ('bmi_age_ratio', bmi_age_ratio) 
])

union

**Quelques raccourcis**

In [34]:
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.compose import make_column_transformer

Pipeline([
    ('my_name_for_imputer', SimpleImputer()),
    ('my_name_for_scaler', StandardScaler())
])

# est équivalent à:
make_pipeline(SimpleImputer(), StandardScaler())

In [79]:
num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
cat_transformer = OneHotEncoder()

preproc_basic = make_column_transformer((num_transformer, ['age', 'bmi']),
                                       (cat_transformer, ['smoker', 'region']),
                                       remainder='passthrough')

preproc_full = make_union(preproc_basic, bmi_age_ratio)
preproc_full

On aurait pu aussi utiliser make_column_selector pour sélectionner les colonnes à traiter par leur dtype

In [37]:
X_train.dtypes

age         float64
bmi         float64
children      int64
smoker         bool
region       object
dtype: object

In [38]:
from sklearn.compose import make_column_selector

num_col = make_column_selector(dtype_include=['float64'])
cat_col = make_column_selector(dtype_include=['object','bool'])

In [41]:
num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
num_col = make_column_selector(dtype_include=['float64'])

cat_transformer = OneHotEncoder()
cat_col = make_column_selector(dtype_include=['object','bool'])

preproc_basic = make_column_transformer(
    (num_transformer, num_col),
    (cat_transformer, cat_col),
    remainder='passthrough')

preproc_full = make_union(preproc_basic, bmi_age_ratio)
preproc_full

**Rajoutons l'entrainement du modèle à notre pipeline**

In [42]:
from sklearn.linear_model import Ridge

# Pipeline de preprocessing
num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
cat_transformer = OneHotEncoder()

preproc = make_column_transformer(
    (num_transformer, make_column_selector(dtype_include=['float64'])),
    (cat_transformer, make_column_selector(dtype_include=['object','bool'])),
    remainder='passthrough')

# Ajout du modèle
pipe = make_pipeline(preproc, Ridge())
pipe

### Entrainement et résultats

In [47]:
pipe.fit(X_train,y_train)

# Prédictions
pipe.predict(X_test.iloc[0:2])

# Score
print(f"Score cross-validé moyen sur le train set: {cross_val_score(pipe, X_train, y_train, cv=5, scoring='r2').mean()}")
print(f"Score sur le test set:{pipe.score(X_test,y_test)}")

Score cross-validé moyen sur le train set: 0.7303205220105584
Score sur le test set:0.7616723797868186


In [48]:
from sklearn.model_selection import cross_val_score

# Cross-validate Pipeline
cross_val_score(pipe, X_train, y_train, cv=5, scoring='r2').mean()

0.7303205220105584

### Grid Search dans une pipeline

On veut vérifier quelle combinaison des paramètres du préprocessing et de l'entrainement donne les meilleurs résultats

On peut pour cela faire un GridSearch sur n'importe quelle composant de la pipeline, avec la syntaxe : nom_etape__nom_transformer__nom_hyperparam

In [53]:
from sklearn.model_selection import GridSearchCV

# On peut afficher tous les paramètres de tout les composants de la pipeline
pipe.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('pipeline',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer()),
                                                    ('standardscaler',
                                                     StandardScaler())]),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x7f4f428005e0>),
                                   ('onehotencoder', OneHotEncoder(),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x7f4f42803f10>)])),
  ('ridge', Ridge())],
 'verbose': False,
 'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('pipeline',
                                  Pipeline(steps=[('simpleimputer',
                                

In [54]:
pipe.get_params()['columntransformer']

In [55]:
grid_search = GridSearchCV(
    pipe, 
    param_grid={
        # grille des hyper paramètres à tester
        'columntransformer__pipeline__simpleimputer__strategy': ['mean', 'median'],
        'ridge__alpha': [0.1, 0.5, 1, 5, 10]},
    cv=5,
    scoring="r2")

# entraine toute la pipeline et la ré-entraine avec les meilleurs paramètres trouvés
grid_search.fit(X_train, y_train)
grid_search.best_params_

{'columntransformer__pipeline__simpleimputer__strategy': 'median',
 'ridge__alpha': 5}

On enregistre la pipeline entrainée avec les meilleurs estimateurs

In [56]:
pipe_tuned = grid_search.best_estimator_

**Mettre des transformations en cache pour economiser du temps de calcul**

Certaines opérations d'une pipeline peuvent être mise en cache afin de ne pas être recalculés:

les calculs des hyperparmètres du modèle

In [58]:
from tempfile import mkdtemp
from shutil import rmtree


cachedir = mkdtemp()

pipe = make_pipeline(preproc, Ridge(), memory=cachedir)

rmtree(cachedir)

**Débuger sa pipeline**

In [60]:
# acceder a chacun des composants
pipe_tuned.named_steps.keys()

dict_keys(['columntransformer', 'ridge'])

In [59]:
# vérifier une étape intermédiaire
pipe_tuned.named_steps["columntransformer"].fit_transform(X_train).shape

(896, 9)

### Exporter sa pipeline entrainée

In [None]:
# le module pickle de python permet de sauvegarder n'importe quel objet
import pickle

# spécifier le path pour le fichier final
from pathlib import Path
import os
export_path = Path("/.../.../")
os.path.join(export_path,'test')

# exporter la pipeline
export_path = ".../.../"
with open(os.path.join(export_path,"pipeline.pkl"), "wb") as file:
    pickle.dump(pipe_tuned, file)

# recharger la pipeline
my_pipeline = pickle.load(open(os.path.join(export_path,"pipeline.pkl"),"rb"))

# faire une prédiction avec la pipeline entrainée
my_pipeline.score(X_test, y_test)

# AutoML

In [66]:
import os
from tpot import TPOTRegressor

X_train_preproc = preproc_basic.fit_transform(X_train)
X_test_preproc = preproc_basic.transform(X_test)

In [None]:
tpot = TPOTRegressor(generations=4, population_size=20, verbosity=2, scoring='r2', n_jobs=-1, cv=2)

tpot.fit(X_train_preproc, y_train)

print(tpot.score(X_test_preproc, y_test))