In [1]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
import sys
import joblib

np.set_printoptions(threshold=sys.maxsize)
np.set_printoptions(precision=3)
large_width = 400
np.set_printoptions(linewidth=large_width)

A_rentals = pd.read_csv("https://raw.githubusercontent.com/MicrosoftDocs/ml-basics/master/data/daily-bike-share.csv")
columns = ['season', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'rentals']
A_rentals = A_rentals[columns]

# SciKit-Learn Pipelines

![SKLearn](sklearn.png)

Rodrigo Agundez - 06 April 2023 - [SciKit-Learn Pipelines Tutorial](https://towardsdatascience.com/step-by-step-tutorial-of-sci-kit-learn-pipeline-62402d5629b6)

# SciKit-Learn Pipelines

- ## Transformer: Data in, data out.
- ## Estimator: Data in, state out -> Transformer
- ## Predictor: Data in, data out.
- ## Model: Data in, state out -> Predictor

In [2]:
A_rentals.sample(5)

Unnamed: 0,season,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,rentals
31,1,2,0,2,1,2,0.192174,0.23453,0.829565,0.053213,47
704,4,12,0,3,1,1,0.438333,0.428012,0.485,0.324021,331
224,3,8,0,6,0,2,0.685833,0.645858,0.729583,0.211454,1504
115,2,4,0,2,1,1,0.631667,0.594083,0.729167,0.3265,678
700,4,12,0,6,0,2,0.298333,0.316904,0.806667,0.059704,951


# For numerical columns

- Impute missing values with the mean
- Scale to unit variance

In [3]:
numeric_features = ['temp', 'atemp', 'hum', 'windspeed']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# For categorical columns

- Impute missing values
- Ordingal and OneHot encode

In [4]:
onthot_features = ["season", "weathersit"]

onehot_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=-1)),
    ('encoder', OneHotEncoder(sparse_output=False, drop='first'))
])


ordinal_features = ['mnth', 'holiday', 'weekday', 'workingday', 'weathersit']

ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=-1)),
    ('encoder', OrdinalEncoder())
])

# Create single object for all operations

In [5]:
preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numeric_transformer, numeric_features),
        ('ordinal', ordinal_transformer, ordinal_features),
        ('onehot', onehot_transformer, onthot_features)
    ]
) 

pipeline = Pipeline(steps = [('preprocessor', preprocessor)])
pipeline

# Let's use it

In [6]:
X = pipeline.fit_transform(A_rentals)
pd.DataFrame(X).head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,-0.826662,-0.679946,1.250171,-0.387892,0.0,0.0,6.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,-0.721095,-0.740652,0.479113,0.749602,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,-1.634657,-1.749767,-1.339274,0.746632,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-1.61478,-1.61027,-0.263182,-0.389829,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-1.467414,-1.504971,-1.341494,-0.046307,0.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


## Save it

In [7]:
joblib.dump(pipeline, 'pipeline.joblib') 

['pipeline.joblib']

# And reuse it

In [8]:
%reset -f

import joblib
import pandas as pd

new_data = pd.read_csv("https://raw.githubusercontent.com/MicrosoftDocs/ml-basics/master/data/daily-bike-share.csv")

In [9]:
pipeline = joblib.load('pipeline.joblib') 

X = pipeline.transform(new_data)
pd.DataFrame(X).head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,-0.826662,-0.679946,1.250171,-0.387892,0.0,0.0,6.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,-0.721095,-0.740652,0.479113,0.749602,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,-1.634657,-1.749767,-1.339274,0.746632,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-1.61478,-1.61027,-0.263182,-0.389829,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-1.467414,-1.504971,-1.341494,-0.046307,0.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


# Custom Transformer

In [10]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

class MultiplyColumns(BaseEstimator, TransformerMixin):

    def __init__(self, by=1, columns=None):
        self.by = by
        self.columns = columns
        
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        cols_to_transform = list(X.columns)
        if self.columns:
            cols_to_transform = self.columns
        X[cols_to_transform] = X[cols_to_transform] * self.by
        return X

In [11]:
Pipeline(steps=[("multiply_columns", MultiplyColumns())])

# [SparkML Intro](03-sparkml_introduction.ipynb)