This tutorial illustrates how to use transformation to create pipeline in ML model project. Here are benefits:   

- Make models easier to use, as it's a single object you can save/restore.   

- Make models more reproducible because all preprocessing and modelling steps are done together.   

- Reduce chance of data leaking because all operations are done separately on train and validation    sets

- Make hyperparameter search easier becase it's a single object.

[credit](https://queirozf.com/entries/scikit-learn-pipelines-custom-pipelines-and-pandas-integration).   

[useful](https://campus.datacamp.com/courses/case-study-school-budgeting-with-machine-learning-in-python/improving-your-model?ex=1): Case Study: School Budgeting with Machine Learning in Python

In [2]:
import sklearn
import pandas as pd
import numpy as np
import scipy

In [3]:
sklearn.__version__, pd.__version__, np.__version__, scipy.__version__

('0.22.1', '1.0.1', '1.18.1', '1.4.1')

## Custom dataframe transformer   
which can be applied to an arbitrary function to a pandas dataframe

In [4]:
from sklearn.pipeline import Pipeline

class DataframeFunctionTransformer():
    def __init__(self, func):
        self.func = func

    def transform(self, input_df, **transform_params):
        return self.func(input_df)

    def fit(self, X, y=None, **fit_params):
        return self

In [5]:
def process_dataframe(input_df):
    
    input_df["text"] = input_df["text"].map(lambda t: t.upper())
    
    return input_df

In [6]:
df = pd.DataFrame({
    "id":[1,2,3,4],
    "text":["foo","Bar","BAz","quux"]
})

In [7]:
df

Unnamed: 0,id,text
0,1,foo
1,2,Bar
2,3,BAz
3,4,quux


In [8]:
pipeline = Pipeline([
    ("lowercase", DataframeFunctionTransformer(process_dataframe))
])

In [9]:
pipeline.fit_transform(df)

Unnamed: 0,id,text
0,1,FOO
1,2,BAR
2,3,BAZ
3,4,QUUX


## Sparse to dense matrix   

For example, you may need to add a step that turns a sparse matrix into a dense matrix, if you need to use a method that requires dense matrices such as GaussianNB or PCA:

In [10]:
from sklearn.decomposition import PCA

from sklearn.tree import DecisionTreeClassifier
from sklearn.base import TransformerMixin,BaseEstimator

from sklearn.pipeline import Pipeline

In [11]:
data = scipy.sparse.csr_matrix([
    [1.,0.,0.,0.,0.,0.],
    [0.,1.,0.,0.,0.,0.],
    [1.,0.,0.,0.,0.,0.],
    [0.,0.,0.,0.,1.,0.],
    [0.,0.,0.,1.,0.,0.],
    [1.,0.,0.,0.,0.,0.],
    [1.,1.,0.,0.,0.,0.],
    [1.,1.,0.,0.,0.,0.],
])

target = np.array([1,1,1,0,0,0,1,1])

In [12]:
class ToDenseTransformer():
    
    # here you define the operation it should perform
    def transform(self, X, y=None, **fit_params):
        return X.todense()

    # just return self
    def fit(self, X, y=None, **fit_params):
        return self

# need to make matrices dense because PCA does not work with sparse vectors.
pipeline = Pipeline([
    ('to_dense',ToDenseTransformer()),
    ('pca',PCA()),
    ('clf',DecisionTreeClassifier())
])

pipeline.fit(data,target)
pipeline.predict(data)

array([1, 1, 1, 0, 0, 1, 1, 1])

## Select Columns Transformer   
to pick up only columns we need.

In [13]:
import pandas as pd

from sklearn.base import TransformerMixin,BaseEstimator
from sklearn.pipeline import Pipeline

class SelectColumnsTransformer():
    def __init__(self, columns=None):
        self.columns = columns

    def transform(self, X, **transform_params):
        cpy_df = X[self.columns].copy()
        return cpy_df

    def fit(self, X, y=None, **fit_params):
        return self

In [14]:
df = pd.DataFrame({
    'name':['alice','bob','charlie','david','edward'],
    'age':[24,32,np.nan,38,20]
})

# create a pipeline with a single transformer
pipe = Pipeline([
    ('selector', SelectColumnsTransformer(["name"]))
])

pipe.fit_transform(df)

Unnamed: 0,name
0,alice
1,bob
2,charlie
3,david
4,edward


## Column transformer: missing imputation  
to fill missing values

In [15]:
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [16]:
df = pd.DataFrame({
    'name':['alice','bob','charlie','david','edward'],
    'age':[24,32,np.nan,38,20]
})

df.head()

Unnamed: 0,name,age
0,alice,24.0
1,bob,32.0
2,charlie,
3,david,38.0
4,edward,20.0


In [17]:
transformer_step = ColumnTransformer([
        ('impute_mean', SimpleImputer(strategy='mean'), ['age'])
    ], remainder='passthrough')

In [18]:
pipe = Pipeline([
    ('transformer', transformer_step)
])

In [19]:
pipe.fit(df)

pd.DataFrame(
    data=pipe.transform(df),
    columns=['age', 'name']
)[["name","age"]]

Unnamed: 0,name,age
0,alice,24.0
1,bob,32.0
2,charlie,28.5
3,david,38.0
4,edward,20.0


## ColumnTransformer with OneHotEncoder   

Create a single Pipeline that takes a DataFrame as input, does preprocessing (for all columns) using a ColumnTransformer and trains a DecisionTreeClassifier on top of it.

In [20]:
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier

In [19]:
df = pd.DataFrame({
    'favorite_color':['blue','green','red','green','blue'],
    'age': [10,15,10,np.nan,10],
    'target':[1,0,1,0,1]
})
df

Unnamed: 0,favorite_color,age,target
0,blue,10.0,1
1,green,15.0,0
2,red,10.0,1
3,green,,0
4,blue,10.0,1


In [22]:
# define individual transformers in a pipeline
categorical_preprocessing = Pipeline([('ohe', OneHotEncoder())])
numerical_preprocessing = Pipeline([('imputation', SimpleImputer())])

# define which transformer applies to which columns
preprocess = ColumnTransformer([
    ('categorical_preprocessing', categorical_preprocessing, ['favorite_color']),
    ('numerical_preprocessing', numerical_preprocessing, ['age'])
])

# create the final pipeline with preprocessing steps and 
# the final classifier step
pipeline = Pipeline([
    ('preprocess', preprocess),
    ('clf', DecisionTreeClassifier())
])

# now fit the pipeline using the whole dataframe
df_features = df[['favorite_color','age']]
df_target = df['target']

pipeline.fit(df_features, df_target)

Pipeline(memory=None,
         steps=[('preprocess',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('categorical_preprocessing',
                                                  Pipeline(memory=None,
                                                           steps=[('ohe',
                                                                   OneHotEncoder(categories='auto',
                                                                                 drop=None,
                                                                                 dtype=<class 'numpy.float64'>,
                                                                                 handle_unknown='error',
                                                                                 sparse=True))],
                                