## Test sklearn pipeline  
- 사이킷런 파이프라인 사용법 정리
- https://towardsdatascience.com/sklearn-pipelines-for-the-modern-ml-engineer-9-techniques-you-cant-ignore-637788f05df5

<div style="text-align: right"> <b>Author : Kwang Myung Yu</b></div>
<div style="text-align: right"> Initial upload: 2023.7. 7</div>
<div style="text-align: right"> Last update: 2023. 7. 7</div>

In [1]:
import os
import sys
import time
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from scipy import stats
import warnings; warnings.filterwarnings('ignore')
#plt.style.use('ggplot')
plt.style.use('seaborn-whitegrid')
%matplotlib inline

### 기본 사용법

In [2]:
import pandas as pd

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error

dataset = fetch_california_housing(as_frame=True)
data = dataset["data"]

X, y = data.drop("MedInc", axis=1), data["MedInc"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.7, random_state=42
)

In [3]:
pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler()),
        ("model", RandomForestRegressor()),
    ]
)

pipeline

In [4]:
pipeline.fit(X_train, y_train)

y_preds = pipeline.predict(X_test)
score = mean_absolute_error(y_test, y_preds)
score

0.586892160342322

In [5]:
from sklearn.pipeline import Pipeline, make_pipeline


pipe1 = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler()),
        ("model", RandomForestRegressor()),
    ]
)
pipe1.steps

[('imputer', SimpleImputer()),
 ('scaler', StandardScaler()),
 ('model', RandomForestRegressor())]

In [6]:
pipe2 = make_pipeline(
        SimpleImputer(strategy="mean"),
        StandardScaler(),
        RandomForestRegressor(),
)
pipe2.steps

[('simpleimputer', SimpleImputer()),
 ('standardscaler', StandardScaler()),
 ('randomforestregressor', RandomForestRegressor())]

In [7]:
from sklearn.preprocessing import OrdinalEncoder, QuantileTransformer
from sklearn.compose import ColumnTransformer

# column names 
nums = ["num_1", "num_2", "num_3"]
cats = ["cat_1", "cat_2", "cat_3"]  

numeric_pipe = Pipeline(
    steps=[
        ("num_imputer", SimpleImputer(strategy="mean")),
        ("std_scaler", StandardScaler()),
    ]
)

categorical_pipe = Pipeline(
    steps = [
        ("cat_imputer", SimpleImputer(strategy="most_frequenct")),
        ("cat_encoder", OrdinalEncoder()),
    ]
)

transformers = ColumnTransformer(
    transformers=[
        ("numeric", numeric_pipe, nums),
        ("categorical", categorical_pipe, cats)
    ]
)

full_pipe = Pipeline(
    steps = [
        ("transformers", transformers),
        ("model", RandomForestRegressor())
        ]
)

full_pipe

In [8]:
import numpy as np
from sklearn.compose import make_column_selector

nums = make_column_selector(dtype_include=np.number)
cats = make_column_selector(dtype_exclude=np.number)


numeric_pipe = Pipeline(
    steps=[
        ("num_imputer", SimpleImputer(strategy="mean")),
        ("std_scaler", StandardScaler()),
    ]
)

categorical_pipe = Pipeline(
    steps = [
        ("cat_imputer", SimpleImputer(strategy="most_frequenct")),
        ("cat_encoder", OrdinalEncoder()),
    ]
)

transformers = ColumnTransformer(
    transformers=[
        ("numeric", numeric_pipe, nums),
        ("categorical", categorical_pipe, cats)
    ]
)

full_pipe = Pipeline(
    steps = [
        ("transformers", transformers),
        ("model", RandomForestRegressor())
        ]
)

full_pipe

In [9]:
# column names 
nums = ["num_1", "num_2", "num_3"]
cats = ["cat_1", "cat_2", "cat_3"] 
cols = nums + cats

pattern = "^num"
filtered_columns = make_column_selector(pattern)

In [10]:
from sklearn.preprocessing import FunctionTransformer

def num_missing_row(X: pd.DataFrame, y = None):
    
    num_missing = X.isnull().sum(axis = 1)
    
    X["num_missing"] = num_missing
    
    return X

In [11]:
pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler()),
        ("custom_tr", FunctionTransformer(func=num_missing_row)),
        ("model", RandomForestRegressor()),
    ]
)

pipeline

In [12]:
from sklearn.base import BaseEstimator, TransformerMixin

In [13]:
pipe1 = make_pipeline(
    SimpleImputer(),
    StandardScaler()
)

pipe2 = make_pipeline(
    SimpleImputer(),
    StandardScaler(),
    RandomForestRegressor()
)

In [14]:
isinstance(pipe1, Tr)

NameError: name 'Tr' is not defined

In [None]:
pipe1.fit_transform(X_train)

array([[-0.67982045, -0.42260285, -0.04788574, ..., -0.08931351,
        -1.33951218,  1.2453071 ],
       [-0.36270041,  0.07313842, -0.24261163, ..., -0.04479956,
        -0.49669489, -0.27749403],
       [-1.1555005 ,  0.17585475, -0.00857821, ..., -0.07522769,
         1.68994775, -0.70687401],
       ...,
       [ 0.58865969, -0.59153751, -0.04099521, ...,  0.01719968,
        -0.75890471,  0.60123711],
       [-1.07622049,  0.39014836, -0.06718056, ...,  0.00482034,
         0.90331827, -1.18618191],
       [ 1.85713982, -0.82961574, -0.08774465, ..., -0.08166962,
         0.99228232, -1.41585028]])

In [None]:
from sklearn.compose import TransformedTargetRegressor

log_tr = FunctionTransformer(func=np.log1p, inverse_func=np.expm1)

rf_log = TransformedTargetRegressor(
    regressor=RandomForestRegressor(), 
    transformer=log_tr
)

pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler()),
        ("model", rf_log),
    ]
)

pipeline

In [None]:
log_tr = FunctionTransformer(func=np.log1p, inverse_func=np.expm1)

pipe = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler()),
        ("model", RandomForestRegressor()),
    ]
)

full_pipe = TransformedTargetRegressor(
    regressor=pipe, transformer=log_tr
)

full_pipe

In [None]:
from sklearn.model_selection import cross_validate

pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler()),
        ("model", RandomForestRegressor()),
    ]
)

results = cross_validate(
    estimator=pipeline,
    X=X,
    y=y,
    cv=5,
    n_jobs=-1,
    scoring=["r2", "neg_mean_absolute_error"],
)
results

{'fit_time': array([6.27660012, 6.11180091, 6.36151314, 6.12932897, 6.00369835]),
 'score_time': array([0.06216192, 0.05139208, 0.06381297, 0.05522013, 0.05743074]),
 'test_r2': array([0.72823224, 0.7560302 , 0.71545186, 0.64936128, 0.7204181 ]),
 'test_neg_mean_absolute_error': array([-0.67588259, -0.66452244, -0.75323841, -0.67489057, -0.73166572])}

In [None]:
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer()),
        ("scaler", StandardScaler()),
        ("model", RandomForestRegressor()),
    ]
)

param_grid = {
    "imputer__strategy" : ["mean", "median"],
    "scaler__with_mean": [True, False],
    "model__n_estimators": [100, 1000]
}

search = GridSearchCV(
    pipeline, 
    param_grid, 
    cv=5, 
)

# search.fit(X, y)

In [None]:
search.best_estimator_

In [None]:
search.best_params_

{'imputer__strategy': 'mean',
 'model__n_estimators': 1000,
 'scaler__with_mean': False}

In [None]:
search.best_score_

0.7153231282076915