# Scikit-Learn Pipelines — End-to-end ML lifecycle (one command per code cell)
# This notebook-style script demonstrates how to use scikit-learn Pipelines across the data lifecycle:
# data ingestion → validation → preprocessing/feature engineering → model training → tuning → evaluation → deployment → monitoring.
# Each code cell contains one primary command with an inline comment explaining its purpose. Heavy I/O and network are commented.


# Installation (commented) — run in your environment if needed
# - Only install extras you actually need (faiss/mlflow/onnx/etc.)

In [None]:
# !pip install scikit-learn pandas numpy joblib  # core tools (commented)


# Imports and global config

In [None]:
import numpy as np  # numerical utilities used throughout


In [None]:
import pandas as pd  # tabular data handling used for ColumnTransformer with selectors


In [None]:
from sklearn import set_config  # control estimator display and diagram rendering


In [None]:
set_config(display='diagram')  # show nice diagrams for Pipeline/ColumnTransformer in rich notebook UIs


# Sample dataset — use a small built-in dataset for safe demos (no network)

In [None]:
from sklearn.datasets import load_iris  # classic multiclass classification dataset


In [None]:
iris = load_iris(as_frame=True)  # load Iris as a pandas-friendly Bunch (has .frame)


In [None]:
df = iris.frame  # pandas DataFrame with features + target for downstream ColumnTransformer demos


In [None]:
X = df.drop(columns=['target'])  # features matrix as DataFrame (keeps dtypes for selectors)


In [None]:
y = df['target']  # target Series for classification


# Data ingestion patterns (commented to avoid I/O/network)

In [None]:
# pd.read_csv('data/train.csv')  # read CSV from local path


In [None]:
# pd.read_parquet('data/train.parquet')  # read Parquet (requires pyarrow/fastparquet)


In [None]:
# import sqlalchemy; engine = sqlalchemy.create_engine('sqlite:///db.sqlite'); pd.read_sql('SELECT * FROM table', engine)  # SQL ingest (commented)


In [None]:
# from sklearn.datasets import fetch_openml; fetch_openml(name='titanic', version=1, as_frame=True)  # OpenML fetcher (may download)


# Basic data validation and quick checks

In [None]:
df.info()  # concise summary: dtypes, non-nulls per column


In [None]:
df.describe(include='all')  # quick stats overview (numeric and categorical)


In [None]:
df.isna().sum()  # count missing values per column (sanity check before imputation)


# Train/validation/test splitting and CV splitters

In [None]:
from sklearn.model_selection import train_test_split  # basic splitting utility


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)  # stratified split for classification


In [None]:
from sklearn.model_selection import StratifiedKFold, TimeSeriesSplit  # common CV splitters


In [None]:
StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  # use for CV maintaining label proportions


In [None]:
TimeSeriesSplit(n_splits=4)  # specialized splitter for ordered/time-indexed data


# Preprocessing with ColumnTransformer — numeric + categorical branches

In [None]:
from sklearn.compose import ColumnTransformer, make_column_selector  # column-wise transformer utilities


In [None]:
from sklearn.impute import SimpleImputer  # impute missing values per column type


In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder  # scaling for numeric; OHE for categoricals


In [None]:
num_sel = make_column_selector(dtype_include=np.number)  # select numeric columns dynamically by dtype


In [None]:
cat_sel = make_column_selector(dtype_exclude=np.number)  # select non-numeric (categorical/object) columns


In [None]:
numeric_pipeline = pd.Series([SimpleImputer(strategy='median'), StandardScaler()])  # placeholder to mirror steps idea (informational)


In [None]:
categorical_pipeline = pd.Series([SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore')])  # placeholder steps (informational)


In [None]:
preprocess = ColumnTransformer(
    transformers=[
        ('num', Pipeline := __import__('sklearn.pipeline').pipeline.Pipeline([('impute', SimpleImputer(strategy='median')), ('scale', StandardScaler())]), num_sel),
        ('cat', Pipeline2 := __import__('sklearn.pipeline').pipeline.Pipeline([('impute', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]), cat_sel),
    ]
)  # build ColumnTransformer with nested Pipelines per column type


# Feature engineering primitives (examples)

In [None]:
from sklearn.preprocessing import PolynomialFeatures, KBinsDiscretizer, FunctionTransformer  # engineered features


In [None]:
PolynomialFeatures(degree=2, include_bias=False)  # generate polynomial/interaction terms for numeric features


In [None]:
KBinsDiscretizer(n_bins=5, encode='onehot', strategy='quantile')  # bucketize continuous features into bins


In [None]:
FunctionTransformer(np.log1p, feature_names_out='one-to-one')  # wrap numpy transforms to use in pipelines


# Target transformation wrapper (for regression pipelines)

In [None]:
from sklearn.compose import TransformedTargetRegressor  # apply transform to y during fit/predict


# End-to-end Pipeline: preprocessing + model

In [None]:
from sklearn.pipeline import Pipeline, make_pipeline  # pipeline composition helpers


In [None]:
from sklearn.linear_model import LogisticRegression  # classifier for iris demo


In [None]:
clf_pipe = Pipeline(steps=[('prep', preprocess), ('model', LogisticRegression(max_iter=1000, random_state=42))])  # define full pipeline


In [None]:
clf_pipe.fit(X_train, y_train)  # fit preprocessing + model in one step on training data


In [None]:
y_pred = clf_pipe.predict(X_test)  # run inference through the whole pipeline on test data


In [None]:
clf_pipe.score(X_test, y_test)  # compute pipeline accuracy on held-out test set


# Cross-validation with pipelines

In [None]:
from sklearn.model_selection import cross_validate  # evaluate with CV across folds


In [None]:
cross_validate(clf_pipe, X, y, cv=5, scoring=['accuracy', 'f1_macro'], return_train_score=False)  # multi-metric CV on the full pipeline


# Hyperparameter search — grid/random search over pipeline parameters

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV  # model selection tools


In [None]:
param_grid = {'model__C': [0.1, 1.0, 10.0], 'model__penalty': ['l2'], 'model__solver': ['lbfgs']}  # reference step params with step__param


In [None]:
grid = GridSearchCV(clf_pipe, param_grid=param_grid, cv=5, n_jobs=None)  # CV grid search across hyperparameters


In [None]:
# grid.fit(X, y)  # run search (commented to keep notebook fast); uncomment to execute and inspect grid.best_params_


In [None]:
rand = RandomizedSearchCV(clf_pipe, param_distributions={'model__C': np.logspace(-3, 1, 20)}, n_iter=8, cv=5, random_state=42)  # randomized search stub


# Access transformed feature names (after fitting the preprocess step)

In [None]:
clf_pipe.named_steps['prep'].get_feature_names_out()  # list output feature names from ColumnTransformer (after fit)


# Calibration and thresholding (classification)

In [None]:
from sklearn.calibration import CalibratedClassifierCV  # wrap classifier to calibrate predicted probabilities


In [None]:
calibrated_pipe = Pipeline(steps=[('prep', preprocess), ('cal', CalibratedClassifierCV(LogisticRegression(max_iter=1000, random_state=42), cv=3, method='isotonic'))])  # pipeline with probability calibration


In [None]:
calibrated_pipe.fit(X_train, y_train)  # fit calibrated pipeline on training data


In [None]:
proba = calibrated_pipe.predict_proba(X_test)[:, 1 if len(np.unique(y))==2 else 0]  # get calibrated probabilities (choose a class index)


In [None]:
from sklearn.metrics import precision_recall_curve  # curve for threshold selection


In [None]:
precision_recall_curve((y_test==np.unique(y)[0]).astype(int) if len(np.unique(y))>2 else y_test, proba)  # PR curve tuples for threshold tuning (demo)


# Evaluation metrics with pipelines

In [None]:
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix  # common metrics


In [None]:
accuracy_score(y_test, y_pred)  # compute accuracy using pipeline predictions


In [None]:
f1_score(y_test, y_pred, average='macro')  # compute macro-averaged F1 score on multiclass iris


In [None]:
classification_report(y_test, y_pred)  # text report of precision/recall/F1 per class


In [None]:
confusion_matrix(y_test, y_pred)  # confusion matrix for error analysis


# Regression example: Pipeline + TransformedTargetRegressor (log-transform target)

In [None]:
from sklearn.datasets import make_regression  # synthetic regression data


In [None]:
Xr, yr = make_regression(n_samples=200, n_features=6, noise=10.0, random_state=42)  # small synthetic regression dataset


In [None]:
from sklearn.linear_model import Ridge  # regularized linear regressor


In [None]:
reg_prep = ColumnTransformer([('num', __import__('sklearn.pipeline').pipeline.Pipeline([('impute', SimpleImputer()), ('scale', StandardScaler())]), list(range(Xr.shape[1])))])  # numeric-only preprocess for regression


In [None]:
reg_pipe = Pipeline([('prep', reg_prep), ('model', TransformedTargetRegressor(regressor=Ridge(alpha=1.0, random_state=42), func=np.log1p, inverse_func=np.expm1))])  # wrap target transform


In [None]:
reg_pipe.fit(Xr, yr)  # fit regression pipeline end-to-end


In [None]:
reg_pipe.score(Xr, yr)  # R^2 score on synthetic data (demo)


# Deployment: persist and load trained pipelines (commented to avoid I/O)

In [None]:
# import joblib; joblib.dump(clf_pipe, 'iris_pipeline.joblib')  # save fitted pipeline to disk


In [None]:
# import joblib; loaded = joblib.load('iris_pipeline.joblib'); loaded.predict(X_test)  # load and run inference (commented)


# Inference pattern: predict on a new sample row

In [None]:
sample_row = X_test.iloc[[0]]  # take one row as a tiny batch (keeps DataFrame structure for preprocess selectors)


In [None]:
clf_pipe.predict(sample_row)  # run prediction through full pipeline on a single sample


# Monitoring: compute rolling/periodic metrics and calibration; simple drift proxy

In [None]:
from sklearn.metrics import brier_score_loss  # calibration-sensitive loss for probabilities


In [None]:
from sklearn.model_selection import cross_val_predict  # out-of-fold predictions for unbiased monitoring estimates


In [None]:
oof_proba = cross_val_predict(clf_pipe, X, y, cv=5, method='predict_proba')  # oof probability predictions using full pipeline


In [None]:
brier_score_loss((y==np.unique(y)[0]).astype(int) if len(np.unique(y))>2 else y, oof_proba[:, 1 if len(np.unique(y))==2 else 0])  # baseline Brier score as monitoring metric


In [None]:
pd.DataFrame(X_train).mean(numeric_only=True)  # reference training means (toy drift baseline over numeric features)


In [None]:
pd.DataFrame(X_test).mean(numeric_only=True)  # compare new batch means vs. training (manual drift check idea)


# Production tips and optional integrations (commented)
# - Class imbalance: use imbalanced-learn `Pipeline` with `SMOTE` inside CV only (to avoid leakage).
# - Model registry/tracking: MLflow `mlflow.sklearn.log_model`, experiments, metrics.
# - Export for other runtimes: `skl2onnx` to ONNX, or `joblib` persistence for Python services.
# - Batch/online inference: keep the same preprocessing inside the pipeline to avoid train/serve skew.

In [None]:
# # imbalanced-learn example (commented):
# # !pip install imbalanced-learn
# # from imblearn.pipeline import Pipeline as ImbPipeline
# # from imblearn.over_sampling import SMOTE
# # imb_pipe = ImbPipeline([('prep', preprocess), ('smote', SMOTE(random_state=42)), ('model', LogisticRegression(max_iter=1000))])
