In [None]:
import pandas as pd
from numpy import mean, std
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

from beaverfe import BeaverPipeline
from beaverfe.transformations import (
    ColumnSelection,
    MathematicalOperations,
    MissingValuesHandler,
    NonLinearTransformation,
    Normalization,
    NumericalBinning,
    OutliersHandler,
    ScaleTransformation,
)

# 1. Get the dataset

In [None]:
data = load_iris()

df = pd.DataFrame(data.data, columns=data.feature_names)
df["target"] = data.target

x, y = df.drop(columns="target"), df["target"]

# 2. Define transformations

In [None]:
transformer = BeaverPipeline(
    [
        MissingValuesHandler(
            transformation_options={
                "sepal length (cm)": "fill_mode",
                "sepal width (cm)": "fill_knn",
                "petal length (cm)": "fill_mode",
                "petal width (cm)": "most_frequent",
            },
            n_neighbors={
                "sepal width (cm)": 5,
            },
        ),
        OutliersHandler(
            transformation_options={
                "sepal length (cm)": ("cap", "iqr"),
                "sepal width (cm)": ("cap", "zscore"),
                "petal length (cm)": ("median", "lof"),
                "petal width (cm)": ("median", "iforest"),
            },
            thresholds={
                "sepal length (cm)": 1.5,
                "sepal width (cm)": 2.5,
            },
            lof_params={
                "petal length (cm)": {
                    "n_neighbors": 20,
                }
            },
            iforest_params={
                "petal width (cm)": {
                    "contamination": 0.1,
                }
            },
        ),
        NonLinearTransformation(
            transformation_options={
                "sepal length (cm)": "yeo_johnson",
                "petal length (cm)": "exponential",
            }
        ),
        Normalization(
            transformation_options={
                "sepal length (cm)": "l1",
                "sepal width (cm)": "l2",
            }
        ),
        NumericalBinning(
            transformation_options={
                "sepal length (cm)": ("uniform", 5),
                "sepal width (cm)": ("quantile", 6),
                "petal length (cm)": ("kmeans", 7),
            }
        ),
        MathematicalOperations(
            operations_options=[
                ("sepal length (cm)", "sepal width (cm)", "add"),
                ("petal length (cm)", "petal width (cm)", "multiply"),
            ]
        ),
        ScaleTransformation(
            transformation_options={
                "sepal length (cm)": "min_max",
                "sepal width (cm)": "min_max",
                "petal length (cm)": "min_max",
                "petal width (cm)": "min_max",
            }
        ),
        ColumnSelection(
            features=[
                "sepal length (cm)",
                "sepal width (cm)",
                "petal length (cm)",
                "petal width (cm)",
                "sepal length (cm)__add__sepal width (cm)",
                "petal length (cm)__multiply__petal width (cm)",
            ]
        ),
    ]
)

# 3. Define the pipeline

In [None]:
model = KNeighborsClassifier()
pipe = Pipeline(steps=[("t", transformer), ("m", model)])

# 4. Evalute

In [None]:
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
scores = cross_val_score(pipe, x, y, scoring="accuracy", cv=cv, n_jobs=-1)

print(f"{mean(scores):.3f} ({std(scores):.3f})")