In [2]:
import sys
from pathlib import Path

src_path = (Path.cwd().parent / "src").resolve()

if str(src_path) not in {str(Path(p).resolve()) for p in sys.path if p}:
    sys.path.insert(0, str(src_path))

In [3]:
from sklearn.datasets import fetch_california_housing
from automl_engine.ml.automl.supervised import RegressionAnalyzer

df = fetch_california_housing(as_frame=True).frame

ra = RegressionAnalyzer(
    df,
    target_column="MedHouseVal",
    random_state=42,
    drop_unused_columns=True,
    ignore_columns=["AveBedrms"],
)

print(ra.data.dtypes)
print(ra.target_column)
print(ra.feature_columns)
print(ra.ignore_columns)
print(ra.column_types)
print(ra.random_state)

MedHouseVal    float64
AveRooms       float64
HouseAge       float64
AveOccup       float64
MedInc         float64
Longitude      float64
Population     float64
Latitude       float64
dtype: object
MedHouseVal
['MedInc', 'HouseAge', 'AveRooms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
['AveBedrms']
{'MedInc': 'float64', 'HouseAge': 'float64', 'AveRooms': 'float64', 'AveBedrms': 'float64', 'Population': 'float64', 'AveOccup': 'float64', 'Latitude': 'float64', 'Longitude': 'float64', 'MedHouseVal': 'float64'}
42


In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import StandardScaler

# 数値: 欠損補完 → 標準化
num = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

geo = KBinsDiscretizer(
    n_bins=10,
    encode="onehot-dense",
    strategy="quantile",
    quantile_method="averaged_inverted_cdf",
)

preprocess = ColumnTransformer(
    transformers=[
        ("num", num, ["MedInc", "HouseAge", "AveRooms", "Population", "AveOccup"]),
        ("geo", geo, ["Latitude", "Longitude"]),
    ],
    remainder="drop",
)

prepared_data = ra.prepare(
    preprocess=preprocess,
    cv=5,
)

for key, value in prepared_data.items():
    print(f"{key}: {value}")


data: {'X': array([[ 2.34476576,  0.98214266,  0.62855945, ...,  0.        ,
         0.        ,  0.        ],
       [ 2.33223796, -0.60701891,  0.32704136, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.7826994 ,  1.85618152,  1.15562047, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-1.14259331, -0.92485123, -0.09031802, ...,  0.        ,
         0.        ,  0.        ],
       [-1.05458292, -0.84539315, -0.04021111, ...,  0.        ,
         0.        ,  0.        ],
       [-0.78012947, -1.00430931, -0.07044252, ...,  0.        ,
         0.        ,  0.        ]]), 'y': 0        4.526
1        3.585
2        3.521
3        3.413
4        3.422
         ...  
20635    0.781
20636    0.771
20637    0.923
20638    0.847
20639    0.894
Name: MedHouseVal, Length: 20640, dtype: float64, 'is_preprocessed': True}
cv: {'strategy': 'KFold', 'n_splits': 5, 'splits': [{'fold': 0, 'train_idx': array([    1,     2,     4, ..., 20636, 20637, 20639

In [5]:
trained = ra.train(
    algorithms=None,          # まずはデフォルトで
    metrics=None,             # まずはデフォルトで
    primary_metric_key=None,  # デフォルトがあるなら任せる
    search_method="grid",     # まずは grid 推奨
    optuna_trials=50,
    optuna_timeout=None,
)

trained.summarize()


TypeError: run_supervised() got an unexpected keyword argument 'engine'