In [None]:
from __future__ import annotations
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder


In [None]:
# Baseline feature set (adjust to your CSV)
BASIC_NUMERIC = [
    "bedrooms", "bathrooms", "sqft_living", "sqft_lot",
    "floors", "year_built", "lat", "long"
]
BASIC_CATEG = ["zipcode", "condition", "grade", "waterfront", "view"]


In [None]:
def build_preprocessor(
    numeric_cols: list[str] = BASIC_NUMERIC,
    categ_cols: list[str] = BASIC_CATEG
) -> ColumnTransformer:
    num = Pipeline([("imputer", SimpleImputer(strategy="median"))])
    cat = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore"))
    ])
    return ColumnTransformer([
        ("num", num, numeric_cols),
        ("cat", cat, categ_cols)
    ])

In [None]:
def load_training_frame(csv_path: str) -> pd.DataFrame:
    df = pd.read_csv(csv_path)
    if "price" not in df.columns:
        raise ValueError("Training CSV must include a 'price' column as target.")
    # drop rows with no target
    df = df.dropna(subset=["price"])
    return df

Hi testing collab