# 02 — Baseline Model

Scikit-learn pipeline with simple preprocessing + RandomForestRegressor. Edit `config/params.yaml` to tweak features.

In [None]:

import yaml, numpy as np, pandas as pd, joblib
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

from src.utils.data_loading import load_melbourne_csv
from src.features.build_features import subset_focus_area, basic_clean

cfg = yaml.safe_load(open('../config/params.yaml'))

df = load_melbourne_csv('../' + cfg['data_path'])
df = subset_focus_area(df, cfg['focus_suburbs'])
df = basic_clean(df)

target = cfg['target']
num_feats = [c for c in cfg['numeric_features'] if c in df.columns]
cat_feats = [c for c in cfg['categorical_features'] if c in df.columns]

X = df[num_feats + cat_feats]
y = df[target].values

num_tf = Pipeline([('imp', SimpleImputer(strategy='median'))])
cat_tf = Pipeline([('imp', SimpleImputer(strategy='most_frequent')),
                   ('oh', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])
pre = ColumnTransformer([('num', num_tf, num_feats),
                         ('cat', cat_tf, cat_feats)])

model = RandomForestRegressor(**cfg['model']['params'])
pipe = Pipeline([('pre', pre), ('model', model)])

Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=cfg['test_size'], random_state=cfg['random_state'])

pipe.fit(Xtr, ytr)
pred = pipe.predict(Xte)
mae = mean_absolute_error(yte, pred)
rmse = np.sqrt(mean_squared_error(yte, pred))
print(f"MAE: ${mae:,.0f}  |  RMSE: ${rmse:,.0f}")

# Error plot
err = yte - pred
plt.hist(err, bins=50, edgecolor='black')
plt.title('Residuals')
plt.xlabel('Actual - Predicted')
plt.ylabel('Count')
plt.show()

# Save model
import os
os.makedirs('../models', exist_ok=True)
joblib.dump(pipe, '../models/baseline.joblib')
