# Notebook

In [4]:
import sys, os
sys.path.append(os.path.abspath(".."))  # this makes src/ visible
os.makedirs("../models", exist_ok=True)
import numpy as np, pandas as pd, joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from src.pipeline import add_features, build_preprocessor

df = pd.read_csv("../data/train.csv")
df = add_features(df)
preprocessor, feature_cols, y_col = build_preprocessor(df)
X = df[feature_cols]
y = df[y_col] if y_col == 'SalePriceLog' else np.log1p(df['SalePrice'])
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

pipe = Pipeline([("prep", preprocessor), ("model", XGBRegressor(random_state=42, tree_method="hist"))])

param_grid = {
    "model__n_estimators": [400, 600, 800],
    "model__max_depth": [3,4,5],
    "model__learning_rate": [0.05, 0.1],
    "model__subsample": [0.8, 1.0],
    "model__colsample_bytree": [0.8, 1.0]
}

cv = GridSearchCV(pipe, param_grid, scoring="neg_root_mean_squared_error", cv=5, n_jobs=-1, verbose=1)
cv.fit(X_train, y_train)
print("Best params:", cv.best_params_, "CV RMSE:", -cv.best_score_)

best_model = cv.best_estimator_
pred = best_model.predict(X_valid)
#rmse = mean_squared_error(y_valid, pred, squared=False)
rmse = mean_squared_error(y_valid, pred) ** 0.5
r2 = r2_score(y_valid, pred)
print("Holdout RMSE:", round(rmse,4), "R2:", round(r2,4))

joblib.dump(best_model, "../models/house_price_pipeline.joblib")
print("Saved model to ../models/house_price_pipeline.joblib")

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best params: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.05, 'model__max_depth': 3, 'model__n_estimators': 800, 'model__subsample': 0.8} CV RMSE: 0.12428175915941435
Holdout RMSE: 0.1299 R2: 0.9096
Saved model to ../models/house_price_pipeline.joblib
