In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.inspection import permutation_importance

USE_SYNTHETIC = True         
CSV_PATH = "path/to/your.csv"
TARGET_COL = "target"

RANDOM_SEED = 42
TEST_SIZE   = 0.25

# 1) Data
def make_synthetic_real_estate(n=1500, seed=42):
    rng = np.random.default_rng(seed)
    sqft = rng.normal(1800, 450, size=n).clip(400, 5000)
    bed  = rng.integers(1, 6, size=n)
    loc  = rng.choice(["A","B","C"], size=n, p=[0.45, 0.4, 0.15])

    # true signal (hundreds of thousands)
    loc_eff = {"A":0.0, "B":0.25, "C":0.6}
    noise = rng.normal(0, 0.18, size=n)
    y = 0.0012*sqft + 0.17*bed + np.vectorize(loc_eff.get)(loc) + noise

    X = pd.DataFrame({"sqft": sqft, "bed": bed, "loc": loc})
    y = pd.Series(y, name="price")
    return X, y

if USE_SYNTHETIC:
    X, y = make_synthetic_real_estate()
    print("Using synthetic data:", X.shape, "Target = price")
else:
    df = pd.read_csv(CSV_PATH)
    if TARGET_COL not in df.columns:
        raise ValueError(f"'{TARGET_COL}' not found in CSV columns.")
    y = df[TARGET_COL]
    X = df.drop(columns=[TARGET_COL])
    print("Using CAPSTONE data:", X.shape, "Target =", TARGET_COL)

# 2) Split & Preprocess
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED
)

cat_cols = [c for c in X.columns if X[c].dtype == 'object']
num_cols = [c for c in X.columns if X[c].dtype != 'object']

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("imp", SimpleImputer(strategy="median"))]), num_cols),
        ("cat", Pipeline([
            ("imp", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_cols),
    ]
)

print("Numeric columns:", num_cols)
print("Categorical columns:", cat_cols)

# 3) Baselines: Mean & Random Forest
y_pred_mean = np.full_like(np.asarray(y_test, dtype=float), fill_value=float(np.mean(y_train)))
mae_mean  = mean_absolute_error(y_test, y_pred_mean)
rmse_mean = mean_squared_error(y_test, y_pred_mean, squared=False)
print(f"Baseline (mean)  MAE={mae_mean:.4f}  RMSE={rmse_mean:.4f}")

rf = Pipeline([
    ("prep", preprocess),
    ("rf", RandomForestRegressor(
        n_estimators=400,
        max_depth=None,
        min_samples_leaf=1,
        random_state=RANDOM_SEED,
        n_jobs=-1
    ))
])
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_mae  = mean_absolute_error(y_test, rf_pred)
rf_rmse = mean_squared_error(y_test, rf_pred, squared=False)
print(f"RandomForest     MAE={rf_mae:.4f}  RMSE={rf_rmse:.4f}")

# 4) Gradient Boosting (base model)
# (sequential trees fit residuals; LR shrinks each step)
gbr_base = Pipeline([
    ("prep", preprocess),
    ("gbr", GradientBoostingRegressor(
        learning_rate=0.05,   # smaller -> more trees often needed
        n_estimators=600,     # number of weak learners
        max_depth=2,          # tree depth (complexity)
        min_samples_leaf=5,   # regularization (minimum samples per leaf)
        subsample=0.9,        # stochastic boosting (regularization)
        random_state=RANDOM_SEED
    ))
])
gbr_base.fit(X_train, y_train)
gbr_pred = gbr_base.predict(X_test)
gbr_mae  = mean_absolute_error(y_test, gbr_pred)
gbr_rmse = mean_squared_error(y_test, gbr_pred, squared=False)
print(f"GradientBoost    MAE={gbr_mae:.4f}  RMSE={gbr_rmse:.4f}")

# 5) Sweep: Learning rate × Estimators
# (Shows the LR–n_estimators tradeoff)
def sweep_lr_estimators(lrs=(0.2, 0.1, 0.05, 0.02), n_ests=(100, 300, 600, 1000)):
    rows = []
    for lr in lrs:
        for n_est in n_ests:
            model = Pipeline([
                ("prep", preprocess),
                ("gbr", GradientBoostingRegressor(
                    learning_rate=lr,
                    n_estimators=n_est,
                    max_depth=2,
                    min_samples_leaf=5,
                    subsample=0.9,
                    random_state=RANDOM_SEED
                ))
            ])
            model.fit(X_train, y_train)
            pred = model.predict(X_test)
            mae = mean_absolute_error(y_test, pred)
            rows.append({"learning_rate": lr, "n_estimators": n_est, "MAE": mae})
    return pd.DataFrame(rows).sort_values(["learning_rate", "n_estimators"])

sweep_lr_ne = sweep_lr_estimators()
print("\nLR × Estimators sweep (lower MAE is better):")
print(sweep_lr_ne.to_string(index=False))

# Plot: for each LR, MAE vs n_estimators
for lr in sorted(sweep_lr_ne["learning_rate"].unique()):
    sub = sweep_lr_ne[sweep_lr_ne["learning_rate"] == lr].sort_values("n_estimators")
    plt.figure(figsize=(6,3.5))
    plt.plot(sub["n_estimators"], sub["MAE"], marker="o")
    plt.title(f"MAE vs n_estimators (learning_rate={lr})")
    plt.xlabel("n_estimators")
    plt.ylabel("MAE")
    plt.grid(True, linestyle="--", alpha=0.5)
    plt.show()

# 6) Sweep: Tree depth + Regularization
# (Depth = weak learner complexity; min_samples_leaf/subsample = regularization)
def sweep_depth_reg(depths=(1,2,3), leaves=(1,5,10), subs=(1.0, 0.9, 0.7), lr=0.05, n_est=600):
    rows = []
    for d in depths:
        for leaf in leaves:
            for sub in subs:
                model = Pipeline([
                    ("prep", preprocess),
                    ("gbr", GradientBoostingRegressor(
                        learning_rate=lr,
                        n_estimators=n_est,
                        max_depth=d,
                        min_samples_leaf=leaf,
                        subsample=sub,
                        random_state=RANDOM_SEED
                    ))
                ])
                model.fit(X_train, y_train)
                pred = model.predict(X_test)
                mae = mean_absolute_error(y_test, pred)
                rows.append({
                    "max_depth": d,
                    "min_samples_leaf": leaf,
                    "subsample": sub,
                    "MAE": mae
                })
    return pd.DataFrame(rows).sort_values(["max_depth","min_samples_leaf","subsample"])

sweep_reg = sweep_depth_reg()
print("\nDepth + Regularization sweep (lower MAE is better):")
print(sweep_reg.to_string(index=False))

# Plots: one per depth; MAE vs min_samples_leaf, grouped by subsample
for d in sorted(sweep_reg["max_depth"].unique()):
    subd = sweep_reg[sweep_reg["max_depth"] == d]
    pivot = subd.pivot_table(index="min_samples_leaf", columns="subsample", values="MAE", aggfunc="mean")
    plt.figure(figsize=(6,3.5))
    for sub in pivot.columns:
        plt.plot(pivot.index, pivot[sub], marker="o", label=f"subsample={sub}")
    plt.title(f"MAE vs min_samples_leaf (max_depth={d})")
    plt.xlabel("min_samples_leaf")
    plt.ylabel("MAE")
    plt.grid(True, linestyle="--", alpha=0.5)
    plt.legend()
    plt.show()

# 7) Final model + Feature Importance (Permutation)
final_cfg = {
    "learning_rate": 0.05,
    "n_estimators": 800,
    "max_depth": 2,
    "min_samples_leaf": 5,
    "subsample": 0.9
}
final_gbr = Pipeline([
    ("prep", preprocess),
    ("gbr", GradientBoostingRegressor(random_state=RANDOM_SEED, **final_cfg))
])
final_gbr.fit(X_train, y_train)
pred_final = final_gbr.predict(X_test)
mae_final  = mean_absolute_error(y_test, pred_final)
rmse_final = mean_squared_error(y_test, pred_final, squared=False)
print(f"\nFINAL GBR (cfg={final_cfg})  MAE={mae_final:.4f}  RMSE={rmse_final:.4f}")

# Permutation importance works on the trained regressor and transformed X
prep = final_gbr.named_steps["prep"]
reg  = final_gbr.named_steps["gbr"]

X_test_trans = prep.transform(X_test)

# Build feature names (numeric + one-hot categories)
feat_names = []
feat_names += num_cols
if len(cat_cols) > 0:
    ohe = prep.named_transformers_["cat"].named_steps["ohe"]
    for col, cats in zip(cat_cols, ohe.categories_):
        feat_names += [f"{col}={c}" for c in cats]

perm = permutation_importance(reg, X_test_trans, y_test, n_repeats=5, random_state=RANDOM_SEED, n_jobs=-1)
importances = pd.DataFrame({
    "feature": feat_names,
    "importance_mean": perm.importances_mean,
    "importance_std": perm.importances_std
}).sort_values("importance_mean", ascending=False)

print("\nTop features (permutation importance):")
print(importances.head(12).to_string(index=False))

plt.figure(figsize=(7,4))
topk = importances.head(10).iloc[::-1]
plt.barh(topk["feature"], topk["importance_mean"])
plt.title("Top 10 features (Permutation Importance)")
plt.xlabel("Decrease in score")
plt.tight_layout()
plt.show()


Matplotlib is building the font cache; this may take a moment.


Using synthetic data: (1500, 3) Target = price
Numeric columns: ['sqft', 'bed']
Categorical columns: ['loc']


<class 'TypeError'>: got an unexpected keyword argument 'squared'