# 12 — Mini Project: Prediction Pipeline

## Project brief
Build and evaluate a model to predict house prices from synthetic features.

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

rng = np.random.default_rng(123)
n = 500

df = pd.DataFrame({
    "sqft": rng.normal(1600, 500, n).clip(400, 5000),
    "bedrooms": rng.integers(1, 6, n),
    "age": rng.integers(0, 60, n),
    "neighborhood": rng.choice(["A", "B", "C", "D"], n),
})

base = 50000 + df["sqft"] * 180 + df["bedrooms"] * 10000 - df["age"] * 800
neigh_boost = df["neighborhood"].map({"A": 90000, "B": 50000, "C": 20000, "D": 0})
noise = rng.normal(0, 25000, n)

df["price"] = base + neigh_boost + noise

df.head()

Unnamed: 0,sqft,bedrooms,age,neighborhood,price
0,1105.439325,4,35,B,352042.766765
1,1416.106674,2,20,A,442385.04749
2,2243.962631,4,42,A,597843.110004
3,1696.98721,1,29,A,442717.440423
4,2060.11545,4,43,A,537408.626974


In [3]:
X = df.drop(columns=["price"])
y = df["price"]

num_cols = ["sqft", "bedrooms", "age"]
cat_cols = ["neighborhood"]

numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

preprocess = ColumnTransformer([
    ("num", numeric_pipeline, num_cols),
    ("cat", categorical_pipeline, cat_cols),
])

model = RandomForestRegressor(n_estimators=200, random_state=42)

pipe = Pipeline([
    ("preprocess", preprocess),
    ("model", model),
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)

print("MAE:", round(mean_absolute_error(y_test, pred), 2))
print("R2:", round(r2_score(y_test, pred), 3))

MAE: 28001.84
R2: 0.893


## Extension ideas
- Try GradientBoostingRegressor or XGBoost
- Add cross-validation
- Inspect feature importances

In [4]:
# Step 1: Import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor
# Optional: use XGBoost if installed
# from xgboost import XGBRegressor

# Step 2: Create sample dataset
np.random.seed(42)
df = pd.DataFrame({
    "Hours_Studied": np.random.randint(1, 11, 20),
    "Score": np.random.randint(50, 101, 20)
})

X = df[["Hours_Studied"]]
y = df["Score"]

# Step 3: Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Train Gradient Boosting Regressor
gbr = GradientBoostingRegressor(random_state=42)
gbr.fit(X_train, y_train)

# Step 5: Predictions and evaluation
y_pred = gbr.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Gradient Boosting Results:")
print(f"MAE: {mae:.2f}")
print(f"R²: {r2:.2f}")

# Step 6: Cross-validation
cv_scores = cross_val_score(gbr, X, y, cv=5, scoring='r2')
print(f"5-Fold Cross-Validation R² scores: {cv_scores}")
print(f"Mean CV R²: {cv_scores.mean():.2f}")

# Step 7: Feature importance
importances = gbr.feature_importances_
for i, col in enumerate(X.columns):
    print(f"Feature: {col}, Importance: {importances[i]:.2f}")


Gradient Boosting Results:
MAE: 18.87
R²: -0.20
5-Fold Cross-Validation R² scores: [-4.66212376 -0.46721401 -1.14522897 -0.43115519 -1.61172012]
Mean CV R²: -1.66
Feature: Hours_Studied, Importance: 1.00
