In [1]:
# ============================================================ 
# # ONTARIO REAL ESTATE AI DEMO PIPELINE 
# # datasets → cleans → merges → trains → saves model 
# # ============================================================

# import dependencies

import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import random
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
import joblib
import os

In [2]:
N_ROWS = 10000
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

os.makedirs("data", exist_ok= True)
os.makedirs("models", exist_ok=True)

In [3]:
df_prop = pd.DataFrame([generate_property_row() for _ in range(N_ROWS)]) 
df_prop.to_csv("data/property_data.csv", index=False)

prop = df_prop.copy()
tracker = df_tracker.copy()
cmhc = df_cmhc.copy()

prop["year"] = pd.to_datetime(prop["sale_date"]).dt.year
prop["city_norm"] = prop["city"].str.lower()
tracker["city_norm"] = tracker["municipality"].str.lower()
cmhc["city_norm"] = cmhc["region"].str.lower()

merged = (
    prop.merge(tracker, on=["city_norm", "year"], how="left")
        .merge(cmhc, on=["city_norm", "year"], how="left")
)

NameError: name 'generate_property_row' is not defined

In [None]:
TARGET = "price" 

numeric_features = ["beds","baths","sqft","lot_size","year_built", 
                    "housing_starts","new_units","long_term_care_beds", 
                    "aru_units","avg_price_region","price_index"] 

categorical_features = ["property_type","city","postal_code"]

X = merged[numeric_features + categorical_features] 
y = merged[TARGET]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

preprocessor = ColumnTransformer(
    transformers=[ 
        ("num", "passthrough", numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

model = XGBRegressor( n_estimators=500, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8, objective="reg:squarederror", random_state=42 ) 
pipeline = Pipeline([ ("preprocess", preprocessor), ("model", model) ]) 
pipeline.fit(X_train, y_train) 
preds = pipeline.predict(X_test) 

print("------------------------------------------------")
print("Any NaN in predictions:", np.isnan(preds).any()) 
print("Any Inf in predictions:", np.isinf(preds).any()) 
print("Prediction range:", preds.min(), preds.max()) 
print("Actual price range:", y_test.min(), y_test.max())
print("------------------------------------------------")
print("MAE:", mean_absolute_error(y_test, preds)) 
print("RMSE:", np.sqrt(mean_squared_error(y_test, preds)))#, squared=False)) 
print("R²:", r2_score(y_test, preds))