In [None]:
# ============================================================ 
# # ONTARIO REAL ESTATE AI DEMO PIPELINE 
# # datasets → cleans → merges → trains → saves model 
# # ============================================================

# import dependencies

import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import random
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
import joblib
import os

1. Configuration

In [25]:
N_ROWS = 10000
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

os.makedirs("data", exist_ok= True)
os.makedirs("models", exist_ok=True)

2. Synthetic Property Data Generator

In [26]:
cities = [ ("Toronto", "M5V", 43.6426, -79.3871, 900000, 1400000), 
    ("Mississauga", "L5B", 43.5890, -79.6441, 650000, 1000000), 
    ("Brampton", "L6P", 43.7315, -79.7624, 600000, 950000), 
    ("Ottawa", "K1A", 45.4215, -75.6972, 550000, 900000), 
    ("Hamilton", "L8P", 43.2557, -79.8711, 500000, 850000), 
    ("London", "N6A", 42.9849, -81.2453, 400000, 700000), 
    ("Waterloo", "N2L", 43.4643, -80.5204, 500000, 850000), 
    ("Burlington", "L7R", 43.3255, -79.7990, 650000, 1100000), 
    ("Oakville", "L6J", 43.4675, -79.6877, 900000, 1600000), 
]

property_types = ["Detached", "Semi-Detached", "Townhouse", "Condo"]

start_date = datetime(2022, 1, 1)
end_date = datetime(2025, 12, 31)
days_range = (end_date - start_date).days

def generate_property_row():
    city, postal, base_lat, base_lon, price_min, price_max = random.choice(cities)
    beds = np.random.choice([1,2,3,4,5], p=[0.1, 0.25, 0.35, 0.2, 0.1])
    baths = np.random.choice([1,2,3,4], p =[0.2, 0.5,0.25,0.05])
    ptype = random.choice(property_types)

    if ptype == "Condo":
        sqft = int(np.random.normal(800 + beds * 150, 120))
        lot_size = 0
    elif ptype == "Townhouse":
        sqft = int(np.random.normal(1200 + beds *200, 150))
        lot_size = int(np.random.normal(2000, 400))
    else :
        sqft = int(np.random.normal(1500 + beds * 250, 200))
        lot_size = int(np.random.normal(3500, 800))
    
    sqft = max(400, sqft)
    lot_size = max(0, lot_size)

    year_built = int(np.random.choice(range(1965, 2023)))

    offset_days = np.random.randint(0, days_range + 1)
    sale_date = start_date + timedelta(days=int(offset_days))
    
    base_price = np.random.uniform(price_min, price_max) 
    age = sale_date.year - year_built 
    age_factor = max(0.7, 1.1 - age * 0.01) 
    size_factor = 1.0 + (sqft - 1500) / 8000.0 
    beds_factor = 1.0 + (beds - 3) * 0.05

    type_factor = { 
        "Condo": 0.85, 
        "Townhouse": 0.95, 
        "Semi-Detached": 1.0, 
        "Detached": 1.1, 
    }[ptype]

    type_factor = { 
        "Condo": 0.85, 
        "Townhouse": 0.95, 
        "Semi-Detached": 1.0, 
        "Detached": 1.1, 
    }[ptype]
    
    noise = np.random.normal(1.0, 0.08)
    price = int(max(200000, base_price * age_factor * size_factor * beds_factor * type_factor * noise))

    lat = base_lat + np.random.normal(0, 0.01) 
    lon = base_lon + np.random.normal(0, 0.01)

    return { 
        "beds": beds, 
        "baths": baths, 
        "sqft": sqft, 
        "lot_size": lot_size, 
        "year_built": year_built, 
        "property_type": ptype, 
        "city": city, 
        "postal_code": postal, 
        "lat": round(lat, 6), 
        "lon": round(lon, 6), 
        "sale_date": sale_date.strftime("%Y-%m-%d"), 
        "price": price, 
    }



3. ONTARIO HOUSING TRACKER (STATIC)

In [27]:
df_tracker = pd.DataFrame({ 
    "municipality": ["Toronto","Mississauga","Brampton","Ottawa","Hamilton","London","Waterloo","Burlington","Oakville"], 
    "year": [2023]*9, 
    "housing_starts": [28500,8200,7600,6800,5400,4900,3500,2900,3100], 
    "new_units": [31000,9000,8100,7200,5800,5200,3800,3100,3400], 
    "long_term_care_beds": [1200,300,250,400,180,150,100,80,90], 
    "aru_units": [850,210,190,260,140,120,90,60,70] 
}) 
df_tracker.to_csv("data/ontario_housing_tracker.csv", index=False)

4. CMHC PRICE INDEX (STATIC)

In [28]:
df_cmhc = pd.DataFrame({ 
    "region": ["Toronto","Mississauga","Brampton","Ottawa","Hamilton","London","Waterloo","Burlington","Oakville"], 
    "year": [2023]*9, 
    "avg_price_region": [1120000,890000,820000,710000,690000,540000,760000,880000,1350000], 
    "price_index": [178,162,155,148,145,132,150,160,190] 
}) 
df_cmhc.to_csv("data/cmhc_prices.csv", index=False)

5. MERGE ALL DATASETS

In [29]:
df_prop = pd.DataFrame([generate_property_row() for _ in range(N_ROWS)]) 
df_prop.to_csv("data/property_data.csv", index=False)

prop = df_prop.copy()
tracker = df_tracker.copy()
cmhc = df_cmhc.copy()

prop["year"] = pd.to_datetime(prop["sale_date"]).dt.year
prop["city_norm"] = prop["city"].str.lower()
tracker["city_norm"] = tracker["municipality"].str.lower()
cmhc["city_norm"] = cmhc["region"].str.lower()

merged = (
    prop.merge(tracker, on=["city_norm", "year"], how="left")
        .merge(cmhc, on=["city_norm", "year"], how="left")
)

6. TRAIN MODEL

In [30]:
TARGET = "price" 

numeric_features = ["beds","baths","sqft","lot_size","year_built", 
                    "housing_starts","new_units","long_term_care_beds", 
                    "aru_units","avg_price_region","price_index"] 

categorical_features = ["property_type","city","postal_code"]

X = merged[numeric_features + categorical_features] 
y = merged[TARGET]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

preprocessor = ColumnTransformer(
    transformers=[ 
        ("num", "passthrough", numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

model = XGBRegressor( n_estimators=500, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8, objective="reg:squarederror", random_state=42 ) 
pipeline = Pipeline([ ("preprocess", preprocessor), ("model", model) ]) 
pipeline.fit(X_train, y_train) 
preds = pipeline.predict(X_test) 

print("------------------------------------------------")
print("Any NaN in predictions:", np.isnan(preds).any()) 
print("Any Inf in predictions:", np.isinf(preds).any()) 
print("Prediction range:", preds.min(), preds.max()) 
print("Actual price range:", y_test.min(), y_test.max())
print("------------------------------------------------")
print("MAE:", mean_absolute_error(y_test, preds)) 
print("RMSE:", np.sqrt(mean_squared_error(y_test, preds)))#, squared=False)) 
print("R²:", r2_score(y_test, preds))


------------------------------------------------
Any NaN in predictions: False
Any Inf in predictions: False
Prediction range: 283358.3 1.8309342e+06
Actual price range: 219538 2140893
------------------------------------------------
MAE: 102526.1015625
RMSE: 132382.04687947684
R²: 0.775572657585144


7. SAVE MODEL 

In [32]:
joblib.dump(pipeline, "models/ontario_price_model.pkl") 
print("\nModel saved to models/ontario_price_model.pkl")


Model saved to models/ontario_price_model.pkl
