### Imports and paths

In [1]:
import os
from pathlib import Path

import pandas as pd
import geopandas as gpd

REPO_ROOT = Path.cwd().parents[0] if Path.cwd().name == "notebooks" else Path.cwd()
DATA_DIR = REPO_ROOT / "data"

RAW_DIR = DATA_DIR / "raw"
INTERIM_DIR = DATA_DIR / "interim"
PROCESSED_DIR = DATA_DIR / "processed"

INTERIM_DIR.mkdir(parents=True, exist_ok=True)

print("REPO_ROOT:", REPO_ROOT)


REPO_ROOT: C:\code\pyspark-playground\Covercheck-Toronto


### Load neighbourhood polygons

In [2]:
nbhd_path = "../data/raw/boundaries/toronto_neighbourhoods.geojson"

gdf = gpd.read_file(nbhd_path)
print("Loaded polygons:", gdf.shape)
print("CRS:", gdf.crs)
gdf.head(2)


Loaded polygons: (158, 12)
CRS: EPSG:4326


Unnamed: 0,_id,AREA_ID,AREA_ATTR_ID,PARENT_AREA_ID,AREA_SHORT_CODE,AREA_LONG_CODE,AREA_NAME,AREA_DESC,CLASSIFICATION,CLASSIFICATION_CODE,OBJECTID,geometry
0,1,2502366,26022881,,174,174,South Eglinton-Davisville,South Eglinton-Davisville (174),Not an NIA or Emerging Neighbourhood,,17824737.0,"MULTIPOLYGON (((-79.38635 43.69783, -79.38623 ..."
1,2,2502365,26022880,,173,173,North Toronto,North Toronto (173),Not an NIA or Emerging Neighbourhood,,17824753.0,"MULTIPOLYGON (((-79.39744 43.70693, -79.39837 ..."


### Inspecting columns

In [3]:
list(gdf.columns)

['_id',
 'AREA_ID',
 'AREA_ATTR_ID',
 'PARENT_AREA_ID',
 'AREA_SHORT_CODE',
 'AREA_LONG_CODE',
 'AREA_NAME',
 'AREA_DESC',
 'CLASSIFICATION',
 'CLASSIFICATION_CODE',
 'OBJECTID',
 'geometry']

### Auto picking the right columns

In [4]:
def pick_first_existing(cols, candidates):
    cols_upper = {c.upper(): c for c in cols}
    for cand in candidates:
        if cand.upper() in cols_upper:
            return cols_upper[cand.upper()]
    return None

area_id_col = pick_first_existing(gdf.columns, ["AREA_ID", "AREA_LONG_CODE", "NEIGHBOURHOOD_ID", "id"])
area_name_col = pick_first_existing(gdf.columns, ["AREA_NAME", "AREA_LONG_NAME", "NAME", "NEIGHBOURHOOD_NAME"])
nbhd_id_col = pick_first_existing(gdf.columns, ["AREA_SHORT_CODE", "HOOD_158", "NBHD_ID", "NEIGHBOURHOOD_158", "SHORT_CODE"])

print("area_id_col:", area_id_col)
print("area_name_col:", area_name_col)
print("nbhd_id_col:", nbhd_id_col)

if area_id_col is None or area_name_col is None or nbhd_id_col is None:
    raise ValueError(
        "Could not auto-detect one of the required columns.\n"
        "Run `list(gdf.columns)` and then manually set area_id_col, area_name_col, nbhd_id_col."
    )


area_id_col: AREA_ID
area_name_col: AREA_NAME
nbhd_id_col: AREA_SHORT_CODE


### Building dimension table and save

In [5]:
dim = gdf[[area_id_col, area_name_col, nbhd_id_col, "geometry"]].copy()
dim = dim.rename(columns={
    area_id_col: "area_id",
    area_name_col: "area_name",
    nbhd_id_col: "nbhd_id"
})

#clean types
dim["area_id"] = dim["area_id"].astype(str)
dim["nbhd_id"] = pd.to_numeric(dim["nbhd_id"], errors="coerce").astype("Int64")

#basic checks
assert dim["area_id"].isna().sum() == 0, "area_id has nulls"
assert dim["nbhd_id"].isna().sum() == 0, "nbhd_id has nulls"
assert dim["nbhd_id"].nunique() >= 100, "nbhd_id unique count seems too low â€” check mapping"
# helps to catch future boundary file issues
assert dim["area_id"].is_unique
assert dim["nbhd_id"].is_unique

out_path = INTERIM_DIR / "dim_neighbourhoods.parquet"
dim.to_parquet(out_path, index=False)
print("Saved:", out_path)
dim.head()


Saved: C:\code\pyspark-playground\Covercheck-Toronto\data\interim\dim_neighbourhoods.parquet


Unnamed: 0,area_id,area_name,nbhd_id,geometry
0,2502366,South Eglinton-Davisville,174,"MULTIPOLYGON (((-79.38635 43.69783, -79.38623 ..."
1,2502365,North Toronto,173,"MULTIPOLYGON (((-79.39744 43.70693, -79.39837 ..."
2,2502364,Dovercourt Village,172,"MULTIPOLYGON (((-79.43411 43.66015, -79.43537 ..."
3,2502363,Junction-Wallace Emerson,171,"MULTIPOLYGON (((-79.4387 43.66766, -79.43841 4..."
4,2502362,Yonge-Bay Corridor,170,"MULTIPOLYGON (((-79.38404 43.64497, -79.38502 ..."
