### Load base and dim

In [8]:
from pathlib import Path
import pandas as pd

REPO_ROOT = Path.cwd().parents[0] if Path.cwd().name == "notebooks" else Path.cwd()
DATA_DIR = REPO_ROOT / "data"
INTERIM_DIR = DATA_DIR / "interim"
PROCESSED_DIR = DATA_DIR / "processed"

base_path = PROCESSED_DIR / "base_nbhd_day.parquet"
dim_path = INTERIM_DIR / "dim_neighbourhoods.parquet"

base = pd.read_parquet(base_path)
dim = pd.read_parquet(dim_path)

print("base:", base.shape)
print("dim:", dim.shape)
base.head(2)


base: (692514, 4)
dim: (158, 4)


Unnamed: 0,AREA_ID,AREA_NAME,date,collision_count
0,2502366,South Eglinton-Davisville,2014-01-01,0
1,2502366,South Eglinton-Davisville,2014-01-02,0


### Finding the base key column automatically

In [9]:
possible_keys = ["AREA_ID", "area_id"]
base_key = next((k for k in possible_keys if k in base.columns), None)
if base_key is None:
    raise ValueError(f"Could not find AREA_ID/area_id in base. Columns: {list(base.columns)}")

print("Using base key:", base_key)


Using base key: AREA_ID


### Merge, validate and save

In [10]:
before_rows = len(base)

#patch to avoid int and string merge error
print(f"Base key ({base_key}) type: {base[base_key].dtype}")
print(f"Dim key (area_id) type: {dim['area_id'].dtype}")
#forcing both to string to avoid mismatch
base[base_key] = base[base_key].astype(str)
dim["area_id"] = dim["area_id"].astype(str)


merged = base.merge(
    dim[["area_id", "nbhd_id", "area_name"]],
    left_on=base_key,
    right_on="area_id",
    how="left"
)

assert len(merged) == before_rows, "Row count changed after merge â€” should not happen"
missing = merged["nbhd_id"].isna().sum()
print("Missing nbhd_id:", missing)
if missing > 0:
    # show sample of broken keys
    print(merged.loc[merged["nbhd_id"].isna(), [base_key]].drop_duplicates().head(10))
    raise ValueError("Some AREA_ID values did not map to dim_neighbourhoods. Fix dim mapping first.")

out_path = PROCESSED_DIR / "base_nbhd_day_w158.parquet"
merged.to_parquet(out_path, index=False)
print("Saved:", out_path)

merged.head()


Base key (AREA_ID) type: int32
Dim key (area_id) type: str
Missing nbhd_id: 0
Saved: C:\code\pyspark-playground\Covercheck-Toronto\data\processed\base_nbhd_day_w158.parquet


Unnamed: 0,AREA_ID,AREA_NAME,date,collision_count,area_id,nbhd_id,area_name
0,2502366,South Eglinton-Davisville,2014-01-01,0,2502366,174,South Eglinton-Davisville
1,2502366,South Eglinton-Davisville,2014-01-02,0,2502366,174,South Eglinton-Davisville
2,2502366,South Eglinton-Davisville,2014-01-03,2,2502366,174,South Eglinton-Davisville
3,2502366,South Eglinton-Davisville,2014-01-04,0,2502366,174,South Eglinton-Davisville
4,2502366,South Eglinton-Davisville,2014-01-05,1,2502366,174,South Eglinton-Davisville


#### For now after merging we have both AREA_NAME and area_name. Not harmful for now, but for modelling we want one canonical name field to avoid confusion