In [1]:
import os
import re
from pathlib import Path

import numpy as np
import pandas as pd
import geopandas as gpd
from tqdm import tqdm
tqdm.pandas()

from shapely.geometry import MultiPolygon, Polygon
from shapely.validation import make_valid

In [2]:
from constants import CENSUS_25

In [3]:
# Load GTA municipalities boundary
gta = gpd.read_file("../data/geo/regions/TMUN.gpkg")

# Ensure geometry validity
gta["geometry"] = gta["geometry"].apply(make_valid)

# --- 1. Reproject to a projected CRS (meters) ---
# Toronto is in UTM Zone 17N
CRS_METERS = "EPSG:32617"
gta_proj = gta.to_crs(CRS_METERS)

# Dissolve to single geometry for intersection tests
gta_boundary = gta_proj.dissolve()

# --- 2. Create bounding box for grid ---
minx, miny, maxx, maxy = gta_proj.total_bounds

# Set grid resolution
cell_size = 1000  # meters

# Generate grid coordinates
x_coords = np.arange(minx, maxx + cell_size, cell_size)
y_coords = np.arange(miny, maxy + cell_size, cell_size)

# Generate polygons for each 1 km square
grid_polys = []
for x in x_coords:
    for y in y_coords:
        grid_polys.append(
            Polygon([
                (x, y),
                (x + cell_size, y),
                (x + cell_size, y + cell_size),
                (x, y + cell_size)
            ])
        )

grid = gpd.GeoDataFrame({"geometry": grid_polys}, crs=CRS_METERS)

# --- 3. Keep only full squares that intersect the TMUN boundary ---
# Build a spatial index for performance
sindex = grid.sindex

possible_matches_idx = list(sindex.intersection(gta_boundary.geometry.total_bounds))
possible_matches = grid.iloc[possible_matches_idx]

intersects_mask = possible_matches.intersects(gta_boundary.geometry.iloc[0])

grid_intersecting = possible_matches[intersects_mask].copy()

# --- 4. Save as GeoPackage ---
OUT_PATH = "../data/geo/regions/TMUN_squares.gpkg"
grid_intersecting.to_file(OUT_PATH, driver="GPKG")

grid_intersecting.head()

Unnamed: 0,geometry
1288,"POLYGON ((610569.054 4822163.339, 611569.054 4..."
1349,"POLYGON ((611569.054 4822163.339, 612569.054 4..."
1411,"POLYGON ((612569.054 4823163.339, 613569.054 4..."
1228,"POLYGON ((609569.054 4823163.339, 610569.054 4..."
1289,"POLYGON ((610569.054 4823163.339, 611569.054 4..."


In [4]:
# ---------------------------
# Cell 4 — Areal interpolation (tract -> 1km squares)
# ---------------------------
# Where the squares live (created in Cell 3)
SQUARES_PATH = "../data/geo/regions/TMUN_squares.gpkg"

# Output file pattern (one file per year)
OUT_PATTERN = "../data/language/{year}/num_speakers_squares_{year}.gpkg"

# Ensure output directories exist when we write later
def _ensure_out_dir(path_str):
    p = Path(path_str).parent
    p.mkdir(parents=True, exist_ok=True)

# Load squares once (we will reuse for all years)
squares = gpd.read_file(SQUARES_PATH)

# Ensure squares are projected in meters for area calculations
CRS_METERS = "EPSG:32617"
if squares.crs is None:
    raise ValueError(f"{SQUARES_PATH} has no CRS; please ensure it is projected (EPSG:32617 recommended).")
squares = squares.to_crs(CRS_METERS)
squares = squares.reset_index(drop=True)  # ensure clean integer index

# Build a spatial index for squares for fast intersection queries
squares_sindex = squares.sindex

# Main loop over census years
for year in CENSUS_25:
    print(f"\nInterpolating year {year}...")

    # Load the tract-level counts for this year
    tract_path = f"../data/language/{year}/num_speakers_tmun_{year}.gpkg"
    tracts = gpd.read_file(tract_path)

    # Ensure projected CRS for area computations
    tracts = tracts.to_crs(CRS_METERS)

    # Identify the columns we want to interpolate:
    # all columns that start with "num_" (including num_tot and num_not_eng)
    num_cols = [c for c in tracts.columns if c.startswith("num_") and c != "geometry"]
    if "num_tot" not in num_cols:
        # Safety check, include if present
        if "num_tot" in tracts.columns:
            num_cols.append("num_tot")

    # Prepare result GeoDataFrame: copy squares geometry and zeroed numeric columns
    result = squares.copy()
    for c in num_cols:
        result[c] = 0.0

    # For performance, convert numeric tract columns to numpy (we will access them per-tract)
    # but keep them available in the tracts GeoDataFrame for clarity.
    tracts[num_cols] = tracts[num_cols].astype(float)

    # Iterate over tracts and distribute counts to intersecting squares
    # Using tqdm for progress visibility
    for idx, tract in tqdm(tracts.iterrows(), total=len(tracts), desc=f"year {year} tracts"):
        tract_geom = tract.geometry
        if tract_geom is None or tract_geom.is_empty:
            continue

        tract_area = tract_geom.area
        if tract_area == 0 or np.isnan(tract_area):
            continue

        # candidate square indices whose bounding boxes intersect the tract bbox
        possible_idx = list(squares_sindex.intersection(tract_geom.bounds))
        if not possible_idx:
            continue

        candidates = result.iloc[possible_idx]  # note: result has same index as squares
        # compute actual intersections (this returns a GeoSeries)
        intersections = candidates.geometry.intersection(tract_geom)

        # compute intersection areas
        inter_areas = intersections.area

        # mask where area > 0
        mask = inter_areas > 0
        if not mask.any():
            continue

        # indices (in result GeoDataFrame) that receive contributions
        recv_idx = candidates.index[mask]

        # compute ratios (proportion of tract area that falls in each square)
        ratios = inter_areas[mask] / tract_area  # pandas Series indexed by recv_idx

        # For each numeric column, add the apportioned count to the corresponding square rows
        # We vectorize per-column by multiplying ratios (aligned by index)
        for col in num_cols:
            tract_value = tract[col] if pd.notnull(tract[col]) else 0.0
            if tract_value == 0:
                # nothing to add for this column
                continue

            # apportioned counts per receiving square (Series aligned to recv_idx)
            apportioned = ratios * float(tract_value)

            # add apportioned values into result dataframe (aligning on index)
            # use .loc to write values for these indices
            result.loc[recv_idx, col] += apportioned.values

    # Optional: round counts to, say, 2 decimals (population counts can remain floats but you can round)
    # Here we'll keep floats but fill any tiny negatives (due to numerical issues) with 0
    for c in num_cols:
        result[c] = result[c].clip(lower=0.0)

    # Save the interpolated squares for this year
    out_path = OUT_PATTERN.format(year=year)
    _ensure_out_dir(out_path)
    result.to_file(out_path, driver="GPKG")
    print(f"Saved interpolated squares for {year} -> {out_path}")

print("\nAll years complete.")



Interpolating year 1971...


year 1971 tracts: 100%|██████████| 422/422 [00:04<00:00, 89.98it/s] 


Saved interpolated squares for 1971 -> ../data/language/1971/num_speakers_squares_1971.gpkg

Interpolating year 1996...


year 1996 tracts: 100%|██████████| 713/713 [00:09<00:00, 72.92it/s]


Saved interpolated squares for 1996 -> ../data/language/1996/num_speakers_squares_1996.gpkg

Interpolating year 2021...


year 2021 tracts: 100%|██████████| 1049/1049 [00:27<00:00, 38.79it/s]


Saved interpolated squares for 2021 -> ../data/language/2021/num_speakers_squares_2021.gpkg

All years complete.


In [5]:
# CELL 5 - compute pct files, round num files, create centroid versions
IN_PATTERN  = "../data/language/{year}/num_speakers_squares_{year}.gpkg"
PCT_PATTERN = "../data/language/{year}/pct_speakers_squares_{year}.gpkg"

CENT_NUM_PATTERN = "../data/language/{year}/num_speakers_centroid_{year}.gpkg"
CENT_PCT_PATTERN = "../data/language/{year}/pct_speakers_centroid_{year}.gpkg"

for year in CENSUS_25:
    print(f"Processing {year}…")

    # ------------------------------
    # Load NUM dataset
    # ------------------------------
    num_path = IN_PATTERN.format(year=year)
    gdf_num = gpd.read_file(num_path)

    # Identify all language count columns (start with 'num_' but exclude num_tot)
    lang_cols = [c for c in gdf_num.columns if c.startswith("num_") and c != "num_tot"]

    # ------------------------------
    # Compute percentages
    # ------------------------------
    gdf_pct = gdf_num.copy()

    for col in lang_cols:
        lang = col.replace("num_", "")
        gdf_pct[f"pct_{lang}"] = (gdf_pct[col] / gdf_pct["num_tot"] * 100).round(2)

    # Keep num_tot (rounded)
    gdf_pct["num_tot"] = gdf_pct["num_tot"].round(2)

    # Drop original num language columns for pct file
    gdf_pct = gdf_pct.drop(columns=lang_cols)

    # Save pct file
    pct_path = PCT_PATTERN.format(year=year)
    gdf_pct.to_file(pct_path, driver="GPKG")

    # ------------------------------
    # Round all numeric values in NUM file
    # ------------------------------
    for col in lang_cols + ["num_tot"]:
        gdf_num[col] = gdf_num[col].round(2)

    gdf_num.to_file(num_path, driver="GPKG")  # overwrite with rounded values

    # ------------------------------
    # Create centroid versions
    # ------------------------------
    gdf_num_cent = gdf_num.copy()
    gdf_num_cent["geometry"] = gdf_num_cent.centroid
    gdf_num_cent.to_file(CENT_NUM_PATTERN.format(year=year), driver="GPKG")

    gdf_pct_cent = gdf_pct.copy()
    gdf_pct_cent["geometry"] = gdf_pct_cent.centroid
    gdf_pct_cent.to_file(CENT_PCT_PATTERN.format(year=year), driver="GPKG")

    print(f"✓ Finished {year}")


Processing 1971…
✓ Finished 1971
Processing 1996…
✓ Finished 1996
Processing 2021…
✓ Finished 2021


In [4]:
# === Cell N: Convert centroid GPKGs into frontend-ready JSON ===
for year in CENSUS_25:
    in_path = f"../data/language/{year}/num_speakers_centroid_{year}.gpkg"
    out_path = f"../data/language/{year}/num_speakers_centroid_{year}.json"

    print(f"Processing year {year}...")

    # Load
    gdf = gpd.read_file(in_path)

    # Ensure CRS is WGS84
    if gdf.crs != "EPSG:4326":
        gdf = gdf.to_crs(4326)

    # Extract centroid coords
    # (geometry *should* already be a point but we recompute to be safe)
    gdf["x"] = gdf.geometry.x
    gdf["y"] = gdf.geometry.y

    # Keep numeric columns beginning with "num_"
    num_cols = [c for c in gdf.columns if c.startswith("num_")]

    # Build clean DataFrame for export
    df = gdf[["x", "y"] + num_cols].copy()

    # Convert to records for JSON output
    records = df.to_dict(orient="records")

    # Save JSON
    (
        pd.DataFrame(records)
        .to_json(out_path, orient="records", indent=2)
    )

    print(f"✓ Saved {out_path} ({len(df)} rows)")

Processing year 1971...
✓ Saved ../data/language/1971/num_speakers_centroid_1971.json (2255 rows)
Processing year 1996...
✓ Saved ../data/language/1996/num_speakers_centroid_1996.json (2255 rows)
Processing year 2021...
✓ Saved ../data/language/2021/num_speakers_centroid_2021.json (2255 rows)
