In [5]:
import os
import numpy as np
import pandas as pd
import geopandas as gpd

from tqdm import tqdm

import matplotlib.pyplot as plt
from matplotlib.colors import Normalize

from scipy.interpolate import griddata
from scipy.ndimage import gaussian_filter

In [6]:
# ---- Languages to analyze ----
LANG_VARS = [
    # "num_ara", 
    # "num_chi", 
    # "num_ita", 
    # "num_por",
    # "num_ukr",
    # "num_pol",
    # "num_kor",
    "num_yid",
    "num_rus",
]   # Modify as needed

YEARS = [1971, 1996, 2021]

# ---- Input data ----
CENT_NUM_PATTERN = "../data/language/{year}/num_speakers_centroid_{year}.gpkg"

# ---- Output directory ----
OUT_DIR = "../data/viz"

# ---- Toronto-region boundaries ----
BOUNDARY_PATH = "../data/geo/regions/TMUN_CSD_OldTO.gpkg"
BOUNDARY = gpd.read_file(BOUNDARY_PATH)

# Ensure a projected CRS for distance-based smoothing
TARGET_CRS = "EPSG:32617"   # UTM Zone 17N — covers Toronto well

BOUNDARY = BOUNDARY.to_crs(TARGET_CRS)

In [7]:
def load_centroid_data(year):
    path = CENT_NUM_PATTERN.format(year=year)
    gdf = gpd.read_file(path)
    gdf = gdf.to_crs(TARGET_CRS)
    return gdf

def make_grid(gdf):
    xmin, ymin, xmax, ymax = gdf.total_bounds

    # round coordinates to nearest 1000 m grid to align with your squares
    x_coords = np.unique(gdf.geometry.x.astype(int))
    y_coords = np.unique(gdf.geometry.y.astype(int))

    # create a uniform grid at 1 km resolution
    xx, yy = np.meshgrid(
        np.arange(xmin, xmax + 1000, 1000),
        np.arange(ymin, ymax + 1000, 1000)
    )
    return xx, yy

def rasterize_and_smooth(gdf, lang_var, xx, yy, sigma=1.0):
    # extract point coordinates
    points = np.vstack([gdf.geometry.x, gdf.geometry.y]).T
    values = gdf[lang_var].fillna(0).to_numpy()

    # Interpolate
    grid = griddata(points, values, (xx, yy), method="linear")

    # Fill empty regions with 0 instead of NaN
    grid = np.nan_to_num(grid, nan=0.0)

    # Light smoothing to avoid losing detail
    grid_smoothed = gaussian_filter(grid, sigma=sigma)

    return grid_smoothed

def plot_heatmap(grid, xx, yy, boundary, lang_var, year, out_path):
    fig, ax = plt.subplots(figsize=(10, 10))

    im = ax.imshow(
        grid,
        extent=[xx.min(), xx.max(), yy.min(), yy.max()],
        origin="lower",
        cmap="viridis"
    )

    boundary.plot(ax=ax, facecolor="none", edgecolor="black", linewidth=0.5)

    ax.set_title(f"Heatmap — {lang_var}, {year}")
    plt.colorbar(im, ax=ax, label=lang_var)

    plt.tight_layout()
    plt.savefig(out_path, dpi=300)
    plt.close()

def plot_contour(grid, xx, yy, boundary, lang_var, year, out_path):
    fig, ax = plt.subplots(figsize=(10, 10))

    levels = np.linspace(grid.min(), grid.max(), 12)

    cs = ax.contourf(
        xx, yy, grid,
        levels=levels,
        cmap="viridis"
    )

    boundary.plot(ax=ax, facecolor="none", edgecolor="black", linewidth=0.5)

    ax.set_title(f"Filled Contour Map — {lang_var}, {year}")
    plt.colorbar(cs, ax=ax, label=lang_var)

    plt.tight_layout()
    plt.savefig(out_path, dpi=300)
    plt.close()

In [8]:
for lang_var in LANG_VARS:
    print(f"\n---- {lang_var} ----")

    for year in YEARS:
        print(f"  Year {year}")

        # Load data
        gdf = load_centroid_data(year)

        # Make grid only once per year
        xx, yy = make_grid(gdf)

        # Rasterize and smooth
        grid = rasterize_and_smooth(gdf, lang_var, xx, yy, sigma=1)

        # Output directories
        lang_dir = os.path.join(OUT_DIR, lang_var)
        os.makedirs(lang_dir, exist_ok=True)

        # Paths
        heat_path = os.path.join(lang_dir, f"heat_{year}.png")
        cont_path = os.path.join(lang_dir, f"contour_{year}.png")

        # Generate maps
        plot_heatmap(grid, xx, yy, BOUNDARY, lang_var, year, heat_path)
        plot_contour(grid, xx, yy, BOUNDARY, lang_var, year, cont_path)


---- num_yid ----
  Year 1971
  Year 1996
  Year 2021

---- num_rus ----
  Year 1971
  Year 1996
  Year 2021
