In [2]:
import os
from datetime import datetime, timedelta

import pandas as pd
import numpy as np
from tqdm import tqdm

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import matplotlib.dates as mdates

In [3]:
from constants import FEDERAL_LOCATION_IDS
FEDERAL_CITIES = list(FEDERAL_LOCATION_IDS.keys())

RQ1: How many days per year are in each health risk category using maximum daily AQHI, for each city? Is it increasing?

In [12]:
# Categories for AQHI and PM2.5
aqhi_categories = {
    "Low (1-3)": (1, 3),
    "Moderate (4-6)": (4, 6),
    "High (7-10)": (7, 10),
    "Very High (11)": (11, 11),
    "High and Very High (7+)": (7, 11),
}

pm25_categories = {
    "Good (0-12)": (0, 12),
    "Moderate (12-35.5)": (12, 35.5),
    "Unhealthy-1 (35.5-55.5)": (35.5, 55.5),
    "Unhealthy-2 (55.5-150.5)": (55.5, 150.5),
    "Unhealthy-3 (150.5-250.5)": (150.5, 250.5),
    "Hazardous (250.5+)": (250.5, np.inf),
    "Unhealthy and Hazardous (55.5+)": (55.5, np.inf),
}

years = list(range(2005, 2026))

# Helper: compute heatmap matrix (cities × years)
def compute_heatmap(case, col, categories, raw_col=None):
    results = {cat: pd.DataFrame(0, index=FEDERAL_CITIES, columns=years) for cat in categories}

    print(f"Processing {case}")
    for city in tqdm(FEDERAL_CITIES):
        dir_end = case
        if case == "aqhi_plus":
            dir_end = "aqhi-plus"

        daily_path = f"../../data/processed/federal/daily-{dir_end}/{city}.csv"
        if not os.path.exists(daily_path):
            continue

        df_daily = pd.read_csv(daily_path, index_col=0, parse_dates=True)
        if col not in df_daily.columns:
            continue

        series = df_daily[col].dropna()

        for cat_name, (low, high) in categories.items():
            mask = (series >= low) & (series <= high)
            filtered = series[mask]
            if filtered.empty:
                continue

            yearly_counts = filtered.groupby(filtered.index.year).size()
            results[cat_name].loc[city, yearly_counts.index] = yearly_counts.values

    return results

# Helper: plot heatmap grids
def plot_heatmaps(results, case, cmap="Reds"):
    out_dir = f"../../data/plots/federal-{case}"
    os.makedirs(out_dir, exist_ok=True)

    for cat_name, df_matrix in results.items():
        plt.figure(figsize=(15, 10))
        im = plt.imshow(df_matrix.values, aspect="auto", cmap=cmap, interpolation="nearest")

        # Axis formatting
        plt.yticks(range(len(df_matrix.index)), df_matrix.index)
        plt.xticks(range(len(df_matrix.columns)), df_matrix.columns, rotation=90)
        plt.colorbar(im, label="Number of Days")

        plt.title(f"{case.upper()} – {cat_name}")
        plt.xlabel("Year")
        plt.ylabel("City")

        plt.tight_layout()
        plt.savefig(f"{out_dir}/{case}_{cat_name.replace(' ', '_').replace('(', '').replace(')', '')}.png", dpi=150)
        plt.close()


# ---- Run for AQHI (max values) ----
for case, col in [("aqhi", "max_aqhi"), ("aqhi_plus", "max_aqhi_plus")]:
    results = compute_heatmap(case, col, aqhi_categories)
    plot_heatmaps(results, case, cmap="Reds")

# ---- Run for PM2.5 ----
results_pm25 = compute_heatmap("pm25", "max_pm25", pm25_categories)
plot_heatmaps(results_pm25, "pm25", cmap="YlOrBr")

Processing aqhi


100%|██████████| 42/42 [00:00<00:00, 86.04it/s]


Processing aqhi_plus


100%|██████████| 42/42 [00:00<00:00, 83.43it/s]


Processing pm25


100%|██████████| 42/42 [00:00<00:00, 64.17it/s]


In [10]:
df = pd.read_csv("../../data/processed/federal/daily-aqhi/Toronto.csv", index_col=0, parse_dates=True)
print(df["max_aqhi"].value_counts())

max_aqhi
3.0     3131
4.0     2307
5.0      885
2.0      729
6.0      291
7.0       89
8.0       27
9.0        4
1.0        1
10.0       1
11.0       1
Name: count, dtype: int64
