In [1]:
import os
from datetime import datetime, timedelta

import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
from constants import ONTARIO_CITIES

For the high and very high AQHI categories, produce a CSV where the rows are the cities, and the columns are the years from 2003.

The values of each cell should be the number of days in a year in which there is an AQHI value in that range (we will default to maximum - but others can be used).

In [11]:
results_dir = "../../data/results/num_aqhi_days"
os.makedirs(results_dir, exist_ok=True)

# Define only the categories we care about
aqhi_categories = {
    "high": (7, 10),
    "very_high": (11, np.inf)
}

category_results = {cat: {} for cat in aqhi_categories}

for city in tqdm(sorted(ONTARIO_CITIES)):
    # Paths
    daily_path = f"../../data/processed/ontario/daily/{city}.csv"
    coverage_path = f"../../data/processed/ontario/metadata/coverage/yearly/{city}.csv"

    if not os.path.exists(daily_path) or not os.path.exists(coverage_path):
        print(f"⚠️ Missing data for {city}")
        continue

    # Load daily AQHI
    df_daily = pd.read_csv(daily_path, index_col=0, parse_dates=True)
    df_daily = df_daily.sort_index()
    df_daily["max_aqhi"] = pd.to_numeric(df_daily["max_aqhi"], errors="coerce")
    df_daily["year"] = df_daily.index.year

    # Load coverage metadata
    df_cov = pd.read_csv(coverage_path, index_col=0, parse_dates=True)
    # Convert index from dates (YYYY-01-01) → year integer
    df_cov.index = df_cov.index.year
    coverage = df_cov["aqhi"]

    for cat, (low, high) in aqhi_categories.items():
        # Mask for the AQHI category
        mask = (df_daily["max_aqhi"] >= low) & (df_daily["max_aqhi"] <= high)
        counts = df_daily.loc[mask].groupby("year").size()

        # Apply the coverage rule (>50%)
        filtered_counts = {}
        for year in counts.index.union(coverage.index):
            if coverage.get(year, 0) > 50:
                filtered_counts[year] = counts.get(year, 0)
            else:
                filtered_counts[year] = np.nan

        category_results[cat][city] = pd.Series(filtered_counts)

# Convert each category into a DataFrame and save
for cat, city_data in category_results.items():
    df_cat = pd.DataFrame(city_data).T  # cities as rows
    df_cat = df_cat.reindex(sorted(ONTARIO_CITIES))  # alphabetical
    df_cat = df_cat.reindex(sorted(df_cat.columns), axis=1)  # years sorted

    out_path = f"{results_dir}/{cat}.csv"
    df_cat.to_csv(out_path, float_format="%.0f")
    print(f"Saved {out_path}")

100%|███████████████████████████████████████████| 38/38 [00:00<00:00, 76.59it/s]

Saved ../../data/results/num_aqhi_days/high.csv
Saved ../../data/results/num_aqhi_days/very_high.csv



