In [1]:
import os
import json
import csv
import requests
from datetime import datetime, timedelta
from dateutil import parser
from calendar import monthrange

import pandas as pd
import numpy as np
from tqdm import tqdm
from openaq import OpenAQ

In [2]:
from constants import FEDERAL_LOCATION_IDS

FEDERAL_CITIES = list(FEDERAL_LOCATION_IDS.keys())
years = list(range(2005, 2026))

COVERAGE_DIR = "../../data/processed/federal/metadata/coverage/yearly"
COVERAGE_THRESHOLD = 50  # percent

# Define only the ranges we need for the CSV outputs
categories_to_save = {
    "aqhi": {
        "7plus": (7, np.inf),
        "11plus": (11, np.inf),
    },
    "aqhi_plus": {
        "7plus": (7, np.inf),
        "11plus": (11, np.inf),
    },
    "pm25": {
        "55_5plus": (55.5, np.inf),
        "250_5plus": (250.5, np.inf),
    },
}

In [3]:
def compute_num_days(case, col, categories):
    """
    Compute number of days per city per year falling within each category.
    Only include years where coverage >= 50% for the pollutant/metric.
    Returns a dict of {category_name: DataFrame(cities x years)}.
    """
    results = {cat: pd.DataFrame(0, index=FEDERAL_CITIES, columns=years) for cat in categories}

    print(f"Processing {case}")
    for city in tqdm(FEDERAL_CITIES):
        dir_end = case
        if case == "aqhi_plus":
            dir_end = "aqhi-plus"

        daily_path = f"../../data/processed/federal/daily-{dir_end}/{city}.csv"
        coverage_path = f"{COVERAGE_DIR}/{city}.csv"

        # Skip if either file missing
        if not os.path.exists(daily_path) or not os.path.exists(coverage_path):
            continue

        # Load daily data
        df_daily = pd.read_csv(daily_path, index_col=0, parse_dates=True)
        if col not in df_daily.columns:
            continue
        series = pd.to_numeric(df_daily[col], errors="coerce").dropna()
        if series.empty:
            continue

        # Load coverage metadata
        try:
            df_cov = pd.read_csv(coverage_path, index_col=0)
            # Identify valid years with >= 50% coverage for this metric
            valid_years = df_cov.index[df_cov[case] >= COVERAGE_THRESHOLD].astype(int).tolist()
        except Exception as e:
            print(f"⚠️ Skipping {city} (coverage load error: {e})")
            continue

        for cat_name, (low, high) in categories.items():
            mask = (series >= low) & (series <= high if np.isfinite(high) else True)
            filtered = series[mask]
            if filtered.empty:
                continue

            yearly_counts = filtered.groupby(filtered.index.year).size()

            # Only record counts for valid coverage years
            for year, count in yearly_counts.items():
                results[cat_name].loc[city, year] = count

            for year in years:
                if year not in valid_years:
                    results[cat_name].loc[city, year] = np.nan

    for cat in results:
        results[cat] = results[cat].rename(index={'Metro Van - Vancouver': 'Vancouver'})

    return results

In [4]:
# Output directory
out_dir = "../../data/results/federal_num_days"
os.makedirs(out_dir, exist_ok=True)

# ---- AQHI ----
aqhi_results = compute_num_days("aqhi", "max_aqhi", categories_to_save["aqhi"])
for cat, df in aqhi_results.items():
    df.to_csv(f"{out_dir}/aqhi_{cat}.csv")

# ---- AQHI+ ----
aqhi_plus_results = compute_num_days("aqhi_plus", "max_aqhi_plus", categories_to_save["aqhi_plus"])
for cat, df in aqhi_plus_results.items():
    df.to_csv(f"{out_dir}/aqhi_plus_{cat}.csv")

# ---- PM2.5 ----
pm25_results = compute_num_days("pm25", "max_pm25", categories_to_save["pm25"])
for cat, df in pm25_results.items():
    df.to_csv(f"{out_dir}/pm25_{cat}.csv")

print("✅ Finished generating all 6 CSV files!")

Processing aqhi


100%|██████████| 42/42 [00:00<00:00, 77.71it/s]


Processing aqhi_plus


100%|██████████| 42/42 [00:00<00:00, 73.36it/s]


Processing pm25


100%|██████████| 42/42 [00:00<00:00, 79.91it/s]


✅ Finished generating all 6 CSV files!


In [3]:
results_dir = "../../data/results/federal_num_days"
output_dir = results_dir

# Loop through each CSV file
for csv_file in os.listdir(results_dir):
    if not csv_file.endswith(".csv"):
        continue

    csv_path = os.path.join(results_dir, csv_file)
    json_path = os.path.join(output_dir, csv_file.replace(".csv", ".json"))

    # Read CSV with city as index (first column)
    df = pd.read_csv(csv_path, index_col=0)

    # Replace NaN and NaT with None (JSON null)
    df = df.replace({np.nan: None})

    # Convert to dict of dicts — {city: {col1: val1, col2: val2, ...}}
    json_dict = df.to_dict(orient="index")

    # Write to JSON
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(json_dict, f, indent=4, ensure_ascii=False)

    print(f"✅ Saved: {json_path}")

print("🎉 All summary JSONs generated successfully.")


✅ Saved: ../../data/results/federal_num_days/pm25_55_5plus.json
✅ Saved: ../../data/results/federal_num_days/pm25_250_5plus.json
✅ Saved: ../../data/results/federal_num_days/aqhi_7plus.json
✅ Saved: ../../data/results/federal_num_days/aqhi_plus_7plus.json
✅ Saved: ../../data/results/federal_num_days/aqhi_plus_11plus.json
✅ Saved: ../../data/results/federal_num_days/aqhi_11plus.json
🎉 All summary JSONs generated successfully.
