In [1]:
import os
import json
import csv
import requests
from datetime import datetime, timedelta
from dateutil import parser
from calendar import monthrange

import pandas as pd
import numpy as np
from tqdm import tqdm
from openaq import OpenAQ

In [2]:
from constants import FEDERAL_LOCATION_IDS

FEDERAL_CITIES = list(FEDERAL_LOCATION_IDS.keys())
years = list(range(2005, 2026))

COVERAGE_DIR = "../../data/processed/federal/metadata/coverage/yearly"
COVERAGE_THRESHOLD = 50  # Only include years with >=50% coverage

OUTPUT_DIR = "../../data/results/federal_pct_pollutant"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [3]:
def compute_aqhi(pm25_3h, no2_3h, o3_3h):
    """Compute federal AQHI from 3-hour averages."""
    aqhi = (
        1000 * (
            (np.exp(0.000871 * no2_3h) - 1) +
            (np.exp(0.000537 * o3_3h) - 1) +
            (np.exp(0.000487 * pm25_3h) - 1)
        )
    ) / 10.4
    return aqhi


In [4]:
def compute_pct_contrib(df):
    """
    Given a DataFrame with pm25_3h_avg, no2_3h_avg, o3_3h_avg, and aqhi_raw,
    compute hourly percentage contribution of each pollutant to AQHI.
    Returns df with new columns: pct_pm25, pct_no2, pct_o3.
    """
    df = df.copy()
    
    # Compute pollutant-specific AQHI
    aqhi_pm25 = compute_aqhi(df['pm25_3h_avg'], 0, 0)
    aqhi_no2 = compute_aqhi(0, df['no2_3h_avg'], 0)
    aqhi_o3 = compute_aqhi(0, 0, df['o3_3h_avg'])
    
    # Avoid division by zero
    mask = df['aqhi_raw'] > 0
    
    df['pct_pm25'] = np.nan
    df['pct_no2'] = np.nan
    df['pct_o3'] = np.nan
    
    df.loc[mask, 'pct_pm25'] = aqhi_pm25[mask] / df.loc[mask, 'aqhi_raw'] * 100
    df.loc[mask, 'pct_no2'] = aqhi_no2[mask] / df.loc[mask, 'aqhi_raw'] * 100
    df.loc[mask, 'pct_o3'] = aqhi_o3[mask] / df.loc[mask, 'aqhi_raw'] * 100
    
    return df

In [5]:
def compute_yearly_pct_contrib(case='all'):
    """
    Compute average yearly percent contribution for each pollutant per city.
    
    Parameters:
    - case: 'all' -> all AQHI, 'high' -> only hours with aqhi_raw >= 7
    
    Returns:
    - dict: {pollutant: DataFrame(cities x years)}
    """
    pollutants = ['pm25', 'no2', 'o3']
    results = {poll: pd.DataFrame(np.nan, index=FEDERAL_CITIES, columns=years) for poll in pollutants}

    for city in tqdm(FEDERAL_CITIES):
        city_path = f"../../data/processed/federal/hourly/{city}.csv"
        coverage_path = f"{COVERAGE_DIR}/{city}.csv"
        
        if not os.path.exists(city_path) or not os.path.exists(coverage_path):
            continue
        
        # Load data
        df = pd.read_csv(city_path, index_col=0, parse_dates=True)
        if df.empty or 'aqhi_raw' not in df.columns:
            continue
        
        # Load coverage metadata
        try:
            df_cov = pd.read_csv(coverage_path, index_col=0)
            valid_years = df_cov.index[df_cov['aqhi'] >= COVERAGE_THRESHOLD].astype(int).tolist()
        except:
            continue
        
        # Compute hourly percent contributions
        df = compute_pct_contrib(df)
        
        if case == 'high':
            df = df[df['aqhi_raw'] >= 7]
        elif case == 'medium':
            df = df[df['aqhi_raw'] >= 4]
        
        # Compute yearly averages
        for year in years:
            if year not in valid_years:
                continue
            mask = df.index.year == year
            if mask.sum() == 0:
                continue
            for poll in pollutants:
                results[poll].loc[city, year] = df.loc[mask, f'pct_{poll}'].mean()
    
    for poll in pollutants:
        results[poll] = results[poll].round(2)
        results[poll] = results[poll].rename(index={'Metro Van - Vancouver': 'Vancouver'})
    
    return results

In [6]:
# Cases: all AQHI vs high AQHI (>=7)
cases = {'all': '', 'high': 'high', 'medium': 'medium'}

for case_name, suffix in cases.items():
    yearly_results = compute_yearly_pct_contrib(case=case_name)
    
    for poll, df in yearly_results.items():
        path_suffix = '' if case_name == 'all' else f'_{case_name}'
        csv_path = os.path.join(OUTPUT_DIR, f"{poll}_pct{path_suffix}.csv")
        df.to_csv(csv_path)
        print(f"✅ Saved: {csv_path}")

100%|██████████| 42/42 [00:12<00:00,  3.46it/s]


✅ Saved: ../../data/results/federal_pct_pollutant/pm25_pct.csv
✅ Saved: ../../data/results/federal_pct_pollutant/no2_pct.csv
✅ Saved: ../../data/results/federal_pct_pollutant/o3_pct.csv


100%|██████████| 42/42 [00:09<00:00,  4.35it/s]


✅ Saved: ../../data/results/federal_pct_pollutant/pm25_pct_high.csv
✅ Saved: ../../data/results/federal_pct_pollutant/no2_pct_high.csv
✅ Saved: ../../data/results/federal_pct_pollutant/o3_pct_high.csv


100%|██████████| 42/42 [00:11<00:00,  3.77it/s]

✅ Saved: ../../data/results/federal_pct_pollutant/pm25_pct_medium.csv
✅ Saved: ../../data/results/federal_pct_pollutant/no2_pct_medium.csv
✅ Saved: ../../data/results/federal_pct_pollutant/o3_pct_medium.csv





In [7]:
for csv_file in os.listdir(OUTPUT_DIR):
    if not csv_file.endswith(".csv"):
        continue
    # if 'pm25' not in csv_file:
    #     continue

    csv_path = os.path.join(OUTPUT_DIR, csv_file)
    json_path = os.path.join(OUTPUT_DIR, csv_file.replace(".csv", ".json"))

    # Read CSV with first column as index (assumed to be city)
    df = pd.read_csv(csv_path, index_col=0)

    # Replace NaN and NaT with None (JSON null)
    df = df.replace({np.nan: None})

    # Convert to dict of dicts — {city: {col1: val1, col2: val2, ...}}
    json_dict = df.to_dict(orient="index")

    # Write to JSON
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(json_dict, f, indent=4, ensure_ascii=False)

    print(f"✅ Saved: {json_path}")


✅ Saved: ../../data/results/federal_pct_pollutant/no2_pct_medium.json
✅ Saved: ../../data/results/federal_pct_pollutant/pm25_pct_medium.json
✅ Saved: ../../data/results/federal_pct_pollutant/o3_pct_high.json
✅ Saved: ../../data/results/federal_pct_pollutant/o3_pct.json
✅ Saved: ../../data/results/federal_pct_pollutant/pm25_pct.json
✅ Saved: ../../data/results/federal_pct_pollutant/o3_pct_medium.json
✅ Saved: ../../data/results/federal_pct_pollutant/no2_pct_high.json
✅ Saved: ../../data/results/federal_pct_pollutant/pm25_pct_high.json
✅ Saved: ../../data/results/federal_pct_pollutant/no2_pct.json
