In [2]:
import os
import csv
import requests
from datetime import datetime, timedelta
from calendar import monthrange

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from tqdm import tqdm

In [3]:
from constants import ONTARIO_CITIES

## Parse and organize the pollutant data

Create each of the output files with empty values

In [6]:
# Create date range for every hour from 2003 to 2024, and empty df
date_range = pd.date_range(start='2003-01-01', end='2024-12-31 23:00:00', freq='h')

columns = ['pm25', 'no2', 'o3', 'pm25_3h_avg', 'no2_3h_avg', 'o3_3h_avg', 'aqhi_raw', 'aqhi_plus_raw', 'aqhi', 'aqhi_plus']
df_empty = pd.DataFrame(index=date_range, columns=columns)

# Save a copy for each city
output_dir = '../../data/processed/ontario/hourly/'
for city in tqdm(ONTARIO_CITIES):
    filename = f"{output_dir}{city}.csv"
    df_empty.to_csv(filename)

100%|███████████████████████████████████████████| 38/38 [00:15<00:00,  2.40it/s]


Parse data for each year and each pollutant from 2005-2024

In [7]:
def load_city_df(city_name):
    path = f'../../data/processed/ontario/hourly/{city_name}.csv'
    return pd.read_csv(path, index_col=0, parse_dates=True)


def parse_city_block(city_data_rows, header_row):
    try:
        header_trimmed = header_row[:27]
        rows_trimmed = [row[:27] for row in city_data_rows]

        df = pd.DataFrame(rows_trimmed)
        df.columns = header_trimmed
        df = df[['Date'] + [f'H{h:02d}' for h in range(1, 25)]]
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
        df = df.dropna(subset=['Date'])

        melted = df.melt(id_vars='Date', var_name='hour', value_name='value')
        melted['hour'] = melted['hour'].str.extract(r'H(\d{2})').astype(int)  # No -1 since H1 is 1AM
        melted['datetime'] = melted['Date'] + pd.to_timedelta(melted['hour'], unit='h') + pd.Timedelta(hours=1)  # +1 since H1 is 1AM is 1:00-1:59AM stored at 2AM
        melted.set_index('datetime', inplace=True)

        melted['value'] = pd.to_numeric(melted['value'], errors='coerce')
        melted.loc[melted['value'].isin([-999, 9999]), 'value'] = np.nan

        return melted

    except Exception as e:
        print(f"Error parsing city block: {e}")
        return None

def update_city_df(city_df, melted, pollutant):
    try:
        # Clip melted index to only those within the city's df index
        melted = melted[melted.index.isin(city_df.index)]

        city_df.loc[melted.index, pollutant] = melted['value']
        return city_df
    except Exception as e:
        print(f"Error updating city DataFrame: {e}")
        return city_df


def process_city_block(city_name, header_row, city_data_rows, pollutant):
    try:
        city_df = load_city_df(city_name)
        melted = parse_city_block(city_data_rows, header_row)
        if melted is not None:
            updated_df = update_city_df(city_df, melted, pollutant)
            updated_df.to_csv(f'../../data/processed/ontario/hourly/{city_name}.csv')
            # print(f"✅ Updated {city_name}.csv with {pollutant} data.")
    except Exception as e:
        print(f"❌ Failed to process {city_name}: {e}")

In [8]:
pollutants = ['pm25', 'o3', 'no2']
years = list(range(2003, 2025))

for pollutant in pollutants:
    for year in tqdm(years):
        file_path = f'../../data/raw/ontario/{pollutant}/{year}_{pollutant}.csv'

        if not os.path.exists(file_path):
            print(f"⚠️ File not found: {file_path}")
            continue

        with open(file_path, newline='', encoding='utf-8') as csvfile:
            reader = csv.reader(csvfile)
            current_city = None
            header_row = []
            city_data_rows = []

            for row in reader:
                if not row:
                    continue

                first_cell = row[0].strip().lower()

                if first_cell in ONTARIO_CITIES:
                    # Process the previous block if exists
                    if current_city and city_data_rows and header_row:
                        try:
                            process_city_block(current_city, header_row, city_data_rows, pollutant)
                        except Exception as e:
                            print(f"❌ Failed to process {current_city} in {file_path}: {e}")

                    # Start a new block
                    current_city = first_cell
                    header_row = []
                    city_data_rows = []
                    continue

                if current_city:
                    if len(row) > 2 and row[2].strip().lower() == 'date':
                        header_row = row
                    elif header_row and len(row) >= 27:
                        city_data_rows.append(row)

            # Process last city block after file ends
            if current_city and city_data_rows and header_row:
                try:
                    process_city_block(current_city, header_row, city_data_rows, pollutant)
                except Exception as e:
                    print(f"❌ Failed to process final block for {current_city} in {file_path}: {e}")

100%|███████████████████████████████████████████| 22/22 [13:54<00:00, 37.92s/it]
100%|███████████████████████████████████████████| 22/22 [15:15<00:00, 41.61s/it]
100%|███████████████████████████████████████████| 22/22 [15:00<00:00, 40.95s/it]


Use the OpenAQ downloaded data to append data for 2025

In [9]:
POLLUTANTS = ['pm25', 'no2', 'o3']

# Step 1: Generate the 2025 hourly date range (timezone-naive)
date_range_2025 = pd.date_range(start='2025-01-01', end='2025-12-31 23:00:00', freq='h')

# Step 2: Loop over just one city for testing
for city in tqdm(ONTARIO_CITIES):
    city_dir = f'../../data/raw/ontario/2025/{city}'

    # Create empty DataFrame template
    columns = ['pm25', 'no2', 'o3', 'pm25_3h_avg', 'no2_3h_avg', 'o3_3h_avg', 'aqhi']
    df_city_2025 = pd.DataFrame(index=date_range_2025, columns=columns)

    # Step 3: Process raw data files for the city
    if os.path.isdir(city_dir):  # Skip cities without data directory
        for i in range(1, 8):  # Files 1 to 7
            file_path = os.path.join(city_dir, f'2025-{i}.csv')
            if not os.path.exists(file_path):
                continue
    
            try:
                df_raw = pd.read_csv(file_path)
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
                continue
    
            # Filter for relevant pollutants
            df_filtered = df_raw[df_raw['parameter'].isin(POLLUTANTS)]
    
            # Iterate through rows and assign values
            for _, row in df_filtered.iterrows():
                try:
                    # Parse and strip timezone to make it match df index
                    ts = pd.to_datetime(row['datetimeLocal']).tz_localize(None)
    
                    if ts.year != 2025:
                        continue  # Skip anything not in 2025
    
                    pollutant = row['parameter'].lower()
                    value = row['value']
    
                    # Multiply NO2 and O3 by 1000
                    if pollutant in ['no2', 'o3']:
                        value *= 1000
    
                    df_city_2025.at[ts, pollutant] = value
                except Exception as e:
                    print(f"Error parsing row: {e}")
                    continue
    
    # Step 4: Read processed historical file and insert 2025
    processed_path = f'../../data/processed/ontario/hourly/{city}.csv'
    
    if os.path.exists(processed_path):
        df_all = pd.read_csv(processed_path, index_col=0, parse_dates=True)
        # Drop any existing 2025 rows
        df_all = df_all[~((df_all.index >= '2025-01-01') & (df_all.index <= '2025-12-31 23:00:00'))]
        # Append 2025
        df_all = pd.concat([df_all, df_city_2025])
    else:
        # If no existing file, just use 2025 data
        df_all = df_city_2025

    # Sort index just in case
    df_all.sort_index(inplace=True)

    # Save updated DataFrame
    df_all.to_csv(processed_path)

100%|███████████████████████████████████████████| 38/38 [05:11<00:00,  8.20s/it]


## Compute rolling averages and AQHI

Compute and update CSVs with 3-hour rolling averages

In [4]:
POLLUTANTS = ['pm25', 'no2', 'o3']

for city in tqdm(ONTARIO_CITIES):
    city_path = f'../../data/processed/ontario/hourly/{city}.csv'

    try:
        df = pd.read_csv(city_path, index_col=0, parse_dates=True)
    except Exception as e:
        print(f"Error reading {city_path}: {e}")
        continue

    for pollutant in POLLUTANTS:
        avg_col = f"{pollutant}_3h_avg"
        df[avg_col] = (
            df[pollutant]
            .rolling(window=3, min_periods=1)
            .mean()
            .round(1)  
        )

    # Save updated DataFrame
    df.to_csv(city_path)

100%|███████████████████████████████████████████| 38/38 [00:43<00:00,  1.14s/it]


Compute and update CSVs with AQHI per-hour using rolling averages

In [5]:
def compute_mAQI(no2, o3):
    """Compute Ontario mAQI from 1-hour NO2 (ppb) and O3 (ppb)."""
    
    def sub_index_no2(val):
        if pd.isna(val):
            return np.nan
        if 0 <= val <= 110:
            return 0.02264 * val + 1.000
        elif 111 <= val <= 200:
            return 0.03360 * val - 0.2291
        elif 201 <= val <= 524:
            return 0.01235 * val + 4.017
        else:  # > 524
            return 0.01810 * val + 1.000

    def sub_index_o3(val):
        if pd.isna(val):
            return np.nan
        if 0 <= val <= 50:
            return 0.04980 * val + 1.000
        elif 51 <= val <= 80:
            return 0.1031 * val - 1.758
        elif 81 <= val <= 149:
            return 0.05868 * val + 1.747
        else:  # > 149
            return 0.05868 * val + 1.747

    no2_idx = no2.apply(sub_index_no2)
    o3_idx = o3.apply(sub_index_o3)

    return pd.concat([no2_idx, o3_idx], axis=1).max(axis=1)  # keep decimals for AQHI+ comparison

def compute_aqhi(pm25_3h, no2_3h, o3_3h):
    """Compute federal AQHI from 3-hour averages."""
    aqhi = (
        1000 * (
            (np.exp(0.000871 * no2_3h) - 1) +
            (np.exp(0.000537 * o3_3h) - 1) +
            (np.exp(0.000487 * pm25_3h) - 1)
        )
    ) / 10.4
    return aqhi

for city in tqdm(ONTARIO_CITIES):
    city_path = f'../../data/processed/ontario/hourly/{city}.csv'

    try:
        df = pd.read_csv(city_path, index_col=0, parse_dates=True)
    except Exception as e:
        print(f"Error reading {city_path}: {e}")
        continue

    # --- Compute mAQI from 1-hour O3 & NO2 ---
    mAQI = compute_mAQI(df['no2'], df['o3'])

    # --- Compute federal AQHI from 3-hour averages ---
    aqhi_raw = compute_aqhi(df['pm25_3h_avg'], df['no2_3h_avg'], df['o3_3h_avg'])

    # Round AQHI for reporting (per guidelines)
    aqhi_report = aqhi_raw.round().astype('Int64')
    
    # Cap AQHI values for public display (11 == "10+")
    aqhi_report_final = aqhi_report.clip(upper=11)

    # --- Compute AQHI Plus ---
    aqhi_plus = aqhi_report.copy()
    
    # Step 1: Apply mAQI substitution
    mask_maqi = (mAQI > 6) & ((mAQI > aqhi_raw) | aqhi_raw.isna())
    aqhi_plus[mask_maqi] = mAQI[mask_maqi].round().astype('Int64')
    
    # Step 2: Apply PM2.5 trigger (April 2024 change)
    # pm25 here is the 1-hour value in µg/m³
    sub_pm25 = np.ceil(df['pm25'] / 10)
    
    mask_pm25 = sub_pm25 > aqhi_plus
    aqhi_plus[mask_pm25] = sub_pm25[mask_pm25].astype('Int64')

    # Cap AQHI Plus values for public display (11 == "10+")
    aqhi_plus_final = aqhi_plus.clip(upper=11)

    # --- Store results ---
    df['aqhi_raw'] = aqhi_report
    df['aqhi'] = aqhi_report_final
    df['aqhi_plus_raw'] = aqhi_plus
    df['aqhi_plus'] = aqhi_plus_final

    # Save updated file
    df.to_csv(city_path)

100%|███████████████████████████████████████████| 38/38 [00:47<00:00,  1.26s/it]


## Compute daily AQHI values

In [6]:
daily_dates = pd.date_range(start="2005-01-01", end="2025-12-31", freq="D")

# Loop through cities
for city in tqdm(ONTARIO_CITIES):
    hourly_path = f'../../data/processed/ontario/hourly/{city}.csv'
    daily_path = f'../../data/processed/ontario/daily/{city}.csv'

    if not os.path.exists(hourly_path):
        print(f"No hourly file for {city}, skipping.")
        continue

    try:
        df_hourly = pd.read_csv(hourly_path, index_col=0, parse_dates=True)
    except Exception as e:
        print(f"Error reading {hourly_path}: {e}")
        continue

    if 'aqhi_plus' not in df_hourly.columns:
        print(f"No 'aqhi_plus' column for {city}, skipping.")
        continue

    # Create empty daily DataFrame
    df_daily = pd.DataFrame(index=daily_dates, columns=['4pm_aqhi', 'min_aqhi', 'max_aqhi', 'mean_aqhi', 'med_aqhi'], dtype=float)

    # Group hourly data by date
    grouped = df_hourly.groupby(df_hourly.index.date)

    for date, group in grouped:
        date = pd.Timestamp(date)

        # 4pm AQHI Plus (16:00 hour if present)
        four_pm_val = group.loc[group.index.hour == 16, 'aqhi_plus']
        df_daily.at[date, '4pm_aqhi'] = four_pm_val.iloc[0] if not four_pm_val.empty else np.nan

        # Daily min, max, median
        aqhi_vals = group['aqhi_plus'].dropna()
        aqhi_vals_raw = group['aqhi_plus_raw'].dropna()
        if not aqhi_vals.empty:
            df_daily.at[date, 'min_aqhi'] = aqhi_vals.min()
            df_daily.at[date, 'max_aqhi'] = aqhi_vals.max()
            df_daily.at[date, 'mean_aqhi'] = np.clip(aqhi_vals_raw.mean().round(1), None, 11)
            df_daily.at[date, 'med_aqhi'] = aqhi_vals.median()

    # Save daily file
    df_daily.to_csv(daily_path)

100%|███████████████████████████████████████████| 38/38 [03:32<00:00,  5.60s/it]


## Retrieve metadata

Use 2024 PM2.5 data to retrieve metadata on location and other details for each station

Compute the coverage of the data - percent of each pollutant in each month

In [None]:
def compute_coverage_for_city(city_name):
    # Load processed city data
    path = f'../../data/processed/ontario/hourly/{city_name}.csv'
    df = pd.read_csv(path, index_col=0, parse_dates=True)

    # Ensure all required columns are present
    required_cols = ['pm25', 'no2', 'o3',
                     'pm25_3h_avg', 'no2_3h_avg', 'o3_3h_avg',
                     'aqhi', 'aqhi_plus', 'aqhi_raw', 'aqhi_plus_raw']
    for col in required_cols:
        if col not in df.columns:
            df[col] = np.nan

    # Build full month index from 2003–2025
    month_index = pd.date_range(start='2003-01-01', end='2025-12-31', freq='MS')
    coverage = pd.DataFrame(index=month_index, columns=required_cols)

    # Compute coverage per column, per month
    for month_start in month_index:
        month_end = month_start + pd.offsets.MonthEnd(1)
        month_hours = pd.date_range(start=month_start, end=month_end, freq='h')
        expected_count = len(month_hours)

        for col in required_cols:
            if month_start < df.index.min() or month_start > df.index.max():
                coverage.loc[month_start, col] = 0.00
            else:
                mask = (df.index >= month_start) & (df.index <= month_end)
                observed_count = df.loc[mask, col].notna().sum()
                coverage.loc[month_start, col] = round((observed_count / expected_count) * 100, 2)

    return coverage


# Example: compute coverage for Toronto Downtown
coverage_toronto = compute_coverage_for_city('toronto downtown')
coverage_toronto.head()
