In [3]:
import os
import csv
import requests
from datetime import datetime, timedelta
from calendar import monthrange

import pandas as pd
import numpy as np
from tqdm import tqdm
from openaq import OpenAQ

In [2]:
from constants import FEDERAL_CITIES
from constants import FEDERAL_LOCATION_IDS

API_KEY = "be9dde17e764a66a5352413ec62ab925e5b89a58d8fdc4711fc5ad460cdbd29c"

Parse pollutant data from 2005 to 2023

In [4]:
# Load station-city mapping (preferred stations per city-month)
df_station_map = pd.read_csv('../../data/raw/federal/metadata/station_cities.csv')

df_station_map = df_station_map.rename(columns={'Unnamed: 0': 'month'})
df_station_map['month'] = pd.to_datetime(df_station_map['month'])
df_station_map = df_station_map.set_index('month')

federal_cities = df_station_map.columns.tolist()

# Create full hourly date range from 2005-01-01 to 2025-12-31 23:00
date_range = pd.date_range(start="2005-01-01", end="2025-12-31 23:00:00", freq="h")
columns = ['pm25', 'no2', 'o3', 'pm25_3h_avg', 'no2_3h_avg', 'o3_3h_avg', 'aqhi_raw', 'aqhi_plus_raw', 'aqhi', 'aqhi_plus']
df_empty = pd.DataFrame(index=date_range, columns=columns, dtype=float)

output_dir = '../../data/processed/federal/hourly/'
os.makedirs(output_dir, exist_ok=True)

for city in tqdm(federal_cities):
    filename = f"{output_dir}{city}.csv"
    df_empty.to_csv(filename)
    # print(f"Initialized {filename}")

  0%|          | 0/42 [00:00<?, ?it/s]

100%|██████████| 42/42 [00:32<00:00,  1.31it/s]


In [19]:
pollutants = {'NO2': 'no2', 'O3': 'o3', 'PM25': 'pm25'}
years = range(2005, 2024)  # 2005–2023

for pollutant, colname in pollutants.items():
    print(f"Processing {pollutant} → {colname}")

    for year in tqdm(years):
        file_path = f'../../data/raw/federal/{pollutant}/{pollutant}_{year}.csv'
        if not os.path.exists(file_path):
            continue

        # Read CSV, headers start after 7 lines
        df = pd.read_csv(file_path, skiprows=7)

        # Extract hourly columns
        hour_cols = [c for c in df.columns if c.startswith('H')]

        # Melt into long format: one row per hour
        melted = df.melt(
            id_vars=['Date//Date', 'NAPS ID//Identifiant SNPA'],
            value_vars=hour_cols,
            var_name='hour',
            value_name=colname
        )

        # Clean datetime
        melted['Date//Date'] = pd.to_datetime(melted['Date//Date'])
        melted['hour'] = melted['hour'].str.extract(r'H(\d+)').astype(int)
        melted['datetime'] = (
            melted['Date//Date']
            + pd.to_timedelta(melted['hour'], unit='h')
            + pd.Timedelta(hours=1)
        )
        melted = melted.drop(columns=['hour', 'Date//Date'])

        # Filter invalid values
        melted[colname] = melted[colname].apply(
            lambda x: np.nan if pd.isna(x) or x in [-999, 9999] else x
        )

        # Group by station ID for fast lookup
        station_groups = dict(tuple(melted.groupby('NAPS ID//Identifiant SNPA')))

        # Loop through months in this year
        year_months = pd.date_range(start=f"{year}-01-01", end=f"{year}-12-31", freq="MS")
        for month in year_months:
            if month not in df_station_map.index:
                continue

            for city in federal_cities:
                station_id = df_station_map.loc[month, city]
                if pd.isna(station_id):
                    continue  # no station for this city this month
                
                print(month, city, station_id)

                station_id = int(station_id)  # normalize ID
                if station_id not in station_groups:
                    continue  # station not found in file

                sub = station_groups[station_id]
                mask = sub['datetime'].dt.to_period("M") == month.to_period("M")
                values = sub.loc[mask, ['datetime', colname]]
                if values.empty:
                    continue

                # Load this city's CSV (datetime index restored automatically)
                city_file = f"{output_dir}{city}.csv"
                df_city = pd.read_csv(city_file, index_col=0, parse_dates=True)

                # Assign new values
                df_city.loc[values['datetime'], colname] = values[colname].values

                # Save back to disk
                df_city.to_csv(city_file)


Processing NO2 → no2


  0%|          | 0/19 [00:00<?, ?it/s]

2005-01-01 00:00:00 St. John's 10102
2005-01-01 00:00:00 Halifax 30113
2005-01-01 00:00:00 Fredericton 40103
2005-01-01 00:00:00 Montréal 50103
2005-01-01 00:00:00 Quebec 50308
2005-01-01 00:00:00 Sherbrooke 50404
2005-01-01 00:00:00 Saguenay 50504
2005-01-01 00:00:00 Trois Rivières 50801
2005-01-01 00:00:00 Ottawa 60104
2005-01-01 00:00:00 Windsor 60204
2005-01-01 00:00:00 Toronto 60433
2005-01-01 00:00:00 Hamilton 60513
2005-01-01 00:00:00 Sudbury 60609
2005-01-01 00:00:00 Sault Ste. Marie 60709
2005-01-01 00:00:00 Thunder Bay 60809
2005-01-01 00:00:00 London 60903
2005-01-01 00:00:00 St. Catharines 61302
2005-01-01 00:00:00 North Bay 62001
2005-01-01 00:00:00 Barrie 65001
2005-01-01 00:00:00 Winnipeg 70119
2005-01-01 00:00:00 Brandon 70203
2005-01-01 00:00:00 Regina 80110
2005-01-01 00:00:00 Saskatoon 80211
2005-01-01 00:00:00 Prince Albert 80402
2005-01-01 00:00:00 Edmonton 90130
2005-01-01 00:00:00 Calgary 90222
2005-01-01 00:00:00 Lethbridge 90502
2005-01-01 00:00:00 Fort Mcmurra

  0%|          | 0/19 [00:39<?, ?it/s]


KeyboardInterrupt: 

Use OpenAQ to query for pollutant data over 2024 and 2025

In [11]:
client = OpenAQ(api_key=API_KEY)

sensor_records = []

for city, location_id in tqdm(FEDERAL_LOCATION_IDS.items()):
    if location_id is None:
        sensor_records.append([city, '', '', '', '', ''])
        continue

    sensors = client.locations.sensors(location_id).results

    for sensor in sensors:
        if sensor.parameter['name'] in ['pm25', 'o3', 'no2']:
            sensor_records.append([
                city, 
                location_id, 
                sensor.id,
                sensor.parameter['name'],
                sensor.datetime_first['local'],
                sensor.datetime_last['local'],
            ])

df = pd.DataFrame.from_records(sensor_records, columns=['city', 'location_id', 'sensor_id', 'pollutant', 'date_first', 'date_last'])
df.to_csv('../../data/raw/federal/metadata/sensors.csv', index=False)

100%|██████████| 42/42 [00:06<00:00,  6.24it/s]


In [12]:
for city, location_id, sensor_id, pollutant, date_start, date_end in sensor_records:
    if not sensor_id:
        continue

    

St. John's


In [None]:
client = OpenAQ(api_key=API_KEY)
response = client.measurements.list(
    sensors_id=25148, 
    limit=1000,
    page=13,
    datetime_from='2024-01-01',  # need to do 2024-2025, then 2025-2026 
    datetime_to='2026-01-01',
)

In [24]:
response.results

[]

In [12]:
from openaq import OpenAQ
import pandas as pd
from datetime import datetime

# Initialize client with API key
API_KEY = "0d33157cc065493f21b56c03c868fead2ba85358f4c6d1cffa7e1e91dd560f50"
client = OpenAQ(api_key=API_KEY)

# Station ID
location_id = 774

# Date range
start_date = '2024-01-01T00:00:00Z'
end_date = datetime.utcnow().isoformat() + "Z"

# Function to fetch all measurements with pagination
def fetch_all_measurements(location_id, start_date, end_date, limit=1000):
    page = 1
    all_results = []

    while True:
        response = client.measurements.list(
            location_id=location_id,  # Corrected parameter name
            date_from=start_date,
            date_to=end_date,
            limit=limit,
            page=page
        )
        results = response['results']
        if not results:
            break
        
        all_results.extend(results)
        print(f"Fetched page {page} with {len(results)} records")
        page += 1

    return all_results

# Fetch data
data = fetch_all_measurements(location_id, start_date, end_date)

# Convert to DataFrame
df = pd.DataFrame(data)

# Optional: show first few rows
print(df.head())

# Save to CSV
df.to_csv("station_774_measurements.csv", index=False)


  end_date = datetime.utcnow().isoformat() + "Z"


TypeError: Measurements.list() got an unexpected keyword argument 'location_id'