In [20]:
import os
import csv
import requests
from datetime import datetime, timedelta
from calendar import monthrange

import pandas as pd
import numpy as np
from tqdm import tqdm
from openaq import OpenAQ

In [21]:
from constants import FEDERAL_CITIES
from constants import FEDERAL_LOCATION_IDS

API_KEY = "be9dde17e764a66a5352413ec62ab925e5b89a58d8fdc4711fc5ad460cdbd29c"

Parse pollutant data from 2005 to 2023

In [22]:
# Load station-city mapping (preferred stations per city-month)
df_station_map = pd.read_csv('../../data/raw/federal/metadata/station_cities.csv')
df_station_map = df_station_map.rename(columns={'Unnamed: 0': 'month'})
df_station_map['month'] = pd.to_datetime(df_station_map['month'])
df_station_map = df_station_map.set_index('month')

federal_cities = df_station_map.columns.tolist()

# Create full hourly date range from 2005-01-01 to 2025-12-31 23:00
date_range = pd.date_range(start="2005-01-01", end="2025-12-31 23:00:00", freq="h")
columns = ['pm25', 'no2', 'o3', 'pm25_3h_avg', 'no2_3h_avg', 'o3_3h_avg',
           'aqhi_raw', 'aqhi_plus_raw', 'aqhi', 'aqhi_plus']

df_empty = pd.DataFrame(index=date_range, columns=columns, dtype=float)

output_dir = '../../data/processed/federal/hourly/'
os.makedirs(output_dir, exist_ok=True)

# Initialize each city file
for city in tqdm(federal_cities):
    filename = f"{output_dir}{city}.csv"
    df_empty.to_csv(filename)

100%|██████████| 42/42 [00:30<00:00,  1.36it/s]


In [23]:
pollutants = {'NO2': 'no2', 'O3': 'o3', 'PM25': 'pm25'}
years = range(2005, 2024)  # 2005–2023

for pollutant, colname in pollutants.items():
    print(f"\n=== Processing {pollutant} → {colname} ===")

    for year in tqdm(years):
        file_path = f'../../data/raw/federal/{pollutant}/{pollutant}_{year}.csv'
        if not os.path.exists(file_path):
            continue

        # Read raw pollutant-year file
        df = pd.read_csv(file_path, skiprows=7)

        # Extract hourly columns
        hour_cols = [c for c in df.columns if c.startswith('H')]

        # Melt to long format
        melted = df.melt(
            id_vars=['Date//Date', 'NAPS ID//Identifiant SNPA'],
            value_vars=hour_cols,
            var_name='hour',
            value_name=colname
        )

        # Parse datetime
        melted['Date//Date'] = pd.to_datetime(melted['Date//Date'])
        melted['hour'] = melted['hour'].str.extract(r'H(\d+)').astype(int)
        melted['datetime'] = (
            melted['Date//Date'] +
            pd.to_timedelta(melted['hour'], unit='h') +
            pd.Timedelta(hours=1)
        )
        melted = melted.drop(columns=['hour', 'Date//Date'])

        # Filter invalids
        melted[colname] = pd.to_numeric(melted[colname], errors='coerce')
        melted.loc[melted[colname].isin([-999, 9999]), colname] = np.nan

        # Add month for join with station map
        melted['month'] = melted['datetime'].dt.to_period("M").dt.to_timestamp()
        melted['station_id'] = melted['NAPS ID//Identifiant SNPA'].astype(int)

        # Now we need: (station_id, month) → city
        # Reshape df_station_map for a merge
        df_long = df_station_map.stack().reset_index()
        df_long.columns = ['month', 'city', 'station_id']
        df_long['station_id'] = df_long['station_id'].dropna().astype(int)

        # Merge pollutant values with city mapping
        merged = melted.merge(df_long, on=['station_id', 'month'], how='inner')

        # Now merged has datetime, pollutant col, city
        # Group by city, then write once
        for city, sub in merged.groupby('city'):
            city_file = f"{output_dir}{city}.csv"
            df_city = pd.read_csv(city_file, index_col=0, parse_dates=True)

            df_city.loc[sub['datetime'], colname] = sub[colname].values

            df_city.to_csv(city_file)



=== Processing NO2 → no2 ===


100%|██████████| 19/19 [09:50<00:00, 31.06s/it]



=== Processing O3 → o3 ===


100%|██████████| 19/19 [11:19<00:00, 35.76s/it]



=== Processing PM25 → pm25 ===


100%|██████████| 19/19 [12:18<00:00, 38.87s/it]


Use OpenAQ to query for pollutant data over 2024 and 2025

In [11]:
client = OpenAQ(api_key=API_KEY)

sensor_records = []

for city, location_id in tqdm(FEDERAL_LOCATION_IDS.items()):
    if location_id is None:
        sensor_records.append([city, '', '', '', '', ''])
        continue

    sensors = client.locations.sensors(location_id).results

    for sensor in sensors:
        if sensor.parameter['name'] in ['pm25', 'o3', 'no2']:
            sensor_records.append([
                city, 
                location_id, 
                sensor.id,
                sensor.parameter['name'],
                sensor.datetime_first['local'],
                sensor.datetime_last['local'],
            ])

df = pd.DataFrame.from_records(sensor_records, columns=['city', 'location_id', 'sensor_id', 'pollutant', 'date_first', 'date_last'])
df.to_csv('../../data/raw/federal/metadata/sensors.csv', index=False)

100%|██████████| 42/42 [00:06<00:00,  6.24it/s]


In [12]:
for city, location_id, sensor_id, pollutant, date_start, date_end in sensor_records:
    if not sensor_id:
        continue

    

St. John's


In [None]:
client = OpenAQ(api_key=API_KEY)
response = client.measurements.list(
    sensors_id=25148, 
    limit=1000,
    page=13,
    datetime_from='2024-01-01',  # need to do 2024-2025, then 2025-2026 
    datetime_to='2026-01-01',
)

In [24]:
response.results

[]

In [12]:
from openaq import OpenAQ
import pandas as pd
from datetime import datetime

# Initialize client with API key
API_KEY = "0d33157cc065493f21b56c03c868fead2ba85358f4c6d1cffa7e1e91dd560f50"
client = OpenAQ(api_key=API_KEY)

# Station ID
location_id = 774

# Date range
start_date = '2024-01-01T00:00:00Z'
end_date = datetime.utcnow().isoformat() + "Z"

# Function to fetch all measurements with pagination
def fetch_all_measurements(location_id, start_date, end_date, limit=1000):
    page = 1
    all_results = []

    while True:
        response = client.measurements.list(
            location_id=location_id,  # Corrected parameter name
            date_from=start_date,
            date_to=end_date,
            limit=limit,
            page=page
        )
        results = response['results']
        if not results:
            break
        
        all_results.extend(results)
        print(f"Fetched page {page} with {len(results)} records")
        page += 1

    return all_results

# Fetch data
data = fetch_all_measurements(location_id, start_date, end_date)

# Convert to DataFrame
df = pd.DataFrame(data)

# Optional: show first few rows
print(df.head())

# Save to CSV
df.to_csv("station_774_measurements.csv", index=False)


  end_date = datetime.utcnow().isoformat() + "Z"


TypeError: Measurements.list() got an unexpected keyword argument 'location_id'