In [2]:
import os
import csv
import requests
from datetime import datetime, timedelta
from calendar import monthrange

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from tqdm import tqdm

In [3]:
from constants import FEDERAL_CITIES

In [3]:
df_stations = pd.read_csv('../../data/raw/federal/metadata/StationsNAPS-StationsSNPA.csv')
df_stations.columns

Index(['NAPS_ID', 'Station_Name', 'Status', 'Location_Address', 'City', 'P/T',
       'Postal_Code', 'Timezone', 'Latitude', 'Longitude', 'Elevation',
       'Start_Year', 'End_Year', 'Combined_Stations', 'Inlet_Height',
       'Network', 'SO2', 'CO', 'NO2', 'NO', 'NOX', 'O3', 'PM_25_Continuous',
       'PM_10_Continuous', 'PM_2.5_RM', 'PM10-2.5', 'PM2.5_Speciation', 'VOC',
       'Carbonyl', 'PAH', 'Site_Type', 'Urbanization', 'Neighbourhood',
       'Land_Use', 'Scale', 'PC', 'CD', 'CSD', 'CMA/CA', 'AQMS_Airzone',
       'Core_Site'],
      dtype='object')

Remove stations which have old data only (before 2000) or aren't in our list

In [4]:
# Filter 1: Keep only stations in federal cities
# Create a set of (city, province) tuples for faster lookup
federal_cities_set = set(FEDERAL_CITIES)

# Filter stations to keep only those in federal cities
df_filtered = df_stations[
    df_stations.apply(lambda row: (row['City'], row['P/T']) in federal_cities_set, axis=1)
].copy()  # Create a copy to avoid SettingWithCopyWarning

print(f"Stations after federal cities filter: {len(df_filtered)}")

# Filter 2: Remove stations with data exclusively before 2000
# Convert date strings to datetime with explicit format to avoid parsing warnings
df_filtered.loc[:, 'Start_Year_dt'] = pd.to_datetime(
    df_filtered['Start_Year'], 
    format='%Y-%m-%d %I:%M:%S %p', 
    errors='coerce'
)
df_filtered.loc[:, 'End_Year_dt'] = pd.to_datetime(
    df_filtered['End_Year'], 
    format='%Y-%m-%d %I:%M:%S %p', 
    errors='coerce'
)

# Define cutoff date (January 1st, 2000)
cutoff_date = pd.Timestamp('2000-01-01')

# Keep stations if:
# 1. End year is NaN (ongoing stations) AND start year is not NaN, OR
# 2. End year exists and is >= 2000-01-01 (stations that operated after 2000)
# This removes stations that have both start and end dates exclusively before 2000
df_filtered = df_filtered[
    ((df_filtered['End_Year_dt'].isna()) & (df_filtered['Start_Year_dt'].notna())) |  # Ongoing stations with valid start
    (df_filtered['End_Year_dt'] >= cutoff_date)  # Stations ending on or after 2000
]

print(f"Stations after date filter: {len(df_filtered)}")

# Clean up temporary columns
df_filtered = df_filtered.drop(['Start_Year_dt', 'End_Year_dt'], axis=1)

df_filtered.to_csv('../../data/raw/federal/metadata/stations_2.csv', index=False)

Stations after federal cities filter: 320
Stations after date filter: 174


In [11]:
# Load station metadata
df_stations2 = pd.read_csv('../../data/raw/federal/metadata/stations_2.csv')
valid_stations = set(df_stations2['NAPS_ID'].astype(str))  # fast lookup

# Define pollutants and years
pollutants = ['NO2', 'O3', 'PM25']
years = range(2005, 2024)

In [12]:
coverage_data = {}

for pollutant in pollutants:
    print(f'Processing {pollutant}...')
    
    # Dictionary to hold monthly coverage
    monthly_coverage = {} # keys: month string YYYY-MM, values: dict of station-city coverage

    for year in tqdm(years):
        file_path = f'../../data/raw/federal/{pollutant}/{pollutant}_{year}.csv'
        if not os.path.exists(file_path):
            continue
        
        # Read CSV skipping first 7 rows, headers are on 8th line
        df = pd.read_csv(file_path, skiprows=7)
        
        # Filter rows to only valid stations
        df = df[df['NAPS ID//Identifiant SNPA'].astype(str).isin(valid_stations)]
        
        # Identify hour columns
        hour_cols = [col for col in df.columns if col.startswith('H')]
        
        # Create station-city pair column (station ID first)
        df['station_city'] = df['NAPS ID//Identifiant SNPA'].astype(str) + '-' + df['City//Ville']
        
        # Iterate over each row (day)
        for idx, row in df.iterrows(): # tqdm(df.iterrows(), total=len(df), desc=f'{pollutant} {year} Rows'):
            date = pd.to_datetime(row['Date//Date'])
            month_str = date.strftime('%Y-%m')
            
            if month_str not in monthly_coverage:
                monthly_coverage[month_str] = {}
            
            station_city = row['station_city']
            if station_city not in monthly_coverage[month_str]:
                monthly_coverage[month_str][station_city] = {'count': 0, 'total': 0}
            
            # Count valid hours
            valid_hours = row[hour_cols].apply(lambda x: 1 if pd.notna(x) and x not in [-999, 9999] else 0).sum()
            monthly_coverage[month_str][station_city]['count'] += valid_hours
            monthly_coverage[month_str][station_city]['total'] += len(hour_cols)
    
    # Build final DataFrame
    all_station_cities = sorted({sc for month in monthly_coverage.values() for sc in month.keys()},
                                key=lambda x: int(x.split('-')[0]))  # sort by station ID numerically
    all_months = pd.date_range(start='2005-01-01', end='2023-12-31', freq='MS').strftime('%Y-%m')
    
    coverage_df = pd.DataFrame(index=all_months, columns=all_station_cities)
    
    for month in tqdm(all_months):
        for sc in all_station_cities:
            if month in monthly_coverage and sc in monthly_coverage[month]:
                counts = monthly_coverage[month][sc]
                coverage_pct = round(100 * counts['count'] / counts['total'], 2)
                coverage_df.loc[month, sc] = coverage_pct
            else:
                coverage_df.loc[month, sc] = np.nan # No data
    
    # Save CSV
    coverage_df.to_csv(f'../../data/raw/federal/metadata/coverage_{pollutant}.csv')


Processing NO2...


100%|██████████| 19/19 [04:20<00:00, 13.69s/it]
100%|██████████| 228/228 [00:00<00:00, 279.41it/s]


Processing O3...


100%|██████████| 19/19 [05:22<00:00, 16.95s/it]
100%|██████████| 228/228 [00:00<00:00, 246.42it/s]


Processing PM25...


100%|██████████| 19/19 [06:12<00:00, 19.62s/it]
100%|██████████| 228/228 [00:01<00:00, 220.36it/s]


In [15]:
# Choose which pollutant CSV to open
pollutant = 'PM25'  # change to O3 or PM25 if needed
input_path = f'../../data/raw/federal/metadata/coverage_{pollutant}.csv'
output_path = f'../../data/raw/federal/metadata/station_cities.csv'

# Load the coverage CSV
df = pd.read_csv(input_path, index_col=0)

# Extract cities from columns (remove station IDs)
cities = [col.split('-', 1)[1] for col in df.columns]

# Remove duplicates while preserving order
seen = set()
unique_cities = []
for city in cities:
    if city not in seen:
        seen.add(city)
        unique_cities.append(city)

# Build new DataFrame
df_cities = pd.DataFrame(index=df.index)  # months as index
for city in unique_cities:
    df_cities[city] = np.nan  # initialize columns

# Reset index so month becomes first column
df_cities.reset_index(inplace=True)
df_cities.rename(columns={'index': 'Month'}, inplace=True)

# Save to CSV
df_cities.to_csv(output_path, index=False)
print(f'City-only CSV saved to {output_path}')


City-only CSV saved to ../../data/raw/federal/metadata/station_cities.csv
