In [2]:
import requests
import pandas as pd
from datetime import datetime, date
from aqi_api import open_weather_api_key
import os

def get_historical_aqi_data(api_key, city, start_date, end_date):
    """Fetches historical air quality data for a given city and date range."""
    # Geocode the city to get latitude and longitude
    geocode_url = f"http://api.openweathermap.org/data/2.5/weather?q={city}&appid={api_key}"
    geocode_response = requests.get(geocode_url)
    
    if geocode_response.status_code == 200:
        geocode_data = geocode_response.json()
        lat = geocode_data['coord']['lat']
        lon = geocode_data['coord']['lon']
    else:
        print(f"Failed to get coordinates for {city}. Error: {geocode_response.status_code}")
        return None

    # Convert dates to Unix timestamps
    start_timestamp = int(datetime.strptime(start_date, '%Y-%m-%d').timestamp())
    end_timestamp = int(datetime.strptime(end_date, '%Y-%m-%d').timestamp())

    # Fetch historical air quality data
    aqi_url = f"http://api.openweathermap.org/data/2.5/air_pollution/history?lat={lat}&lon={lon}&start={start_timestamp}&end={end_timestamp}&appid={api_key}"
    aqi_response = requests.get(aqi_url)
    
    if aqi_response.status_code == 200:
        aqi_data = aqi_response.json()
        if 'list' in aqi_data:
            records = []
            for item in aqi_data['list']:
                components = item['components']
                record = {
                    'timestamp': datetime.utcfromtimestamp(item['dt']).strftime('%Y-%m-%d'),  # Keep full date
                    'city': city,
                    'aqi': round((item['main']['aqi'] - 1) * 100, 1),  # Convert AQI to 0-500 scale
                    'co': round(components.get('co', 0), 1),
                    'no': round(components.get('no', 0), 1),
                    'no2': round(components.get('no2', 0), 1),
                    'o3': round(components.get('o3', 0), 1),
                    'so2': round(components.get('so2', 0), 1),
                    'pm2_5': round(components.get('pm2_5', 0), 1),
                    'pm10': round(components.get('pm10', 0), 1),
                    'nh3': round(components.get('nh3', 0), 1)
                }
                records.append(record)
            return records
        else:
            print(f"No air quality data available for {city} in the specified date range.")
            return None
    else:
        print(f"Failed to get historical air quality data for {city}. Error: {aqi_response.status_code}")
        return None

def save_to_csv(data, filename, folder = "datasets/"):
    """Saves the collected data to a CSV file with improved aggregation."""
# Ensure the folder exists
    os.makedirs(folder, exist_ok=True)
    
    # Define full path
    file_path = os.path.join(folder, filename)

    df = pd.DataFrame(data)

    # Ensure timestamp is in datetime format
    df['timestamp'] = pd.to_datetime(df['timestamp'])

    # Convert numeric columns properly
    pollutant_columns = ['co', 'no', 'no2', 'o3', 'so2', 'pm2_5', 'pm10', 'nh3']
    for col in pollutant_columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    df['aqi'] = pd.to_numeric(df['aqi'], errors='coerce')

    # Aggregate data correctly
    df_grouped = df.groupby('timestamp').agg({
        'aqi': 'max',  # Keep the max AQI per day
        'co': 'median',
        'no': 'median',
        'no2': 'median',
        'o3': 'median',
        'so2': 'median',
        'pm2_5': 'median',
        'pm10': 'median',
        'nh3': 'median'
    }).reset_index()

    # Round all values to 1 decimal place
    df_grouped = df_grouped.round(1)

    # Save to CSV
    df_grouped.to_csv(file_path, index=False)
    print(f"Data {filename} saved to {file_path}.")

if __name__ == "__main__":
    # Replace 'your_api_key_here' with your actual OpenWeather API key
    api_key = open_weather_api_key
    # cities = ['Kathmandu', 'Bhaktapur', 'Lalitpur', 'Pokhara', 'Biratnagar', 'Birgunj', 'Butwal', 'Dharan', 'Hetauda', 'Janakpur', 'Nepalgunj', 'Panauti', 'Tulsipur']
    cities = ['Kathmandu']
    start_date = '2023-01-01'  
    end_date = date.today().strftime('%Y-%m-%d')

    # Save the data to a CSV file
    folder_path = "datasets/"

    
    for city in cities:
        data = get_historical_aqi_data(api_key, city, start_date, end_date)
        if data:
            filename = f"{city}_historical_aqi_{start_date}_to_{end_date}.csv"
            save_to_csv(data, filename, folder=folder_path)


Data Kathmandu_historical_aqi_2023-01-01_to_2025-03-06.csv saved to datasets/Kathmandu_historical_aqi_2023-01-01_to_2025-03-06.csv.
