License
Code Attribution
Snippets of the code were taken from a code example developed by Dr. David W. McDonald for use in DATA 512, a course in the UW MS Data Science degree program. This code is provided under the Creative Commons CC-BY license.

The rest of the code lies under the standard MIT licence

In [2]:
import json, time
#    The 'requests' module is a distribution module for making web requests. If you do not have it already, you'll need to install it
import requests

import pandas as pd

import warnings

from pyproj import Transformer, Geod

warnings.filterwarnings("ignore")

In [None]:
API_REQUEST_URL = 'https://aqs.epa.gov/data/api'

#
#    These are some of the 'actions' we can ask the API to take or requests that we can make of the API
#
#    Sign-up request - generally only performed once - unless you lose your key
API_ACTION_SIGNUP = '/signup?email={email}'
#
#    List actions provide information on API parameter values that are required by some other actions/requests
API_ACTION_LIST_CLASSES = '/list/classes?email={email}&key={key}'
API_ACTION_LIST_PARAMS = '/list/parametersByClass?email={email}&key={key}&pc={pclass}'
API_ACTION_LIST_SITES = '/list/sitesByCounty?email={email}&key={key}&state={state}&county={county}'
#
#    Monitor actions are requests for monitoring stations that meet specific criteria
API_ACTION_MONITORS_COUNTY = '/monitors/byCounty?email={email}&key={key}&param={param}&bdate={begin_date}&edate={end_date}&state={state}&county={county}'
API_ACTION_MONITORS_BOX = '/monitors/byBox?email={email}&key={key}&param={param}&bdate={begin_date}&edate={end_date}&minlat={minlat}&maxlat={maxlat}&minlon={minlon}&maxlon={maxlon}'
#
#    Summary actions are requests for summary data. These are for daily summaries
API_ACTION_DAILY_SUMMARY_COUNTY = '/dailyData/byCounty?email={email}&key={key}&param={param}&bdate={begin_date}&edate={end_date}&state={state}&county={county}'
API_ACTION_DAILY_SUMMARY_BOX = '/dailyData/byBox?email={email}&key={key}&param={param}&bdate={begin_date}&edate={end_date}&minlat={minlat}&maxlat={maxlat}&minlon={minlon}&maxlon={maxlon}'
#
#    It is always nice to be respectful of a free data resource.
#    We're going to observe a 100 requests per minute limit - which is fairly nice
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED
#
#
#    This is a template that covers most of the parameters for the actions we might take, from the set of actions
#    above. In the examples below, most of the time parameters can either be supplied as individual values to a
#    function - or they can be set in a copy of the template and passed in with the template.
#
AQS_REQUEST_TEMPLATE = {
    "email":      "smohan5@uw.edu",
    "key":        "",
    "state":      "06",     # the two digit state FIPS # as a string
    "county":     "071",     # the three digit county FIPS # as a string
    "begin_date": "19640501",     # the start of a time window in YYYYMMDD format
    "end_date":   "20211031",     # the end of a time window in YYYYMMDD format, begin_date and end_date must be in the same year
    "minlat":    0.0,
    "maxlat":    0.0,
    "minlon":    0.0,
    "maxlon":    0.0,
    "param":     "",     # a list of comma separated 5 digit codes, max 5 codes requested
    "pclass":    ""      # parameter class is only used by the List calls
}

STARTYEAR = 1964
ENDYEAR = 2024

In [None]:
#This implements the sign-up request. The parameters are standardized so that this function definition matches
# #    all of the others. However, the easiest way to call this is to simply call this function with your preferred
# #    email address.
# #
def request_signup(email_address = None,
                   endpoint_url = API_REQUEST_URL,
                   endpoint_action = API_ACTION_SIGNUP,
                   request_template = AQS_REQUEST_TEMPLATE,
                   headers = None):

    # Make sure we have a string - if you don't have access to this email addres, things might go badly for you
    if email_address:
        request_template['email'] = email_address

    if not request_template['email']:
        raise Exception("Must supply an email address to call 'request_signup()'")

    if '@' not in request_template['email']:
        raise Exception(f"Must supply an email address to call 'request_signup()'. The string '{request_template['email']}' does not look like an email address.")

    # Compose the signup url - create a request URL by combining the endpoint_url with the parameters for the request
    request_url = endpoint_url+endpoint_action.format(**request_template)

    # make the request
    try:
        # Wait first, to make sure we don't exceed a rate limit in the situation where an exception occurs
        # during the request processing - throttling is always a good practice with a free data source
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(request_url, headers=headers)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

print("Requesting SIGNUP ...")
response = request_signup("smohan5@uw.edu")
print(json.dumps(response,indent=4))

Requesting SIGNUP ...
{
    "Header": [
        {
            "status": "Success",
            "request_time": "2024-10-29T16:10:16-04:00",
            "url": "https://aqs.epa.gov/data/api/signup?email=smohan5@uw.edu"
        }
    ],
    "Data": [
        "You should receive a registration confirmation email with a link for confirming your email shortly."
    ]
}


In [None]:
def request_list_info(email_address = None, key = None,
                      endpoint_url = API_REQUEST_URL,
                      endpoint_action = API_ACTION_LIST_CLASSES,
                      request_template = AQS_REQUEST_TEMPLATE,
                      headers = None):

    #  Make sure we have email and key - at least
    #  This prioritizes the info from the call parameters - not what's already in the template
    if email_address:
        request_template['email'] = email_address
    if key:
        request_template['key'] = key

    # For the basic request we need an email address and a key
    if not request_template['email']:
        raise Exception("Must supply an email address to call 'request_list_info()'")
    if not request_template['key']:
        raise Exception("Must supply a key to call 'request_list_info()'")

    # compose the request
    request_url = endpoint_url+endpoint_action.format(**request_template)

    # make the request
    try:
        # Wait first, to make sure we don't exceed a rate limit in the situation where an exception occurs
        # during the request processing - throttling is always a good practice with a free data source
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(request_url, headers=headers)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

In [None]:
#
#   The default should get us a list of the various groups or classes of sensors. These classes are user defined names for clustors of
#   sensors that might be part of a package or default air quality sensing station. We need a class name to start getting down to the
#   a sensor ID. Each sensor type has an ID number. We'll eventually need those ID numbers to be able to request values that come from
#   that specific sensor.
# username and key to be removed !!!!!!!!!!!!!!!!!!!!!!!
#
USERNAME = "smohan5@uw.edu"
APIKEY = "bluegazelle23"
request_data = AQS_REQUEST_TEMPLATE.copy()
request_data['email'] = USERNAME
request_data['key'] = APIKEY

response = request_list_info(request_template=request_data)

if response["Header"][0]['status'] == "Success":
    print(json.dumps(response['Data'],indent=4))
else:
    print(json.dumps(response,indent=4))

[
    {
        "code": "AIRNOW MAPS",
        "value_represented": "The parameters represented on AirNow maps (88101, 88502, and 44201)"
    },
    {
        "code": "ALL",
        "value_represented": "Select all Parameters Available"
    },
    {
        "code": "AQI POLLUTANTS",
        "value_represented": "Pollutants that have an AQI Defined"
    },
    {
        "code": "CORE_HAPS",
        "value_represented": "Urban Air Toxic Pollutants"
    },
    {
        "code": "CRITERIA",
        "value_represented": "Criteria Pollutants"
    },
    {
        "code": "CSN DART",
        "value_represented": "List of CSN speciation parameters to populate the STI DART tool"
    },
    {
        "code": "FORECAST",
        "value_represented": "Parameters routinely extracted by AirNow (STI)"
    },
    {
        "code": "HAPS",
        "value_represented": "Hazardous Air Pollutants"
    },
    {
        "code": "IMPROVE CARBON",
        "value_represented": "IMPROVE Carbon Parameters"
    }

In [None]:
#
#   Once we have a list of the classes or groups of possible sensors, we can find the sensor IDs that make up that class (group)
#   The one that looks to be associated with the Air Quality Index is "AQI POLLUTANTS"
#   We'll use that to make another list request.
#
AQI_PARAM_CLASS = "AQI POLLUTANTS"

In [None]:
#
#   Structure a request to get the sensor IDs associated with the AQI
#
request_data = AQS_REQUEST_TEMPLATE.copy()
request_data['email'] = USERNAME
request_data['key'] = APIKEY
request_data['pclass'] = AQI_PARAM_CLASS  # here we specify that we want this 'pclass' or parameter classs

response = request_list_info(request_template=request_data, endpoint_action=API_ACTION_LIST_PARAMS)

if response["Header"][0]['status'] == "Success":
    print(json.dumps(response['Data'],indent=4))
else:
    print(json.dumps(response,indent=4))

[
    {
        "code": "42101",
        "value_represented": "Carbon monoxide"
    },
    {
        "code": "42401",
        "value_represented": "Sulfur dioxide"
    },
    {
        "code": "42602",
        "value_represented": "Nitrogen dioxide (NO2)"
    },
    {
        "code": "44201",
        "value_represented": "Ozone"
    },
    {
        "code": "81102",
        "value_represented": "PM10 Total 0-10um STP"
    },
    {
        "code": "88101",
        "value_represented": "PM2.5 - Local Conditions"
    },
    {
        "code": "88502",
        "value_represented": "Acceptable PM2.5 AQI & Speciation Mass"
    }
]


We now have a response containing a set of sensor ID numbers. The list includes the sensor numbers as well as a description or name for each sensor.

The EPA AQS API has limits on some call parameters. Specifically, when we request data for sensors we can only specify a maximum of 5 different sensor values to return. This means we cannot get all of the Air Quality Index parameters in one request for data. We have to break it up. So we break the request into two logical groups, the AQI sensors that sample gasses and the AQI sensors that sample particles in the air.

In [None]:
#
#   Given the set of sensor codes, now we can create a parameter list or 'param' value as defined by the AQS API spec.
#   It turns out that we want all of these measures for AQI, but we need to have two different param constants to get
#   all seven of the code types. We can only have a max of 5 sensors/values request per param.
#
#   Gaseous AQI pollutants CO, SO2, NO2, and O2
AQI_PARAMS_GASEOUS = "42101,42401,42602,44201"
#
#   Particulate AQI pollutants PM10, PM2.5, and Acceptable PM2.5
AQI_PARAMS_PARTICULATES = "81102,88101,88502"
#
#

Air quality monitoring stations are located all over the US at different locations. To have AQI data relevant to Rialto, we must focus on monitoring stations in and around Rialto. To do so, we must supply the FIPS number for the state and county as a 5 digit string.

In [9]:
CITY_LOCATIONS = {
    'rialto' :       {'city'   : 'Rialto',
                       'county' : 'San Bernardino',
                       'state'  : 'California',
                       'fips'   : '06071',
                       'latlon' : [34.106117, -117.372093] }
}

In [None]:
#  This list request should give us a list of all the monitoring stations in the county specified by the
#  given city
#
request_data = AQS_REQUEST_TEMPLATE.copy()
request_data['email'] = USERNAME
request_data['key'] = APIKEY
request_data['state'] = CITY_LOCATIONS['rialto']['fips'][:2]   # the first two digits (characters) of FIPS is the state code
request_data['county'] = CITY_LOCATIONS['rialto']['fips'][2:]  # the last three digits (characters) of FIPS is the county code

response = request_list_info(request_template=request_data, endpoint_action=API_ACTION_LIST_SITES)

if response["Header"][0]['status'] == "Success":
    print(json.dumps(response['Data'],indent=4))
else:
    print(json.dumps(response,indent=4))

[
    {
        "code": "0001",
        "value_represented": "Barstow"
    },
    {
        "code": "0003",
        "value_represented": null
    },
    {
        "code": "0004",
        "value_represented": null
    },
    {
        "code": "0005",
        "value_represented": "Crestline"
    },
    {
        "code": "0006",
        "value_represented": null
    },
    {
        "code": "0008",
        "value_represented": null
    },
    {
        "code": "0009",
        "value_represented": null
    },
    {
        "code": "0012",
        "value_represented": "Phelan-Beekley Road & Phelan Road"
    },
    {
        "code": "0013",
        "value_represented": "Lucerne Valley-Middle School"
    },
    {
        "code": "0014",
        "value_represented": "Victorville-Amargosa Road"
    },
    {
        "code": "0015",
        "value_represented": "Trona-Athol"
    },
    {
        "code": "0017",
        "value_represented": "Twenty Nine Palms-Adobe"
    },
    {
        "code": "0

The above response gives us a list of monitoring stations. Each monitoring station has a unique "code" which is a string number, and, sometimes, a description. The description seems to be something about where the monitoring station is located. Since we have many monitoring stations in Dane county, we can skip using bounding box approach.

The function below is designed to encapsulate requests to the EPA AQS API. When calling the function we should create/copy a parameter template, then initialize that template with values that won't change with each call. Then on each call simply pass in the parameters that need to change, like date ranges.

In [None]:
#    This implements the daily summary request. Daily summary provides a daily summary value for each sensor being requested
#    from the start date to the end date.
#
#    Like the two other functions, this can be called with a mixture of a defined parameter dictionary, or with function
#    parameters. If function parameters are provided, those take precedence over any parameters from the request template.
#
def request_daily_summary(email_address = None, key = None, param=None,
                          begin_date = None, end_date = None, fips = None,
                          endpoint_url = API_REQUEST_URL,
                          endpoint_action = API_ACTION_DAILY_SUMMARY_COUNTY,
                          request_template = AQS_REQUEST_TEMPLATE,
                          headers = None):

    #  This prioritizes the info from the call parameters - not what's already in the template
    if email_address:
        request_template['email'] = email_address
    if key:
        request_template['key'] = key
    if param:
        request_template['param'] = param
    if begin_date:
        request_template['begin_date'] = begin_date
    if end_date:
        request_template['end_date'] = end_date
    if fips and len(fips)==5:
        request_template['state'] = fips[:2]
        request_template['county'] = fips[2:]

    # Make sure there are values that allow us to make a call - these are always required
    if not request_template['email']:
        raise Exception("Must supply an email address to call 'request_daily_summary()'")
    if not request_template['key']:
        raise Exception("Must supply a key to call 'request_daily_summary()'")
    if not request_template['param']:
        raise Exception("Must supply param values to call 'request_daily_summary()'")
    if not request_template['begin_date']:
        raise Exception("Must supply a begin_date to call 'request_daily_summary()'")
    if not request_template['end_date']:
        raise Exception("Must supply an end_date to call 'request_daily_summary()'")
    # Note we're not validating FIPS fields because not all of the daily summary actions require the FIPS numbers

    # compose the request
    request_url = endpoint_url+endpoint_action.format(**request_template)

    # make the request
    try:
        # Wait first, to make sure we don't exceed a rate limit in the situation where an exception occurs
        # during the request processing - throttling is always a good practice with a free data source
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(request_url, headers=headers)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

The below function takes an existing dataframe and appends new AQI records from a provided dictionary containing AQI data. It extracts relevant fields creating a list of new rows which is then converted into a dataframe. Finally, the function concatenates this new dataframe with the original one and returns the updated dataframe, ensuring that any empty columns are removed in the process. This is to append all our API call responses into one dataframe.

In [None]:
# Function to append the collected AQI data
def concat_aqi_data(df, aqi_data):
    # Create a list to store new rows
    new_rows = []

    # Loop through each record in the 'Data' part of the response
    for i in aqi_data['Data']:
        # Create a dictionary for the new row and add it to the list
        new_rows.append({
            'state_code': i['state_code'],
            'county_code': i['county_code'],
            'site_number': i['site_number'],
            'latitude': i['latitude'],
            'longitude': i['longitude'],
            'parameter_code': i['parameter_code'],
            'parameter': i['parameter'],
            'sample_duration': i['sample_duration'],
            'arithmetic_mean': i['arithmetic_mean'],
            'units_of_measure': i['units_of_measure'],
            'date_local': i['date_local'],
            'aqi': i['aqi']
        })

    # Convert the list of new rows to a DataFrame and concatenate it with the existing DataFrame
    new_df = pd.DataFrame(new_rows)
    new_df = new_df.dropna(axis=1, how='all')
    df = pd.concat([df, new_df], ignore_index=True)

    return df

We now call API to collect AQI data for both gaseous and particulate pollutants over the date range specified in the beginning of this notebook. It initializes an empty DataFrame for each pollutant type and loops through the years to make API calls for daily summaries. If the requests are successful, the corresponding AQI data is appended to the respective DataFrames using the above function. It also prints relevent status information.

In [None]:
# attribute to Manasa

request_data = AQS_REQUEST_TEMPLATE.copy()
request_data['email'] = USERNAME
request_data['key'] = APIKEY
request_data['param'] = AQI_PARAMS_GASEOUS
request_data['state'] = CITY_LOCATIONS['rialto']['fips'][:2]
request_data['county'] = CITY_LOCATIONS['rialto']['fips'][2:]

gaseous_responses = []
particulate_responses = []

# Initialize an empty DataFrame for the AQI data
gaseous_aqi_df = pd.DataFrame(columns=['state_code', 'county_code','site_number', 'latitude', 'longitude', 'parameter_code',
                                       'parameter','sample_duration','arithmetic_mean', 'units_of_measure','date_local', 'aqi'])
particulate_aqi_df = pd.DataFrame(columns=['state_code', 'county_code','site_number', 'latitude', 'longitude', 'parameter_code',
                                           'parameter','sample_duration','arithmetic_mean', 'units_of_measure', 'date_local', 'aqi'])

# Loop through the years and request data
for year in range(STARTYEAR, ENDYEAR):
    begin_date = f"{year}0501"  # May 1st of the given year
    end_date = f"{year}1031"    # October 31st of the given year

    # Request gaseous data
    request_data['param'] = AQI_PARAMS_GASEOUS
    gaseous_responses = request_daily_summary(request_template=request_data, begin_date=begin_date, end_date=end_date)
    if gaseous_responses and gaseous_responses["Header"][0]['status'] == "Success":
        print(f"Processing gaseous data for {year}")
        gaseous_aqi_df = concat_aqi_data(gaseous_aqi_df, gaseous_responses)
    else:
        print(f"No gaseous data available for {year}")

    # Request particulate data
    request_data['param'] = AQI_PARAMS_PARTICULATES
    particulate_responses = request_daily_summary(request_template=request_data, begin_date=begin_date, end_date=end_date)
    if particulate_responses and particulate_responses["Header"][0]['status'] == "Success":
        print(f"Processing particulate data for {year}")
        particulate_aqi_df = concat_aqi_data(particulate_aqi_df, particulate_responses)
    else:
        print(f"No particulate data available for {year}")

Processing gaseous data for 1964
No particulate data available for 1964
Processing gaseous data for 1965
No particulate data available for 1965
Processing gaseous data for 1966
No particulate data available for 1966
Processing gaseous data for 1967
No particulate data available for 1967
Processing gaseous data for 1968
No particulate data available for 1968
Processing gaseous data for 1969
No particulate data available for 1969
Processing gaseous data for 1970
No particulate data available for 1970
Processing gaseous data for 1971
No particulate data available for 1971
Processing gaseous data for 1972
No particulate data available for 1972
Processing gaseous data for 1973
No particulate data available for 1973
Processing gaseous data for 1974
No particulate data available for 1974
Processing gaseous data for 1975
No particulate data available for 1975
Processing gaseous data for 1976
No particulate data available for 1976
Processing gaseous data for 1977
No particulate data available f

Below I am storing the gaseous and particulate data into csv files.

In [None]:
gaseous_aqi_df = gaseous_aqi_df.drop_duplicates()
gaseous_aqi_df.to_csv("gaseous_aqi_1964_2024.csv", index=False)
print("Gaseous AQI Data saved to gaseous_aqi_1964_2024.csv")

particulate_aqi_df = particulate_aqi_df.drop_duplicates()
particulate_aqi_df.to_csv("particulate_aqi_1964_2024.csv", index=False)
print("Particulate AQI Data saved to particulate_aqi_1964_2024.csv")

Gaseous AQI Data saved to gaseous_aqi_1964_2024.csv
Particulate AQI Data saved to particulate_aqi_1964_2024.csv


In [3]:
gaseous_aqi = pd.read_csv("gaseous_aqi_1964_2024.csv")
particulate_aqi = pd.read_csv("particulate_aqi_1964_2024.csv")

In [None]:
gaseous_aqi.head()

Unnamed: 0,state_code,county_code,site_number,latitude,longitude,parameter_code,parameter,sample_duration,arithmetic_mean,units_of_measure,date_local,aqi
0,6,71,9001,34.102234,-117.286709,42101,Carbon monoxide,1 HOUR,2.291667,Parts per million,1964-05-01,
1,6,71,9001,34.102234,-117.286709,42101,Carbon monoxide,1 HOUR,2.416667,Parts per million,1964-05-02,
2,6,71,9001,34.102234,-117.286709,42101,Carbon monoxide,1 HOUR,2.75,Parts per million,1964-05-03,
3,6,71,9001,34.102234,-117.286709,42101,Carbon monoxide,1 HOUR,2.583333,Parts per million,1964-05-04,
4,6,71,9001,34.102234,-117.286709,42101,Carbon monoxide,1 HOUR,2.583333,Parts per million,1964-05-05,


In [None]:
particulate_aqi.head()

Unnamed: 0,state_code,county_code,site_number,latitude,longitude,parameter_code,parameter,sample_duration,arithmetic_mean,units_of_measure,date_local,aqi
0,6,71,1,34.89405,-117.02471,81102,PM10 Total 0-10um STP,24 HOUR,32.0,Micrograms/cubic meter (25 C),1987-05-03,30.0
1,6,71,1,34.89405,-117.02471,81102,PM10 Total 0-10um STP,24 HOUR,25.0,Micrograms/cubic meter (25 C),1987-05-09,23.0
2,6,71,1,34.89405,-117.02471,81102,PM10 Total 0-10um STP,24 HOUR,25.0,Micrograms/cubic meter (25 C),1987-05-15,23.0
3,6,71,1,34.89405,-117.02471,81102,PM10 Total 0-10um STP,24 HOUR,24.0,Micrograms/cubic meter (25 C),1987-05-21,22.0
4,6,71,1,34.89405,-117.02471,81102,PM10 Total 0-10um STP,24 HOUR,27.0,Micrograms/cubic meter (25 C),1987-05-27,25.0


I can see a lot of NaN values in the AQI dataframes.


In [4]:
# count the number of nan values in both the datarames
print(gaseous_aqi.isna().sum().sum())
print(particulate_aqi.isna().sum().sum())

# print total number of rows in both the dfs
print(len(gaseous_aqi))
print(len(particulate_aqi))

200138
29551
549855
78067


Now I am going to drop the rows where the AQI is missing.

In [5]:
#drop rows where AQI is Nan
gaseous_aqi = gaseous_aqi.dropna(subset=['aqi'])
particulate_aqi = particulate_aqi.dropna(subset=['aqi'])

In [6]:
#print rows of both the dfs
print(len(gaseous_aqi))
print(len(particulate_aqi))

349717
48516


In [7]:
#concatenate dfs one below the other
aqi_df = pd.concat([gaseous_aqi, particulate_aqi], axis=0)

In [10]:
site_distance = aqi_df.value_counts(subset=['site_number', 'latitude', 'longitude']).reset_index()
geodcalc = Geod(ellps='WGS84')


rialto = CITY_LOCATIONS['rialto']['latlon']

# Calculate distance in miles from Madison for each site
site_distance['distance'] = site_distance.apply(
    lambda x: geodcalc.inv(rialto[1], rialto[0], x['longitude'], x['latitude'])[2] * 0.00062137,
    axis=1
)

# Check if all distances are within 650 miles and print accordingly
if (site_distance['distance'] <= 650).all():
    print("All are within the city (within 650 miles).")
else:
    print("Some sites are more than 650 miles away:")
    print(site_distance[site_distance['distance'] > 650])

All are within the city (within 650 miles).


Next, I am going to calculate the AQI for each year by aggregating the data.

In [11]:
# Convert date to datetime and extract year
aqi_df['date_local'] = pd.to_datetime(aqi_df['date_local'])
aqi_df['year'] = aqi_df['date_local'].dt.year

# save aqi df to csv
aqi_df.to_csv('aqi.csv', index=False)



In [12]:
# Calculate the mean AQI per year based on the daily maximum values
aqi_per_year = aqi_df.groupby('year')['aqi'].median().reset_index()
aqi_per_year.to_csv('aqi_1964_2024-2.csv', index=False)

The resulting csv will be used for further smoke analysis.