In [1]:
# 
#    These are standard python modules
#
#import json, time, urllib.parse
import json, time
#
#    The 'requests' module is a distribution module for making web requests.
#
import requests
import pandas as pd

#########
#
#    CONSTANTS
#

#
#    This is the root of all AQS API URLs
#
API_REQUEST_URL = 'https://aqs.epa.gov/data/api'

#
#    These are 'actions' we can ask the API to take or requests that we can make of the API
#
#    Sign-up request - generally only performed once - unless you lose your key
API_ACTION_SIGNUP = '/signup?email={email}'
#
#    List actions provide information on API parameter values that are required by some other actions/requests
API_ACTION_LIST_CLASSES = '/list/classes?email={email}&key={key}'
API_ACTION_LIST_PARAMS = '/list/parametersByClass?email={email}&key={key}&pc={pclass}'
API_ACTION_LIST_SITES = '/list/sitesByCounty?email={email}&key={key}&state={state}&county={county}'
#
#    Monitor actions are requests for monitoring stations that meet specific criteria
API_ACTION_MONITORS_COUNTY = '/monitors/byCounty?email={email}&key={key}&param={param}&bdate={begin_date}&edate={end_date}&state={state}&county={county}'
API_ACTION_MONITORS_BOX = '/monitors/byBox?email={email}&key={key}&param={param}&bdate={begin_date}&edate={end_date}&minlat={minlat}&maxlat={maxlat}&minlon={minlon}&maxlon={maxlon}'
#
#    Summary actions are requests for summary data. These are for daily summaries
API_ACTION_DAILY_SUMMARY_COUNTY = '/dailyData/byCounty?email={email}&key={key}&param={param}&bdate={begin_date}&edate={end_date}&state={state}&county={county}'
API_ACTION_DAILY_SUMMARY_BOX = '/dailyData/byBox?email={email}&key={key}&param={param}&bdate={begin_date}&edate={end_date}&minlat={minlat}&maxlat={maxlat}&minlon={minlon}&maxlon={maxlon}'
#
#    It is always nice to be respectful of a free data resource.
#    We're going to observe a 100 requests per minute limit - which is fairly nice
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED
#
#
#    This is a template that covers most of the parameters for the actions we might take, from the set of actions
#    above. In the examples below, most of the time parameters can either be supplied as individual values to a
#    function - or they can be set in a copy of the template and passed in with the template.
# 
AQS_REQUEST_TEMPLATE = {
    "email":      "",     
    "key":        "",      
    "state":      "",     # the two digit state FIPS # as a string
    "county":     "",     # the three digit county FIPS # as a string
    "begin_date": "",     # the start of a time window in YYYYMMDD format
    "end_date":   "",     # the end of a time window in YYYYMMDD format, begin_date and end_date must be in the same year
    "minlat":    0.0,
    "maxlat":    0.0,
    "minlon":    0.0,
    "maxlon":    0.0,
    "param":     "",     # a list of comma separated 5 digit codes, max 5 codes requested
    "pclass":    ""      # parameter class is only used by the List calls
}


USERNAME = "prerit16@uw.edu"
APIKEY= 'baygoose33' 

In [2]:
#
#    This implements the list request. There are several versions of the list request that only require email and key.
#    This code sets the default action/requests to list the groups or parameter class descriptors. Having those descriptors 
#    allows one to request the individual (proprietary) 5 digit codes for individual air quality measures by using the
#    param request. Some code in later cells will illustrate those requests.
#
def request_list_info(email_address = None, key = None,
                      endpoint_url = API_REQUEST_URL, 
                      endpoint_action = API_ACTION_LIST_CLASSES, 
                      request_template = AQS_REQUEST_TEMPLATE,
                      headers = None):
    
    #  Make sure we have email and key - at least
    #  This prioritizes the info from the call parameters - not what's already in the template
    if email_address:
        request_template['email'] = email_address
    if key:
        request_template['key'] = key
    
    # For the basic request we need an email address and a key
    if not request_template['email']:
        raise Exception("Must supply an email address to call 'request_list_info()'")
    if not request_template['key']: 
        raise Exception("Must supply a key to call 'request_list_info()'")

    # compose the request
    request_url = endpoint_url+endpoint_action.format(**request_template)
        
    # make the request
    try:
        # Wait first, to make sure we don't exceed a rate limit in the situation where an exception occurs
        # during the request processing - throttling is always a good practice with a free data source
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(request_url, headers=headers)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response


#
#   The default should get us a list of the various groups or classes of sensors. These classes are user defined names for clustors of
#   sensors that might be part of a package or default air quality sensing station. We need a class name to start getting down to the
#   a sensor ID. Each sensor type has an ID number. We'll eventually need those ID numbers to be able to request values that come from
#   that specific sensor.
#
request_data = AQS_REQUEST_TEMPLATE.copy()
request_data['email'] = USERNAME
request_data['key'] = APIKEY

response = request_list_info(request_template=request_data)

if response["Header"][0]['status'] == "Success":
    print(json.dumps(response['Data'],indent=4))
else:
    print(json.dumps(response,indent=4))


[
    {
        "code": "AIRNOW MAPS",
        "value_represented": "The parameters represented on AirNow maps (88101, 88502, and 44201)"
    },
    {
        "code": "ALL",
        "value_represented": "Select all Parameters Available"
    },
    {
        "code": "AQI POLLUTANTS",
        "value_represented": "Pollutants that have an AQI Defined"
    },
    {
        "code": "CORE_HAPS",
        "value_represented": "Urban Air Toxic Pollutants"
    },
    {
        "code": "CRITERIA",
        "value_represented": "Criteria Pollutants"
    },
    {
        "code": "CSN DART",
        "value_represented": "List of CSN speciation parameters to populate the STI DART tool"
    },
    {
        "code": "FORECAST",
        "value_represented": "Parameters routinely extracted by AirNow (STI)"
    },
    {
        "code": "HAPS",
        "value_represented": "Hazardous Air Pollutants"
    },
    {
        "code": "IMPROVE CARBON",
        "value_represented": "IMPROVE Carbon Parameters"
    }

In [3]:
#
#   Once we have a list of the classes or groups of possible sensors, we can find the sensor IDs that make up that class (group)
#   The one that looks to be associated with the Air Quality Index is "AQI POLLUTANTS"
#   We'll use that to make another list request.
#
AQI_PARAM_CLASS = "AQI POLLUTANTS"


In [4]:
#
#   Structure a request to get the sensor IDs associated with the AQI
#
request_data = AQS_REQUEST_TEMPLATE.copy()
request_data['email'] = USERNAME
request_data['key'] = APIKEY
request_data['pclass'] = AQI_PARAM_CLASS  # here we specify that we want this 'pclass' or parameter classs

response = request_list_info(request_template=request_data, endpoint_action=API_ACTION_LIST_PARAMS)

if response["Header"][0]['status'] == "Success":
    print(json.dumps(response['Data'],indent=4))
else:
    print(json.dumps(response,indent=4))


[
    {
        "code": "42101",
        "value_represented": "Carbon monoxide"
    },
    {
        "code": "42401",
        "value_represented": "Sulfur dioxide"
    },
    {
        "code": "42602",
        "value_represented": "Nitrogen dioxide (NO2)"
    },
    {
        "code": "44201",
        "value_represented": "Ozone"
    },
    {
        "code": "81102",
        "value_represented": "PM10 Total 0-10um STP"
    },
    {
        "code": "88101",
        "value_represented": "PM2.5 - Local Conditions"
    },
    {
        "code": "88502",
        "value_represented": "Acceptable PM2.5 AQI & Speciation Mass"
    }
]


In [5]:
#
#   Given the set of sensor codes, now we can create a parameter list or 'param' value as defined by the AQS API spec.
#   It turns out that we want all of these measures for AQI, but we need to have two different param constants to get
#   all seven of the code types. We can only have a max of 5 sensors/values request per param.
#
#   Gaseous AQI pollutants CO, SO2, NO2, and O2
AQI_PARAMS_GASEOUS = "42101,42401,42602,44201"
#
#   Particulate AQI pollutants PM10, PM2.5, and Acceptable PM2.5
AQI_PARAMS_PARTICULATES = "81102,88101,88502"
#   
#

In [6]:
#
#   We'll use these two city locations in the examples below.
#
CITY_LOCATIONS = {
    'Derby' :       {'city'   : 'Derby',
                       'county' : 'Sedgwick',
                       'state'  : 'Kansas',
                       'fips'   : '20173',
                       'latlon' : [37.552407, -97.261492] }
}


In [7]:
#
#  This list request should give us a list of all the monitoring stations in the county specified by the
#  given city selected from the CITY_LOCATIONS dictionary
#
request_data = AQS_REQUEST_TEMPLATE.copy()
request_data['email'] = USERNAME
request_data['key'] = APIKEY
request_data['state'] = CITY_LOCATIONS['Derby']['fips'][:2]   # the first two digits (characters) of FIPS is the state code
request_data['county'] = CITY_LOCATIONS['Derby']['fips'][2:]  # the last three digits (characters) of FIPS is the county code

response = request_list_info(request_template=request_data, endpoint_action=API_ACTION_LIST_SITES)

if response["Header"][0]['status'] == "Success":
    print(json.dumps(response['Data'],indent=4))
else:
    print(json.dumps(response,indent=4))


[
    {
        "code": "0001",
        "value_represented": "PARK CITY"
    },
    {
        "code": "0002",
        "value_represented": null
    },
    {
        "code": "0003",
        "value_represented": null
    },
    {
        "code": "0004",
        "value_represented": null
    },
    {
        "code": "0005",
        "value_represented": null
    },
    {
        "code": "0006",
        "value_represented": null
    },
    {
        "code": "0007",
        "value_represented": "13TH & ST PAUL"
    },
    {
        "code": "0008",
        "value_represented": "WASH & SKINNER"
    },
    {
        "code": "0009",
        "value_represented": "PAWNEE & GLENN"
    },
    {
        "code": "0010",
        "value_represented": "WICHITA HD"
    },
    {
        "code": "0011",
        "value_represented": null
    },
    {
        "code": "0012",
        "value_represented": null
    },
    {
        "code": "0013",
        "value_represented": null
    },
    {
        "code": "0

In [8]:
#
#    This implements the daily summary request. Daily summary provides a daily summary value for each sensor being requested
#    from the start date to the end date. 
#
#    Like the two other functions, this can be called with a mixture of a defined parameter dictionary, or with function
#    parameters. If function parameters are provided, those take precedence over any parameters from the request template.
#
def request_daily_summary(email_address = None, key = None, param=None,
                          begin_date = None, end_date = None, fips = None,
                          endpoint_url = API_REQUEST_URL, 
                          endpoint_action = API_ACTION_DAILY_SUMMARY_COUNTY, 
                          request_template = AQS_REQUEST_TEMPLATE,
                          headers = None):
    
    #  This prioritizes the info from the call parameters - not what's already in the template
    if email_address:
        request_template['email'] = email_address
    if key:
        request_template['key'] = key
    if param:
        request_template['param'] = param
    if begin_date:
        request_template['begin_date'] = begin_date
    if end_date:
        request_template['end_date'] = end_date
    if fips and len(fips)==5:
        request_template['state'] = fips[:2]
        request_template['county'] = fips[2:]            

    # Make sure there are values that allow us to make a call - these are always required
    if not request_template['email']:
        raise Exception("Must supply an email address to call 'request_daily_summary()'")
    if not request_template['key']: 
        raise Exception("Must supply a key to call 'request_daily_summary()'")
    if not request_template['param']: 
        raise Exception("Must supply param values to call 'request_daily_summary()'")
    if not request_template['begin_date']: 
        raise Exception("Must supply a begin_date to call 'request_daily_summary()'")
    if not request_template['end_date']: 
        raise Exception("Must supply an end_date to call 'request_daily_summary()'")
    # Note we're not validating FIPS fields because not all of the daily summary actions require the FIPS numbers
        
    # compose the request
    request_url = endpoint_url+endpoint_action.format(**request_template)
        
    # make the request
    try:
        # Wait first, to make sure we don't exceed a rate limit in the situation where an exception occurs
        # during the request processing - throttling is always a good practice with a free data source
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(request_url, headers=headers)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response



In [9]:
#
#    This is a list of field names - data - that will be extracted from each record
#
EXTRACTION_FIELDS = ['sample_duration','observation_count','arithmetic_mean','aqi']

#
#    The function creates a summary record
def extract_summary_from_response(r=None, fields=EXTRACTION_FIELDS):
    ## the result will be structured around monitoring site, parameter, and then date
    result = dict()
    data = r["Data"]
    for record in data:
        # make sure the record is set up
        site = record['site_number']
        param = record['parameter_code']
        #date = record['date_local']    # this version keeps the respnse value YYYY-
        date = record['date_local'].replace('-','') # this puts it in YYYYMMDD format
        if site not in result:
            result[site] = dict()
            result[site]['local_site_name'] = record['local_site_name']
            result[site]['site_address'] = record['site_address']
            result[site]['state'] = record['state']
            result[site]['county'] = record['county']
            result[site]['city'] = record['city']
            result[site]['pollutant_type'] = dict()
        if param not in result[site]['pollutant_type']:
            result[site]['pollutant_type'][param] = dict()
            result[site]['pollutant_type'][param]['parameter_name'] = record['parameter']
            result[site]['pollutant_type'][param]['units_of_measure'] = record['units_of_measure']
            result[site]['pollutant_type'][param]['method'] = record['method']
            result[site]['pollutant_type'][param]['data'] = dict()
        if date not in result[site]['pollutant_type'][param]['data']:
            result[site]['pollutant_type'][param]['data'][date] = list()
        
        # now extract the specified fields
        extract = dict()
        for k in fields:
            if str(k) in record:
                extract[str(k)] = record[k]
            else:
                # this makes sure we always have the requested fields, even if
                # we have a missing value for a given day/month
                extract[str(k)] = None
        
        # add this extraction to the list for the day
        result[site]['pollutant_type'][param]['data'][date].append(extract)
    
    return result


In [10]:
request_data = AQS_REQUEST_TEMPLATE.copy()
request_data['email'] = USERNAME
request_data['key'] = APIKEY
request_data['param'] = AQI_PARAMS_PARTICULATES
request_data['state'] = CITY_LOCATIONS['Derby']['fips'][:2]
request_data['county'] = CITY_LOCATIONS['Derby']['fips'][2:]

# Initialize an empty list to store the results
all_particulate_data = []

# Define the request template and base URL
request_template = "YOUR_REQUEST_TEMPLATE_HERE"
base_url = "YOUR_BASE_URL_HERE"

# Define the years you want to retrieve data for
start_year = 1963
end_year = 2023

# Loop through the years and request daily summary data
for year in range(start_year, end_year + 1):
    begin_date = f"{year}0101"
    end_date = f"{year}1231"

    # Make the request for the current year
    particulate_aqi = request_daily_summary(request_template=request_data, begin_date=begin_date, end_date=end_date)
    
    # Check the response and append data to the list
    if particulate_aqi["Header"][0]['status'] == "Success":
        #all_gaseous_data.extend(gaseous_aqi['Data'])
        extract_particulate = extract_summary_from_response(particulate_aqi)
        #print("Summary of particulate extraction ...")
        #print(json.dumps(extract_gaseous,indent=4))
        all_particulate_data.append(extract_particulate)
        
    elif particulate_aqi["Header"][0]['status'].startswith("No data "):
        print(f"No data available for {year}.")
        
    else:
        print(f"Error in retrieving data for {year}.")

# Print or process the collected data
print("Response for the particulate pollutants ...")
print(json.dumps(all_particulate_data, indent=4))

# Now, the all_gaseous_data list contains data for each year from 1963 to 2023.


No data available for 1963.
No data available for 1964.
No data available for 1965.
No data available for 1966.
No data available for 1967.
No data available for 1968.
No data available for 1969.
No data available for 1970.
No data available for 1971.
No data available for 1972.
No data available for 1973.
No data available for 1974.
No data available for 1975.
No data available for 1976.
No data available for 1977.
No data available for 1978.
No data available for 1979.
No data available for 1980.
No data available for 1981.
No data available for 1982.
No data available for 1983.
No data available for 1984.
No data available for 1985.
Response for the particulate pollutants ...


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [59]:
request_data = AQS_REQUEST_TEMPLATE.copy()
request_data['email'] = USERNAME
request_data['key'] = APIKEY
request_data['param'] = AQI_PARAMS_PARTICULATES
request_data['state'] = CITY_LOCATIONS['Derby']['fips'][:2]
request_data['county'] = CITY_LOCATIONS['Derby']['fips'][2:]

## Final List where all the data will be stored
particulate_data = []
# Years under consideration
start_year = 1963
end_year = 2023

# Fetching Data for Every Year
for year in range(start_year, end_year + 1):
    begin_date = str(year) + "0101"
    end_date = str(year) + "1231"

    # Creating template for the for the current year in the loop to fetch data
    particulate_aqi = request_daily_summary(request_template=request_data, begin_date=begin_date, end_date=end_date)
    
    # Appending data to the list if status is Success
    if particulate_aqi["Header"][0]['status'] == "Success":
        extract_particulate = extract_summary_from_response(particulate_aqi)
        particulate_data.append(extract_particulate)
    elif particulate_aqi["Header"][0]['status'].startswith("No data "):
        print(f"No data for the year : {year}.")
    else:
        print(f"Error in retrieving data for the year : {year}.")

## Printing all data fetched
print("Fetched Data")
print(json.dumps(particulate_data, indent=4))

# Storing the result in a JSON
with open("../Data/particulate_data.json", 'w') as file:
    json.dump(particulate_data, file, indent=4)
print("Data Stored as JSON")

No data for the year : 1963.
No data for the year : 1964.
No data for the year : 1965.
No data for the year : 1966.
No data for the year : 1967.
No data for the year : 1968.
No data for the year : 1969.
No data for the year : 1970.
No data for the year : 1971.
No data for the year : 1972.
No data for the year : 1973.
No data for the year : 1974.
No data for the year : 1975.
No data for the year : 1976.
No data for the year : 1977.
No data for the year : 1978.
No data for the year : 1979.
No data for the year : 1980.
No data for the year : 1981.
No data for the year : 1982.
No data for the year : 1983.
No data for the year : 1984.
No data for the year : 1985.
Fetched Data


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Data Stored as JSON


In [58]:
# Storing the result in a JSON
with open("../Data/particulate_data.json", 'w') as file:
    json.dump(all_particulate_data, file, indent=4)

In [12]:
request_data = AQS_REQUEST_TEMPLATE.copy()
request_data['email'] = USERNAME
request_data['key'] = APIKEY
request_data['param'] = AQI_PARAMS_GASEOUS
request_data['state'] = CITY_LOCATIONS['Derby']['fips'][:2]
request_data['county'] = CITY_LOCATIONS['Derby']['fips'][2:]

# Initialize an empty list to store the results
all_gaseous_data = []

# Define the request template and base URL
request_template = "YOUR_REQUEST_TEMPLATE_HERE"
base_url = "YOUR_BASE_URL_HERE"

# Define the years you want to retrieve data for
start_year = 1963
end_year = 2023

# Loop through the years and request daily summary data
for year in range(start_year, end_year + 1):
    begin_date = f"{year}0101"
    end_date = f"{year}1231"

    # Make the request for the current year
    gaseous_aqi = request_daily_summary(request_template=request_data, begin_date=begin_date, end_date=end_date)
    
    # Check the response and append data to the list
    if gaseous_aqi["Header"][0]['status'] == "Success":
        #all_gaseous_data.extend(gaseous_aqi['Data'])
        extract_gaseous = extract_summary_from_response(gaseous_aqi)
        #print("Summary of particulate extraction ...")
        #print(json.dumps(extract_gaseous,indent=4))
        all_gaseous_data.append(extract_gaseous)
        
    elif gaseous_aqi["Header"][0]['status'].startswith("No data "):
        print(f"No data available for {year}.")
        
    else:
        print(f"Error in retrieving data for {year}.")

# Print or process the collected data
print("Response for the gaseous pollutants ...")
print(json.dumps(all_gaseous_data, indent=4))

# Now, the all_gaseous_data list contains data for each year from 1963 to 2023.


No data available for 1964.
No data available for 1965.
No data available for 1966.
No data available for 1967.
Response for the gaseous pollutants ...


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [60]:
request_data = AQS_REQUEST_TEMPLATE.copy()
request_data['email'] = USERNAME
request_data['key'] = APIKEY
request_data['param'] = AQI_PARAMS_GASEOUS
request_data['state'] = CITY_LOCATIONS['Derby']['fips'][:2]
request_data['county'] = CITY_LOCATIONS['Derby']['fips'][2:]

## Final List where all the data will be stored
gaseous_data = []

# Years under consideration
start_year = 1963
end_year = 2023

# Fetching Data for Every Year
for year in range(start_year, end_year + 1):
    begin_date = str(year) + "0101"
    end_date = str(year) + "1231"
    
    # Creating template for the for the current year in the loop to fetch data
    gaseous_aqi = request_daily_summary(request_template=request_data, begin_date=begin_date, end_date=end_date)
    
    # Appending data to the list if status is Success
    if gaseous_aqi["Header"][0]['status'] == "Success":
        extract_gaseous = extract_summary_from_response(gaseous_aqi)
        gaseous_data.append(extract_gaseous)
        
    elif gaseous_aqi["Header"][0]['status'].startswith("No data "):
        print(f"No data for the year : {year}.")
    else:
        print(f"Error in retrieving data for the year : {year}.")

## Printing all data fetched
print("Fetched Data")
print(json.dumps(gaseous_data, indent=4))

# Storing the result in a JSON
with open("../Data/gaseous_data.json", 'w') as file:
    json.dump(gaseous_data, file, indent=4)
print("Data Stored as JSON")

No data for the year : 1964.
No data for the year : 1965.
No data for the year : 1966.
No data for the year : 1967.
Fetched Data


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Data Stored as JSON


In [13]:
# Define the output file name
output_file = "../Data/gaseous_data.json"

# Write the JSON data to the text file with an indentation of 4
with open(output_file, 'w') as file:
    json.dump(all_gaseous_data, file, indent=4)

In [61]:
def extract_aqi(data, dynamic_keys=None):
    if dynamic_keys is None:
        dynamic_keys = []

    extracted_data = []

    for key, value in data.items():
        current_keys = dynamic_keys + [key]

        if isinstance(value, dict):
            extracted_data.extend(extract_aqi(value, current_keys))
        elif isinstance(value, list):
            for item in value:
                if isinstance(item, dict):
                    aqi = item.get("aqi")
                    sample_duration = item.get("sample_duration")
                    if aqi is not None:
                        extracted_data.append({"keys": current_keys,"sample_duration": sample_duration, "aqi": aqi})

    return extracted_data

In [15]:
# Initialize a list to store the extracted data
extracted_data_gaseous = []

# Iterate through the list of datasets
for dataset in all_gaseous_data:
    extracted_data_gaseous.extend(extract_aqi(dataset))

# Initialize a list to store the extracted data
extracted_data_particulate = []

# Iterate through the list of datasets
for dataset in all_particulate_data:
    extracted_data_particulate.extend(extract_aqi(dataset))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [68]:
def process_gasesous_particulate(input_df):
    processed_data = []
    for entry in input_df:
        keys = entry['keys']
        date = keys[-1]
        aqi = entry['aqi']
        sample_duration = entry['sample_duration']
        pollutant_type = keys[-3]
        processed_data.append({'Date': date, 'AQI': aqi, 'Sample_Duration': sample_duration, 'Pollutant_Type': pollutant_type})
    return processed_data

In [62]:
# Extract relevant information and create a new list of dictionaries
formatted_data = []

for entry in extracted_data_particulate:
    keys = entry['keys']
    date = keys[-1]
    aqi = entry['aqi']
    sample_duration = entry['sample_duration']
    pollutant_type = keys[-3]

    formatted_data.append({'Date': date, 'AQI': aqi, 'Sample_Duration': sample_duration, 'Pollutant_Type': pollutant_type})

# Create a DataFrame from the formatted data
particulate_df = pd.DataFrame(formatted_data)

# Convert the 'Date' column to a datetime format
particulate_df['Date'] = pd.to_datetime(particulate_df['Date'], format='%Y%m%d')
particulate_df

Unnamed: 0,Date,AQI,Sample_Duration,Pollutant_Type
0,1986-08-12,48,24 HOUR,81102
1,1986-08-18,25,24 HOUR,81102
2,1986-08-24,30,24 HOUR,81102
3,1986-08-30,28,24 HOUR,81102
4,1986-09-05,30,24 HOUR,81102
...,...,...,...,...
89105,2023-06-30,47,24-HR BLK AVG,88101
89106,2023-06-30,47,24-HR BLK AVG,88101
89107,2023-06-30,47,24-HR BLK AVG,88101
89108,2023-06-30,47,24-HR BLK AVG,88101


In [81]:
def extract_aqi2(data, dynamic_keys=None):
    if dynamic_keys is None:
        dynamic_keys = []

    extracted_data = []

    for key in data.keys():
        current_keys = dynamic_keys + [key]
        if isinstance(data[key], dict):
            extracted_data.extend(extract_aqi2(data[key], current_keys))
        elif isinstance(data[key], list):
            for item in data[key]:
                if isinstance(item, dict):
                    aqi = item["aqi"]
                    sample_duration = item["sample_duration"]
                    if aqi is not None:
                        extracted_data.append({"keys": current_keys,"sample_duration": sample_duration, "aqi": aqi})

    return extracted_data

In [82]:
# Initialize a list to store the extracted data
extracted_data_gaseous = []

# Iterate through the list of datasets
for dataset in all_gaseous_data:
    extracted_data_gaseous.extend(extract_aqi2(dataset))

# Initialize a list to store the extracted data
extracted_data_particulate = []

# Iterate through the list of datasets
for dataset in all_particulate_data:
    extracted_data_particulate.extend(extract_aqi2(dataset))

In [83]:
# Extract relevant information and create a new list of dictionaries
formatted_data = []

for entry in extracted_data_particulate:
    keys = entry['keys']
    date = keys[-1]
    aqi = entry['aqi']
    sample_duration = entry['sample_duration']
    pollutant_type = keys[-3]

    formatted_data.append({'Date': date, 'AQI': aqi, 'Sample_Duration': sample_duration, 'Pollutant_Type': pollutant_type})

# Create a DataFrame from the formatted data
particulate_df = pd.DataFrame(formatted_data)

# Convert the 'Date' column to a datetime format
particulate_df['Date'] = pd.to_datetime(particulate_df['Date'], format='%Y%m%d')
particulate_df

Unnamed: 0,Date,AQI,Sample_Duration,Pollutant_Type
0,1986-08-12,48,24 HOUR,81102
1,1986-08-18,25,24 HOUR,81102
2,1986-08-24,30,24 HOUR,81102
3,1986-08-30,28,24 HOUR,81102
4,1986-09-05,30,24 HOUR,81102
...,...,...,...,...
89105,2023-06-30,47,24-HR BLK AVG,88101
89106,2023-06-30,47,24-HR BLK AVG,88101
89107,2023-06-30,47,24-HR BLK AVG,88101
89108,2023-06-30,47,24-HR BLK AVG,88101


In [69]:
particulate_df = pd.DataFrame(process_gasesous_particulate(extracted_data_particulate))

# Convert the 'Date' column to a datetime format
particulate_df['Date'] = pd.to_datetime(particulate_df['Date'], format='%Y%m%d')
particulate_df

gaseous_df = pd.DataFrame(process_gasesous_particulate(extracted_data_gaseous))

# Convert the 'Date' column to a datetime format
gaseous_df['Date'] = pd.to_datetime(gaseous_df['Date'], format='%Y%m%d')
gaseous_df.head()

Unnamed: 0,Date,AQI,Sample_Duration,Pollutant_Type
0,1986-08-12,48,24 HOUR,81102
1,1986-08-18,25,24 HOUR,81102
2,1986-08-24,30,24 HOUR,81102
3,1986-08-30,28,24 HOUR,81102
4,1986-09-05,30,24 HOUR,81102
...,...,...,...,...
89105,2023-06-30,47,24-HR BLK AVG,88101
89106,2023-06-30,47,24-HR BLK AVG,88101
89107,2023-06-30,47,24-HR BLK AVG,88101
89108,2023-06-30,47,24-HR BLK AVG,88101


In [71]:
# Extract relevant information and create a new list of dictionaries
formatted_data = []

for entry in extracted_data_gaseous:
    keys = entry['keys']
    date = keys[-1]
    aqi = entry['aqi']
    sample_duration = entry['sample_duration']
    pollutant_type = keys[-3]

    formatted_data.append({'Date': date, 'AQI': aqi, 'Sample_Duration': sample_duration, 'Pollutant_Type': pollutant_type})

# Create a DataFrame from the formatted data
gaseous_df = pd.DataFrame(formatted_data)

# Convert the 'Date' column to a datetime format
gaseous_df['Date'] = pd.to_datetime(gaseous_df['Date'], format='%Y%m%d')
gaseous_df

Unnamed: 0,Date,AQI,Sample_Duration,Pollutant_Type
0,1971-09-20,22,8-HR RUN AVG END HOUR,42101
1,1971-09-22,26,8-HR RUN AVG END HOUR,42101
2,1971-09-23,24,8-HR RUN AVG END HOUR,42101
3,1971-09-24,35,8-HR RUN AVG END HOUR,42101
4,1971-09-25,33,8-HR RUN AVG END HOUR,42101
...,...,...,...,...
168290,2023-06-29,61,8-HR RUN AVG BEGIN HOUR,44201
168291,2023-06-29,61,8-HR RUN AVG BEGIN HOUR,44201
168292,2023-06-30,44,8-HR RUN AVG BEGIN HOUR,44201
168293,2023-06-30,44,8-HR RUN AVG BEGIN HOUR,44201


In [72]:
gaseous_df = pd.DataFrame(process_gasesous_particulate(extracted_data_gaseous))

# Convert the 'Date' column to a datetime format
gaseous_df['Date'] = pd.to_datetime(gaseous_df['Date'], format='%Y%m%d')
gaseous_df

Unnamed: 0,Date,AQI,Sample_Duration,Pollutant_Type
0,1971-09-20,22,8-HR RUN AVG END HOUR,42101
1,1971-09-22,26,8-HR RUN AVG END HOUR,42101
2,1971-09-23,24,8-HR RUN AVG END HOUR,42101
3,1971-09-24,35,8-HR RUN AVG END HOUR,42101
4,1971-09-25,33,8-HR RUN AVG END HOUR,42101
...,...,...,...,...
168290,2023-06-29,61,8-HR RUN AVG BEGIN HOUR,44201
168291,2023-06-29,61,8-HR RUN AVG BEGIN HOUR,44201
168292,2023-06-30,44,8-HR RUN AVG BEGIN HOUR,44201
168293,2023-06-30,44,8-HR RUN AVG BEGIN HOUR,44201


In [19]:
gaseous_df.to_csv("../Data/gaseous_aqi_data_processed.csv")
particulate_df.to_csv("../Data/particulate_aqi_data_processed.csv")

In [52]:
particulate_df_copy = particulate_df.copy()
gaseous_df_copy = gaseous_df.copy()
particulate_df_copy.drop(['Sample_Duration', 'Pollutant_Type'], axis=1, inplace=True)
gaseous_df_copy.drop(['Sample_Duration', 'Pollutant_Type'], axis=1, inplace=True)

In [53]:
# Group by 'Date' and calculate the mean for each group
part_avg_df = particulate_df_copy.groupby('Date').mean().reset_index(drop = False)
gas_avg_df = gaseous_df_copy.groupby('Date').mean().reset_index(drop = False)

In [54]:
# # Extract the year from the 'Date' column
# gaseous_avg_df['Year'] = gaseous_avg_df['Date'].dt.year
# particulate_avg_df['Year'] = particulate_avg_df['Date'].dt.year

# Merge the DataFrames on 'Date' and take the maximum AQI or fill with values from either DataFrame
merged_df = pd.merge(gas_avg_df, part_avg_df, on='Date', how='outer', suffixes=('_g', '_p'))
merged_df['AQI'] = merged_df[['AQI_g', 'AQI_p']].max(axis=1)
merged_df.drop(columns=['AQI_g', 'AQI_p'], inplace=True)
# Extract the year from the 'Date' column
merged_df['Year'] = merged_df['Date'].dt.year

In [84]:
# Group by 'Year' and calculate the mean AQI for each year
aqi_df = merged_df.groupby('Year')['AQI'].mean().reset_index()
aqi_df.head()

Unnamed: 0,Year,AQI
0,1971,29.473214
1,1972,25.860806
2,1973,26.41482
3,1974,47.145204
4,1975,42.049516


In [56]:
aqi_df.to_csv("../Data/final_aqi_each_year.csv")