# Air Quality Systems (AQS) Data
## Part 1: Compiling Data using the AQS api

In [2]:
import requests
import json
import time
import pandas as pd

In [2]:
# make a folder to save the data
! mkdir aqs

A subdirectory or file aqs already exists.


In [25]:
def save_file(filename, df):
    folder = 'aqs'
    extension = 'csv'
    path = '{}\{}.{}'.format(folder, filename, extension)
    df.to_csv(path)
    

In [3]:
# the AQS api: https://aqs.epa.gov/aqsweb/documents/data_api.html
# the table explains the meaning of variables used: https://aqs.epa.gov/aqsweb/documents/data_api.html#variables
# you have to sign up for the service - its super simple

email = 'taylordisom@gmail.com'
key = 'coppercrane22'
creds = 'email={}&key={}'.format(email, key)

In [4]:
# these are daily records, so only one date is needed
def get_aqs_byBox_url(date):
    ''' from the CRITERIA param class:
    42101 - Carbon monoxide
    42401 - Sulfur dioxide
    42602 - Nitrogen dioxide (NO2)
    44201 - Ozone
    81102 - PM10 Total 0-10um STP
    88101 - PM2.5 - Local Conditions
    ''' # api query limits: 5 params at a time
    params = '44201,42401,42602,44201,88101'
    parameters = 'param={}'.format(params)

    # the start and end dates for the request in the format yyyymmdd
    start_date = date
    end_date = date
    date_range = 'bdate={}&edate={}'.format(start_date, end_date)

    # make a box (lat and long boundaries) to confine results,  (commented after 2 decimal places)
    min_lat = 36.47 #4307 #the most south
    max_lat = 38.52 #2384 # the most north
    min_long = -123.41 #3342 # the most west / least east
    max_long = -121.33 #1310 #the most east
    box_bounds = 'minlat={}&maxlat={}&minlon={}&maxlon={}'.format(min_lat, max_lat, min_long, max_long)

    byBox_query_base = 'https://aqs.epa.gov/data/api/dailyData/byBox'

    # this is a sample query
    # https://aqs.epa.gov/data/api/dailyData/byBox?email=test@aqs.api&key=test&param=44201&bdate=20150501&edate=20150502&minlat=33.3&maxlat=33.6&minlon=-87.0&maxlon=-86.7

    # modularized the query
    url = '{}?{}&{}&{}&{}'.format(byBox_query_base, creds, parameters, date_range, box_bounds)
    return url

In [5]:
# 
def request_aqs_data(date):
    url = get_aqs_byBox_url(date)
    response = requests.get(url, stream=True)
    if response.status_code == requests.codes.ok:
        return response
    return None

In [6]:
def get_dates_for_range(start_date, end_date):
    date_range = pd.date_range(start=start_date, end=end_date)
    dates = [str(timestamp.date()).replace('-','') for timestamp in date_range]
    return dates

In [7]:
# pull the data from 1980 til now every 5 seconds (request usage limits)
start_date = '19800101'
# start_date = '20160505' # for testing
end_date = '20201203'
# end_date = '20160505' # for testing
dates = get_dates_for_range(start_date, end_date)
for date in dates:
    response = request_aqs_data(date)
    if response:
        json_response = response.json()
        if json_response['Data']:
            data_dict = json_response['Data']
            df = pd.DataFrame.from_dict(data = data_dict)
            save_file(date, df)
    time.sleep(5)


NameError: name 'save_file' is not defined

In [None]:
# the next step is to compile these daily files into a single file for each year
# the same can be done for each decade

In [None]:
# there are pregenereated data files: https://aqs.epa.gov/aqsweb/airdata/download_files.html
# you can use the county code to filter the data


## Part 2: Dataset Details
https://aqs.epa.gov/aqsweb/airdata/FileFormats.html

1. index: a unique identifier for the record / reading
2. state_code: The FIPS code of the state in which the monitor resides. The numeric code for the state where the reading was observed (alphabetically, California is '06')
3. county_code: The FIPS code of the county in which the monitor resides.The numeric code for the county where the reading was observed
4. site_number: A unique number within the county identifying the site.
5. parameter_code: The AQS code corresponding to the parameter measured by the monitor.
6. poc: This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.
7. latitude: The monitoring site’s angular distance north of the equator measured in decimal degrees.
8. longitude: The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.
9. datum: The Datum associated with the Latitude and Longitude measures.
10. parameter: The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.
11. sample_duration: The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).
12. pollutant_standard: A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)
13. date_local: The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.
14. units_of_measure: The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.
15. event_type: Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.
16. observation_count: The number of observations (samples) taken during the day.
17. observation_percent: The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).
18. validity_indicator: An indication of whether the regulatory data completeness criteria for valid summary data have been met by the monitor for the year. Y means yes, N means no or that there are no regulatory completeness criteria for the parameter.
19. arithmetic_mean: The average (arithmetic mean) value for the day.
20. first_max_value: The highest value for the day.
21. first_max_hour: The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.
22. aqi: The Air Quality Index for the day for the pollutant, if applicable.
23. method_code: An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.
24. method: A short description of the processes, equipment, and protocols used in gathering and measuring the sample.
25. local_site_name: The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.
26. site_address: The approximate street address of the monitoring site.
27. state: The name of the state where the monitoring site is located.
28. county: The name of the county where the monitoring site is located.
29. city: The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.
30. cbsa_code: The code (ZIP Code) of the core bases statistical area (metropolitan area) where the monitoring site is located.
31. cbsa: The name of the core bases statistical area (metropolitan area) where the monitoring site is located.
32. date_of_last_change: The date the last time any numeric values in this record were updated in the AQS data system.


### Data

There is 40 years worth of data in the form of daily data that you can request from AQS. You can also find more granular data (hourly), but it seems like it would be overkill since our other data cannot also have such granularity. The daily data is about 70 KB per file/day. with 40 years of data, that's about 14,610 days and 1 GB of data (1 million KB). This is only pulling data within the set bounds of a defined lat/long box. we could also search by county, but I believe you cannot specify multiple counties in a single request. since there are usage limits for the size and frequency of requests, we decided to request data with the bounds of a box such that all of the counties we are interested in lie within the box.

In [11]:
# TODO add a small request to show what the data looks like

day = '20180405'
response = request_aqs_data(day)
if response:
    json_response = response.json()
    if json_response['Data']:
        data_dict = json_response['Data']
        df = pd.DataFrame.from_dict(data = data_dict)
df

Unnamed: 0,state_code,county_code,site_number,parameter_code,poc,latitude,longitude,datum,parameter,sample_duration,...,method_code,method,local_site_name,site_address,state,county,city,cbsa_code,cbsa,date_of_last_change
0,06,085,0005,88101,1,37.348497,-121.894898,WGS84,PM2.5 - Local Conditions,24 HOUR,...,145,R & P Model 2025 PM-2.5 Sequential Air Sampler...,San Jose - Jackson,158B JACKSON ST,California,Santa Clara,San Jose,41940,"San Jose-Sunnyvale-Santa Clara, CA",2019-04-03
1,06,085,0005,88101,1,37.348497,-121.894898,WGS84,PM2.5 - Local Conditions,24 HOUR,...,145,R & P Model 2025 PM-2.5 Sequential Air Sampler...,San Jose - Jackson,158B JACKSON ST,California,Santa Clara,San Jose,41940,"San Jose-Sunnyvale-Santa Clara, CA",2019-04-03
2,06,085,0005,88101,1,37.348497,-121.894898,WGS84,PM2.5 - Local Conditions,24 HOUR,...,145,R & P Model 2025 PM-2.5 Sequential Air Sampler...,San Jose - Jackson,158B JACKSON ST,California,Santa Clara,San Jose,41940,"San Jose-Sunnyvale-Santa Clara, CA",2019-04-03
3,06,085,0005,88101,1,37.348497,-121.894898,WGS84,PM2.5 - Local Conditions,24 HOUR,...,145,R & P Model 2025 PM-2.5 Sequential Air Sampler...,San Jose - Jackson,158B JACKSON ST,California,Santa Clara,San Jose,41940,"San Jose-Sunnyvale-Santa Clara, CA",2019-04-03
4,06,013,1002,44201,1,38.006311,-121.641918,WGS84,Ozone,8-HR RUN AVG BEGIN HOUR,...,047,INSTRUMENTAL - ULTRA VIOLET,Bethel Island,5551 BETHEL ISLAND RD,California,Contra Costa,Bethel Island,41860,"San Francisco-Oakland-Hayward, CA",2019-01-28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300,06,053,0002,88101,3,36.481870,-121.733330,NAD83,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Carmel Valley,35 Ford Road,California,Monterey,Carmel Valley Village,41500,"Salinas, CA",2019-02-08
301,06,001,0013,88101,3,37.864767,-122.302741,NAD83,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Berkeley Aquatic Park,1 Bolivar Dr,California,Alameda,Not in a city,41860,"San Francisco-Oakland-Hayward, CA",2019-01-28
302,06,087,1005,88101,3,37.063150,-122.083092,NAD83,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,San Lorenzo Valley Middle School,"7179 Hacienda Way, Felton CA 95018",California,Santa Cruz,Not in a city,42100,"Santa Cruz-Watsonville, CA",2019-02-08
303,06,067,0011,42602,1,38.302591,-121.420838,WGS84,Nitrogen dioxide (NO2),1 HOUR,...,200,Teledyne-API Model 200EUP or T200UP - Photolyt...,Elk Grove-Bruceville,"12490 BRUCEVILLE RD, ELK GROVE, CA",California,Sacramento,Not in a city,40900,"Sacramento--Roseville--Arden-Arcade, CA",2019-04-03
