# Air Quality Systems (AQS) Data
## Part 1: Compiling Data using the AQS api

In [2]:
import requests
import json
import time
import pandas as pd

In [2]:
# make a folder to save the data
! mkdir aqs

A subdirectory or file aqs already exists.


In [25]:
def save_file(filename, df):
    folder = 'aqs'
    extension = 'csv'
    path = '{}\{}.{}'.format(folder, filename, extension)
    df.to_csv(path)
    

In [3]:
# the AQS api: https://aqs.epa.gov/aqsweb/documents/data_api.html
# the table explains the meaning of variables used: https://aqs.epa.gov/aqsweb/documents/data_api.html#variables
# you have to sign up for the service - its super simple

email = 'taylordisom@gmail.com'
key = 'coppercrane22'
creds = 'email={}&key={}'.format(email, key)

In [213]:
# these are daily records, so only one date is needed
def get_aqs_byBox_url(date):
    ''' from the CRITERIA param class:
    42101 - Carbon monoxide
    42401 - Sulfur dioxide
    42602 - Nitrogen dioxide (NO2)
    44201 - Ozone
    81102 - PM10 Total 0-10um STP
    88101 - PM2.5 - Local Conditions ***(for some reason, the code is 88502 for Point Reyes)
    ''' # api query limits: 5 params at a time
    params = '44201,42401,42602,44201,88101'
    parameters = 'param={}'.format(params)

    # the start and end dates for the request in the format yyyymmdd
    start_date = date
    end_date = date
    date_range = 'bdate={}&edate={}'.format(start_date, end_date)

    # make a box (lat and long boundaries) to confine results,  (commented after 2 decimal places)
    min_lat = 36.47 #4307 #the most south
    max_lat = 38.52 #2384 # the most north
    min_long = -123.41 #3342 # the most west / least east
    max_long = -121.33 #1310 #the most east
    box_bounds = 'minlat={}&maxlat={}&minlon={}&maxlon={}'.format(min_lat, max_lat, min_long, max_long)

    byBox_query_base = 'https://aqs.epa.gov/data/api/dailyData/byBox'

    # this is a sample query
    # https://aqs.epa.gov/data/api/dailyData/byBox?email=test@aqs.api&key=test&param=44201&bdate=20150501&edate=20150502&minlat=33.3&maxlat=33.6&minlon=-87.0&maxlon=-86.7

    # modularized the query
    url = '{}?{}&{}&{}&{}'.format(byBox_query_base, creds, parameters, date_range, box_bounds)
    return url


# You can make larger requests (may be penalized, but it works) - the start and end dates don't need to be the same day
# https://aqs.epa.gov/data/api/dailyData/byState?email=test@aqs.api&key=test&param=88101,88502&bdate=20150101&edate=20150201&state=06

In [5]:
# 
def request_aqs_data(date):
    url = get_aqs_byBox_url(date)
    response = requests.get(url, stream=True)
    if response.status_code == requests.codes.ok:
        return response
    return None

In [6]:
def get_dates_for_range(start_date, end_date):
    date_range = pd.date_range(start=start_date, end=end_date)
    dates = [str(timestamp.date()).replace('-','') for timestamp in date_range]
    return dates

In [30]:
# This is unnecessary, you can pull multiple days worth of daily data in a single request, you just have to have the year be the same in the start and end date
# TODO - change the queries to pull a years' worth of data instead of a day.
# pull the data from 2015 til now every 5 seconds (request usage limits)
# start_date = '19800101' 
start_date = '20150101' # Got aconnection reset error, starting over from the last date
# start_date = '20160505' # for testing
end_date = '20201203'
# end_date = '20160505' # for testing
dates = get_dates_for_range(start_date, end_date)
for date in dates:
    response = request_aqs_data(date)
    if response:
        json_response = response.json()
        if json_response['Data']:
            data_dict = json_response['Data']
            df = pd.DataFrame.from_dict(data = data_dict)
            save_file(date, df)
    time.sleep(5)


In [None]:
# the next step is to compile these daily files into a single file for each year
# the same can be done for each decade

In [None]:
# there are pregenereated data files: https://aqs.epa.gov/aqsweb/airdata/download_files.html
# you can use the county code to filter the data


In [5]:
import os

path = r'C:\Users\taylo\Documents\GitHub\DS4A\eda_notebooks\aqs'
files = os.listdir(path)
# print(files)

In [11]:
# 
with open('aqs_data.csv', 'a') as comp_file:
    
    # add the header
    header = ',state_code,county_code,site_number,parameter_code,poc,latitude,longitude,datum,parameter,sample_duration,pollutant_standard,date_local,units_of_measure,event_type,observation_count,observation_percent,validity_indicator,arithmetic_mean,first_max_value,first_max_hour,aqi,method_code,method,local_site_name,site_address,state,county,city,cbsa_code,cbsa,date_of_last_change\n'
    comp_file.write(header)
    
    for file in files:
        path = 'aqs\\'
        with open(path+file) as data_file:
            data = data_file.read()
        comp_file.write(data)
    

## Part 2: Dataset Details
https://aqs.epa.gov/aqsweb/airdata/FileFormats.html

1. index: a unique identifier for the record / reading
2. state_code: The FIPS code of the state in which the monitor resides. The numeric code for the state where the reading was observed (alphabetically, California is '06')
3. county_code: The FIPS code of the county in which the monitor resides.The numeric code for the county where the reading was observed
4. site_number: A unique number within the county identifying the site.
5. parameter_code: The AQS code corresponding to the parameter measured by the monitor.
6. poc: This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.
7. latitude: The monitoring site’s angular distance north of the equator measured in decimal degrees.
8. longitude: The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.
9. datum: The Datum associated with the Latitude and Longitude measures.
10. parameter: The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.
11. sample_duration: The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).
12. pollutant_standard: A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)
13. date_local: The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.
14. units_of_measure: The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.
15. event_type: Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.
16. observation_count: The number of observations (samples) taken during the day.
17. observation_percent: The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).
18. validity_indicator: An indication of whether the regulatory data completeness criteria for valid summary data have been met by the monitor for the year. Y means yes, N means no or that there are no regulatory completeness criteria for the parameter.
19. arithmetic_mean: The average (arithmetic mean) value for the day.
20. first_max_value: The highest value for the day.
21. first_max_hour: The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.
22. aqi: The Air Quality Index for the day for the pollutant, if applicable.
23. method_code: An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.
24. method: A short description of the processes, equipment, and protocols used in gathering and measuring the sample.
25. local_site_name: The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.
26. site_address: The approximate street address of the monitoring site.
27. state: The name of the state where the monitoring site is located.
28. county: The name of the county where the monitoring site is located.
29. city: The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.
30. cbsa_code: The code (ZIP Code) of the core bases statistical area (metropolitan area) where the monitoring site is located.
31. cbsa: The name of the core bases statistical area (metropolitan area) where the monitoring site is located.
32. date_of_last_change: The date the last time any numeric values in this record were updated in the AQS data system.


### Data

There is 40 years worth of data in the form of daily data that you can request from AQS. You can also find more granular data (hourly), but it seems like it would be overkill since our other data cannot also have such granularity. The daily data is about 70 KB per file/day. with 40 years of data, that's about 14,610 days and 1 GB of data (1 million KB). This is only pulling data within the set bounds of a defined lat/long box. we could also search by county, but I believe you cannot specify multiple counties in a single request. since there are usage limits for the size and frequency of requests, we decided to request data with the bounds of a box such that all of the counties we are interested in lie within the box.

In [11]:
# TODO add a small request to show what the data looks like

day = '20180405'
response = request_aqs_data(day)
if response:
    json_response = response.json()
    if json_response['Data']:
        data_dict = json_response['Data']
        df = pd.DataFrame.from_dict(data = data_dict)
df

Unnamed: 0,state_code,county_code,site_number,parameter_code,poc,latitude,longitude,datum,parameter,sample_duration,...,method_code,method,local_site_name,site_address,state,county,city,cbsa_code,cbsa,date_of_last_change
0,06,085,0005,88101,1,37.348497,-121.894898,WGS84,PM2.5 - Local Conditions,24 HOUR,...,145,R & P Model 2025 PM-2.5 Sequential Air Sampler...,San Jose - Jackson,158B JACKSON ST,California,Santa Clara,San Jose,41940,"San Jose-Sunnyvale-Santa Clara, CA",2019-04-03
1,06,085,0005,88101,1,37.348497,-121.894898,WGS84,PM2.5 - Local Conditions,24 HOUR,...,145,R & P Model 2025 PM-2.5 Sequential Air Sampler...,San Jose - Jackson,158B JACKSON ST,California,Santa Clara,San Jose,41940,"San Jose-Sunnyvale-Santa Clara, CA",2019-04-03
2,06,085,0005,88101,1,37.348497,-121.894898,WGS84,PM2.5 - Local Conditions,24 HOUR,...,145,R & P Model 2025 PM-2.5 Sequential Air Sampler...,San Jose - Jackson,158B JACKSON ST,California,Santa Clara,San Jose,41940,"San Jose-Sunnyvale-Santa Clara, CA",2019-04-03
3,06,085,0005,88101,1,37.348497,-121.894898,WGS84,PM2.5 - Local Conditions,24 HOUR,...,145,R & P Model 2025 PM-2.5 Sequential Air Sampler...,San Jose - Jackson,158B JACKSON ST,California,Santa Clara,San Jose,41940,"San Jose-Sunnyvale-Santa Clara, CA",2019-04-03
4,06,013,1002,44201,1,38.006311,-121.641918,WGS84,Ozone,8-HR RUN AVG BEGIN HOUR,...,047,INSTRUMENTAL - ULTRA VIOLET,Bethel Island,5551 BETHEL ISLAND RD,California,Contra Costa,Bethel Island,41860,"San Francisco-Oakland-Hayward, CA",2019-01-28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300,06,053,0002,88101,3,36.481870,-121.733330,NAD83,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Carmel Valley,35 Ford Road,California,Monterey,Carmel Valley Village,41500,"Salinas, CA",2019-02-08
301,06,001,0013,88101,3,37.864767,-122.302741,NAD83,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Berkeley Aquatic Park,1 Bolivar Dr,California,Alameda,Not in a city,41860,"San Francisco-Oakland-Hayward, CA",2019-01-28
302,06,087,1005,88101,3,37.063150,-122.083092,NAD83,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,San Lorenzo Valley Middle School,"7179 Hacienda Way, Felton CA 95018",California,Santa Cruz,Not in a city,42100,"Santa Cruz-Watsonville, CA",2019-02-08
303,06,067,0011,42602,1,38.302591,-121.420838,WGS84,Nitrogen dioxide (NO2),1 HOUR,...,200,Teledyne-API Model 200EUP or T200UP - Photolyt...,Elk Grove-Bruceville,"12490 BRUCEVILLE RD, ELK GROVE, CA",California,Sacramento,Not in a city,40900,"Sacramento--Roseville--Arden-Arcade, CA",2019-04-03


In [1]:
import pandas as pd

In [222]:

filename = 'aqs_data.csv'
df = pd.read_csv(filename)
# df

In [223]:
# remove the headers that are rows
false_header_indices = df.where(df['state_code'] == 'state_code').dropna(subset=['state_code']).index
false_header_indices
df = df.drop(false_header_indices)
# df = df.drop_duplicates() #there are no dupes
# df

In [224]:
df.columns

Index(['Unnamed: 0', 'state_code', 'county_code', 'site_number',
       'parameter_code', 'poc', 'latitude', 'longitude', 'datum', 'parameter',
       'sample_duration', 'pollutant_standard', 'date_local',
       'units_of_measure', 'event_type', 'observation_count',
       'observation_percent', 'validity_indicator', 'arithmetic_mean',
       'first_max_value', 'first_max_hour', 'aqi', 'method_code', 'method',
       'local_site_name', 'site_address', 'state', 'county', 'city',
       'cbsa_code', 'cbsa', 'date_of_last_change'],
      dtype='object')

In [20]:
# df['county'].unique()
# df['cbsa'].unique()
# df['city'].unique()

array(['Vallejo-Fairfield, CA', 'San Francisco-Oakland-Hayward, CA',
       'San Jose-Sunnyvale-Santa Clara, CA', 'Napa, CA', 'Santa Rosa, CA',
       'Salinas, CA', 'Stockton-Lodi, CA', 'Santa Cruz-Watsonville, CA',
       'Sacramento--Roseville--Arden-Arcade, CA'], dtype=object)

In [35]:
col_name = 'site_number'
# df[col_name].describe()
len(df[col_name].unique())


33

In [None]:
# add a zip code 
# ***not doing this at this time

In [229]:
# remove entries before 2015

str_dates = df['date_local']
df['date_local'] = pd.to_datetime(str_dates)

filter_date = '2014-12-31'
str(df['date_local'][1]) > filter_date
df_15_20 = df.where(df['date_local'] > filter_date).dropna(subset=['date_local'])


In [19]:
# Monterey, San Joaquin, Santa Cruz, San Benito, Sacramento not in the Bay Area csv, will remove those entries with the county as the previously listed values

In [226]:
# remove columns unnamed (previous index)
# columns_to_drop = ['Unnamed: 0', 'state_code', 'poc', 'datum', 'sample_duration', 'pollutant_standard', 'units_of_measure', 'event_type', 'observation_count', 'observation_percent', 'validity_indicator', 'method_code', 'method', 'site_address', 'state', 'cbsa_code', 'cbsa', 'date_of_last_change']
# columns_to_drop = ['Unnamed: 0', 'state_code',  'datum', 'pollutant_standard', 'units_of_measure',  'observation_count', 'event_type', 'observation_percent', 'validity_indicator', 'method_code', 'method',  'site_address', 'state', 'cbsa_code', 'cbsa', 'date_of_last_change']
columns_to_drop = ['Unnamed: 0', 'state_code', 'county_code', 'site_number', 'parameter_code', 'datum', 'event_type', 'units_of_measure', 'event_type', 'observation_count', 'observation_percent', 'validity_indicator', 'method_code', 'method',  'site_address', 'state', 'cbsa_code', 'cbsa', 'date_of_last_change'] #  - testing what is the difference in the rows with the same date, county, site, sample duration, pollutant, and poc

# df_clean = df_15_20.drop(columns=columns_to_drop)
# df_clean

In [230]:
df_15_20['date_local']

2613232   2015-01-01
2613233   2015-01-01
2613234   2015-01-01
2613235   2015-01-01
2613236   2015-01-01
             ...    
3201066   2020-09-29
3201068   2020-09-30
3201069   2020-09-30
3201070   2020-09-30
3201071   2020-09-30
Name: date_local, Length: 585741, dtype: datetime64[ns]

In [231]:
# check which sites I am getting

max_sites = df_15_20['local_site_name'].unique()

In [232]:
print(sorted(max_sites))
print(sorted(data_sites)) # will need to run the code at the bottom of the notebook

# the max data is missing data from only this site: Point Reyes NS Ranger Station



['Berkeley Aquatic Park', 'Bethel Island', 'Carmel Valley', 'Concord', 'Crockett - Kendall Ave', 'Elk Grove-Bruceville', 'Fairfield', 'Gilroy', 'Hayward', 'Hollister', 'Laney College', 'Livermore', 'Los Gatos', 'Martinez - Jones St.', 'Napa', 'Napa Valley College', 'Oakland', 'Oakland West', 'Patterson Pass', 'Pleasanton - Owens Ct', 'Redwood City', 'Richmond - 7th St.', 'Salinas 3', 'San Francisco', 'San Jose - Jackson', 'San Jose - Knox Avenue', 'San Lorenzo Valley Middle School', 'San Martin', 'San Pablo', 'San Rafael', 'San Ramon', 'Santa Cruz', 'Sebastopol', 'Tracy-Airport', 'Vacaville-Ulatis Drive', 'Vallejo']
['Berkeley Aquatic Park', 'Concord', 'Gilroy', 'Laney College', 'Livermore', 'Napa', 'Napa Valley College', 'Oakland', 'Oakland West', 'Pleasanton - Owens Ct', 'Point Reyes NS Ranger Station', 'Redwood City', 'San Francisco', 'San Jose - Jackson', 'San Jose - Knox Avenue', 'San Pablo', 'San Rafael', 'Sebastopol', 'Vallejo']


In [233]:
# can I use the api to pull data from the site: Point Reyes (from the map, it seems like it should be within the bounds of the box)

# the lat long is 38.122979 -122.90944

#     min_lat = 36.47 #4307 #the most south
#     max_lat = 38.52 #2384 # the most north
#     min_long = -123.41 #3342 # the most west / least east
#     max_long = -121.33 #1310 #the most east

# https://aqs.epa.gov/data/api/list/sitesByCounty?email=test@aqs.api&key=test&state=06&county=041
# the site lies within the bounds of the box, so we'll try to pull from that site specifically site number for Point Reyes is 0002

# from the other dataset, it seems that the earliest reading from that site is 01/09/2015
# the reason that Point Reyes was not pulled in the original requests was because it uses a different parameter code for pm 2.5, 88502.
# it is from the parameter class 'SPECIATION' (https://aqs.epa.gov/data/api/list/parametersByClass?email=test@aqs.api&key=test&pc=SPECIATION)
# this request pulls the data from that site: https://aqs.epa.gov/data/api/monitors/bySite?email=test@aqs.api&key=test&param=88502&bdate=20150501&edate=20150501&state=06&county=041&site=0002
# will pull that data and concat it later

# filtering out the sites that aren't used in the other dataset (will need the code at the bottom of the notebook) - just Bay Area sites
df_15_20_BA = df_15_20.where(df_15_20['local_site_name'].isin(data_sites)).dropna(subset=['local_site_name'])
# df_15_20_BA

In [234]:
# pull the point reyes data
# https://aqs.epa.gov/data/api/dailyData/bySite?email=test@aqs.api&key=test&param=88502&bdate=20150101&edate=20151231&state=06&county=041&site=0002
# The site point reyes does not have a 1 hour sample duration, thus they do not have the max value for each day
# this site will not be included in this dataset


In [235]:
# exclude the parameters (pollutants) that are not pm 2.5
# df_15_20_BA['parameter'].unique()
df_15_20_BA = df_15_20_BA.where(df_15_20_BA['parameter'] == 'PM2.5 - Local Conditions').dropna(subset=['parameter'])

In [236]:

df_clean = df_15_20_BA.drop(columns=columns_to_drop)


In [237]:
# need to remove the samples that are not '24-HR BLK AVG'
# only the '1 Hour' sample durations have a 'first max hour' that is non-zero, but their aqi is NaN

In [238]:
# df_clean.set_index(['date_local', 'county', 'city']).head(60)
# df_clean.set_index(['date_local', 'county', 'city']).shape

In [239]:
# the other dataset has this value:
# ,Date,Source,Site ID,POC,Daily Mean PM2.5 Concentration,UNITS,DAILY_AQI_VALUE,Site Name,DAILY_OBS_COUNT,PERCENT_COMPLETE,AQS_PARAMETER_CODE,AQS_PARAMETER_DESC,CBSA_CODE,CBSA_NAME,STATE_CODE,STATE,COUNTY_CODE,COUNTY,SITE_LATITUDE,SITE_LONGITUDE
# 41221,01/02/2015,AQS,60950004.0,3.0,30.2,ug/m3 LC,89.0,Vallejo,1.0,100.0,88101.0,PM2.5 - Local Conditions,46700.0,"Vallejo-Fairfield, CA",6.0,California,95.0,Solano,38.102507,-122.237976

# on this same day, there are readings of 30.9 and 30.2 for poc 4 and 3, respectively
# the other dataset uses poc 3 and has a mean of 30.2

# not sure if they use the lower poc value for the daily data

# the '1 hour' sample duration data seems to have the working max values and the hour when it first occurs, but it does not have the aqi - however, the other dataset does have it.
# we will have to be careful to use the same poc as the oter dataset


# LOOK AT EVENT TYPE FOR WILDFIRE
# df_clean['event_type'].unique()
# df_clean.where(df_clean['event_type'] != 'None').dropna(subset=['event_type'])

# Gilroy is the only site in the Bay Area that seems to use the 'event type', and only for 2 days: 2015-06-30 & 2015-07-01
# seemed promising, but it is not worth keeping this column

In [240]:
# there seems to be duplicates in the data, Let's add all of the columns back to see if there are diffs in the dupes
# calling drop_duplicates changes the df shape from 585484 to 457675
# going to add a dropdupes() call earlier in the notebook (before I drop the unwanted columns)
# dropping the duplicates before dropping the columns does not change the shape (thus, we have lost distinguishing data from the dropped columns)

In [241]:
# df_clean.set_index(['date_local', 'county', 'city']).drop_duplicates().shape

In [242]:
# the difference is in the value of the pollutant standard: There is an aggregated value (mean) for each standard - but the aggregation ends up being the same
# in other words, we can drop that column and remove the duplicates
df_clean['pollutant_standard'].unique()

array(['PM25 Annual 2012', 'PM25 24-hour 2012', nan, 'PM25 Annual 2006',
       'PM25 24-hour 2006'], dtype=object)

In [243]:
# drop the polluntant standard, parameter, aqi (they will all be NaN), and remove the duplicates
df_cleanest = df_clean.drop(columns=['pollutant_standard', 'parameter', 'aqi']).drop_duplicates()
df_cleanest.head(16)

Unnamed: 0,poc,latitude,longitude,sample_duration,date_local,arithmetic_mean,first_max_value,first_max_hour,local_site_name,county,city
2613384,3,37.936013,-122.026154,24-HR BLK AVG,2015-01-01,23.1,23.1,0,Concord,Contra Costa,Concord
2613386,4,38.102507,-122.237976,24-HR BLK AVG,2015-01-01,25.5,25.5,0,Vallejo,Solano,Vallejo
2613387,3,38.102507,-122.237976,24-HR BLK AVG,2015-01-01,24.5,24.5,0,Vallejo,Solano,Vallejo
2613388,3,37.793624,-122.263376,24-HR BLK AVG,2015-01-01,10.7,10.7,0,Laney College,Alameda,Oakland
2613389,3,37.814781,-122.282347,24-HR BLK AVG,2015-01-01,13.0,13.0,0,Oakland West,Alameda,Oakland
2613390,3,37.743065,-122.169935,24-HR BLK AVG,2015-01-01,13.4,13.4,0,Oakland,Alameda,Oakland
2613391,3,37.482934,-122.20337,24-HR BLK AVG,2015-01-01,14.7,14.7,0,Redwood City,San Mateo,Redwood City
2613392,3,37.765946,-122.399044,24-HR BLK AVG,2015-01-01,13.9,13.9,0,San Francisco,San Francisco,San Francisco
2613393,3,37.97231,-122.520004,24-HR BLK AVG,2015-01-01,23.9,23.9,0,San Rafael,Marin,San Rafael
2613394,3,36.999571,-121.574684,24-HR BLK AVG,2015-01-01,18.5,18.5,0,Gilroy,Santa Clara,Gilroy


In [244]:
# filter out the 24 hour data and keep the 1 hour data that has the max values of the day
df_cleanest = df_cleanest.where(df_cleanest['sample_duration'] == '1 HOUR').dropna(subset=['sample_duration'])

# drop the sample_duration since, it is all the same
df_cleanest.drop(columns=['sample_duration'], inplace=True)
# df_cleanest.head(16)

In [245]:
df_cleanest.set_index(['date_local', 'county', 'city']).head(16)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,poc,latitude,longitude,arithmetic_mean,first_max_value,first_max_hour,local_site_name
date_local,county,city,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015-01-01,Contra Costa,Concord,3,37.936013,-122.026154,23.125,42.0,22,Concord
2015-01-01,Solano,Vallejo,4,38.102507,-122.237976,25.583333,59.0,21,Vallejo
2015-01-01,Solano,Vallejo,3,38.102507,-122.237976,24.583333,62.0,21,Vallejo
2015-01-01,Alameda,Oakland,3,37.793624,-122.263376,10.75,42.0,21,Laney College
2015-01-01,Alameda,Oakland,3,37.814781,-122.282347,13.0,35.0,21,Oakland West
2015-01-01,Alameda,Oakland,3,37.743065,-122.169935,13.458333,28.0,1,Oakland
2015-01-01,San Mateo,Redwood City,3,37.482934,-122.20337,14.708333,40.0,0,Redwood City
2015-01-01,San Francisco,San Francisco,3,37.765946,-122.399044,13.916667,32.0,20,San Francisco
2015-01-01,Marin,San Rafael,3,37.97231,-122.520004,23.916667,55.0,21,San Rafael
2015-01-01,Santa Clara,Gilroy,3,36.999571,-121.574684,18.5,38.0,0,Gilroy


In [246]:
# should we do something about the poc column? 
df_cleanest['poc'].unique()

array(['3', '4'], dtype=object)

In [247]:
# the other dataset keeps the differing poc values, so Iwill leave it in for now
# could append the 4 from poc to the site name and drop the column (need to look into the 1 value from point reyes)

In [248]:
# I'm going to drop lat and long since we can use a dict later if we need the lat long for a site
# the city is in the site name, so it could be dropped later - I will keep it until the zip code or another identifier replaces it
df_cleanest.drop(columns=['latitude', 'longitude'], inplace=True)

In [249]:
df_cleanest.set_index(['date_local', 'county', 'local_site_name']).to_csv("Bay Area Max PM 2.5 Data.csv")

In [None]:
# ##################################################################

In [None]:
# cleaner approach below

In [63]:
# https://www.epa.gov/outdoor-air-quality-data/download-daily-data using PM 2.5, years 2015-2020, California, and All-Sites 
df15 = pd.read_csv('cali2015.csv')
df16 = pd.read_csv('cali2016.csv')
df17 = pd.read_csv('cali2017.csv')
df18 = pd.read_csv('cali2018.csv')
df19 = pd.read_csv('cali2019.csv')
df20 = pd.read_csv('cali2020.csv')

In [71]:
all_cali_data = pd.concat([df15,df16, df17, df18, df19, df20])

In [74]:
counties_wanted = ['Alameda', 'Contra Costa', 'Marin', 'Napa', 'San Francisco', 'San Mateo', 'Santa Clara', 'Solano', 'Sonoma']
counties_wanted

['Alameda',
 'Contra Costa',
 'Marin',
 'Napa',
 'San Francisco',
 'San Mateo',
 'Santa Clara',
 'Solano',
 'Sonoma']

In [83]:
wanted_data = all_cali_data.where(all_cali_data['COUNTY'].isin(counties_wanted))
wanted_data['COUNTY'].unique()

array(['Alameda', nan, 'Contra Costa', 'Marin', 'Napa', 'San Francisco',
       'San Mateo', 'Santa Clara', 'Solano', 'Sonoma'], dtype=object)

In [86]:
data = wanted_data.dropna(subset=['COUNTY'])

In [88]:
data.to_csv('cali_data_2015-2020.csv')

In [69]:


# look at counties and get the max value of the maxes for each county

In [98]:
data_sites = data['Site Name'].unique()
data_sites

array(['Livermore', 'Oakland', 'Oakland West', 'Laney College', 'Concord',
       'San Pablo', 'San Rafael', 'Point Reyes NS Ranger Station', 'Napa',
       'San Francisco', 'Redwood City', 'Gilroy', 'San Jose - Jackson',
       'San Jose - Knox Avenue', 'Vallejo', 'Sebastopol',
       'Berkeley Aquatic Park', 'Pleasanton - Owens Ct',
       'Napa Valley College'], dtype=object)

In [None]:
data.where(data['Site Name'] == 'Point Reyes NS Ranger Station').dropna(subset=['Site Name'])

In [217]:
data['POC'].unique()
data.where(data['POC'] == 4.).dropna(subset=['POC']).head()

Unnamed: 0,Date,Source,Site ID,POC,Daily Mean PM2.5 Concentration,UNITS,DAILY_AQI_VALUE,Site Name,DAILY_OBS_COUNT,PERCENT_COMPLETE,AQS_PARAMETER_CODE,AQS_PARAMETER_DESC,CBSA_CODE,CBSA_NAME,STATE_CODE,STATE,COUNTY_CODE,COUNTY,SITE_LATITUDE,SITE_LONGITUDE
41557,01/01/2015,AQS,60950004.0,4.0,25.5,ug/m3 LC,79.0,Vallejo,1.0,100.0,88101.0,PM2.5 - Local Conditions,46700.0,"Vallejo-Fairfield, CA",6.0,California,95.0,Solano,38.102507,-122.237976
41558,01/02/2015,AQS,60950004.0,4.0,30.9,ug/m3 LC,91.0,Vallejo,1.0,100.0,88101.0,PM2.5 - Local Conditions,46700.0,"Vallejo-Fairfield, CA",6.0,California,95.0,Solano,38.102507,-122.237976
41559,01/03/2015,AQS,60950004.0,4.0,38.7,ug/m3 LC,109.0,Vallejo,1.0,100.0,88101.0,PM2.5 - Local Conditions,46700.0,"Vallejo-Fairfield, CA",6.0,California,95.0,Solano,38.102507,-122.237976
41560,01/04/2015,AQS,60950004.0,4.0,33.0,ug/m3 LC,95.0,Vallejo,1.0,100.0,88101.0,PM2.5 - Local Conditions,46700.0,"Vallejo-Fairfield, CA",6.0,California,95.0,Solano,38.102507,-122.237976
41561,01/05/2015,AQS,60950004.0,4.0,28.1,ug/m3 LC,85.0,Vallejo,1.0,100.0,88101.0,PM2.5 - Local Conditions,46700.0,"Vallejo-Fairfield, CA",6.0,California,95.0,Solano,38.102507,-122.237976


In [212]:
# After finding out that Point Reyes uses parameter code 88502 instead of 88101, I want to see all of the parameter codes in the dataset
data['AQS_PARAMETER_CODE'].unique()

array([88101., 88502.])