# Air Quality Systems (AQS) Data
## Part 1: Compiling Data using the AQS api

In [1]:
import requests
import json
import time
import pandas as pd

In [2]:
# the AQS api: https://aqs.epa.gov/aqsweb/documents/data_api.html
# the table explains the meaning of variables used: https://aqs.epa.gov/aqsweb/documents/data_api.html#variables
# you have to sign up for the service - its super simple

email = 'taylordisom@gmail.com'
key = 'coppercrane22'
creds = 'email={}&key={}'.format(email, key)

In [3]:
# We only care about PM 2.5
def get_aqs_byBox_url(start_date, end_date):
    ''' from the CRITERIA param class:
    42101 - Carbon monoxide
    42401 - Sulfur dioxide
    42602 - Nitrogen dioxide (NO2)
    44201 - Ozone
    81102 - PM10 Total 0-10um STP
    88101 - PM2.5 - Local Conditions ***(for some reason, the code is 88502 for Point Reyes)
    ''' # api query limits: 5 params at a time
    params = '88101'
    parameters = 'param={}'.format(params)

    # the start and end dates for the request in the format yyyymmdd
    date_range = 'bdate={}&edate={}'.format(start_date, end_date)

    # make a box (lat and long boundaries) to confine results,  (commented after 2 decimal places)
    min_lat = 36.47 #4307 #the most south
    max_lat = 38.52 #2384 # the most north
    min_long = -123.41 #3342 # the most west / least east
    max_long = -121.33 #1310 #the most east
    box_bounds = 'minlat={}&maxlat={}&minlon={}&maxlon={}'.format(min_lat, max_lat, min_long, max_long)

    byBox_query_base = 'https://aqs.epa.gov/data/api/dailyData/byBox'

    # this is a sample query
    # https://aqs.epa.gov/data/api/dailyData/byBox?email=test@aqs.api&key=test&param=44201&bdate=20150501&edate=20150502&minlat=33.3&maxlat=33.6&minlon=-87.0&maxlon=-86.7

    # modularized the query
    url = '{}?{}&{}&{}&{}'.format(byBox_query_base, creds, parameters, date_range, box_bounds)
    return url


# You can make larger requests (may be penalized, but it works) - the start and end dates don't need to be the same day
# https://aqs.epa.gov/data/api/dailyData/byState?email=test@aqs.api&key=test&param=88101,88502&bdate=20150101&edate=20150201&state=06

In [4]:
# make the request and return the respone on a success or None otherwise
def request_aqs_data(start_date, end_date):
    url = get_aqs_byBox_url(start_date, end_date)
    print(url)
    response = requests.get(url, stream=True)
    if response.status_code == requests.codes.ok:
        return response
    return None

In [6]:
# for the given year, pull all of the data available. make it a json and convert the json to a df and return the df or None
def get_df_for_year_request(year):
    start_date = year + '0101'
    end_date = year + '1231'
    
    response = request_aqs_data(start_date, end_date)
    if response:
        json_response = response.json()
        if json_response['Data']:
            data_dict = json_response['Data']
            df = pd.DataFrame.from_dict(data = data_dict)
            return df
    return None


In [None]:
# create an array of df's for each year listed to concat 
years = ['2015', '2016', '2017', '2018', '2019', '2020']

arr_data = []
for year in years:
    df = get_df_for_year_request(year)
    arr_data.append(df)
    time.sleep(5)


In [10]:
df = pd.concat(arr_data, ignore_index=True)
df

Unnamed: 0,state_code,county_code,site_number,parameter_code,poc,latitude,longitude,datum,parameter,sample_duration,...,method_code,method,local_site_name,site_address,state,county,city,cbsa_code,cbsa,date_of_last_change
0,06,075,0005,88101,3,37.765946,-122.399044,WGS84,PM2.5 - Local Conditions,1 HOUR,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,San Francisco,10 ARKANSAS ST.,California,San Francisco,San Francisco,41860,"San Francisco-Oakland-Hayward, CA",2016-02-04
1,06,075,0005,88101,3,37.765946,-122.399044,WGS84,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,San Francisco,10 ARKANSAS ST.,California,San Francisco,San Francisco,41860,"San Francisco-Oakland-Hayward, CA",2016-02-04
2,06,075,0005,88101,3,37.765946,-122.399044,WGS84,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,San Francisco,10 ARKANSAS ST.,California,San Francisco,San Francisco,41860,"San Francisco-Oakland-Hayward, CA",2016-02-04
3,06,075,0005,88101,3,37.765946,-122.399044,WGS84,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,San Francisco,10 ARKANSAS ST.,California,San Francisco,San Francisco,41860,"San Francisco-Oakland-Hayward, CA",2016-02-04
4,06,075,0005,88101,3,37.765946,-122.399044,WGS84,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,San Francisco,10 ARKANSAS ST.,California,San Francisco,San Francisco,41860,"San Francisco-Oakland-Hayward, CA",2016-02-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
222599,06,001,0013,88101,3,37.864767,-122.302741,NAD83,PM2.5 - Local Conditions,1 HOUR,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Berkeley Aquatic Park,1 Bolivar Dr,California,Alameda,Not in a city,41860,"San Francisco-Oakland-Hayward, CA",2020-10-15
222600,06,001,0013,88101,3,37.864767,-122.302741,NAD83,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Berkeley Aquatic Park,1 Bolivar Dr,California,Alameda,Not in a city,41860,"San Francisco-Oakland-Hayward, CA",2020-10-15
222601,06,001,0013,88101,3,37.864767,-122.302741,NAD83,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Berkeley Aquatic Park,1 Bolivar Dr,California,Alameda,Not in a city,41860,"San Francisco-Oakland-Hayward, CA",2020-10-15
222602,06,001,0013,88101,3,37.864767,-122.302741,NAD83,PM2.5 - Local Conditions,24-HR BLK AVG,...,170,Met One BAM-1020 Mass Monitor w/VSCC - Beta At...,Berkeley Aquatic Park,1 Bolivar Dr,California,Alameda,Not in a city,41860,"San Francisco-Oakland-Hayward, CA",2020-10-15


## Part 2: Dataset Details
https://aqs.epa.gov/aqsweb/airdata/FileFormats.html

1. index: a unique identifier for the record / reading
2. state_code: The FIPS code of the state in which the monitor resides. The numeric code for the state where the reading was observed (alphabetically, California is '06')
3. county_code: The FIPS code of the county in which the monitor resides.The numeric code for the county where the reading was observed
4. site_number: A unique number within the county identifying the site.
5. parameter_code: The AQS code corresponding to the parameter measured by the monitor.
6. poc: This is the “Parameter Occurrence Code” used to distinguish different instruments that measure the same parameter at the same site.
7. latitude: The monitoring site’s angular distance north of the equator measured in decimal degrees.
8. longitude: The monitoring site’s angular distance east of the prime meridian measured in decimal degrees.
9. datum: The Datum associated with the Latitude and Longitude measures.
10. parameter: The name or description assigned in AQS to the parameter measured by the monitor. Parameters may be pollutants or non-pollutants.
11. sample_duration: The length of time that air passes through the monitoring device before it is analyzed (measured). So, it represents an averaging period in the atmosphere (for example, a 24-hour sample duration draws ambient air over a collection filter for 24 straight hours). For continuous monitors, it can represent an averaging time of many samples (for example, a 1-hour value may be the average of four one-minute samples collected during each quarter of the hour).
12. pollutant_standard: A description of the ambient air quality standard rules used to aggregate statistics. (See description at beginning of document.)
13. date_local: The calendar date for the summary. All daily summaries are for the local standard day (midnight to midnight) at the monitor.
14. units_of_measure: The unit of measure for the parameter. QAD always returns data in the standard units for the parameter. Submitters are allowed to report data in any unit and EPA converts to a standard unit so that we may use the data in calculations.
15. event_type: Indicates whether data measured during exceptional events are included in the summary. A wildfire is an example of an exceptional event; it is something that affects air quality, but the local agency has no control over. No Events means no events occurred. Events Included means events occurred and the data from them is included in the summary. Events Excluded means that events occurred but data form them is excluded from the summary. Concurred Events Excluded means that events occurred but only EPA concurred exclusions are removed from the summary. If an event occurred for the parameter in question, the data will have multiple records for each monitor.
16. observation_count: The number of observations (samples) taken during the day.
17. observation_percent: The percent representing the number of observations taken with respect to the number scheduled to be taken during the day. This is only calculated for monitors where measurements are required (e.g., only certain parameters).
18. validity_indicator: An indication of whether the regulatory data completeness criteria for valid summary data have been met by the monitor for the year. Y means yes, N means no or that there are no regulatory completeness criteria for the parameter.
19. arithmetic_mean: The average (arithmetic mean) value for the day.
20. first_max_value: The highest value for the day.
21. first_max_hour: The hour (on a 24-hour clock) when the highest value for the day (the previous field) was taken.
22. aqi: The Air Quality Index for the day for the pollutant, if applicable.
23. method_code: An internal system code indicating the method (processes, equipment, and protocols) used in gathering and measuring the sample. The method name is in the next column.
24. method: A short description of the processes, equipment, and protocols used in gathering and measuring the sample.
25. local_site_name: The name of the site (if any) given by the State, local, or tribal air pollution control agency that operates it.
26. site_address: The approximate street address of the monitoring site.
27. state: The name of the state where the monitoring site is located.
28. county: The name of the county where the monitoring site is located.
29. city: The name of the city where the monitoring site is located. This represents the legal incorporated boundaries of cities and not urban areas.
30. cbsa_code: The code (ZIP Code) of the core bases statistical area (metropolitan area) where the monitoring site is located.
31. cbsa: The name of the core bases statistical area (metropolitan area) where the monitoring site is located.
32. date_of_last_change: The date the last time any numeric values in this record were updated in the AQS data system.


### Data

There is 40 years worth of data in the form of daily data that you can request from AQS. You can also find more granular data (hourly), but it seems like it would be overkill since our other data cannot also have such granularity. The daily data is about 70 KB per file/day. with 40 years of data, that's about 14,610 days and 1 GB of data (1 million KB). This is only pulling data within the set bounds of a defined lat/long box. we could also search by county, but I believe you cannot specify multiple counties in a single request. since there are usage limits for the size and frequency of requests, we decided to request data with the bounds of a box such that all of the counties we are interested in lie within the box.

In [11]:
df.columns

Index(['state_code', 'county_code', 'site_number', 'parameter_code', 'poc',
       'latitude', 'longitude', 'datum', 'parameter', 'sample_duration',
       'pollutant_standard', 'date_local', 'units_of_measure', 'event_type',
       'observation_count', 'observation_percent', 'validity_indicator',
       'arithmetic_mean', 'first_max_value', 'first_max_hour', 'aqi',
       'method_code', 'method', 'local_site_name', 'site_address', 'state',
       'county', 'city', 'cbsa_code', 'cbsa', 'date_of_last_change'],
      dtype='object')

In [35]:
col_name = 'site_number'
# df[col_name].describe()
len(df[col_name].unique())


33

In [19]:
# Monterey, San Joaquin, Santa Cruz, San Benito, Sacramento not in the Bay Area csv, will remove those entries with the county as the previously listed values

In [40]:
# - testing what is the difference in the rows with the same date, county, site, sample duration, pollutant, and poc
columns_to_drop = ['state_code', 'parameter_code', 'datum', 'event_type', 'units_of_measure', 'event_type', 'observation_count', 'observation_percent', 'validity_indicator', 'method_code', 'method',  'site_address', 'state', 'cbsa_code', 'cbsa', 'date_of_last_change'] 


In [33]:
print(len(df['local_site_name'].unique()), df['local_site_name'].unique())
# print(sorted(data_sites)) 
# will need to run the code at the bottom of the notebook

# the max data is missing data from only this site: Point Reyes NS Ranger Station



23 ['San Francisco' 'Carmel Valley' 'Concord' 'Salinas 3' 'Redwood City'
 'Napa' 'Hollister' 'Livermore' 'Santa Cruz' 'San Rafael' 'Vallejo'
 'San Pablo' 'San Jose - Jackson' 'Oakland West' 'Oakland' 'Gilroy'
 'Laney College' 'San Lorenzo Valley Middle School' 'Sebastopol'
 'San Jose - Knox Avenue' 'Berkeley Aquatic Park' 'Pleasanton - Owens Ct'
 'Napa Valley College']


In [233]:
# can I use the api to pull data from the site: Point Reyes (from the map, it seems like it should be within the bounds of the box)

# the lat long is 38.122979 -122.90944 for point reyes

#     min_lat = 36.47 #4307 #the most south
#     max_lat = 38.52 #2384 # the most north
#     min_long = -123.41 #3342 # the most west / least east
#     max_long = -121.33 #1310 #the most east

# https://aqs.epa.gov/data/api/list/sitesByCounty?email=test@aqs.api&key=test&state=06&county=041
# the site lies within the bounds of the box, so we'll try to pull from that site specifically site number for Point Reyes is 0002

# from the other dataset, it seems that the earliest reading from that site is 01/09/2015
# the reason that Point Reyes was not pulled in the original requests was because it uses a different parameter code for pm 2.5, 88502.
# it is from the parameter class 'SPECIATION' (https://aqs.epa.gov/data/api/list/parametersByClass?email=test@aqs.api&key=test&pc=SPECIATION)
# this request pulls the data from that site: https://aqs.epa.gov/data/api/monitors/bySite?email=test@aqs.api&key=test&param=88502&bdate=20150501&edate=20150501&state=06&county=041&site=0002
# will pull that data and concat it later



In [None]:
# filtering out the sites that aren't used in the other dataset (will need the code at the bottom of the notebook) - just Bay Area sites
# the sites that we are considering has changed and it now seems that we are looking at more sites than are contained in this data (possible solution: the bounding box could be expanded)
# df = df.where(df['local_site_name'].isin(data_sites)).dropna(subset=['local_site_name'])

In [234]:
# pull the point reyes data
# https://aqs.epa.gov/data/api/dailyData/bySite?email=test@aqs.api&key=test&param=88502&bdate=20150101&edate=20151231&state=06&county=041&site=0002
# The site point reyes does not have a 1 hour sample duration, thus they do not have the max value for each day
# this site will not be included in this dataset


In [38]:
# df.isna().value_counts()

In [41]:

df_clean = df.drop(columns=columns_to_drop)


In [43]:
df_clean.isna().value_counts()

county_code  site_number  poc    latitude  longitude  parameter  sample_duration  pollutant_standard  date_local  arithmetic_mean  first_max_value  first_max_hour  aqi    local_site_name  county  city 
False        False        False  False     False      False      False            False               False       False            False            False           False  False            False   False    178328
                                                                                  True                False       False            False            False           True   False            False   False     44276
dtype: int64

In [237]:
# need to remove the samples that are not '24-HR BLK AVG'
# only the '1 Hour' sample durations have a 'first max hour' that is non-zero, but their aqi is NaN

In [238]:
# df_clean.set_index(['date_local', 'county', 'city']).head(60)
# df_clean.set_index(['date_local', 'county', 'city']).shape

In [239]:
# the other dataset has this value:
# ,Date,Source,Site ID,POC,Daily Mean PM2.5 Concentration,UNITS,DAILY_AQI_VALUE,Site Name,DAILY_OBS_COUNT,PERCENT_COMPLETE,AQS_PARAMETER_CODE,AQS_PARAMETER_DESC,CBSA_CODE,CBSA_NAME,STATE_CODE,STATE,COUNTY_CODE,COUNTY,SITE_LATITUDE,SITE_LONGITUDE
# 41221,01/02/2015,AQS,60950004.0,3.0,30.2,ug/m3 LC,89.0,Vallejo,1.0,100.0,88101.0,PM2.5 - Local Conditions,46700.0,"Vallejo-Fairfield, CA",6.0,California,95.0,Solano,38.102507,-122.237976

# on this same day, there are readings of 30.9 and 30.2 for poc 4 and 3, respectively
# the other dataset uses poc 3 and has a mean of 30.2

# not sure if they use the lower poc value for the daily data

# the '1 hour' sample duration data seems to have the working max values and the hour when it first occurs, but it does not have the aqi - however, the other dataset does have it.
# we will have to be careful to use the same poc as the oter dataset


# LOOK AT EVENT TYPE FOR WILDFIRE
# df_clean['event_type'].unique()
# df_clean.where(df_clean['event_type'] != 'None').dropna(subset=['event_type'])

# Gilroy is the only site in the Bay Area that seems to use the 'event type', and only for 2 days: 2015-06-30 & 2015-07-01
# seemed promising, but it is not worth keeping this column

In [240]:
# there seems to be duplicates in the data, Let's add all of the columns back to see if there are diffs in the dupes
# calling drop_duplicates changes the df shape from 585484 to 457675
# going to add a dropdupes() call earlier in the notebook (before I drop the unwanted columns)
# dropping the duplicates before dropping the columns does not change the shape (thus, we have lost distinguishing data from the dropped columns)

In [241]:
# df_clean.set_index(['date_local', 'county', 'city']).drop_duplicates().shape

In [242]:
# the difference is in the value of the pollutant standard: There is an aggregated value (mean) for each standard - but the aggregation ends up being the same
# in other words, we can drop that column and remove the duplicates
df_clean['pollutant_standard'].unique()

array(['PM25 Annual 2012', 'PM25 24-hour 2012', nan, 'PM25 Annual 2006',
       'PM25 24-hour 2006'], dtype=object)

In [44]:
# drop the polluntant standard, parameter, aqi (they will all be NaN), and remove the duplicates
df_cleanest = df_clean.drop(columns=['pollutant_standard', 'parameter', 'aqi']).drop_duplicates()
df_cleanest.head(16)

Unnamed: 0,county_code,site_number,poc,latitude,longitude,sample_duration,date_local,arithmetic_mean,first_max_value,first_max_hour,local_site_name,county,city
0,75,5,3,37.765946,-122.399044,1 HOUR,2015-01-01,13.916667,32.0,20,San Francisco,San Francisco,San Francisco
1,75,5,3,37.765946,-122.399044,24-HR BLK AVG,2015-01-01,13.9,13.9,0,San Francisco,San Francisco,San Francisco
5,75,5,3,37.765946,-122.399044,1 HOUR,2015-01-02,22.666667,31.0,22,San Francisco,San Francisco,San Francisco
6,75,5,3,37.765946,-122.399044,24-HR BLK AVG,2015-01-02,22.6,22.6,0,San Francisco,San Francisco,San Francisco
10,75,5,3,37.765946,-122.399044,1 HOUR,2015-01-03,27.958333,47.0,22,San Francisco,San Francisco,San Francisco
11,75,5,3,37.765946,-122.399044,24-HR BLK AVG,2015-01-03,27.9,27.9,0,San Francisco,San Francisco,San Francisco
15,75,5,3,37.765946,-122.399044,1 HOUR,2015-01-04,28.833333,38.0,1,San Francisco,San Francisco,San Francisco
16,75,5,3,37.765946,-122.399044,24-HR BLK AVG,2015-01-04,28.8,28.8,0,San Francisco,San Francisco,San Francisco
20,75,5,3,37.765946,-122.399044,1 HOUR,2015-01-05,20.625,34.0,6,San Francisco,San Francisco,San Francisco
21,75,5,3,37.765946,-122.399044,24-HR BLK AVG,2015-01-05,20.6,20.6,0,San Francisco,San Francisco,San Francisco


In [45]:
# filter out the 24 hour data and keep the 1 hour data that has the max values of the day
df_cleanest = df_cleanest.where(df_cleanest['sample_duration'] == '1 HOUR').dropna(subset=['sample_duration'])

# drop the sample_duration since, it is all the same
df_cleanest.drop(columns=['sample_duration'], inplace=True)
# df_cleanest.head(16)

In [46]:
df_cleanest.set_index(['date_local', 'county', 'city']).head(16)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,county_code,site_number,poc,latitude,longitude,arithmetic_mean,first_max_value,first_max_hour,local_site_name
date_local,county,city,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2015-01-01,San Francisco,San Francisco,75,5,3.0,37.765946,-122.399044,13.916667,32.0,20.0,San Francisco
2015-01-02,San Francisco,San Francisco,75,5,3.0,37.765946,-122.399044,22.666667,31.0,22.0,San Francisco
2015-01-03,San Francisco,San Francisco,75,5,3.0,37.765946,-122.399044,27.958333,47.0,22.0,San Francisco
2015-01-04,San Francisco,San Francisco,75,5,3.0,37.765946,-122.399044,28.833333,38.0,1.0,San Francisco
2015-01-05,San Francisco,San Francisco,75,5,3.0,37.765946,-122.399044,20.625,34.0,6.0,San Francisco
2015-01-06,San Francisco,San Francisco,75,5,3.0,37.765946,-122.399044,30.5,46.0,5.0,San Francisco
2015-01-07,San Francisco,San Francisco,75,5,3.0,37.765946,-122.399044,21.958333,32.0,16.0,San Francisco
2015-01-08,San Francisco,San Francisco,75,5,3.0,37.765946,-122.399044,34.375,55.0,17.0,San Francisco
2015-01-09,San Francisco,San Francisco,75,5,3.0,37.765946,-122.399044,25.916667,39.0,1.0,San Francisco
2015-01-10,San Francisco,San Francisco,75,5,3.0,37.765946,-122.399044,14.5,21.0,23.0,San Francisco


In [47]:
# should we do something about the poc column? yes - we should aggregate (average) where needed
df_cleanest['poc'].unique()

array([3., 4.])

In [247]:
# the other dataset keeps the differing poc values, so Iwill leave it in for now
# could append the 4 from poc to the site name and drop the column (need to look into the 1 value from point reyes)

In [48]:
# I'm going to drop lat and long since we can use a dict later if we need the lat long for a site
# the city is in the site name, so it could be dropped later - I will keep it until the zip code or another identifier replaces it
df_cleanest.drop(columns=['latitude', 'longitude'], inplace=True)

In [49]:
df_cleanest.set_index(['date_local', 'county', 'local_site_name']).to_csv("Bay Area Max PM 2.5 Data.csv")

In [None]:
# ##################################################################

In [None]:
# cleaner approach below

In [26]:
# https://www.epa.gov/outdoor-air-quality-data/download-daily-data using PM 2.5, years 2015-2020, California, and All-Sites 
df_sites = pd.read_csv('data\BA_AQI_STATIONS.csv')

In [27]:
df_sites.columns

Index(['NAME', 'COUNTY', 'LATITUDE', 'LONGITUDE', 'START_DATE', 'END_DATE'], dtype='object')

In [28]:
# from our google doc
counties_wanted = ['Alameda', 'Contra Costa', 'Marin', 'Napa', 'San Francisco', 'San Mateo', 'Santa Clara', 'Solano', 'Sonoma']


In [29]:
wanted_data = df_sites.where(df_sites['COUNTY'].isin(counties_wanted))
wanted_data['COUNTY'].unique()

array(['San Mateo', 'San Francisco', nan, 'Santa Clara', 'Alameda',
       'Solano', 'Contra Costa', 'Napa', 'Sonoma', 'Marin'], dtype=object)

In [86]:
data = wanted_data.dropna(subset=['COUNTY'])

In [34]:
data_sites = df_sites['NAME'].unique()
print(len(data_sites), data_sites)

34 ['Redwood City' 'San Francisco' 'San Lorenzo Valley Middle School'
 'San Jose - Jackson' 'Laney College' 'Vallejo' 'Livermore'
 'Pleasanton - Owens Ct' 'Concord' 'Napa' 'Stockton-Hazelton' 'Sebastopol'
 'Berkeley Aquatic Park' 'Napa Valley College' 'Oakland' 'San Pablo'
 'Cortina Indian Rancheria' 'Oakland West' 'San Rafael'
 'San Jose - Knox Avenue' 'Gilroy' 'Hollister' 'Ukiah-Library'
 'Lakeport-S. Main Street' 'Sacramento-1309 T Street'
 'Woodland-Gibson Road' 'Santa Cruz' 'Salinas 3' 'Manteca'
 'Sacramento-Bercut Drive' 'Sacramento-Del Paso Manor'
 'Modesto-14th Street' 'Sloughhouse' 'Willits-125 East Commercial Street']


In [None]:
# data.where(data['Site Name'] == 'Point Reyes NS Ranger Station').dropna(subset=['Site Name'])

In [217]:
# data['POC'].unique()
# data.where(data['POC'] == 4.).dropna(subset=['POC']).head()

Unnamed: 0,Date,Source,Site ID,POC,Daily Mean PM2.5 Concentration,UNITS,DAILY_AQI_VALUE,Site Name,DAILY_OBS_COUNT,PERCENT_COMPLETE,AQS_PARAMETER_CODE,AQS_PARAMETER_DESC,CBSA_CODE,CBSA_NAME,STATE_CODE,STATE,COUNTY_CODE,COUNTY,SITE_LATITUDE,SITE_LONGITUDE
41557,01/01/2015,AQS,60950004.0,4.0,25.5,ug/m3 LC,79.0,Vallejo,1.0,100.0,88101.0,PM2.5 - Local Conditions,46700.0,"Vallejo-Fairfield, CA",6.0,California,95.0,Solano,38.102507,-122.237976
41558,01/02/2015,AQS,60950004.0,4.0,30.9,ug/m3 LC,91.0,Vallejo,1.0,100.0,88101.0,PM2.5 - Local Conditions,46700.0,"Vallejo-Fairfield, CA",6.0,California,95.0,Solano,38.102507,-122.237976
41559,01/03/2015,AQS,60950004.0,4.0,38.7,ug/m3 LC,109.0,Vallejo,1.0,100.0,88101.0,PM2.5 - Local Conditions,46700.0,"Vallejo-Fairfield, CA",6.0,California,95.0,Solano,38.102507,-122.237976
41560,01/04/2015,AQS,60950004.0,4.0,33.0,ug/m3 LC,95.0,Vallejo,1.0,100.0,88101.0,PM2.5 - Local Conditions,46700.0,"Vallejo-Fairfield, CA",6.0,California,95.0,Solano,38.102507,-122.237976
41561,01/05/2015,AQS,60950004.0,4.0,28.1,ug/m3 LC,85.0,Vallejo,1.0,100.0,88101.0,PM2.5 - Local Conditions,46700.0,"Vallejo-Fairfield, CA",6.0,California,95.0,Solano,38.102507,-122.237976


In [212]:
# After finding out that Point Reyes uses parameter code 88502 instead of 88101, I want to see all of the parameter codes in the dataset
# data['AQS_PARAMETER_CODE'].unique()

array([88101., 88502.])