# Example of a query using the EPA API
will only query a subset of data

We found several APIs from the EPA (a couple), Center for disease control, and Census data, but here we show two APIs used to generate the main data and to track greenhouse producing facilities tracked by the EPA

In [15]:
#import dependencies
import pandas as pd
import requests as req
import csv
#create separate file with individual log-in credentials to be imported
from tokens import (user_id_aqs, password_aqs)

In [16]:
#parameter codes to loop through in API:

#Only will loop through california
state_code = '06'
#Will loop through all counties within California. Note it is commented out for this example. Will only use two county
#county_list = ["{0:03}".format(i) for i in range(116) if i%2 != 0]
#Loop through years 
years_list = ['2011','2012','2013','2014','2015','2016','2017']
#Loop though a particular parameter e.g. Ozone levels, CO2, PM2.5, etc.
parameter_code = '88101'
#Establish two lists, one that will collect the dataframes generated into  list titled 'frames'
#The other list is 
frames = []
missing = []

#Will only loop through two county for years 11-17 as an example so we dont generate too much data in this example
county_list = ['001','003']

In [17]:
#header used to create dataframes when looping through URL
header = ['Latitude', 'Longitude', 'Datum', 'Horizontal Accuracy', 'State Code',
       'County Code', 'Site Num', 'Parameter Code', 'POC',
       'AQS Parameter Desc', 'Date Local', '24 Hour Local', 'Date GMT',
       '24 Hour GMT', 'Year GMT', 'Day In Year GMT', 'Sample Measurement',
       'Units of Measure', 'Sample Duration', 'Sample Frequency',
       'Detection Limit', 'Measurement Uncertainty', 'Qualifier Description',
       'Method Type', 'Method Description']

for year in years_list:
    #Should loop through variable county_list but will only loop through two county for example
    for county_code in county_list:
        url = f"https://aqs.epa.gov/api/rawData?user={user_id_aqs}&pw={password_aqs}&format=DMCSV&param={parameter_code}&bdate={year}0101&edate={year}1231&state={state_code}&county={county_code}"
        response = req.get(url)
        #making sure request worked
        if response.status_code == 200:
            
            print(f"succesfully requested for year:{year}, county:{county_code}")
            
            empty_list = []
            #Convert ext to CSV because the file is returned as a large text(Thanks Jake!)
            #csv.reader actual stores every line of the text as a list
            wrapper = csv.reader(response.text.strip().split('\n'))
            #Skip the first list in this CSV file which is the header
            next(wrapper)   
            #Store each of the following lists in a big list
            for record in wrapper:
                empty_list.append(record) 
            #Some text files are empty (only have header), so for those files that do not have API data, skip because they
            #cannot generate a dataframe
            try:  
                df = pd.DataFrame(empty_list, columns = header)
                frames.append(df)
                print(f"Added {len(df)} rows")
            except AssertionError:
                print(f"Empty")
                #maintain a record of the empty data from counties/years
                missing.append([year,county_code])
            
master_df = pd.concat(frames)
print(len(master_df))
#print missing data: (year, county)
print(missing)

succesfully requested for year:2011, county:001
Added 17410 rows
succesfully requested for year:2011, county:003
Empty
succesfully requested for year:2012, county:001
Added 17560 rows
succesfully requested for year:2012, county:003
Empty
succesfully requested for year:2013, county:001
Added 25077 rows
succesfully requested for year:2013, county:003
Empty
succesfully requested for year:2014, county:001
Added 33403 rows
succesfully requested for year:2014, county:003
Empty
succesfully requested for year:2015, county:001
Added 34091 rows
succesfully requested for year:2015, county:003
Empty
succesfully requested for year:2016, county:001
Added 37692 rows
succesfully requested for year:2016, county:003
Empty
succesfully requested for year:2017, county:001
Added 42851 rows
succesfully requested for year:2017, county:003
Empty
208084
[['2011', '003'], ['2012', '003'], ['2013', '003'], ['2014', '003'], ['2015', '003'], ['2016', '003'], ['2017', '003']]


In [18]:
print(len(master_df))
master_df.head()

208084


Unnamed: 0,Latitude,Longitude,Datum,Horizontal Accuracy,State Code,County Code,Site Num,Parameter Code,POC,AQS Parameter Desc,...,Day In Year GMT,Sample Measurement,Units of Measure,Sample Duration,Sample Frequency,Detection Limit,Measurement Uncertainty,Qualifier Description,Method Type,Method Description
0,37.687526,-121.784217,WGS84,109305.56,6,1,7,88101,1,PM2.5 - Local Conditions,...,18,6.6,Micrograms/cubic meter (LC),24 HOUR,EVERY DAY,2,,,Reference,R & P Model 2025 PM-2.5 Sequential Air Sampler...
1,37.687526,-121.784217,WGS84,109305.56,6,1,7,88101,1,PM2.5 - Local Conditions,...,21,23.6,Micrograms/cubic meter (LC),24 HOUR,EVERY DAY,2,,,Reference,R & P Model 2025 PM-2.5 Sequential Air Sampler...
2,37.687526,-121.784217,WGS84,109305.56,6,1,7,88101,1,PM2.5 - Local Conditions,...,1,10.5,Micrograms/cubic meter (LC),24 HOUR,EVERY DAY,2,,,Reference,R & P Model 2025 PM-2.5 Sequential Air Sampler...
3,37.687526,-121.784217,WGS84,109305.56,6,1,7,88101,1,PM2.5 - Local Conditions,...,31,8.5,Micrograms/cubic meter (LC),24 HOUR,EVERY DAY,2,,,Reference,R & P Model 2025 PM-2.5 Sequential Air Sampler...
4,37.687526,-121.784217,WGS84,109305.56,6,1,7,88101,1,PM2.5 - Local Conditions,...,45,3.0,Micrograms/cubic meter (LC),24 HOUR,EVERY DAY,2,,,Reference,R & P Model 2025 PM-2.5 Sequential Air Sampler...


In [20]:
#Copy dataframe to have an editable version
copy_master = master_df

#Create dictionary to create a column with actual names
equiv = { "001": "Alameda", "003": "Apline", "005": "Amador", "007":"Butte", "009":"Calaveras", "011": "Colusa", "013":"Contra Costa", "015":"Del Norte", "017":"El Dorado", "019": "Fresno", "021": "Glenn", "023": "Humboldt", "025": "Imperial", "027": "Inyo", "029": "Kern", "031": "Kings", "033":"Lake", "035": "Lassen", "037": "Los Angeles", "039": "Madera", "041": "Marin", "043": "Mariposa", "045": "Mendicino", "047": "Merced", "049": "Modoc", "051": "Mono", "053": "Monterey", "055": "Napa", "057": "Nevada", "059": "Orange", "061": "Placer", "063": "Plumas", "065": "Riverside", "067": "Sacramento", "069": "San Benito", "071": "San Bernardino", "073": "San Diego", "075": "San Francisco", "077": "San Joaquin", "079": "San Luis Obispo", "081": "San Mateo", "083": "Santa Barbara", "085": "Santa Clara", "087": "Santa Cruz", "089": "Shasta", "091": "Sierra", "093": "Siskiyou", "095": "Solano", "097": "Sonoma", "099": "Stanislaus", "101": "Sutter", "103": "Tehama", "105": "Trinity", "107": "Tulare", "109": "Tuolumne", "111": "Ventura", "113": "Yolo", "115": "Yuba"}

df_trim1 = copy_master[['County Code','Date Local','24 Hour Local','Date GMT','Sample Measurement', "Sample Frequency",'Units of Measure','Sample Duration']]

#column for months
df_trim1['Date Local'] = pd.to_datetime(df_trim1['Date Local'])
#line for larger datasets
#df_trim1.drop(df_trim1.index[318396], inplace=True)
df_trim1['Month'] = pd.DatetimeIndex(df_trim1['Date Local']).month
df_trim1["Month"].fillna(0)
df_trim1["Month"].astype(float)
df_trim1.fillna(0)


#columns for years
df_trim1['Date Local'] = pd.to_datetime(df_trim1['Date Local'])
#line for larger datasets
#df_trim1.drop(df_trim1.index[318396], inplace=True)
df_trim1['Year'] = pd.DatetimeIndex(df_trim1['Date Local']).year
df_trim1["Year"].fillna(0)
df_trim1["Year"].astype(float)
df_trim1.fillna(0)

#column for county names
df_trim1["County Names"] = df_trim1["County Code"].map(equiv)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pan

In [21]:
df_trim1.head()

Unnamed: 0,County Code,Date Local,24 Hour Local,Date GMT,Sample Measurement,Sample Frequency,Units of Measure,Sample Duration,Month,Year,County Names
0,1,2011-01-18,00:00,2011-01-18,6.6,EVERY DAY,Micrograms/cubic meter (LC),24 HOUR,1.0,2011.0,Alameda
1,1,2011-01-21,00:00,2011-01-21,23.6,EVERY DAY,Micrograms/cubic meter (LC),24 HOUR,1.0,2011.0,Alameda
2,1,2011-01-01,00:00,2011-01-01,10.5,EVERY DAY,Micrograms/cubic meter (LC),24 HOUR,1.0,2011.0,Alameda
3,1,2011-01-31,00:00,2011-01-31,8.5,EVERY DAY,Micrograms/cubic meter (LC),24 HOUR,1.0,2011.0,Alameda
4,1,2011-02-14,00:00,2011-02-14,3.0,EVERY DAY,Micrograms/cubic meter (LC),24 HOUR,2.0,2011.0,Alameda


# Code for plotting facilities, Use of another API to retireve facilities tracked by the EPA
This will show facilities that produce greenhouse gases and that are tracked by the EPA

In [25]:
import pandas as pd
import matplotlib.pyplot as plt
import requests
import json

#up to row 14 is the last available for california
url = "http://iaspub.epa.gov/enviro/efservice/PUB_DIM_FACILITY/STATE/=/CA/rows/0:114/json"
response = requests.get(url).json()

import gmaps
# Google developer API key
from tokens import goog
# Access maps with unique API key
gmaps.configure(api_key=goog)

In [26]:
df = pd.DataFrame(response).drop_duplicates('FACILITY_ID')
df_loc = df[['LATITUDE','LONGITUDE']].reset_index(drop=True)

figure_layout = {
    'width': '400px',
    'height': '300px',
    'border': '1px solid black',
    'padding': '1px',
    'margin': '0 auto 0 auto'
}
fig = gmaps.figure(layout=figure_layout)

In [27]:
# Assign the marker layer to a variable
markers = gmaps.marker_layer(df_loc)
# Add the layer to the map
fig.add_layer(markers)
fig