In [7]:
import requests
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy

In [25]:
# getting a list of US cities available from OpenAQ
cities = requests.get('https://api.openaq.org/v1/cities?country=US&limit=1000')
print(cities.status_code)
cities.json()

200


{'meta': {'name': 'openaq-api',
  'license': 'CC BY 4.0',
  'website': 'https://docs.openaq.org/',
  'page': 1,
  'limit': 1000,
  'found': 812},
 'results': [{'country': 'US',
   'name': '007',
   'city': '007',
   'count': 1565,
   'locations': 4},
  {'country': 'US',
   'name': '019',
   'city': '019',
   'count': 1168,
   'locations': 1},
  {'country': 'US', 'name': '023', 'city': '023', 'count': 23, 'locations': 1},
  {'country': 'US',
   'name': '037',
   'city': '037',
   'count': 3049,
   'locations': 3},
  {'country': 'US',
   'name': '039',
   'city': '039',
   'count': 1540,
   'locations': 2},
  {'country': 'US',
   'name': '047',
   'city': '047',
   'count': 101,
   'locations': 1},
  {'country': 'US',
   'name': '051',
   'city': '051',
   'count': 335,
   'locations': 4},
  {'country': 'US',
   'name': '077',
   'city': '077',
   'count': 2475,
   'locations': 1},
  {'country': 'US',
   'name': 'ABBEVILLE',
   'city': 'ABBEVILLE',
   'count': 4619,
   'locations': 1},
 

In [35]:
# getting information for the "San Francisco-Oakland-Fremont" location
sf_parameters = {
    'city[]': 'San Francisco-Oakland-Fremont'
}

sf_response = requests.get('https://api.openaq.org/v1/locations', sf_parameters)

print(sf_response.status_code)

In [61]:
# generalizing to all locations of interest
areas_of_interest = [
    'San Francisco-Oakland-Fremont',
    'San Jose-Sunnyvale-Santa Clara',
    'Vallejo-Fairfield',
    'Napa',
    'Sonoma'
]

bay_area_locations = []

# getting info for each area and adding the locations that it contains to a list
for area in areas_of_interest:
    parameters = { 'city[]': area }
    response = requests.get('https://api.openaq.org/v1/locations', parameters)
    
    for result in response.json()['results']:
        bay_area_locations.append(result['location'])


In [62]:
bay_area_locations

['Berkeley Aquatic Par',
 'Bethel Island',
 'Concord',
 'Hayward',
 'Laney College',
 'Livermore - Rincon',
 'Oakland',
 'Oakland West',
 'Patterson Pass',
 'Pleasanton - Owens C',
 'Redwood City',
 'Richmond - 7th St',
 'San Francisco',
 'San Pablo - Rumrill',
 'San Rafael',
 'San Ramon',
 'Gilory - 9th Street',
 'Hollister AMS',
 'Hollister AMS',
 'Los Gatos',
 'Pinnacles NM',
 'San Jose - Jackson S',
 'San Jose - Knox Ave',
 'San Martin',
 'Fairfield',
 'Rio Vista',
 'Rio Vista',
 'Vacaville',
 'Vallejo',
 'Napa - Jefferson St',
 'Napa - Napa Valley C',
 'Sonoma Technology Mo']

In [88]:
# getting a sense of what the data for each location will look like... 
# testing the process on data for Alameda/Berkeley Aquatic Par location
# focusing on PM2.5 because that's the main pollutant used to gauge wildfires' effects on air quality

test_params = {
    'city': 'ALAMEDA',
    'location': 'Berkeley Aquatic Par',
    'parameter': 'pm25',
    'date_from': '2020-01-01'
}

alameda_resp = requests.get('https://api.openaq.org/v1/measurements', test_params)
print(alameda_resp.status_code)

200


In [89]:
alameda_resp.json()

{'meta': {'name': 'openaq-api',
  'license': 'CC BY 4.0',
  'website': 'https://docs.openaq.org/',
  'page': 1,
  'limit': 100,
  'found': 6410},
 'results': [{'location': 'Berkeley Aquatic Par',
   'parameter': 'pm25',
   'date': {'utc': '2020-12-01T04:00:00Z',
    'local': '2020-11-30T20:00:00-08:00'},
   'value': 13,
   'unit': 'µg/m³',
   'coordinates': {'latitude': 37.864767, 'longitude': -122.302741},
   'country': 'US',
   'city': 'ALAMEDA'},
  {'location': 'Berkeley Aquatic Par',
   'parameter': 'pm25',
   'date': {'utc': '2020-12-01T03:00:00Z',
    'local': '2020-11-30T19:00:00-08:00'},
   'value': 10,
   'unit': 'µg/m³',
   'coordinates': {'latitude': 37.864767, 'longitude': -122.302741},
   'country': 'US',
   'city': 'ALAMEDA'},
  {'location': 'Berkeley Aquatic Par',
   'parameter': 'pm25',
   'date': {'utc': '2020-12-01T02:00:00Z',
    'local': '2020-11-30T18:00:00-08:00'},
   'value': 13,
   'unit': 'µg/m³',
   'coordinates': {'latitude': 37.864767, 'longitude': -122.3027

In [70]:
#turning the alameda_resp dictionary into a df

alameda_df = pd.DataFrame.from_dict(alameda_resp.json()['results'])
alameda_df.head(10)

Unnamed: 0,city,coordinates,country,date,location,parameter,unit,value
0,ALAMEDA,"{'latitude': 37.864767, 'longitude': -122.302741}",US,"{'utc': '2020-12-01T01:00:00Z', 'local': '2020...",Berkeley Aquatic Par,pm25,µg/m³,9
1,ALAMEDA,"{'latitude': 37.864767, 'longitude': -122.302741}",US,"{'utc': '2020-12-01T00:00:00Z', 'local': '2020...",Berkeley Aquatic Par,pm25,µg/m³,7
2,ALAMEDA,"{'latitude': 37.864767, 'longitude': -122.302741}",US,"{'utc': '2020-11-30T23:00:00Z', 'local': '2020...",Berkeley Aquatic Par,pm25,µg/m³,10
3,ALAMEDA,"{'latitude': 37.864767, 'longitude': -122.302741}",US,"{'utc': '2020-11-30T22:00:00Z', 'local': '2020...",Berkeley Aquatic Par,pm25,µg/m³,14
4,ALAMEDA,"{'latitude': 37.864767, 'longitude': -122.302741}",US,"{'utc': '2020-11-30T21:00:00Z', 'local': '2020...",Berkeley Aquatic Par,pm25,µg/m³,11
5,ALAMEDA,"{'latitude': 37.864767, 'longitude': -122.302741}",US,"{'utc': '2020-11-30T20:00:00Z', 'local': '2020...",Berkeley Aquatic Par,pm25,µg/m³,20
6,ALAMEDA,"{'latitude': 37.864767, 'longitude': -122.302741}",US,"{'utc': '2020-11-30T18:00:00Z', 'local': '2020...",Berkeley Aquatic Par,pm25,µg/m³,10
7,ALAMEDA,"{'latitude': 37.864767, 'longitude': -122.302741}",US,"{'utc': '2020-11-30T17:00:00Z', 'local': '2020...",Berkeley Aquatic Par,pm25,µg/m³,12
8,ALAMEDA,"{'latitude': 37.864767, 'longitude': -122.302741}",US,"{'utc': '2020-11-30T16:00:00Z', 'local': '2020...",Berkeley Aquatic Par,pm25,µg/m³,14
9,ALAMEDA,"{'latitude': 37.864767, 'longitude': -122.302741}",US,"{'utc': '2020-11-30T15:00:00Z', 'local': '2020...",Berkeley Aquatic Par,pm25,µg/m³,10


In [91]:
# get json of all PM2.5 data for the locations of interest; convert to df and combine into one df (pm25_df)
# print the number of records found for PM2.5 measurements in 2020 for each location

pm25_df = pd.DataFrame(columns=['city', 'coordinates', 'country', 'date', 'location', 'parameter', 'unit', 'value'])

for location in bay_area_locations:
    loc_params = {
        'location': location,
        'parameter': 'pm25',
        'limit': 10000,
        'date_from': '2020-01-01'
    }
    
    loc_resp = requests.get('https://api.openaq.org/v1/measurements', loc_params)
    
    print(location, ':', loc_resp.json()['meta']['found'])
    
    loc_df = pd.DataFrame.from_dict(loc_resp.json()['results'])
    
    pm25_df = pd.concat([pm25_df, loc_df])


Berkeley Aquatic Par : 6411
Bethel Island : 0
Concord : 6587
Hayward : 0
Laney College : 6786
Livermore - Rincon : 6812
Oakland : 6761
Oakland West : 6284
Patterson Pass : 0
Pleasanton - Owens C : 6632
Redwood City : 6616
Richmond - 7th St : 0
San Francisco : 6442
San Pablo - Rumrill : 6433
San Rafael : 6619
San Ramon : 0
Gilory - 9th Street : 6449
Hollister AMS : 2129
Hollister AMS : 2129
Los Gatos : 0
Pinnacles NM : 0
San Jose - Jackson S : 6611
San Jose - Knox Ave : 6647
San Martin : 0
Fairfield : 0
Rio Vista : 5468
Rio Vista : 5468
Vacaville : 6543
Vallejo : 6666
Napa - Jefferson St : 0
Napa - Napa Valley C : 6448
Sonoma Technology Mo : 0


In [92]:
pm25_df

Unnamed: 0,city,coordinates,country,date,location,parameter,unit,value
0,ALAMEDA,"{'latitude': 37.864767, 'longitude': -122.302741}",US,"{'utc': '2020-12-01T05:00:00Z', 'local': '2020...",Berkeley Aquatic Par,pm25,µg/m³,15
1,ALAMEDA,"{'latitude': 37.864767, 'longitude': -122.302741}",US,"{'utc': '2020-12-01T04:00:00Z', 'local': '2020...",Berkeley Aquatic Par,pm25,µg/m³,13
2,ALAMEDA,"{'latitude': 37.864767, 'longitude': -122.302741}",US,"{'utc': '2020-12-01T03:00:00Z', 'local': '2020...",Berkeley Aquatic Par,pm25,µg/m³,10
3,ALAMEDA,"{'latitude': 37.864767, 'longitude': -122.302741}",US,"{'utc': '2020-12-01T02:00:00Z', 'local': '2020...",Berkeley Aquatic Par,pm25,µg/m³,13
4,ALAMEDA,"{'latitude': 37.864767, 'longitude': -122.302741}",US,"{'utc': '2020-12-01T01:00:00Z', 'local': '2020...",Berkeley Aquatic Par,pm25,µg/m³,9
5,ALAMEDA,"{'latitude': 37.864767, 'longitude': -122.302741}",US,"{'utc': '2020-12-01T00:00:00Z', 'local': '2020...",Berkeley Aquatic Par,pm25,µg/m³,7
6,ALAMEDA,"{'latitude': 37.864767, 'longitude': -122.302741}",US,"{'utc': '2020-11-30T23:00:00Z', 'local': '2020...",Berkeley Aquatic Par,pm25,µg/m³,10
7,ALAMEDA,"{'latitude': 37.864767, 'longitude': -122.302741}",US,"{'utc': '2020-11-30T22:00:00Z', 'local': '2020...",Berkeley Aquatic Par,pm25,µg/m³,14
8,ALAMEDA,"{'latitude': 37.864767, 'longitude': -122.302741}",US,"{'utc': '2020-11-30T21:00:00Z', 'local': '2020...",Berkeley Aquatic Par,pm25,µg/m³,11
9,ALAMEDA,"{'latitude': 37.864767, 'longitude': -122.302741}",US,"{'utc': '2020-11-30T20:00:00Z', 'local': '2020...",Berkeley Aquatic Par,pm25,µg/m³,20


In [93]:
pm25_df.reset_index(inplace=True)

Unnamed: 0,index,city,coordinates,country,date,location,parameter,unit,value
126931,6438,Napa,"{'latitude': 38.278849, 'longitude': -122.275024}",US,"{'utc': '2020-01-01T10:00:00Z', 'local': '2020...",Napa - Napa Valley C,pm25,µg/m³,18
126932,6439,Napa,"{'latitude': 38.278849, 'longitude': -122.275024}",US,"{'utc': '2020-01-01T08:00:00Z', 'local': '2020...",Napa - Napa Valley C,pm25,µg/m³,38
126933,6440,Napa,"{'latitude': 38.278849, 'longitude': -122.275024}",US,"{'utc': '2020-01-01T07:00:00Z', 'local': '2019...",Napa - Napa Valley C,pm25,µg/m³,32
126934,6441,Napa,"{'latitude': 38.278849, 'longitude': -122.275024}",US,"{'utc': '2020-01-01T06:00:00Z', 'local': '2019...",Napa - Napa Valley C,pm25,µg/m³,30
126935,6442,Napa,"{'latitude': 38.278849, 'longitude': -122.275024}",US,"{'utc': '2020-01-01T05:00:00Z', 'local': '2019...",Napa - Napa Valley C,pm25,µg/m³,28
126936,6443,Napa,"{'latitude': 38.278849, 'longitude': -122.275024}",US,"{'utc': '2020-01-01T04:00:00Z', 'local': '2019...",Napa - Napa Valley C,pm25,µg/m³,18
126937,6444,Napa,"{'latitude': 38.278849, 'longitude': -122.275024}",US,"{'utc': '2020-01-01T03:00:00Z', 'local': '2019...",Napa - Napa Valley C,pm25,µg/m³,22
126938,6445,Napa,"{'latitude': 38.278849, 'longitude': -122.275024}",US,"{'utc': '2020-01-01T02:00:00Z', 'local': '2019...",Napa - Napa Valley C,pm25,µg/m³,10
126939,6446,Napa,"{'latitude': 38.278849, 'longitude': -122.275024}",US,"{'utc': '2020-01-01T01:00:00Z', 'local': '2019...",Napa - Napa Valley C,pm25,µg/m³,7
126940,6447,Napa,"{'latitude': 38.278849, 'longitude': -122.275024}",US,"{'utc': '2020-01-01T00:00:00Z', 'local': '2019...",Napa - Napa Valley C,pm25,µg/m³,9


This df encompasses the PM2.5 measurements for all of our locations of interest for 2020 (up to the present date). There are around 127,000 rows, so it is reasonable to assume that each year's data will be on the same order of magnitude. OpenAQ provides two years of data via their open API, so we will use the same process as above to acquire the previous two years' data. For data prior to that, we will need to query their S3 buckets, which can be done through a distributed query tool like Amazon Athena, Apache Spark, or Google BigQuery.

In [98]:
pm25_df.isnull().any()

index          False
city           False
coordinates    False
country        False
date           False
location       False
parameter      False
unit           False
value          False
dtype: bool

For the 2020 PM2.5 data, there are no null values (yay).

In [106]:
# we see that the dates are given in a dict, with values for UTC and local time
# NOTE: the date_from field in the API uses UTC, so we will need to make sure that we standardize our time references...

pm25_df['date'][0]

{'utc': '2020-12-01T05:00:00Z', 'local': '2020-11-30T21:00:00-08:00'}

In [118]:
# pulling out the times into their own df for easier handling

df_times = pd.DataFrame(pm25_df['date'].values.tolist(), index=pm25_df.index)

In [126]:
# adding the local times back into the main pm25_df and converting to datetime format

pm25_df['local_datetime'] = pd.to_datetime(df_times['local'])
pm25_df.head(10)

Unnamed: 0,index,city,coordinates,country,date,location,parameter,unit,value,local_datetime
0,0,ALAMEDA,"{'latitude': 37.864767, 'longitude': -122.302741}",US,"{'utc': '2020-12-01T05:00:00Z', 'local': '2020...",Berkeley Aquatic Par,pm25,µg/m³,15,2020-12-01 05:00:00
1,1,ALAMEDA,"{'latitude': 37.864767, 'longitude': -122.302741}",US,"{'utc': '2020-12-01T04:00:00Z', 'local': '2020...",Berkeley Aquatic Par,pm25,µg/m³,13,2020-12-01 04:00:00
2,2,ALAMEDA,"{'latitude': 37.864767, 'longitude': -122.302741}",US,"{'utc': '2020-12-01T03:00:00Z', 'local': '2020...",Berkeley Aquatic Par,pm25,µg/m³,10,2020-12-01 03:00:00
3,3,ALAMEDA,"{'latitude': 37.864767, 'longitude': -122.302741}",US,"{'utc': '2020-12-01T02:00:00Z', 'local': '2020...",Berkeley Aquatic Par,pm25,µg/m³,13,2020-12-01 02:00:00
4,4,ALAMEDA,"{'latitude': 37.864767, 'longitude': -122.302741}",US,"{'utc': '2020-12-01T01:00:00Z', 'local': '2020...",Berkeley Aquatic Par,pm25,µg/m³,9,2020-12-01 01:00:00
5,5,ALAMEDA,"{'latitude': 37.864767, 'longitude': -122.302741}",US,"{'utc': '2020-12-01T00:00:00Z', 'local': '2020...",Berkeley Aquatic Par,pm25,µg/m³,7,2020-12-01 00:00:00
6,6,ALAMEDA,"{'latitude': 37.864767, 'longitude': -122.302741}",US,"{'utc': '2020-11-30T23:00:00Z', 'local': '2020...",Berkeley Aquatic Par,pm25,µg/m³,10,2020-11-30 23:00:00
7,7,ALAMEDA,"{'latitude': 37.864767, 'longitude': -122.302741}",US,"{'utc': '2020-11-30T22:00:00Z', 'local': '2020...",Berkeley Aquatic Par,pm25,µg/m³,14,2020-11-30 22:00:00
8,8,ALAMEDA,"{'latitude': 37.864767, 'longitude': -122.302741}",US,"{'utc': '2020-11-30T21:00:00Z', 'local': '2020...",Berkeley Aquatic Par,pm25,µg/m³,11,2020-11-30 21:00:00
9,9,ALAMEDA,"{'latitude': 37.864767, 'longitude': -122.302741}",US,"{'utc': '2020-11-30T20:00:00Z', 'local': '2020...",Berkeley Aquatic Par,pm25,µg/m³,20,2020-11-30 20:00:00
