# Collect PurpleAir

Collects PurpleAir 2-minute data and hourly averages the data (with 90% or more data coverage) to write out raw hourly files files of PM2.5 values, temperature and humidity. Collects data from the old Thingspeak web API location, availble at the time of analysis, but no longer available.

### Load python packages

In [10]:
import numpy as np
import pandas as pd
from pandas.tseries.offsets import DateOffset
import matplotlib.pyplot as plt
import datetime
from pathlib import Path                   # System agnostic paths

import requests # for url requests
from statistics import median # for median of array
import json # for json reading/writing
import time # for epoch timestamp
import csv # for writing csv files

### Query webpage for station IDs

https://api.purpleair.com/

In [11]:
keys = {
        'api_key': '${user-access-code}', #for access to the api
        'nwlat': '38.9', #latmax
        'selat': '38.05', #latmin
        'nwlng': '-123.58', #lonmin
        'selng': '-122.24', #lonmax
        'location_type': '0', #outside
        # Thingspeak
        'fields': 'latitude,longitude,name,primary_id_a,primary_key_a,secondary_id_a,secondary_key_a,primary_id_b,primary_key_b,secondary_id_b,secondary_key_b,firmware_version'
        }
url = 'https://api.purpleair.com/v1/sensors?'
query_PurpleAir = requests.get(url, keys)
#query_PurpleAir = requests.get('https://api.purpleair.com/v1/sensors?api_key=201C6F2D-787A-11EC-B9BF-42010A800003&nwlat='+str(latmax)+'&selat='+str(latmin)+'&nwlng='+str(lonmin)+'&selng='+str(lonmax)+'&fields=latitude,longitude')

In [12]:
response_list = query_PurpleAir.json()
#response_list

In [13]:
#collect into pandas dataframe
data = pd.DataFrame(response_list.get('data'))
data.columns = response_list.get('fields')
data.index = data['sensor_index']
data

Unnamed: 0_level_0,sensor_index,latitude,longitude,name,primary_id_a,primary_key_a,secondary_id_a,secondary_key_a,primary_id_b,primary_key_b,secondary_id_b,secondary_key_b,firmware_version
sensor_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
131701,131701,38.315340,-122.507950,West Valley,1528318,XB4WQ0GCMI0XZA5K,1528319,BPFA95I51WN5UGQJ,1528320,AC8WO1XSCNYM5JD8,1528321,KLA1VN2T34LVPIIO,7.00
132665,132665,38.540863,-123.096634,Mohrhardt,1535288,4HQV93HTJQ4JOEWE,1535289,TKC1EHCKDTU4ZCSK,1535290,ITR8P78QB8070AW0,1535291,TUGPSQS9791BUGGV,7.00
132747,132747,38.556640,-122.723070,MWSprings,1535541,BNJELGHG1M9TFUWE,1535542,XY71SAI77GKPEC13,1535543,KK0J6UNWB33FAZSW,1535545,Q1EUG2O79ACOAL2R,7.00
133351,133351,38.680504,-123.429200,Sea Ranch Lodge,1538454,VACLTRPDB9KU8ELH,1538456,RJCDS3BM8HQW7D04,1538458,MM7XQS1EUOORIUFV,1538460,2BG4M7USX06ETQNY,7.00
3184,3184,38.305100,-122.288994,Imola6,328118,EBTHGX7UOPRJZFKO,328119,TKZ3GCDS6SKMEO9H,328120,D79LMZE0FPIAAD1D,328121,C7CIZG8VK024HP42,6.06a
...,...,...,...,...,...,...,...,...,...,...,...,...,...
125929,125929,38.296890,-122.660760,Uncle Andy,1499092,6VNAQX1EYIOLA62Z,1499093,IG4NWUAVKSI4Q8JD,1499094,5CDEDLQ5LJY8KLSU,1499095,EGBSIQD4RN861O6E,7.00
126435,126435,38.108265,-122.242744,Morningside Addition,1500815,6O4AU6LOQTTFQJ2Y,1500816,V1O9YX5J4IDRZETO,1500817,CLWU130VHVHCW4YS,1500818,TT2IO4GYOFDUB0CD,7.00
126443,126443,38.397870,-122.821210,Petaluma Ave,1500831,SCDJU1N0361H2D52,1500832,BKJ0G70OKYJM7J2N,1500833,QFOW3NG574UXQ4GW,1500834,BDS86QQ33NCPG787,7.00
127483,127483,38.295532,-122.246700,Stone Bridge School,1505904,63HOLVZSBWTCLTIZ,1505905,LR3MAA1WIH6V5KBD,1505906,BUOJL2L8F79729MQ,1505907,RCZNYFY01L6A62O6,7.00


### Download and add PM2.5 data

Collect primary data for sensor A and B

https://api.thingspeak.com

In [5]:
def extract_pm(df,header,key):
    
    if (key == 'key_a'):
        collect_subset = df.loc[:,('field6','field7', 'field2', 'field8')] #field2 and field 8 are PM2.5
        collect_subset.index = pd.to_datetime(df['created_at'])
        collect_subset.index.name = 'date (UTC)'
        collect_subset.rename({'field6': header['field6'], 
                               'field7': header['field7'], 
                               'field2': header['field2'], 
                               'field8': header['field8']}, 
                              axis='columns', inplace=True)
    elif (key == 'key_b'):
        collect_subset = df.loc[:,('field2', 'field8')] #field2 and field 8 are PM2.5
        collect_subset.index = pd.to_datetime(df['created_at'])
        collect_subset.index.name = 'date (UTC)'
        collect_subset.rename({'field2': header['field2'], 
                               'field8': header['field8']}, 
                              axis='columns', inplace=True)
    
    #night_subset = collect_subset[(collect_subset.index.hour==(3))|(collect_subset.index.hour==(4))].head()
    
    # Purple air raw data is 2-minute temporal resoultion
    # Convert to:
    # Hourly averages
    subset_hour = collect_subset.resample('H').mean()
    # Data coverage check
    # With a minimum of 90% of data available (if < 27 of a possible 30 points, np.NAN)
    subset_hour_count = collect_subset.resample('H').count()
    subset_hour = subset_hour.where(subset_hour_count >= 27)
    subset_hour_round = subset_hour.round(decimals=4)

    
    return(subset_hour_round)

In [6]:
def collect_station(data,sensor,starttime,endtime):
    #----------------------------------------
    # Need to download archived data from Thingspeak.
    #----------------------------------------
    # Primary Key A
    print(sensor)
    url = 'https://api.thingspeak.com/channels/{}/feeds.json'.format(data.loc[sensor, 'primary_id_a'])
    
    params = {
        'api_key': data.loc[sensor, 'primary_key_a'], # sensor API key
        'start': starttime, # start date YYYY-MM-DD%20HH:NN:SS
        'end': endtime, # end date YYYY-MM-DD%20HH:NN:SS
    }
    
    r = requests.get(url, params=params)
    
    #print(r.url) # print out full URL request with params
    data_raw = r.json()
    
    #print(data_raw)
    
    # collect the data
    try:
        df = pd.DataFrame(data_raw.get('feeds'), dtype=float)
        header = data_raw.get('channel')
        station_data_a = extract_pm(df,header,'key_a')
    except KeyError:
        print('DataFrame is empty for sensor A in this date! Skipping this sensor: ' + str(sensor))
        datelist = pd.date_range(start=starttime,end=endtime, freq="H")
        station_data_a = pd.DataFrame()
        #station_data_a.index = datelist

    
    #----------------------------------------
    # Primary Key B
    url = 'https://api.thingspeak.com/channels/{}/feeds.json'.format(data.loc[sensor, 'primary_id_b'])
    
    params = {
        'api_key': data.loc[sensor, 'primary_key_b'], # sensor API key
        'start': starttime, # start date YYYY-MM-DD%20HH:NN:SS
        'end': endtime, # end date YYYY-MM-DD%20HH:NN:SS
    }
    
    r = requests.get(url, params=params)
    data_raw = r.json()
    
    # collect the data
    try:
        df = pd.DataFrame(data_raw.get('feeds'), dtype=float)
        header = data_raw.get('channel')
        station_data_b = extract_pm(df,header,'key_b')
        station_data_b = station_data_b.rename(columns={"PM2.5 (ATM)": "PM2.5_b (ATM)", "PM2.5 (CF=1)": "PM2.5_b (CF=1)"}, errors="raise")
    except KeyError:
        print('DataFrame is empty empty for sensor B in this date! Skipping this sensor: ' + str(sensor))
        datelist = pd.date_range(start=starttime,end=endtime, freq="H")
        station_data_b = pd.DataFrame()
        
    final_station_data = pd.concat([station_data_a, station_data_b], axis=1)
    
    return(final_station_data)

    

### Collect the sensor data and write to csv files

Need to set up date array in order to loop through collecting every 7 days


In [7]:
date_array = pd.date_range(start="2020-08-01 00:00:00",end="2020-11-1 00:00:00")
date_list = date_array[::7]

Process each sensor location separately

In [9]:
from datetime import timedelta

for sensor in data['sensor_index']:
    counter = 0
    station_meta = data.loc[sensor, ['sensor_index','name','latitude','longitude']]

    # Loop through days because of download limit
    for i in date_list:
        starttime = str(i)
        endtime = str(i + timedelta(days=7))
        
        collect_data = collect_station(data,sensor,starttime,endtime)
        
        if (counter == 0):
            final_data = collect_data
            counter += 1
        else:
            temp = final_data
            pm_data = collect_data
            final_data = pd.concat([temp, pm_data], axis=0)
    
    print(final_data)
    
    #final_data.replace(r'\s+( +\.)|#', np.nan, regex=True).replace('', np.nan)
    
    # replace spaces in naming
    station_meta[1]= station_meta[1].replace(' ', '')
    # replace forward slashes to avoid saving issues
    station_meta[1]= station_meta[1].replace('/', '_')

    csv_name = 'raw_download/PurpleAir_{}_{}_{}_{}.csv'.format(station_meta[0],station_meta[1],station_meta[2],station_meta[3])
    final_data.to_csv(csv_name, index=True, na_rep='NaN')

131701
DataFrame is empty for sensor A in this date! Skipping this sensor: 131701
DataFrame is empty empty for sensor B in this date! Skipping this sensor: 131701
131701
DataFrame is empty for sensor A in this date! Skipping this sensor: 131701
DataFrame is empty empty for sensor B in this date! Skipping this sensor: 131701
131701
DataFrame is empty for sensor A in this date! Skipping this sensor: 131701
DataFrame is empty empty for sensor B in this date! Skipping this sensor: 131701
131701
DataFrame is empty for sensor A in this date! Skipping this sensor: 131701
DataFrame is empty empty for sensor B in this date! Skipping this sensor: 131701
131701
DataFrame is empty for sensor A in this date! Skipping this sensor: 131701
DataFrame is empty empty for sensor B in this date! Skipping this sensor: 131701
131701
DataFrame is empty for sensor A in this date! Skipping this sensor: 131701
DataFrame is empty empty for sensor B in this date! Skipping this sensor: 131701
131701
DataFrame is em