In [1]:
import requests
import os
import pandas as pd
import geopandas as gpd
import time
import json
from datetime import timedelta, date
import math

# Here we store our API read key in a string variable that we can reference later.
my_api_read_key = os.environ['PURPLEAIR_KEY']



Some useful links about the PurpleAir API and about air quality measureing:
- [What is the Difference Between CF=1, ATM, and ALT?](https://community.purpleair.com/t/what-is-the-difference-between-cf-1-atm-and-alt/6442)
- [Loop API Calls for Historical Data](https://community.purpleair.com/t/loop-api-calls-for-historical-data/4623)
- [API field descriptions](https://community.purpleair.com/t/api-fields-descriptions/4652)
- [How to Make Efficient API Calls](https://community.purpleair.com/t/how-to-make-efficient-api-calls/6906)
- [Python script for downloading and organizing historical API data](https://community.purpleair.com/t/python-script-for-downloading-and-organizing-historical-api-data/3726)
- [Calibration of PurpleAir monitors](https://community.purpleair.com/t/calibration-of-purpleair-monitors/482/3)
- [Python script: Get Historical Data](https://github.com/alamp326/PA_DataScripts/blob/main/pa_get_historicaldata_bygroup.py)
- I'm starting to wonder if some of the advice in these community posts are outdated... seems like maybe the ALT field is what does the calibration because the [Sensors - Get Sensor Data > Sensor data fields >  pm2.5_alt](https://api.purpleair.com/) seems to suggest that the calculation is baked in... but I think Lance recommends 3.4 now not just 3 which is maybe the root of [his May 1 comment here](https://community.purpleair.com/t/what-is-the-difference-between-cf-1-atm-and-alt/6442/14?u=akanik)... I'm going to need to create a new community post or email someone.

Also, if you have any questions about PM2.5 and the science behind it... [Lance Arthur Wallace](https://community.purpleair.com/u/lance/summary) is probably your guy. He has helped PurpleAir reverse engineer some really nerding sounding shit having to do with how the monitors detect different types of PM2.5.

It seems like the PM2.5 ALT is what I want to pull, but according to the comments on [this article](https://community.purpleair.com/t/what-is-the-difference-between-cf-1-atm-and-alt/6442/14), it also seems like I'll need to do some maths to correct the values produced here. Namely, I need to multiply the values by 3.4/3 (1.333333) to get the most accurate reading.

**Update to methodology 2024.12.31:**

I will message Lance/PurpleAir to make sure i have this right but... I think this is the way forward after hours of reading the docs, specifically the current API docs for PM2.5 ALT found here [Sensors - Get Sensor Data > Sensor data fields >  pm2.5_alt](https://api.purpleair.com/) and this May 1, 2024 comment from Lance found [here](https://community.purpleair.com/t/what-is-the-difference-between-cf-1-atm-and-alt/6442/14?u=akanik): 
- use PM2.5 ALT
- multiply the PM2.5 ALT value by 3.4/3.0 to implement the most current correction value. 

In [2]:
#all houmetro regardless of recency of data or sensor age: '../GIS/purpleair/houmetro-pa-sensors.csv'
#houmetro with data from last 30 days and sensors older than 1 year: '../GIS/purpleair/houmetro-pa-sensors-atleast_1yr.csv'
houmetro_sensors = pd.read_csv('../GIS/purpleair/houmetro-pa-sensors-atleast_1yr.csv')
#for testing purposes we're gonna limit to like 5 of these
# houmetro_sensors = houmetro_sensors.sample(2)
# display(houmetro_sensors)
print(len(houmetro_sensors))

75


In [3]:
def getHistSensorData(sensor_index,my_fields,other_params):
    
    api_url = f'https://api.purpleair.com/v1/sensors/{sensor_index}/history'

    # my_headers is assigned the context of our request we want to make. In this case
    # we will pass through our API read key using the variable created above.
    my_headers = {'X-API-Key':my_api_read_key}

    # my_params is assigned a list of fields of data we are requesting. Excluding the
    # fields parameter will collect all available fields.
    # example input structure: 'temperature,pm2.5_atm'
    field_param = {'fields':my_fields}
    
    #other_params should be a dict, see the following for options:
    # https://api.purpleair.com/#api-sensors-get-sensor-history
    my_params = {**field_param,**other_params}

    # This line creates and sends the request and then assigns its response to the
    # variable, r.
    r = requests.get(api_url, headers=my_headers, params=my_params)

    # We then return the response we received.
    return r

In [4]:
#############################################
#SETTING UP SOME VARIABLES
#############################################

fields = ['pm2.5_alt_a','pm2.5_alt_b']
fields_str = ','.join(fields)

#the average time period for the data
#if you change this, all of the shit below needs to change too cause you were too
#lazy to turn this into a function. asshole.
data_avg = 1440

#be kind, don't overload the server
sleep_sec = 1

#establish start/end dates
start_timestamp = '2022-01-01T00:00:00'
end_timestamp = '2024-12-31T00:00:00'

#i'm also going to record which sensor and timeframe we got data for before
#inevitable issues popup and i need to fucking start over and waste a bunch of
#PA credits
successful_pulls = {}


#############################################
#WORKING WITH TIME
#############################################
#there are limits on how much data can be pulled based on the data_avg parameter. 
#for 1440 (day) the limit is 2 years worth of data. We'll need to adjust our 
#request to pull data in chunks if we want more than that.

#i'm not going to waste time and create a condition for all of these right now, 
#but once i have to do this enough times, I can create a function to handle this
#Time period    |   data_avg value  |   historical limit
#_______________________________________________________
#Real-time      |   0               |   30 days
#10-minute	    |   10              |   60 days
#30-minute	    |   30              |   90 days
#1-hour	        |   60              |   180 days
#6-hour	        |   360             |   1 year
#1-day          |   1440            |   2 years
#1-week	        |   10080           |   5 years
#1-month        |   43200           |   20 years
#1-year	        |   525600          |   100 years

#get timestamp from date
start_timestamp = pd.to_datetime(start_timestamp)
end_timestamp = pd.to_datetime(end_timestamp)

timeframe_pulls = [[start_timestamp,end_timestamp]]

historical_limit = timedelta(days=365*2).total_seconds()/3600
curr_timeframe = (timeframe_pulls[0][1] - timeframe_pulls[0][0]).total_seconds()/3600
# print('historical_limit',historical_limit)
# print('curr_timeframe',curr_timeframe)

new_timeframe_pulls = []

if data_avg == 1440 and curr_timeframe > historical_limit:
    print('Data range exceeds 2 years, engaging in multiple pulls')
    
    #how many pulls will we need to do? we're rounding up cause we want to make sure
    #we get all the data
    pulls = math.ceil(curr_timeframe / historical_limit)
    print('Pulls necessary:',pulls)
    
    og_start = timeframe_pulls[0][0]
    for i in range(0,int(pulls)):
        if i == 0:
            new_start = og_start
        else:
            new_start = new_timeframe_pulls[i-1][1]
        new_end = new_start + timedelta(days=365*2)
        if new_end > pd.Timestamp(date.today()):
            new_end = pd.Timestamp(date.today() - timedelta(days=1))
        new_timeframe_pulls.append([new_start,new_end])
    
    print('New timeframe pulls:',new_timeframe_pulls)
    
if len(new_timeframe_pulls) == 0:
    new_timeframe_pulls = timeframe_pulls
 
   
#############################################
#DO THE LOOPDY LOOP
#############################################

dfs = []
for timeframe in new_timeframe_pulls:
    print('Pulling data for timeframe:',timeframe)
    
    timeframe_str = str(int(timeframe[0].timestamp()))+'_'+str(int(timeframe[1].timestamp()))
    
    successful_pulls[timeframe_str] = []
    
    other_params = {'average':int(data_avg),
                    'start_timestamp':int(timeframe[0].timestamp()),
                    'end_timestamp':int(timeframe[1].timestamp())}
    
    for index, row in houmetro_sensors.iterrows():
    #for sensor_index in houmetro_sensors['sensor_index']:
        sensor_index = row['sensor_index']
        sensor_created = pd.to_datetime(row['date'])
        
        #if the sensor was created after the end of the timeframe, we don't want to pull data
        if sensor_created > pd.to_datetime(int(timeframe[1].timestamp()),unit='s'):
            print('Skipping sensor',sensor_index,'because it was created after the end of the timeframe')
            continue
        
        sensor_filename = f'../data/analyzed/purpleair/sensor-data/pa-sensor-{sensor_index}-hist-2022-2024.csv'
        sensor_data = getHistSensorData(int(sensor_index),fields_str,other_params)
        sensor_data_json = sensor_data.json()
        print(sensor_data_json)
        if 'data' in sensor_data_json and len(sensor_data_json['data']) > 0:
            if len(sensor_data_json['data']) > 0:
                hist_df = pd.DataFrame(sensor_data_json['data'],columns=sensor_data_json['fields'])
                hist_df['sensor_index'] = sensor_index
                
                successful_pulls[timeframe_str].append(sensor_index)
                
                if os.path.exists(sensor_filename):
                    prev_data = pd.read_csv(sensor_filename)
                    hist_df = pd.concat([prev_data,hist_df])
                
                hist_df.to_csv(sensor_filename,index=False)
                
                dfs.append(hist_df)
        time.sleep(sleep_sec)
    

Data range exceeds 2 years, engaging in multiple pulls
Pulls necessary: 2
New timeframe pulls: [[Timestamp('2022-01-01 00:00:00'), Timestamp('2024-01-01 00:00:00')], [Timestamp('2024-01-01 00:00:00'), Timestamp('2025-01-01 00:00:00')]]
Pulling data for timeframe: [Timestamp('2022-01-01 00:00:00'), Timestamp('2024-01-01 00:00:00')]
{'api_version': 'V1.0.14-0.0.58', 'time_stamp': 1735859150, 'sensor_index': 2386, 'start_timestamp': 1640995200, 'end_timestamp': 1704067200, 'average': 1440, 'private': 0, 'fields': ['time_stamp', 'pm2.5_alt_a', 'pm2.5_alt_b'], 'data': [[1651104000, 4.5, None], [1647648000, 0.9, None], [1661644800, 2.2, None], [1644105600, 9.9, None], [1691020800, 5.6, None], [1676851200, 5.3, None], [1696809600, 4.6, None], [1697155200, 3.6, None], [1645660800, 13.4, None], [1702166400, 1.7, None], [1659571200, 5.0, None], [1677888000, 6.9, None], [1692662400, 3.1, None], [1656460800, 10.9, None], [1674086400, 2.8, None], [1670284800, 2.8, None], [1671667200, 15.5, None], [

  hist_df = pd.concat([prev_data,hist_df])


{'api_version': 'V1.0.14-0.0.58', 'time_stamp': 1735859573, 'sensor_index': 6752, 'start_timestamp': 1704067200, 'end_timestamp': 1735689600, 'average': 1440, 'private': 0, 'fields': ['time_stamp', 'pm2.5_alt_a', 'pm2.5_alt_b'], 'data': [[1712275200, 3.7, 3.9], [1724371200, 6.6, 7.9], [1715990400, 18.3, 18.1], [1727568000, 6.9, 8.3], [1705795200, 5.9, 5.7], [1724457600, 8.4, 10.0], [1717977600, 7.0, 6.9], [1708387200, 7.2, 7.1], [1734393600, 2.9, 3.9], [1723852800, 3.8, 4.3], [1721692800, 2.2, 2.2], [1716163200, 9.7, 9.4], [1732838400, 4.3, 4.6], [1718755200, 6.1, 6.2], [1713312000, 9.1, 9.7], [1730332800, 4.2, 4.7], [1708819200, 4.5, 4.6], [1717200000, 9.3, 9.3], [1727481600, 5.7, 6.8], [1713139200, 3.4, 3.4], [1726617600, 4.4, 5.2], [1733270400, 6.2, 6.8], [1720396800, 2.5, 2.4], [1730160000, 4.9, 5.6], [1722211200, 4.2, 4.3], [1735344000, 9.4, 12.2], [1728000000, 7.1, 8.1], [1709942400, 5.7, 5.9], [1704585600, 10.3, 10.0], [1715558400, 11.6, 11.8], [1711238400, 13.9, 15.0], [1714521

In [5]:
#concat and save real quick. Then we can process in the next script
houmetro_sensor_data = pd.concat(dfs)
houmetro_sensor_data.to_csv('../data/analyzed/purpleair/houmetro-pa-2022-2024-pm25.csv',index=False)

  houmetro_sensor_data = pd.concat(dfs)


In [None]:
#just a little testy-poo to make sure things are lining up the way we thought they would
sensor_index = str(161015)
test_file = f'../data/analyzed/purpleair/sensor-data/pa-sensor-{sensor_index}-hist-2022-2024.csv'

test_df = pd.read_csv(test_file)
test_df['date'] = pd.to_datetime(test_df['time_stamp'],unit='s')

print(len(test_df))
print(test_df['date'].min())
print(test_df['date'].max())
display(test_df.head())