In [1]:
# Excellent API call code by 
## https://github.com/zfarooqui/py_purpleair_aqi/tree/main

import requests
import pandas as pd
import json

def get_sensors_list(nwlng, nwlat, selng, selat, location, key_read):
    # PurpleAir API URL
    root_url = 'https://api.purpleair.com/v1/sensors/'

    # Constructing lat_lon parameters
    lat_lon = {
        'nwlng': nwlng,
        'nwlat': nwlat,
        'selng': selng,
        'selat': selat
    }
    ll_api_url = ''.join([f'&{key}={value}' for key, value in lat_lon.items()])

    # Fields to retrieve
    fields_list = ['sensor_index', 'name', 'latitude', 'longitude', 'location_type']
    fields_api_url = '&fields=' + ','.join(fields_list)

    # Indoor, outdoor, or all
    if location == 'indoor':
        loc_api = '&location_type=1'
    elif location == 'outdoor':
        loc_api = '&location_type=0'
    else:
        loc_api = ''

    # Final API URL
    api_url = f"{root_url}?api_key={key_read}{fields_api_url}{ll_api_url}{loc_api}"

    # Getting data
    response = requests.get(api_url)

    if response.status_code == 200:
        json_data = response.json().get('data', [])
        if json_data:
            df = pd.DataFrame.from_records(json_data, columns=fields_list)
        else:
            df = pd.DataFrame(columns=fields_list)
    else:
        raise requests.exceptions.RequestException(f"Failed to fetch data from {api_url}")

    # Saving to PostgreSQL (optional)
    # df.to_sql('tablename', con=engine, if_exists='append', index=False)

    # Saving to CSV file
    df.to_csv("../datasets/sensor_data/sensor_list", index=False, header=True)

    # Creating a list of sensor indices
    sensors_list = df['sensor_index'].tolist()

    return sensors_list

In [2]:
with open('../keys.json') as fi:
    credentials = json.load(fi)

In [3]:
credentials

{'api_key': 'F743EBF7-F1C6-11EE-B9F7-42010A80000D'}

In [4]:
# 35.193264659685866, -101.92062397763742
# 35.16868927923588, -101.87431391404589
nwlng = -101.94  # Northwest longitude of the bounding box
nwlat = 35.20   # Northwest latitude of the bounding box
selng = -101.87  # Southeast longitude of the bounding box
selat = 35.16   # Southeast latitude of the bounding box
location = 'outdoor'  # You can specify 'indoor', 'outdoor', or 'all'
keys_file_path = '../keys.json'  # Path to your keys.json file

sensors_list = get_sensors_list(nwlng, nwlat, selng, selat, location, credentials['api_key'])
print(sensors_list)

[133634]


In [5]:
df = pd.read_csv('../datasets/sensor_data/sensor_list')
df

Unnamed: 0,sensor_index,name,latitude,longitude,location_type
0,133634,AMS Allergy Amarillo,0,35.173115,-101.933075


In [10]:
from datetime import datetime
import time
from io import StringIO
# from sqlalchemy import create_engine

# Starting engine for postgresql
# engine = create_engine('postgresql://postgres:password@location:port/database')

# API Keys provided by PurpleAir(c)
key_read  = credentials['api_key']

# Sleep Seconds
sleep_seconds = 3 # wait sleep_seconds after each query


def get_historicaldata(sensors_list,bdate,edate,average_time,key_read):
    # Historical API URL
    root_api_url = 'https://api.purpleair.com/v1/sensors/'
    
    # Average time: The desired average in minutes, one of the following:0 (real-time),10 (default if not specified),30,60
    average_api = f'&average={average_time}'

    # Creating fields api url from fields list to download the data: Note: Sensor ID/Index will not be downloaded as default
    fields_list = ['pm2.5_atm_a', 'pm2.5_atm_b', 'pm2.5_cf_1_a', 'pm2.5_cf_1_b', 'humidity_a', 'humidity_b', 
               'temperature_a', 'temperature_b', 'pressure_a', 'pressure_b']
    for i,f in enumerate(fields_list):
        if (i == 0):
            fields_api_url = f'&fields={f}'
        else:
            fields_api_url += f'%2C{f}'

    # Dates of Historical Data period
    begindate = datetime.fromisoformat(bdate)
    enddate   = datetime.fromisoformat(edate)
    
    # Downlaod days based on average
    if (average_time == 60):
        datelist = pd.date_range(begindate,enddate,freq='14d') # for 14 days of data
    else:
        datelist = pd.date_range(begindate,enddate,freq='2d') # for 2 days of data
        
    # Reversing to get data from end date to start date
    datelist = datelist.tolist()
    datelist.reverse()
    
    # Converting to PA required format
    date_list=[]
    for dt in datelist:
        dd = dt.strftime('%Y-%m-%d') + 'T' + dt.strftime('%H:%M:%S') +'Z'
        date_list.append(dd)

    # to get data from end date to start date
    len_datelist = len(date_list) - 1
        
    # Getting 2-data for one sensor at a time
    for s in sensors_list:
        # Adding sensor_index & API Key
        hist_api_url = root_api_url + f'{s}/history/csv?api_key={key_read}'

        # Creating start and end date api url
        for i,d in enumerate(date_list):
            # Wait time 
            time.sleep(sleep_seconds)
            
            if (i < len_datelist):
                print('Downloading for Amarillo: %s for Dates: %s and %s.' %(s,date_list[i+1],d))
                dates_api_url = f'&start_timestamp={date_list[i+1]}&end_timestamp={d}'
            
                # Final API URL
                api_url = hist_api_url + dates_api_url + average_api + fields_api_url
                            
                #
                try:
                    response = requests.get(api_url)
                except:
                    print(api_url)
                #
                try:
                    assert response.status_code == requests.codes.ok
                
                    # Creating a Pandas DataFrame
                    df = pd.read_csv(StringIO(response.text), sep=",", header=0)
                
                except AssertionError:
                    df = pd.DataFrame()
                    print('Bad URL!')
            
                if df.empty:
                    print('------------- No Data Available -------------')
                else:
                    # Dropping duplicate rows
                    df = df.drop_duplicates(subset=None, keep='first', inplace=False)
                    
                    # writing to csv file
                    filename = ('../datasets/sensor_data/Amarillo/6hrs_avg/\sensorsID_%s_%s_%s.csv' % (s,date_list[i+1],d))
                    df.to_csv(filename, index=False, header=True)

In [11]:
# Data download period
bdate = '2024-02-01T00:00:00+00:00' 
edate = '2024-04-01T00:00:00+00:00'

df = pd.read_csv("../datasets/sensor_data/sensor_list")

# Creating a list of sensor indices
sensors_list = df['sensor_index'].tolist()

# Average_time. The desired average in minutes, one of the following: 0 (real-time), 
#                  10 (default if not specified), 30, 60, 360 (6 hour), 1440 (1 day)
average_time=360  # or 10  or 0 (Current script is set only for real-time, 10, or 60 minutes data)

# Getting Nashville_area data
get_historicaldata(sensors_list, bdate, edate, average_time, key_read)

Downloading for Amarillo: 133634 for Dates: 2024-03-30T00:00:00Z and 2024-04-01T00:00:00Z.
Downloading for Amarillo: 133634 for Dates: 2024-03-28T00:00:00Z and 2024-03-30T00:00:00Z.
Downloading for Amarillo: 133634 for Dates: 2024-03-26T00:00:00Z and 2024-03-28T00:00:00Z.
Downloading for Amarillo: 133634 for Dates: 2024-03-24T00:00:00Z and 2024-03-26T00:00:00Z.
Downloading for Amarillo: 133634 for Dates: 2024-03-22T00:00:00Z and 2024-03-24T00:00:00Z.
Downloading for Amarillo: 133634 for Dates: 2024-03-20T00:00:00Z and 2024-03-22T00:00:00Z.
Downloading for Amarillo: 133634 for Dates: 2024-03-18T00:00:00Z and 2024-03-20T00:00:00Z.
Downloading for Amarillo: 133634 for Dates: 2024-03-16T00:00:00Z and 2024-03-18T00:00:00Z.
Downloading for Amarillo: 133634 for Dates: 2024-03-14T00:00:00Z and 2024-03-16T00:00:00Z.
Downloading for Amarillo: 133634 for Dates: 2024-03-12T00:00:00Z and 2024-03-14T00:00:00Z.
Downloading for Amarillo: 133634 for Dates: 2024-03-10T00:00:00Z and 2024-03-12T00:00:00Z.