In [1]:
# Excellent API call code by 
## https://github.com/zfarooqui/py_purpleair_aqi/tree/main

import requests
import pandas as pd
import json
import sqlite3

def get_sensors_list(nwlng, nwlat, selng, selat, location, key_read):
    # PurpleAir API URL
    root_url = 'https://api.purpleair.com/v1/sensors/'

    # Constructing lat_lon parameters
    lat_lon = {
        'nwlng': nwlng,
        'nwlat': nwlat,
        'selng': selng,
        'selat': selat
    }
    ll_api_url = ''.join([f'&{key}={value}' for key, value in lat_lon.items()])

    # Fields to retrieve
    fields_list = ['sensor_index', 'name', 'latitude', 'longitude']
    fields_api_url = '&fields=' + ','.join(fields_list)

    # Indoor, outdoor, or all
    if location == 'indoor':
        loc_api = '&location_type=1'
    elif location == 'outdoor':
        loc_api = '&location_type=0'
    else:
        loc_api = ''

    # Final API URL
    api_url = f"{root_url}?api_key={key_read}{fields_api_url}{ll_api_url}{loc_api}"

    # Getting data
    response = requests.get(api_url)

    if response.status_code == 200:
        json_data = response.json().get('data', [])
        if json_data:
            df = pd.DataFrame.from_records(json_data, columns=fields_list)
        else:
            df = pd.DataFrame(columns=fields_list)
    else:
        raise requests.exceptions.RequestException(f"Failed to fetch data from {api_url}")

    # Saving to sqlite (optional)
    db = sqlite3.connect('../datasets/dallas.sqlite')
    df.to_sql('sensor_table',
              db, 
              if_exists='append', 
              index=False)
    
    db.execute('CREATE INDEX sensor_index ON sensor_table(sensor_index)')
    db.close()

    # Saving to CSV file
    df.to_csv("../datasets/sensor_data/sensor_list", index=False, header=True)

    # Creating a list of sensor indices
    sensors_list = df['sensor_index'].tolist()

    return sensors_list

In [2]:
with open('../keys.json') as fi:
    credentials = json.load(fi)

In [4]:
# # 35.193264659685866, -101.92062397763742
# # 35.16868927923588, -101.87431391404589
# nwlng = -101.94  # Northwest longitude of the bounding box
# nwlat = 35.20   # Northwest latitude of the bounding box
# selng = -101.87  # Southeast longitude of the bounding box
# selat = 35.16   # Southeast latitude of the bounding box
# location = 'outdoor'  # You can specify 'indoor', 'outdoor', or 'all'
# keys_file_path = '../keys.json'  # Path to your keys.json file

# sensors_list = get_sensors_list(nwlng, nwlat, selng, selat, location, credentials['api_key'])
# print(sensors_list)

[133634]


In [4]:
# # Bounding Box for Dallas
# # 33.308403956633406, -97.42744748978863
# # 32.36533339607889, -96.27246459337103
# nwlng = -97.43  # Northwest longitude of the bounding box
# nwlat = 33.31   # Northwest latitude of the bounding box
# selng = -96.28  # Southeast longitude of the bounding box
# selat = 32.37  # Southeast latitude of the bounding box
# location = 'outdoor'  # You can specify 'indoor', 'outdoor', or 'all'
# keys_file_path = '../keys.json'  # Path to your keys.json file

# sensors_list = get_sensors_list(nwlng, nwlat, selng, selat, location, credentials['api_key'])
# print(sensors_list)

[2644, 9504, 12969, 13013, 16271, 46221, 51821, 53365, 53389, 59801, 59903, 59907, 59953, 72071, 80533, 87019, 87485, 87721, 90785, 95481, 97395, 99159, 99163, 99187, 99279, 99309, 99585, 99593, 99595, 104402, 109566, 112984, 113642, 113648, 113708, 113833, 113857, 113969, 113975, 114119, 114329, 118993, 120681, 122927, 123021, 123409, 123453, 127045, 127049, 127059, 127067, 127075, 128645, 133820, 135002, 142154, 144032, 144468, 147038, 151486, 164335, 164965, 165171, 182041, 184053, 196323, 196421, 221467]


In [2]:
df = pd.read_csv('../datasets/sensor_data/sensor_list')
df

Unnamed: 0,sensor_index,name,latitude,longitude
0,2644,Sachse Farms,32.990840,-96.599300
1,9504,Rosebud,32.419700,-97.008156
2,12969,Meadow Glen,32.991665,-96.859150
3,13013,GPTX,32.766370,-97.037926
4,16271,Arbormont Estates,32.882004,-97.084130
...,...,...,...,...
65,182041,CleanAIRE NC Hickory Creek,33.126100,-97.059390
66,184053,Creekwood Estates,32.909042,-97.124160
67,196323,Duck Creek Air Quality,32.960114,-96.692200
68,196421,Kilmichael Lane - PAII,32.977444,-96.776110


In [18]:
from datetime import datetime
average_time = 720
bdate = '2018-04-01T00:00:00+00:00' 
edate = '2024-04-01T00:00:00+00:00'
begindate = datetime.fromisoformat(bdate)
enddate   = datetime.fromisoformat(edate)

# Downlaod days based on average
if (average_time == 60):
    datelist = pd.date_range(begindate,enddate,freq='14d') # for 14 days of data
else:
    datelist = pd.date_range(begindate,enddate,freq='2d') # for 2 days of data
    
# Reversing to get data from end date to start date
datelist = datelist.tolist()
datelist.reverse()


In [19]:
datelist

[Timestamp('2024-04-01 00:00:00+0000', tz='UTC'),
 Timestamp('2024-03-30 00:00:00+0000', tz='UTC'),
 Timestamp('2024-03-28 00:00:00+0000', tz='UTC'),
 Timestamp('2024-03-26 00:00:00+0000', tz='UTC'),
 Timestamp('2024-03-24 00:00:00+0000', tz='UTC'),
 Timestamp('2024-03-22 00:00:00+0000', tz='UTC'),
 Timestamp('2024-03-20 00:00:00+0000', tz='UTC'),
 Timestamp('2024-03-18 00:00:00+0000', tz='UTC'),
 Timestamp('2024-03-16 00:00:00+0000', tz='UTC'),
 Timestamp('2024-03-14 00:00:00+0000', tz='UTC'),
 Timestamp('2024-03-12 00:00:00+0000', tz='UTC'),
 Timestamp('2024-03-10 00:00:00+0000', tz='UTC'),
 Timestamp('2024-03-08 00:00:00+0000', tz='UTC'),
 Timestamp('2024-03-06 00:00:00+0000', tz='UTC'),
 Timestamp('2024-03-04 00:00:00+0000', tz='UTC'),
 Timestamp('2024-03-02 00:00:00+0000', tz='UTC'),
 Timestamp('2024-02-29 00:00:00+0000', tz='UTC'),
 Timestamp('2024-02-27 00:00:00+0000', tz='UTC'),
 Timestamp('2024-02-25 00:00:00+0000', tz='UTC'),
 Timestamp('2024-02-23 00:00:00+0000', tz='UTC'),


In [20]:
date_list=[]
for dt in datelist:
    dd = dt.strftime('%Y-%m-%d') + 'T' + dt.strftime('%H:%M:%S') +'Z'
    date_list.append(dd)


In [21]:
date_list

['2024-04-01T00:00:00Z',
 '2024-03-30T00:00:00Z',
 '2024-03-28T00:00:00Z',
 '2024-03-26T00:00:00Z',
 '2024-03-24T00:00:00Z',
 '2024-03-22T00:00:00Z',
 '2024-03-20T00:00:00Z',
 '2024-03-18T00:00:00Z',
 '2024-03-16T00:00:00Z',
 '2024-03-14T00:00:00Z',
 '2024-03-12T00:00:00Z',
 '2024-03-10T00:00:00Z',
 '2024-03-08T00:00:00Z',
 '2024-03-06T00:00:00Z',
 '2024-03-04T00:00:00Z',
 '2024-03-02T00:00:00Z',
 '2024-02-29T00:00:00Z',
 '2024-02-27T00:00:00Z',
 '2024-02-25T00:00:00Z',
 '2024-02-23T00:00:00Z',
 '2024-02-21T00:00:00Z',
 '2024-02-19T00:00:00Z',
 '2024-02-17T00:00:00Z',
 '2024-02-15T00:00:00Z',
 '2024-02-13T00:00:00Z',
 '2024-02-11T00:00:00Z',
 '2024-02-09T00:00:00Z',
 '2024-02-07T00:00:00Z',
 '2024-02-05T00:00:00Z',
 '2024-02-03T00:00:00Z',
 '2024-02-01T00:00:00Z',
 '2024-01-30T00:00:00Z',
 '2024-01-28T00:00:00Z',
 '2024-01-26T00:00:00Z',
 '2024-01-24T00:00:00Z',
 '2024-01-22T00:00:00Z',
 '2024-01-20T00:00:00Z',
 '2024-01-18T00:00:00Z',
 '2024-01-16T00:00:00Z',
 '2024-01-14T00:00:00Z',


In [5]:
from datetime import datetime
import time
from io import StringIO
# from sqlalchemy import create_engine

# Starting engine for postgresql
# engine = create_engine('postgresql://postgres:password@location:port/database')

# API Keys provided by PurpleAir(c)
key_read  = credentials['api_key']

# Sleep Seconds
sleep_seconds = 3 # wait sleep_seconds after each query


def get_historicaldata(sensors_list,bdate,edate,average_time,key_read):
    # Historical API URL
    root_api_url = 'https://api.purpleair.com/v1/sensors/'
    
    # Average time: The desired average in minutes, one of the following:0 (real-time),10 (default if not specified),30,60
    average_api = f'&average={average_time}'

    # Creating fields api url from fields list to download the data: Note: Sensor ID/Index will not be downloaded as default
    fields_list = ['pm2.5_atm_a', 'pm2.5_atm_b', 'pm2.5_cf_1_a', 'pm2.5_cf_1_b', 'humidity_a', 'humidity_b', 
               'temperature_a', 'temperature_b', 'pressure_a', 'pressure_b']
    for i,f in enumerate(fields_list):
        if (i == 0):
            fields_api_url = f'&fields={f}'
        else:
            fields_api_url += f'%2C{f}'

    # Dates of Historical Data period
    begindate = datetime.fromisoformat(bdate)
    enddate   = datetime.fromisoformat(edate)
    
    # Downlaod days based on average
    if (average_time == 60):
        datelist = pd.date_range(begindate,enddate,freq='14d') # for 14 days of data
    else:
        datelist = pd.date_range(begindate,enddate,freq='2d') # for 2 days of data
        
    # Reversing to get data from end date to start date
    datelist = datelist.tolist()
    datelist.reverse()
    
    # Converting to PA required format
    date_list=[]
    for dt in datelist:
        dd = dt.strftime('%Y-%m-%d') + 'T' + dt.strftime('%H:%M:%S') +'Z'
        date_list.append(dd)

    # to get data from end date to start date
    len_datelist = len(date_list) - 1
    db = sqlite3.connect('../datasets/dallas.sqlite')    
    # Getting 2-data for one sensor at a time
    for s in sensors_list:
        # Adding sensor_index & API Key
        hist_api_url = root_api_url + f'{s}/history/csv?api_key={key_read}'

        # Creating start and end date api url
        for i,d in enumerate(date_list):
            # Wait time 
            time.sleep(sleep_seconds)
            
            if (i < len_datelist):
                print('Downloading for Dallas: %s for Dates: %s and %s.' %(s,date_list[i+1],d))
                dates_api_url = f'&start_timestamp={date_list[i+1]}&end_timestamp={d}'
            
                # Final API URL
                api_url = hist_api_url + dates_api_url + average_api + fields_api_url
                            
                #
                try:
                    response = requests.get(api_url)
                except:
                    print(api_url)
                #
                try:
                    assert response.status_code == requests.codes.ok
                
                    # Creating a Pandas DataFrame
                    df = pd.read_csv(StringIO(response.text), sep=",", header=0)
                
                except AssertionError:
                    df = pd.DataFrame()
                    print('Bad URL!')
            
                if df.empty:
                    print('------------- No Data Available -------------')
                else:
                    # Dropping duplicate rows
                    df = df.drop_duplicates(subset=None, keep='first', inplace=False)
                    # adding to sqlite
                    df.to_sql('data_table',
                        db, 
                        if_exists='append', 
                        index=False)
                    
                    # writing to csv file
                    filename = ('../datasets/sensor_data/Dallas/\sensorsID_%s_%s_%s.csv' % (s,date_list[i+1],d))
                    df.to_csv(filename, index=False, header=True)
    db.execute('CREATE INDEX IF NOT EXISTS sensor_index ON data_table(sensor_index)')
    db.close()

In [6]:
df = pd.read_csv("../datasets/sensor_data/sensor_list")

In [8]:
sensor_list = df['sensor_index'].to_list()

In [12]:
sensor_list_copy = sensor_list[3::]

In [13]:
sensor_list_copy

[13013,
 16271,
 46221,
 51821,
 53365,
 53389,
 59801,
 59903,
 59907,
 59953,
 72071,
 80533,
 87019,
 87485,
 87721,
 90785,
 95481,
 97395,
 99159,
 99163,
 99187,
 99279,
 99309,
 99585,
 99593,
 99595,
 104402,
 109566,
 109638,
 112984,
 113642,
 113648,
 113708,
 113833,
 113857,
 113969,
 113975,
 114119,
 114329,
 118993,
 120681,
 122927,
 123021,
 123409,
 123453,
 127045,
 127049,
 127059,
 127067,
 127075,
 128645,
 133363,
 133820,
 135002,
 142154,
 144032,
 144468,
 147038,
 151486,
 164335,
 164965,
 165171,
 182041,
 184053,
 196323,
 196421,
 221467]

In [14]:
# # Data download period for the first round of data 
# bdate = '2022-04-01T00:00:00+00:00' 
# edate = '2024-04-01T00:00:00+00:00'

# df = pd.read_csv("../datasets/sensor_data/sensor_list")

# # Creating a list of sensor indices
# # sensors_list = df['sensor_index'].tolist()

# # Average_time. The desired average in minutes, one of the following: 0 (real-time), 
# #                  10 (default if not specified), 30, 60, 360 (6 hour), 1440 (1 day)
# average_time= 1440  # or 10  or 0 (Current script is set only for real-time, 10, or 60 minutes data)

# # Getting Nashville_area data
# get_historicaldata(sensor_list_copy, bdate, edate, average_time, key_read)

Downloading for Dallas: 13013 for Dates: 2024-03-29T00:00:00Z and 2024-03-31T00:00:00Z.
Downloading for Dallas: 13013 for Dates: 2024-03-27T00:00:00Z and 2024-03-29T00:00:00Z.
Downloading for Dallas: 13013 for Dates: 2024-03-25T00:00:00Z and 2024-03-27T00:00:00Z.
Downloading for Dallas: 13013 for Dates: 2024-03-23T00:00:00Z and 2024-03-25T00:00:00Z.
Downloading for Dallas: 13013 for Dates: 2024-03-21T00:00:00Z and 2024-03-23T00:00:00Z.
Downloading for Dallas: 13013 for Dates: 2024-03-19T00:00:00Z and 2024-03-21T00:00:00Z.
Downloading for Dallas: 13013 for Dates: 2024-03-17T00:00:00Z and 2024-03-19T00:00:00Z.
Downloading for Dallas: 13013 for Dates: 2024-03-15T00:00:00Z and 2024-03-17T00:00:00Z.
Downloading for Dallas: 13013 for Dates: 2024-03-13T00:00:00Z and 2024-03-15T00:00:00Z.
Downloading for Dallas: 13013 for Dates: 2024-03-11T00:00:00Z and 2024-03-13T00:00:00Z.
Downloading for Dallas: 13013 for Dates: 2024-03-09T00:00:00Z and 2024-03-11T00:00:00Z.
Downloading for Dallas: 13013 fo