In [10]:
import requests
import os
import pandas as pd
import geopandas as gpd
import time
import json
from datetime import timedelta, date
import math

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [16]:
#read in the data we fetched from get-historical-purple-air.ipynb
pa_df = pd.read_csv('../data/analyzed/purpleair/houmetro-pa-2022-2024-pm25.csv')

#there are dups in the data... i'll figure it out in the pull code but for now lets remove
print('pre dedupe:',len(pa_df))
pa_df = pa_df.drop_duplicates()
print('post dedupe:',len(pa_df))

#add a readable date
pa_df['date'] = pd.to_datetime(pa_df['time_stamp'],unit='s')
pa_df['year'] = pa_df['date'].dt.year
#need to do the PM2.5 conversion per Lance Wallaces comments
pa_df['pm2.5_alt_a_ADJ'] = pa_df['pm2.5_alt_a']*(3.4/3.0)
pa_df['pm2.5_alt_b_ADJ'] = pa_df['pm2.5_alt_b']*(3.4/3.0)
pa_df['pm2.5_alt_ADJ'] = pa_df[['pm2.5_alt_a_ADJ','pm2.5_alt_b_ADJ']].mean(axis=1)

pre dedupe: 80575
post dedupe: 52128


In [17]:
print(len(pa_df))
display(pa_df.head())

52128


Unnamed: 0,time_stamp,pm2.5_alt_a,pm2.5_alt_b,sensor_index,date,year,pm2.5_alt_a_ADJ,pm2.5_alt_b_ADJ,pm2.5_alt_ADJ
0,1651104000,4.5,,2386,2022-04-28,2022,5.1,,5.1
1,1647648000,0.9,,2386,2022-03-19,2022,1.02,,1.02
2,1661644800,2.2,,2386,2022-08-28,2022,2.493333,,2.493333
3,1644105600,9.9,,2386,2022-02-06,2022,11.22,,11.22
4,1691020800,5.6,,2386,2023-08-03,2023,6.346667,,6.346667


In [18]:
by_sensor_yr = pa_df.groupby(['sensor_index','year']).agg(value_cnt=('time_stamp','count'),
                                                          pm25_avg=('pm2.5_alt_ADJ','mean')
                                                          ).reset_index()

In [21]:
print(len(by_sensor_yr))
display(by_sensor_yr.head(9))

Unnamed: 0_level_0,Unnamed: 1_level_0,value_cnt,pm25_avg
sensor_index,year,Unnamed: 2_level_1,Unnamed: 3_level_1
2386,2022,365,6.577059
2386,2023,365,7.769388
2386,2024,364,7.174872
3033,2022,359,7.212451
3033,2023,365,9.021799
3033,2024,365,8.065763
3298,2022,365,8.527324
3298,2023,365,9.923808
3298,2024,366,9.398925


In [40]:
#get rid of rows where the sensor didn't report at least 75% of the time
by_sensor_yr_min = by_sensor_yr.loc[by_sensor_yr['value_cnt'] >= 0.75*365]    

by_sensor = pd.pivot_table(by_sensor_yr_min,index='sensor_index',
                           columns='year',values='pm25_avg',aggfunc='sum').reset_index()

#lets also join with the sensor features we have
houmetro_sensors = pd.read_csv('../GIS/purpleair/houmetro-pa-sensors-atleast_1yr.csv')
houmetro_sensors = houmetro_sensors[['sensor_index','date_created','latitude','longitude','date']]
houmetro_sensors.rename(columns={'date':'sensor_created_date'},inplace=True)
by_sensor = by_sensor.merge(houmetro_sensors,on='sensor_index',how='left')

#and export for visualization
by_sensor.to_csv('../data/analyzed/purpleair/houmetro-pa-pm25-data-2022-2024.csv',index=False)

In [39]:
by_sensor.sort_values(2024,ascending=False).head(20)

Unnamed: 0,sensor_index,2022,2023,2024,date_created,latitude,longitude,sensor_created_date
22,97279,10.178205,12.022458,12.317067,1610556764,29.743322,-95.23209,2021-01-13 16:52:44
47,166421,,,12.312743,1665505153,29.82794,-95.283745,2022-10-11 16:19:13
53,186271,,,12.146453,1688080370,29.777805,-95.13781,2023-06-29 23:12:50
52,181765,,,12.125066,1684783932,29.822308,-95.30618,2023-05-22 19:32:12
6,25999,8.386046,9.775699,11.341991,1549304400,30.053808,-95.494644,2019-02-04 18:20:00
48,166435,,,11.23358,1665505507,29.74531,-95.2333,2022-10-11 16:25:07
54,186277,,,11.193858,1688080385,29.81704,-95.0665,2023-06-29 23:13:05
49,166441,,,11.190428,1665505510,29.776548,-95.153625,2022-10-11 16:25:10
25,99171,,11.641817,11.16507,1612542619,29.84179,-95.28896,2021-02-05 16:30:19
57,186327,,,11.093067,1688080973,29.79457,-95.14891,2023-06-29 23:22:53
