In [None]:
import requests
import os
import pandas as pd
import geopandas as gpd
import time
import json
from datetime import timedelta, date
import math

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
#Here is a list of indoor monitors, for your edification
site_info = pd.read_csv('../GIS/purpleair/houmetro-pa-sensors-atleast_1yr.csv')
indoor_sensor_list = site_info[site_info['location_type'] == 1]['sensor_index'].tolist()
indoor_sensor_list

In [None]:
#read in the data we fetched from get-historical-purple-air.ipynb
pa_pm25 = pd.read_csv('../data/analyzed/purpleair/houmetro-pa-2022-2024-pm25.csv')

#there are dups in the data... i'll figure it out in the pull code but for now lets remove
print('pre dedupe:',len(pa_pm25))
pa_pm25 = pa_pm25.drop_duplicates()
print('post dedupe:',len(pa_pm25))

#add a readable date
pa_pm25['date'] = pd.to_datetime(pa_pm25['time_stamp'],unit='s')
pa_pm25['year'] = pa_pm25['date'].dt.year

#need to do the PM2.5 conversion per Lance Wallace's comments
pa_pm25['pm2.5_alt_a_ADJ'] = pa_pm25['pm2.5_alt_a']*(3.4/3.0)
pa_pm25['pm2.5_alt_b_ADJ'] = pa_pm25['pm2.5_alt_b']*(3.4/3.0)
pa_pm25['pm2.5_alt_ADJ'] = pa_pm25[['pm2.5_alt_a_ADJ','pm2.5_alt_b_ADJ']].mean(axis=1)

In [None]:
#Remove days known to have high pollution not from industry: NYE, 4th of July
holidays = ['2018-01-01','2018-07-04','2018-12-31',
            '2019-01-01','2019-07-04','2019-12-31',
            '2020-01-01','2020-07-04','2020-12-31',
            '2021-01-01','2021-07-04','2021-12-31',
            '2022-01-01','2022-07-04','2022-12-31',
            '2023-01-01','2023-07-04','2023-12-31',
            '2024-01-01','2024-07-04','2024-12-31',]

pa_pm25 = pa_pm25.loc[~pa_pm25['date'].isin(holidays)]
print('post holiday remove:',len(pa_pm25))

In [5]:
by_site = pa_pm25.groupby(['sensor_index','year']).agg(value_cnt=('time_stamp','count'),
                                                       pm25_avg=('pm2.5_alt_ADJ','mean')
                                                       ).reset_index()

#get rid of rows where the sensor didn't report at least 50% of the time
by_site_full = by_site.loc[by_site['value_cnt'] >= 0.5*365]

by_yr = pd.pivot_table(by_site_full,index='sensor_index',
                       columns='year',values='pm25_avg',aggfunc='mean').reset_index()
by_yr['avg2022_24'] = by_yr[[2022, 2023, 2024]].mean(axis=1,skipna=False)

#calculate how many of the past years the site has been higher than current and past limits
def over_limit(x, limit):
    return (x > limit).sum()

over_cols = [2022,2023,2024]
by_yr['yrs_over12'] = by_yr[over_cols].apply(lambda x: over_limit(x,12), axis=1)
by_yr['yrs_over9'] = by_yr[over_cols].apply(lambda x: over_limit(x,9), axis=1)

#calculate how many days per year each site had daily averages higher than 35
daily_over35 = pa_pm25.groupby('sensor_index').agg(days_over35=('pm2.5_alt_ADJ', lambda x: over_limit(x, 35)),
                                                   max_value=('pm2.5_alt_ADJ', 'max'),
                                                   min_value=('pm2.5_alt_ADJ', 'min')
                                                   ).reset_index()

pm25_site_summary = by_yr.merge(daily_over35, on='sensor_index', how='left')

#lets also join with the sensor features we have
site_info = pd.read_csv('../GIS/purpleair/houmetro-pa-sensors-atleast_1yr.csv')
site_info.rename(columns={'date':'sensor_created_date'},inplace=True)
pm25_site_summary = pm25_site_summary.merge(site_info[['sensor_index','cnty_fips','cnty_nm',
                                                       'latitude','longitude','location_type',
                                                       'sensor_created_date']],on='sensor_index',how='outer')

#remove indoor monitors for this analysis
pm25_site_summary = pm25_site_summary.loc[pm25_site_summary['location_type'] == 0]

#remove rows that don't have any data
#i checked to make sure empty max_value meant no other data in the row
pm25_site_summary = pm25_site_summary.loc[~pm25_site_summary['max_value'].isna()]

#let's make the columns agree with the EPA monitors
rename_cols = {'sensor_index':'site_id','Site Latitude':'latitude','Site Longitude':'longitude'}
pm25_site_summary.rename(columns=rename_cols, inplace=True)
pm25_site_summary.columns = [ str(x) for x in pm25_site_summary.columns ]


#export purpleair
pm25_site_summary['monitor_type'] = 'PurpleAir'
pm25_site_summary.to_csv('../data/analyzed/houmetro-purpleair-pm25-site-summary.csv',index=False)

#concat with EPA data and save too
epa_pm25 = pd.read_csv('../data/analyzed/houmetro-epa-pm25-site-summary.csv')
combo_pm25 = pd.concat([pm25_site_summary,epa_pm25])
reorder_cols = ['longitude','latitude','site_id', '2018','2019', '2020', '2021','2022', '2023', '2024', 
                'avg2018_19', 'avg2019_21', 'avg2020_22','avg2021_23','avg2022_24', 
                'yrs_over12','yrs_over9', 'days_over35', 'max_value', 'min_value', 
                'cnty_fips','county','monitor_type','sensor_created_date']
combo_pm25[reorder_cols].to_csv('../data/analyzed/houmetro-epa-purpleair-pm25-site-summary.csv',index=False)

## Just a couple of integrity checks

In [None]:
#not all the points we pulled data for made it into the final data summary
print(len(pm25_site_summary.loc[pm25_site_summary['max_value'].isna()]))

In [None]:
pm25_site_summary.loc[pm25_site_summary['max_value'].isna()]

In [None]:
#that's cause I ditched monitors that didn't have at least 182 days of data (50% of the year)
#here's an example of not enough data to calculate an annual avg.

display(by_site.loc[by_site['value_cnt']<182].sample(3))

In [None]:
print('PA monitors with data:',len(by_site.sensor_index.unique()))
print('PA monitors with enough data:',len(pm25_site_summary.loc[~pm25_site_summary['max_value'].isna()]))
print('EPA monitors:',len(epa_pm25))
print('EPA + PA monitors for map:',len(combo_pm25))

In [None]:
print(pm25_site_summary.columns)
display(pm25_site_summary.head())

In [None]:
print(combo_pm25[reorder_cols].columns)
display(combo_pm25[reorder_cols].head())