In [1]:
import requests
import os
import pandas as pd
import geopandas as gpd
import numpy as np
import math

import json
from datetime import timedelta, date, datetime
import math

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
#calculate how many of the past years the site has been higher than current and past limits
def over_limit(x, limit):
    return (x > limit).sum()

def return_max_min(x,limit_type):
    measure = 'max'
    if limit_type == 'max':
        measure = x.max()
    elif limit_type == 'min':
        measure = x.min()
    max_fmt = measure.strftime('%B %Y')
    #print(limit_type,':',max_fmt)
    return max_fmt

#we've turned this into a function so we can test the adjusted and unadjusted values
def compile_purpleair_site_summary(pa_pm25, pm_field):
    by_site = pa_pm25.groupby(['sensor_index','year']).agg(value_cnt=('time_stamp','count'),
                                                        pm25_avg=(pm_field,'mean')
                                                        ).reset_index()
    
    #get rid of rows where the sensor didn't report at least 50% of the time
    by_site_full = by_site.loc[by_site['value_cnt'] >= 0.5*365]

    by_yr = pd.pivot_table(by_site_full,index='sensor_index',
                        columns='year',values='pm25_avg',aggfunc='mean').reset_index()
    by_yr['avg2022_24'] = by_yr[[2022, 2023, 2024]].mean(axis=1,skipna=False)

    over_cols = [2022,2023,2024]
    by_yr['yrs_over12'] = by_yr[over_cols].apply(lambda x: over_limit(x,12), axis=1)
    by_yr['yrs_over9'] = by_yr[over_cols].apply(lambda x: over_limit(x,9), axis=1)

    #calculate how many days per year each site had daily averages higher than 35
    daily_over35 = pa_pm25.groupby('sensor_index').agg(over35=(pm_field, lambda x: over_limit(x, 35)),
                                                    max_read=(pm_field, 'max'),
                                                    min_read=(pm_field, 'min'),
                                                    min_date=('date',lambda x: return_max_min(x,'min')),
                                                    max_date=('date',lambda x: return_max_min(x,'max')),
                                                    daily_cnt=(pm_field,'count')
                                                    ).reset_index()

    pm25_site_summary = by_yr.merge(daily_over35, on='sensor_index', how='left')

    #lets also join with the sensor features we have
    site_info = pd.read_csv('../GIS/purpleair/houmetro-pa-sensors-atleast_1yr.csv')
    site_info.rename(columns={'date':'sensor_created_date'},inplace=True)
    pm25_site_summary = pm25_site_summary.merge(site_info[['sensor_index','cnty_fips','cnty_nm',
                                                        'latitude','longitude','location_type',
                                                        'sensor_created_date']],on='sensor_index',how='outer')
    
    pm25_site_summary['county'] = pm25_site_summary['cnty_nm']

    #remove indoor monitors for this analysis
    pm25_site_summary = pm25_site_summary.loc[pm25_site_summary['location_type'] == 0]

    #remove rows that don't have any data
    #i checked to make sure empty max_value meant no other data in the row
    print('NA rows:',len(pm25_site_summary.loc[pm25_site_summary['max_read'].isna()]))
    display(pm25_site_summary.loc[pm25_site_summary['max_read'].isna()].sample(5))
    pm25_site_summary = pm25_site_summary.loc[~pm25_site_summary['max_read'].isna()]

    #let's make the columns agree with the EPA monitors
    rename_cols = {'sensor_index':'site_id','Site Latitude':'latitude','Site Longitude':'longitude'}
    pm25_site_summary.rename(columns=rename_cols, inplace=True)
    pm25_site_summary.columns = [ str(x) for x in pm25_site_summary.columns ]

    #if the 2024 values is greater than 9, list "out" for compliance else "in"
    pm25_site_summary['compliance_2024'] = pm25_site_summary['2024'].apply(lambda x: 'out' if x > 9 else 'in')


    #label as purpleaair because we'll concat with EPA data later and rename some cols
    pm25_site_summary['monitor_type'] = 'PurpleAir'
    pm25_site_summary.rename(columns={'2022':'avg_2022','2023':'avg_2023','2024':'avg_2024'},inplace=True)
    
    return pm25_site_summary

In [3]:
#load in EPA data so we can concat
epa_pm25 = pd.read_csv('../data/analyzed/houmetro-epa-pm25-site-summary.csv')

#read in the data we cleaned in 04a_clean-purpleair-pm25.ipynb
pa_pm25 = pd.read_csv('../data/analyzed/purpleair/houmetro-pa-2022-2024-pm25-cleaned.csv')
pa_pm25['date'] = pd.to_datetime(pa_pm25['date'])


#exporting the site summary that uses the Wallace adjusted PM2.5_alt values
# we did some experiments with the adjusted and unadjusted valuse (see below) and found that
#the adjusted more closly matched EPA data for colocated monitors.
pm25adj_site_summary = compile_purpleair_site_summary(pa_pm25,'pm2.5_alt_ADJ')

#export this summary
pm25adj_site_summary.to_csv('../data/analyzed/houmetro-purpleair-pm25adj-site-summary.csv',index=False)

#concat adjusted with EPA data and save too
combo_pm25adj = pd.concat([pm25adj_site_summary,epa_pm25])
reorder_cols = ['longitude','latitude','site_id', 'avg_2022', 'avg_2023', 'avg_2024', 
                'avg2022_24', 'yrs_over12','yrs_over9', 'over35', 'max_read', 'min_read', 
                'cnty_fips','county','monitor_type','compliance_2024','min_date',
                'max_date','daily_cnt']
combo_pm25adj[reorder_cols].to_csv('../data/analyzed/houmetro-epa-purpleair-pm25adj-site-summary.csv',index=False)

NA rows: 11


Unnamed: 0,sensor_index,2022,2023,2024,avg2022_24,yrs_over12,yrs_over9,over35,max_read,min_read,min_date,max_date,daily_cnt,cnty_fips,cnty_nm,latitude,longitude,location_type,sensor_created_date,county
50,161765,,,,,,,,,,,,,48201.0,Harris,29.716173,-95.34164,0.0,2022-09-09 16:11:29,Harris
39,143862,,,,,,,,,,,,,48157.0,Fort Bend,29.574083,-95.529106,0.0,2022-01-31 22:08:40,Fort Bend
8,27821,,,,,,,,,,,,,48201.0,Harris,29.77074,-95.370056,0.0,2019-03-01 22:54:37,Harris
29,118895,,,,,,,,,,,,,48201.0,Harris,29.816628,-95.32461,0.0,2021-08-09 19:59:32,Harris
46,161003,,,,,,,,,,,,,48157.0,Fort Bend,29.585194,-95.45126,0.0,2022-08-31 21:53:23,Fort Bend


In [4]:
combo_pm25adj.head()

Unnamed: 0,site_id,avg_2022,avg_2023,avg_2024,avg2022_24,yrs_over12,yrs_over9,over35,max_read,min_read,min_date,max_date,daily_cnt,cnty_fips,cnty_nm,latitude,longitude,location_type,sensor_created_date,county,compliance_2024,monitor_type
0,2386,6.55768,7.728895,7.111431,7.132669,0.0,0.0,2.0,37.966667,0.906667,January 2022,December 2024,1085.0,48167.0,Galveston,29.532282,-95.07647,0.0,2017-07-31 20:17:54,Galveston,in,PurpleAir
1,3033,7.232625,9.185048,9.415743,8.611139,0.0,2.0,1.0,38.646667,1.643333,January 2022,November 2024,932.0,48201.0,Harris,29.955063,-95.73739,0.0,2017-08-23 17:30:17,Harris,out,PurpleAir
2,3298,8.502818,9.881602,9.340946,9.241789,0.0,2.0,4.0,46.353333,1.303333,January 2022,December 2024,1087.0,48201.0,Harris,29.58335,-95.12009,0.0,2017-09-13 22:31:13,Harris,out,PurpleAir
4,3777,6.679303,9.097058,8.383,8.05312,0.0,1.0,4.0,49.81,0.793333,January 2022,December 2024,1052.0,48201.0,Harris,29.762896,-95.70764,0.0,2017-10-06 19:15:54,Harris,in,PurpleAir
5,6752,7.963083,8.481898,7.498726,7.981236,0.0,0.0,2.0,41.536667,1.19,January 2022,December 2024,1068.0,48201.0,Harris,29.71706,-95.30876,0.0,2018-01-30 20:02:52,Harris,in,PurpleAir


## Look into some findings

In [7]:
pa = combo_pm25adj.loc[combo_pm25adj['monitor_type'] == 'PurpleAir']
epa = combo_pm25adj.loc[combo_pm25adj['monitor_type'] == 'EPA']

pd.pivot_table(epa,
               index='county',
               columns='monitor_type',
               values=['avg_2022','avg_2023','avg_2024'],
               aggfunc='mean').reset_index()

Unnamed: 0_level_0,county,avg_2022,avg_2023,avg_2024
monitor_type,Unnamed: 1_level_1,EPA,EPA,EPA
0,Brazoria,,7.5125,7.948333
1,Galveston,7.995087,9.771338,7.031707
2,Harris,9.875487,10.795599,10.392295
3,Montgomery,9.688252,10.187778,12.048451


## Look at ADJ and UNADJ side-by-side to see the differences

In [None]:
#exporting the site summary that uses the unadjusted PM2.5_alt values
pm25unadj_site_summary = compile_purpleair_site_summary('pm2.5_alt_UNADJ')
pm25unadj_site_summary.to_csv('../data/analyzed/houmetro-purpleair-pm25unadj-site-summary.csv',index=False)

#concat adjusted with EPA data and save too
combo_pm25unadj = pd.concat([pm25unadj_site_summary,epa_pm25])
reorder_cols = ['longitude','latitude','site_id', 'avg_2022', 'avg_2023', 'avg_2024', 
                'avg2022_24', 'yrs_over12','yrs_over9', 'over35', 'max_read', 'min_read', 
                'cnty_fips','county','monitor_type','compliance_2024','min_date',
                'max_date','daily_cnt']
combo_pm25unadj[reorder_cols].to_csv('../data/analyzed/houmetro-epa-purpleair-pm25unadj-site-summary.csv',index=False)

In [None]:
keep_cols = ['site_id', 'avg_2022', 'avg_2023', 'avg_2024', 'avg2022_24',
       'yrs_over12', 'yrs_over9', 'over35', 'max_read', 'min_read', 'min_date',
       'max_date', 'daily_cnt', 'cnty_nm']
adj_compare = combo_pm25adj[keep_cols].merge(combo_pm25unadj[keep_cols],on='site_id',how='left',suffixes=('_adj','_unadj'))

reorder_cols = ['site_id', 'avg_2022_adj','avg_2022_unadj', 'avg_2023_adj','avg_2023_unadj', 
                'avg_2024_adj','avg_2024_unadj', 'avg2022_24_adj','avg2022_24_unadj',
                'yrs_over12_adj', 'yrs_over12_unadj', 
                'yrs_over9_adj','yrs_over9_unadj', 'over35_adj','over35_unadj', 
                'max_read_adj','max_read_unadj', 'min_read_adj','min_read_unadj', 
                'min_date_adj','max_date_adj', 'daily_cnt_adj', 'cnty_nm_adj']
adj_compare[reorder_cols].to_csv('../data/analyzed/houmetro-purpleair-pm25adj-unadj-comparison.csv',index=False)

In [None]:
adj_compare[reorder_cols].head()

In [None]:
#looking at the two co-located EPA and PurpleAir sensors
pa_co = 166421
epa_co = 482010046

adj_compare.loc[adj_compare['site_id'].isin([pa_co,epa_co])][reorder_cols].head()

In [None]:
pa2_co = 99797
epa2_co = 482011035

adj_compare.loc[adj_compare['site_id'].isin([pa2_co,epa2_co])][reorder_cols].head()

## Just a couple of integrity checks

In [None]:
combo_pm25adj.head()

In [None]:
combo_pm25adj.loc[combo_pm25adj['site_id'] == 161015]

In [None]:
pm25unadj_site_summary.loc[pm25unadj_site_summary['max_read'].isna()]