In [1]:
import requests
import os
import pandas as pd
import geopandas as gpd
import numpy as np
import math

import json
from datetime import timedelta, date, datetime
import math

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# Make sure that the difference between pm2.5_alt_a and pm2.5_alt_b is no more 
# than 70% of either a or b
def get_ab_divergence(a,b):
    if a == 0 or b == 0:
        if a == 0 and b == 0:
            return False
        if a == 0 and b > .7:
            return True
        if b == 0 and a > .7:
            return True
    elif math.isnan(a) or math.isnan(b):
        return False
    else:
        div_a = round((abs(a-b)/a),4)
        div_b = round((abs(a-b)/b),4)
        if div_a >= .7 or div_b >= .7:
            return True
        else:
            return False

In [None]:
#a little test to make sure i'm not insane
a = np.nan
b = 12

#these two are th same
#print('the difference between a and b is', round((abs(a-b)/a),4), 'percent of a')
#print('the difference between a and b is', round((abs(a-b)/b),4), 'percent of b')
print(get_ab_divergence(b,a))

In [3]:
#read in the data we fetched from get-historical-purple-air.ipynb
pa_pm25 = pd.read_csv('../data/analyzed/purpleair/houmetro-pa-2022-2024-pm25.csv')

#there are dups in the data... i'll figure it out in the pull code but for now lets remove
print('pre dedupe:',len(pa_pm25))
pa_pm25 = pa_pm25.drop_duplicates()
print('post dedupe:',len(pa_pm25))

#add a readable date
pa_pm25['date'] = pd.to_datetime(pa_pm25['time_stamp'],unit='s')
pa_pm25['year'] = pa_pm25['date'].dt.year

#assess the divergence issue
pa_pm25['divergency_issue'] = pa_pm25.apply(lambda x: get_ab_divergence(x['pm2.5_alt_a'],x['pm2.5_alt_b']), axis=1)

#need to do the PM2.5 conversion per Lance Wallace's comments
pa_pm25['pm2.5_alt_a_ADJ'] = pa_pm25['pm2.5_alt_a']*(3.4/3.0)
pa_pm25['pm2.5_alt_b_ADJ'] = pa_pm25['pm2.5_alt_b']*(3.4/3.0)
pa_pm25['pm2.5_alt_ADJ'] = pa_pm25[['pm2.5_alt_a_ADJ','pm2.5_alt_b_ADJ']].mean(axis=1)

#we're also calculating the unadjusted alt channels because PurpleAir might have already implemented the adjustment
pa_pm25['pm2.5_alt_UNADJ'] = pa_pm25[['pm2.5_alt_a','pm2.5_alt_b']].mean(axis=1)

pre dedupe: 80575
post dedupe: 52128


In [17]:
#how many records are we ditching with our divergence issue?
display(pa_pm25.groupby('divergency_issue').size())
print((len(pa_pm25.loc[pa_pm25['divergency_issue'] == True])/len(pa_pm25))*100)

divergency_issue
False    48965
True      3156
dtype: int64

6.054327808471455


In [None]:
#are some sensors more affected than others by the divergence issue?
by_site_divergence = pd.pivot_table(pa_pm25,index='sensor_index',
                                    columns='divergency_issue',
                                    values='pm2.5_alt_a',
                                    aggfunc='count').reset_index().sort_values(True,ascending=False)
#by_site_divergence
by_site_divergence.columns = ['sensor_index','divergent_FALSE','divergent_TRUE']
by_site_divergence['divergent_TRUE'] = by_site_divergence['divergent_TRUE'].fillna(0)
by_site_divergence['divergent_TRUE'] = by_site_divergence['divergent_TRUE'].astype(int)
by_site_divergence['divergent_FALSE'] = by_site_divergence['divergent_FALSE'].fillna(0)
by_site_divergence['divergent_FALSE'] = by_site_divergence['divergent_FALSE'].astype(int)

by_site_divergence['share_divergent'] = by_site_divergence['divergent_TRUE'] / (by_site_divergence['divergent_TRUE'] + by_site_divergence['divergent_FALSE'])
by_site_divergence['share_divergent'] = by_site_divergence['share_divergent'].fillna(0)

In [13]:
by_site_divergence.sort_values('share_divergent',ascending=False)

Unnamed: 0,sensor_index,divergent_FALSE,divergent_TRUE,share_divergent
8,27821,0,1076,1.0
15,46237,0,384,1.0
44,161003,43,163,0.791262
60,180145,86,317,0.7866
11,30593,901,195,0.17792
12,31163,824,152,0.155738
41,154769,195,34,0.148472
1,3033,939,150,0.137741
37,135062,818,108,0.116631
47,161159,329,43,0.115591


In [None]:
#i have two things i need to do... average the channels together and apply the calibration
#is order of operations important here? I should know but i forget so let's test!

a = 1233543
b = 95832
#avg then multiply
print('avg then mult:',((a+b)/2)*(3.4/3.0))
#multiply then avg
print('mult then avg:',((a*(3.4/3.0))+(b*(3.4/3.0)))/2)

In [None]:
#Remove days known to have high pollution not from industry: NYE, 4th of July
holidays = ['2018-01-01','2018-07-04','2018-12-31',
            '2019-01-01','2019-07-04','2019-12-31',
            '2020-01-01','2020-07-04','2020-12-31',
            '2021-01-01','2021-07-04','2021-12-31',
            '2022-01-01','2022-07-04','2022-12-31',
            '2023-01-01','2023-07-04','2023-12-31',
            '2024-01-01','2024-07-04','2024-12-31',]

pa_pm25 = pa_pm25.loc[~pa_pm25['date'].isin(holidays)]
print('post holiday remove:',len(pa_pm25))

In [None]:
#calculate how many of the past years the site has been higher than current and past limits
def over_limit(x, limit):
    return (x > limit).sum()

def return_max_min(x,limit_type):
    measure = 'max'
    if limit_type == 'max':
        measure = x.max()
    elif limit_type == 'min':
        measure = x.min()
    max_fmt = measure.strftime('%B %Y')
    #print(limit_type,':',max_fmt)
    return max_fmt

def compile_purpleair_site_summary(pm_field):
    by_site = pa_pm25.groupby(['sensor_index','year']).agg(value_cnt=('time_stamp','count'),
                                                        pm25_avg=(pm_field,'mean')
                                                        ).reset_index()
    
    #get rid of rows where the sensor didn't report at least 50% of the time
    by_site_full = by_site.loc[by_site['value_cnt'] >= 0.5*365]

    by_yr = pd.pivot_table(by_site_full,index='sensor_index',
                        columns='year',values='pm25_avg',aggfunc='mean').reset_index()
    by_yr['avg2022_24'] = by_yr[[2022, 2023, 2024]].mean(axis=1,skipna=False)

    over_cols = [2022,2023,2024]
    by_yr['yrs_over12'] = by_yr[over_cols].apply(lambda x: over_limit(x,12), axis=1)
    by_yr['yrs_over9'] = by_yr[over_cols].apply(lambda x: over_limit(x,9), axis=1)

    #calculate how many days per year each site had daily averages higher than 35
    daily_over35 = pa_pm25.groupby('sensor_index').agg(over35=(pm_field, lambda x: over_limit(x, 35)),
                                                    max_read=(pm_field, 'max'),
                                                    min_read=(pm_field, 'min'),
                                                    min_date=('date',lambda x: return_max_min(x,'min')),
                                                    max_date=('date',lambda x: return_max_min(x,'max')),
                                                    daily_cnt=(pm_field,'count')
                                                    ).reset_index()

    pm25_site_summary = by_yr.merge(daily_over35, on='sensor_index', how='left')

    #lets also join with the sensor features we have
    site_info = pd.read_csv('../GIS/purpleair/houmetro-pa-sensors-atleast_1yr.csv')
    site_info.rename(columns={'date':'sensor_created_date'},inplace=True)
    pm25_site_summary = pm25_site_summary.merge(site_info[['sensor_index','cnty_fips','cnty_nm',
                                                        'latitude','longitude','location_type',
                                                        'sensor_created_date']],on='sensor_index',how='outer')

    #remove indoor monitors for this analysis
    pm25_site_summary = pm25_site_summary.loc[pm25_site_summary['location_type'] == 0]

    #remove rows that don't have any data
    #i checked to make sure empty max_value meant no other data in the row
    print('NA rows:',len(pm25_site_summary.loc[pm25_site_summary['max_read'].isna()]))
    display(pm25_site_summary.loc[pm25_site_summary['max_read'].isna()].sample(5))
    pm25_site_summary = pm25_site_summary.loc[~pm25_site_summary['max_read'].isna()]

    #let's make the columns agree with the EPA monitors
    rename_cols = {'sensor_index':'site_id','Site Latitude':'latitude','Site Longitude':'longitude'}
    pm25_site_summary.rename(columns=rename_cols, inplace=True)
    pm25_site_summary.columns = [ str(x) for x in pm25_site_summary.columns ]

    #if the 2024 values is greater than 9, list "out" for compliance else "in"
    pm25_site_summary['compliance_2024'] = pm25_site_summary['2024'].apply(lambda x: 'out' if x > 9 else 'in')


    #label as purpleaair because we'll concat with EPA data later and rename some cols
    pm25_site_summary['monitor_type'] = 'PurpleAir'
    pm25_site_summary.rename(columns={'2022':'avg_2022','2023':'avg_2023','2024':'avg_2024'},inplace=True)
    
    return pm25_site_summary

In [None]:
#load in EPA data so we can concat
epa_pm25 = pd.read_csv('../data/analyzed/houmetro-epa-pm25-site-summary.csv')

In [None]:
#exporting the site summary that uses the Wallace adjusted PM2.5_alt values
pm25adj_site_summary = compile_purpleair_site_summary('pm2.5_alt_ADJ')

pm25adj_site_summary.to_csv('../data/analyzed/houmetro-purpleair-pm25adj-site-summary.csv',index=False)

#concat adjusted with EPA data and save too
combo_pm25adj = pd.concat([pm25adj_site_summary,epa_pm25])
reorder_cols = ['longitude','latitude','site_id', 'avg_2022', 'avg_2023', 'avg_2024', 
                'avg2022_24', 'yrs_over12','yrs_over9', 'over35', 'max_read', 'min_read', 
                'cnty_fips','county','monitor_type','compliance_2024','min_date',
                'max_date','daily_cnt']
combo_pm25adj[reorder_cols].to_csv('../data/analyzed/houmetro-epa-purpleair-pm25adj-site-summary.csv',index=False)

In [None]:
#exporting the site summary that uses the unadjusted PM2.5_alt values
pm25unadj_site_summary = compile_purpleair_site_summary('pm2.5_alt_UNADJ')
pm25unadj_site_summary.to_csv('../data/analyzed/houmetro-purpleair-pm25unadj-site-summary.csv',index=False)

#concat adjusted with EPA data and save too
combo_pm25unadj = pd.concat([pm25unadj_site_summary,epa_pm25])
reorder_cols = ['longitude','latitude','site_id', 'avg_2022', 'avg_2023', 'avg_2024', 
                'avg2022_24', 'yrs_over12','yrs_over9', 'over35', 'max_read', 'min_read', 
                'cnty_fips','county','monitor_type','compliance_2024','min_date',
                'max_date','daily_cnt']
combo_pm25unadj[reorder_cols].to_csv('../data/analyzed/houmetro-epa-purpleair-pm25unadj-site-summary.csv',index=False)

## Look at ADJ and UNADJ side-by-side to see the differences

In [None]:
keep_cols = ['site_id', 'avg_2022', 'avg_2023', 'avg_2024', 'avg2022_24',
       'yrs_over12', 'yrs_over9', 'over35', 'max_read', 'min_read', 'min_date',
       'max_date', 'daily_cnt', 'cnty_nm']
adj_compare = combo_pm25adj[keep_cols].merge(combo_pm25unadj[keep_cols],on='site_id',how='left',suffixes=('_adj','_unadj'))

reorder_cols = ['site_id', 'avg_2022_adj','avg_2022_unadj', 'avg_2023_adj','avg_2023_unadj', 
                'avg_2024_adj','avg_2024_unadj', 'avg2022_24_adj','avg2022_24_unadj',
                'yrs_over12_adj', 'yrs_over12_unadj', 
                'yrs_over9_adj','yrs_over9_unadj', 'over35_adj','over35_unadj', 
                'max_read_adj','max_read_unadj', 'min_read_adj','min_read_unadj', 
                'min_date_adj','max_date_adj', 'daily_cnt_adj', 'cnty_nm_adj']
adj_compare[reorder_cols].to_csv('../data/analyzed/houmetro-purpleair-pm25adj-unadj-comparison.csv',index=False)

In [None]:
adj_compare[reorder_cols].head()

In [None]:
#looking at the two co-located EPA and PurpleAir sensors
pa_co = 166421
epa_co = 482010046

adj_compare.loc[adj_compare['site_id'].isin([pa_co,epa_co])][reorder_cols].head()

In [None]:
pa2_co = 99797
epa2_co = 482011035

adj_compare.loc[adj_compare['site_id'].isin([pa2_co,epa2_co])][reorder_cols].head()

## Just a couple of integrity checks

In [None]:
combo_pm25adj.head()

In [None]:
pm25unadj_site_summary.loc[pm25unadj_site_summary['max_read'].isna()]