In [1]:
import pandas as pd

## Story-specific analysis
As we get closer to publishing the first PM2.5 story package we need to clarify a few things, look deeper into others and run a few data updates that are outside of the wider scope of this analysis. We'll do that here.

In [2]:
#following steps and proceedures from 01_analyze-epa-pm25.ipynb to get data
base_path = '../data/source/EPA/'
file_suffix = '_houmetro_epa_ad_viz_plotval_data'

years = range(2018, 2026)
pm25_dfs = []
for year in years:
    df = pd.read_csv(f'{base_path}{year}{file_suffix}.csv')
    df['datetime'] = pd.to_datetime(df['Date'])
    df['year'] = df['datetime'].dt.year
    pm25_dfs.append(df)
    
pm25 = pd.concat(pm25_dfs)

In [3]:
#Remove days known to have high pollution not from industry: NYE, 4th of July
holidays = ['2018-01-01','2018-07-04','2018-12-31',
            '2019-01-01','2019-07-04','2019-12-31',
            '2020-01-01','2020-07-04','2020-12-31',
            '2021-01-01','2021-07-04','2021-12-31',
            '2022-01-01','2022-07-04','2022-12-31',
            '2023-01-01','2023-07-04','2023-12-31',
            '2024-01-01','2024-07-04','2024-12-31',
            '2025-01-01','2025-07-04','2025-12-31']

pm25 = pm25.loc[~pm25['datetime'].isin(holidays)]

#Group by `Site ID` and `year` to get annual average `Daily Mean PM2.5 Concentration` for each site, for each year
by_site = pm25.groupby(['Site ID', 'year'])['Daily Mean PM2.5 Concentration'].mean().reset_index()

#Further group by `Site ID` and `Daily Mean PM2.5 Concentration` to calculate 3-year average for timespans 
# 2018-20, 2019-21, 2020-22, 2021-23, 2022-24
by_yr = pd.pivot_table(by_site, index='Site ID', columns='year', 
                       values='Daily Mean PM2.5 Concentration', aggfunc='mean').reset_index()
by_yr['avg2018_20'] = by_yr[[2018, 2019, 2020]].mean(axis=1,skipna=False)
by_yr['avg2019_21'] = by_yr[[2019, 2020, 2021]].mean(axis=1,skipna=False)
by_yr['avg2020_22'] = by_yr[[2020, 2021, 2022]].mean(axis=1,skipna=False)
by_yr['avg2021_23'] = by_yr[[2021, 2022, 2023]].mean(axis=1,skipna=False)
by_yr['avg2022_24'] = by_yr[[2022, 2023, 2024]].mean(axis=1,skipna=False)

#calculate how many of the past years the site has been higher than current and past limits
def over_limit(x, limit):
    return (x > limit).sum()

def return_max_min(x,limit_type):
    measure = 'max'
    if limit_type == 'max':
        measure = x.max()
    elif limit_type == 'min':
        measure = x.min()
    max_fmt = measure.strftime('%B %Y')
    #print(limit_type,':',max_fmt)
    return max_fmt

over_cols = list(range(2018,2026))
by_yr['yrs_over12'] = by_yr[over_cols].apply(lambda x: over_limit(x,12), axis=1)
by_yr['yrs_over9'] = by_yr[over_cols].apply(lambda x: over_limit(x,9), axis=1)

#calculate how many days per year each site had daily averages higher than 35
daily_over35 = pm25.groupby('Site ID').agg(days_over35=('Daily Mean PM2.5 Concentration', lambda x: over_limit(x, 35)),
                                           max_value=('Daily Mean PM2.5 Concentration', 'max'),
                                           min_value=('Daily Mean PM2.5 Concentration', 'min')
                                        ).reset_index()

daily_over35 = pm25.groupby('Site ID').agg(over35=('Daily Mean PM2.5 Concentration', lambda x: over_limit(x, 35)),
                                                   max_read=('Daily Mean PM2.5 Concentration', 'max'),
                                                   min_read=('Daily Mean PM2.5 Concentration', 'min'),
                                                   min_date=('datetime',lambda x: return_max_min(x,'min')),
                                                   max_date=('datetime',lambda x: return_max_min(x,'max')),
                                                   daily_cnt=('Daily Mean PM2.5 Concentration','count')
                                                   ).reset_index()

pm25_site_summary = by_yr.merge(daily_over35, on='Site ID', how='left')

#lets also join with the sensor features we have
site_info = pm25[['Site ID','County FIPS Code', 'County', 'Site Latitude', 'Site Longitude']].drop_duplicates()
pm25_site_summary = pm25_site_summary.merge(site_info, on='Site ID', how='left')
pm25_site_summary.columns = pm25_site_summary.columns.astype(str)

display(pm25_site_summary.columns)

#if the 2024 values is greater than 9, list "out" for compliance else "in"
pm25_site_summary['compliance_2024'] = pm25_site_summary['2024'].apply(lambda x: 'out' if x > 9 else 'in')

#renaming some columns because i'm anal
rename_cols = {'Site ID':'site_id','County FIPS Code':'cnty_fips','County':'county',
               'Site Latitude':'latitude','Site Longitude':'longitude'}
pm25_site_summary.rename(columns=rename_cols,inplace=True)

#export for graphics and whatnot
pm25_site_summary['monitor_type'] = 'EPA'
pm25_site_summary.rename(columns={'2018':'avg_2018',
                                  '2019':'avg_2019',
                                  '2020':'avg_2020',
                                  '2021':'avg_2021',
                                  '2022':'avg_2022',
                                  '2023':'avg_2023',
                                  '2024':'avg_2024',
                                  '2025':'avg_2025'},inplace=True)

reorder_cols = ['longitude','latitude','site_id', 'avg_2022', 'avg_2023', 'avg_2024', 'avg_2025',
                'avg2022_24', 'yrs_over12','yrs_over9', 'over35', 'max_read', 'min_read', 
                'cnty_fips','county','monitor_type','compliance_2024','min_date',
                'max_date','daily_cnt']
#pm25_site_summary[reorder_cols].to_csv('../data/analyzed/houmetro-epa-pm25-site-summary-WITH2025.csv',index=False)

  pm25 = pm25.loc[~pm25['datetime'].isin(holidays)]


Index(['Site ID', '2018', '2019', '2020', '2021', '2022', '2023', '2024',
       '2025', 'avg2018_20', 'avg2019_21', 'avg2020_22', 'avg2021_23',
       'avg2022_24', 'yrs_over12', 'yrs_over9', 'over35', 'max_read',
       'min_read', 'min_date', 'max_date', 'daily_cnt', 'County FIPS Code',
       'County', 'Site Latitude', 'Site Longitude'],
      dtype='object')

In [4]:
#just reordering columns for easier reading
pm25_site_summary = pm25_site_summary[reorder_cols]

In [5]:
pm25_site_summary.head()

Unnamed: 0,longitude,latitude,site_id,avg_2022,avg_2023,avg_2024,avg_2025,avg2022_24,yrs_over12,yrs_over9,over35,max_read,min_read,cnty_fips,county,monitor_type,compliance_2024,min_date,max_date,daily_cnt
0,-95.354974,28.964394,480391012,,7.5125,7.948333,,,0,0,0,31.8,0.8,39,Brazoria,EPA,in,January 2023,December 2024,116
1,-94.861289,29.254474,481671034,7.995087,9.771338,7.031707,6.747619,8.266044,0,1,3,50.9,0.1,167,Galveston,EPA,in,January 2018,April 2025,2406
2,-95.326137,29.901036,482010024,10.186234,10.380679,10.857105,10.241905,10.474672,0,8,4,42.9,0.0,201,Harris,EPA,out,January 2018,April 2025,2805
3,-95.284096,29.828086,482010046,11.772159,13.032551,12.824465,11.057843,12.543058,3,5,3,44.7,-0.8,201,Harris,EPA,out,May 2021,April 2025,1350
4,-95.499219,29.695729,482010055,9.458482,10.837709,10.653353,9.286316,10.316515,0,4,2,40.7,2.6,201,Harris,EPA,out,April 2022,April 2025,1020


In [6]:
settegast_id = 482010046

### "For two years in a row, it has collected the highest average readings of any official monitor in Texas"

In [10]:
pm25_site_summary.sort_values('avg_2022', ascending=False).head(3)

Unnamed: 0,longitude,latitude,site_id,avg_2022,avg_2023,avg_2024,avg_2025,avg2022_24,yrs_over12,yrs_over9,over35,max_read,min_read,cnty_fips,county,monitor_type,compliance_2024,min_date,max_date,daily_cnt
3,-95.284096,29.828086,482010046,11.772159,13.032551,12.824465,11.057843,12.543058,3,5,3,44.7,-0.8,201,Harris,EPA,out,May 2021,April 2025,1350
11,-95.38769,29.81453,482011052,11.252023,12.416519,11.906471,11.713,11.858338,1,8,5,44.2,0.9,201,Harris,EPA,out,January 2018,April 2025,1734
8,-95.257593,29.733726,482011035,10.622436,11.320558,10.740649,,10.894548,0,7,13,46.7,1.6,201,Harris,EPA,out,January 2018,December 2024,5255


In [9]:
pm25_site_summary.sort_values('avg_2023', ascending=False).head(3)

Unnamed: 0,longitude,latitude,site_id,avg_2022,avg_2023,avg_2024,avg_2025,avg2022_24,yrs_over12,yrs_over9,over35,max_read,min_read,cnty_fips,county,monitor_type,compliance_2024,min_date,max_date,daily_cnt
3,-95.284096,29.828086,482010046,11.772159,13.032551,12.824465,11.057843,12.543058,3,5,3,44.7,-0.8,201,Harris,EPA,out,May 2021,April 2025,1350
11,-95.38769,29.81453,482011052,11.252023,12.416519,11.906471,11.713,11.858338,1,8,5,44.2,0.9,201,Harris,EPA,out,January 2018,April 2025,1734
5,-95.031232,29.770698,482010058,10.496078,11.510029,10.396364,8.8375,10.800824,0,6,4,50.2,-0.1,201,Harris,EPA,out,January 2018,April 2025,2526


In [11]:
pm25_site_summary.sort_values('avg_2024', ascending=False).head(3)

Unnamed: 0,longitude,latitude,site_id,avg_2022,avg_2023,avg_2024,avg_2025,avg2022_24,yrs_over12,yrs_over9,over35,max_read,min_read,cnty_fips,county,monitor_type,compliance_2024,min_date,max_date,daily_cnt
3,-95.284096,29.828086,482010046,11.772159,13.032551,12.824465,11.057843,12.543058,3,5,3,44.7,-0.8,201,Harris,EPA,out,May 2021,April 2025,1350
12,-95.425128,30.350302,483390078,9.688252,10.187778,12.048451,10.349524,10.641494,1,5,5,39.7,-0.1,339,Montgomery,EPA,out,January 2018,April 2025,2468
11,-95.38769,29.81453,482011052,11.252023,12.416519,11.906471,11.713,11.858338,1,8,5,44.2,0.9,201,Harris,EPA,out,January 2018,April 2025,1734


In [12]:
pm25_site_summary.sort_values('avg_2025', ascending=False).head(3)

Unnamed: 0,longitude,latitude,site_id,avg_2022,avg_2023,avg_2024,avg_2025,avg2022_24,yrs_over12,yrs_over9,over35,max_read,min_read,cnty_fips,county,monitor_type,compliance_2024,min_date,max_date,daily_cnt
11,-95.38769,29.81453,482011052,11.252023,12.416519,11.906471,11.713,11.858338,1,8,5,44.2,0.9,201,Harris,EPA,out,January 2018,April 2025,1734
3,-95.284096,29.828086,482010046,11.772159,13.032551,12.824465,11.057843,12.543058,3,5,3,44.7,-0.8,201,Harris,EPA,out,May 2021,April 2025,1350
12,-95.425128,30.350302,483390078,9.688252,10.187778,12.048451,10.349524,10.641494,1,5,5,39.7,-0.1,339,Montgomery,EPA,out,January 2018,April 2025,2468


### "Two months from that date, a Houston Chronicle analysis of PM2.5 data show that the Settegast monitor will likely…"

According to the official schedule, they're looking June-June I think so let's also query Settegast that way to cross all the t's an dot the i's

In [15]:
settegast_df = pm25.loc[pm25['Site ID'] == settegast_id]

In [31]:
settegast_df.head()#.dtypes

Unnamed: 0,Date,Source,Site ID,POC,Daily Mean PM2.5 Concentration,Units,Daily AQI Value,Local Site Name,Daily Obs Count,Percent Complete,...,CBSA Code,CBSA Name,State FIPS Code,State,County FIPS Code,County,Site Latitude,Site Longitude,datetime,year
730,05/05/2021,AQS,482010046,1,8.3,ug/m3 LC,46,Houston North Wayside,1,100.0,...,26420,"Houston-The Woodlands-Sugar Land, TX",48,Texas,201,Harris,29.828086,-95.284096,2021-05-05,2021
731,05/06/2021,AQS,482010046,1,13.6,ug/m3 LC,59,Houston North Wayside,1,100.0,...,26420,"Houston-The Woodlands-Sugar Land, TX",48,Texas,201,Harris,29.828086,-95.284096,2021-05-06,2021
732,05/07/2021,AQS,482010046,1,16.1,ug/m3 LC,64,Houston North Wayside,1,100.0,...,26420,"Houston-The Woodlands-Sugar Land, TX",48,Texas,201,Harris,29.828086,-95.284096,2021-05-07,2021
733,05/08/2021,AQS,482010046,1,8.8,ug/m3 LC,49,Houston North Wayside,1,100.0,...,26420,"Houston-The Woodlands-Sugar Land, TX",48,Texas,201,Harris,29.828086,-95.284096,2021-05-08,2021
734,05/09/2021,AQS,482010046,1,10.5,ug/m3 LC,54,Houston North Wayside,1,100.0,...,26420,"Houston-The Woodlands-Sugar Land, TX",48,Texas,201,Harris,29.828086,-95.284096,2021-05-09,2021


In [29]:
#timeframes for TCEQ are June 1 to May 31 so let's look at averages for those
#dates over the years
def get_annual_tceq_avg(df,year):
    start_month = 6
    end_month = 5
    start_date = f'{year}-{start_month}-01'
    end_date = f'{year+1}-{end_month}-31'
    epa_year = (df['datetime'] >= start_date) & (df['datetime'] <= end_date)
    year_df = df.loc[epa_year]
    return year_df['Daily Mean PM2.5 Concentration'].mean()

In [30]:
print(get_annual_tceq_avg(settegast_df,2022))
print(get_annual_tceq_avg(settegast_df,2023))
print(get_annual_tceq_avg(settegast_df,2024))

12.724137931034482
12.746504559270518
12.345762711864406
