In [1]:
import pandas as pd

#show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## Story-specific analysis
As we get closer to publishing the first PM2.5 story package we need to clarify a few things, look deeper into others and run a few data updates that are outside of the wider scope of this analysis. We'll do that here.

FIRST, i'm going to load our custom analysis from the non-design values straight from the EPA.

In [2]:
#following steps and proceedures from 01_analyze-epa-pm25.ipynb to get data
base_path = '../data/source/EPA/tx/'
file_suffix = '_tx_epa_ad_viz_plotval_data'

years = range(2018, 2026)
pm25_dfs = []
for year in years:
    #turning site id into a str so it plays nice with the other design value df
    df = pd.read_csv(f'{base_path}{year}{file_suffix}.csv',dtype={'Site ID': str})
    df['datetime'] = pd.to_datetime(df['Date'])
    df['year'] = df['datetime'].dt.year
    pm25_dfs.append(df)
    
    #creating a site id to cbsa crosswalk
    if year == 2025:
        site_cbsa_2025 = df[['Site ID','CBSA Name']].drop_duplicates()
    
pm25 = pd.concat(pm25_dfs)

#Remove days known to have high pollution not from industry: NYE, 4th of July
holidays = ['2018-01-01','2018-07-04','2018-12-31',
            '2019-01-01','2019-07-04','2019-12-31',
            '2020-01-01','2020-07-04','2020-12-31',
            '2021-01-01','2021-07-04','2021-12-31',
            '2022-01-01','2022-07-04','2022-12-31',
            '2023-01-01','2023-07-04','2023-12-31',
            '2024-01-01','2024-07-04','2024-12-31',
            '2025-01-01','2025-07-04','2025-12-31']

pm25 = pm25.loc[~pm25['datetime'].isin(holidays)]

#Group by `Site ID` and `year` to get annual average `Daily Mean PM2.5 Concentration` for each site, for each year
by_site = pm25.groupby(['Site ID', 'year'])['Daily Mean PM2.5 Concentration'].mean().reset_index()

#Further group by `Site ID` and `Daily Mean PM2.5 Concentration` to calculate 3-year average for timespans 
# 2018-20, 2019-21, 2020-22, 2021-23, 2022-24
by_yr = pd.pivot_table(by_site, index='Site ID', columns='year', 
                       values='Daily Mean PM2.5 Concentration', aggfunc='mean').reset_index()
by_yr['avg2018_20'] = by_yr[[2018, 2019, 2020]].mean(axis=1,skipna=False)
by_yr['avg2019_21'] = by_yr[[2019, 2020, 2021]].mean(axis=1,skipna=False)
by_yr['avg2020_22'] = by_yr[[2020, 2021, 2022]].mean(axis=1,skipna=False)
by_yr['avg2021_23'] = by_yr[[2021, 2022, 2023]].mean(axis=1,skipna=False)
by_yr['avg2022_24'] = by_yr[[2022, 2023, 2024]].mean(axis=1,skipna=False)

#calculate how many of the past years the site has been higher than current and past limits
def over_limit(x, limit):
    return (x > limit).sum()

def return_max_min(x,limit_type):
    measure = 'max'
    if limit_type == 'max':
        measure = x.max()
    elif limit_type == 'min':
        measure = x.min()
    max_fmt = measure.strftime('%B %Y')
    #print(limit_type,':',max_fmt)
    return max_fmt

over_cols = list(range(2018,2026))
by_yr['yrs_over12'] = by_yr[over_cols].apply(lambda x: over_limit(x,12), axis=1)
by_yr['yrs_over9'] = by_yr[over_cols].apply(lambda x: over_limit(x,9), axis=1)

#calculate how many days per year each site had daily averages higher than 35
daily_over35 = pm25.groupby('Site ID').agg(days_over35=('Daily Mean PM2.5 Concentration', lambda x: over_limit(x, 35)),
                                           max_value=('Daily Mean PM2.5 Concentration', 'max'),
                                           min_value=('Daily Mean PM2.5 Concentration', 'min')
                                        ).reset_index()

daily_over35 = pm25.groupby('Site ID').agg(over35=('Daily Mean PM2.5 Concentration', lambda x: over_limit(x, 35)),
                                                   max_read=('Daily Mean PM2.5 Concentration', 'max'),
                                                   min_read=('Daily Mean PM2.5 Concentration', 'min'),
                                                   min_date=('datetime',lambda x: return_max_min(x,'min')),
                                                   max_date=('datetime',lambda x: return_max_min(x,'max')),
                                                   daily_cnt=('Daily Mean PM2.5 Concentration','count')
                                                   ).reset_index()

pm25_site_summary = by_yr.merge(daily_over35, on='Site ID', how='left')

#lets also join with the sensor features we have
site_info = pm25[['Site ID','County FIPS Code', 'County', 'Site Latitude', 'Site Longitude']].drop_duplicates()
pm25_site_summary = pm25_site_summary.merge(site_info, on='Site ID', how='left')
pm25_site_summary.columns = pm25_site_summary.columns.astype(str)

#if the 2024 values is greater than 9, list "out" for compliance else "in"
pm25_site_summary['compliance_2024'] = pm25_site_summary['2024'].apply(lambda x: 'out' if x > 9 else 'in')

#renaming some columns because i'm anal
rename_cols = {'Site ID':'site_id','County FIPS Code':'cnty_fips','County':'county',
               'Site Latitude':'latitude','Site Longitude':'longitude'}
pm25_site_summary.rename(columns=rename_cols,inplace=True)

#export for graphics and whatnot
pm25_site_summary['monitor_type'] = 'EPA'
pm25_site_summary.rename(columns={'2018':'avg_2018',
                                  '2019':'avg_2019',
                                  '2020':'avg_2020',
                                  '2021':'avg_2021',
                                  '2022':'avg_2022',
                                  '2023':'avg_2023',
                                  '2024':'avg_2024',
                                  '2025':'avg_2025'},inplace=True)

#just reordering columns for easier reading
reorder_cols = ['longitude','latitude','site_id',
                'avg_2018','avg_2019','avg_2020','avg_2021','avg_2022','avg_2023','avg_2024','avg_2025',
                'avg2022_24', 'yrs_over12','yrs_over9', 'over35', 'max_read', 'min_read', 
                'cnty_fips','county','monitor_type','compliance_2024','min_date',
                'max_date','daily_cnt']
pm25_site_summary = pm25_site_summary[reorder_cols]

#export if we need
#pm25_site_summary[reorder_cols].to_csv('../data/analyzed/houmetro-epa-pm25-site-summary-WITH2025.csv',index=False)

#create annual ranks 
pm25_site_summary['2022_rank'] = pm25_site_summary['avg_2022'].rank(ascending=False)
pm25_site_summary['2023_rank'] = pm25_site_summary['avg_2023'].rank(ascending=False)
pm25_site_summary['2024_rank'] = pm25_site_summary['avg_2024'].rank(ascending=False)
pm25_site_summary['2025_rank'] = pm25_site_summary['avg_2025'].rank(ascending=False)

settegast_id = '482010046'

  pm25 = pm25.loc[~pm25['datetime'].isin(holidays)]


NEXT, I'm going to load the design values that have been approved or whatever by EPA and TCEQ... I think. This is what we'll use for most of the stories I think.

In [3]:
pm25_dv_path = '../data/source/EPA/pm25_designvalues_2022_2024_final_05_28_25.xlsx'

pm25_dv_file = pd.ExcelFile(pm25_dv_path)
sheet_names = pm25_dv_file.sheet_names
for sheet_name in sheet_names:
    print(sheet_name)

Table1a. NAA Status Annual 2012
Table1b. NAA Status 24-hr 2006
Table1c. NAA Status Annual 1997
Table2a. Other Violators Annual
Table2b. Other Violators 24-hr
Table3a. NAA Trends Annual 2012
Table3b. NAA Trends 24-hr 2006
Table3c. NAA Trends Annual 1997
Table4a. County Status Annual
Table4b. County Status 24-hr
Table5a. Site Status Annual
Table5b. Site Status 24-hr
Table6a. Site Trends Annual
Table6b. Site Trends 24-hr
Appendix. Monitor Information


In [4]:
dtypes = {'State FIPS':str,'County FIPS':str,'AQS Site ID':str}

####COUNTY STUFF######
#read in
county_dv = pd.read_excel(pm25_dv_path, sheet_name='Table4a. County Status Annual',
                          skiprows=3,skipfooter=7, dtype=dtypes)

#rename shitty columns
rename_cols = {'2022-2024 Annual   Design Value (µg/m3) [1,2]':'dv_2022_24',}
county_dv.rename(columns=rename_cols, inplace=True)

#subset for TX
tx_county_dv = county_dv[county_dv['State Name'] == 'Texas']


#create ranking
tx_county_dv['rank_2022_24'] = tx_county_dv['dv_2022_24'].rank(ascending=False)


####SITE STUFF######
#read in
site_dv = pd.read_excel(pm25_dv_path, sheet_name='Table6a. Site Trends Annual',
                        skiprows=3,skipfooter=7, dtype=dtypes)

#rename shitty columns
rename_cols = {'2013-2015     Annual      Design Value (µg/m3) [1,2]':'dv_2013_15',
                '2014-2016     Annual      Design Value (µg/m3) [1,2]':'dv_2014_16',
                '2015-2017     Annual      Design Value (µg/m3) [1,2]':'dv_2015_17',
                '2016-2018     Annual      Design Value (µg/m3) [1,2]':'dv_2016_18',
                '2017-2019     Annual      Design Value (µg/m3) [1,2]':'dv_2017_19',
                '2018-2020     Annual      Design Value (µg/m3) [1,2]':'dv_2018_20',
                '2019-2021     Annual      Design Value (µg/m3) [1,2]':'dv_2019_21',
                '2020-2022     Annual      Design Value (µg/m3) [1,2]':'dv_2020_22',
                '2021-2023     Annual      Design Value (µg/m3) [1,2]':'dv_2021_23',
                '2022-2024     Annual      Design Value (µg/m3) [1,2]':'dv_2022_24'}
site_dv.rename(columns=rename_cols, inplace=True)

#subset for TX
tx_site_dv = site_dv[site_dv['State Name'] == 'Texas']

#create rankings
tx_site_dv['rank_2022_24'] = tx_site_dv['dv_2022_24'].rank(ascending=False)
tx_site_dv['rank_2021_23'] = tx_site_dv['dv_2021_23'].rank(ascending=False)
tx_site_dv['rank_2020_22'] = tx_site_dv['dv_2020_22'].rank(ascending=False)
tx_site_dv['rank_2019_21'] = tx_site_dv['dv_2019_21'].rank(ascending=False)
tx_site_dv['rank_2018_20'] = tx_site_dv['dv_2018_20'].rank(ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tx_county_dv['rank_2022_24'] = tx_county_dv['dv_2022_24'].rank(ascending=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tx_site_dv['rank_2022_24'] = tx_site_dv['dv_2022_24'].rank(ascending=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tx_site_dv['rank_2021_23'] = tx_site_dv['dv_2021_

Ok lastly we're brining in the PurpleAir data we've analyzed and processed.

In [20]:
pa_pm25 = pd.read_csv('../data/analyzed/houmetro-purpleair-pm25adj-site-summary.csv')

## Fact-check
Here are some data sentences we're fact checking!

--------------------------------
FACT: "For three years in a row, the TCEQ’s air monitor near Hogue’s home has collected some of the highest average readings for particulate matter of any official monitor in Texas, according to an analysis of pollution data by the Houston Chronicle."

LOGIC: Since this is talking about annaul avg readings by site across 3 discrete years, we'll use the raw PM2.5 readings from EPA, not the design value stuff that comes pre-processed.

OUTCOME: TRUE 🎉

In [5]:
pm25_site_summary.loc[pm25_site_summary['site_id'] == settegast_id]

Unnamed: 0,longitude,latitude,site_id,avg_2018,avg_2019,avg_2020,avg_2021,avg_2022,avg_2023,avg_2024,avg_2025,avg2022_24,yrs_over12,yrs_over9,over35,max_read,min_read,cnty_fips,county,monitor_type,compliance_2024,min_date,max_date,daily_cnt,2022_rank,2023_rank,2024_rank,2025_rank
29,-95.284096,29.828086,482010046,,,,12.652193,11.772159,13.032551,12.824465,12.224464,12.543058,4,5,3,44.7,-1.5,201,Harris,EPA,out,May 2021,September 2025,1481,2.0,2.0,4.0,5.0


In [6]:
tx_site_dv.loc[tx_site_dv['AQS Site ID'] == str(settegast_id)]

Unnamed: 0,State Name,County Name,CBSA Name,CSA Name,2012 Annual PM2.5 NAAQS Designated Area,1997 Annual PM2.5 NAAQS Designated Area,EPA Region,AQS Site ID,Local Site Name,Street Address,Site Latitude,Site Longitude,dv_2013_15,dv_2014_16,dv_2015_17,dv_2016_18,dv_2017_19,dv_2018_20,dv_2019_21,dv_2020_22,dv_2021_23,dv_2022_24,rank_2022_24,rank_2021_23,rank_2020_22,rank_2019_21,rank_2018_20
929,Texas,Harris,"Houston-The Woodlands-Sugar Land, TX","Houston-The Woodlands, TX",,,6,482010046,Houston North Wayside,7330 1/2 North Wayside,29.828086,-95.284096,,,,,,,,,12.5,12.7,2.0,1.0,,,


-------------------------
FACT: "Meanwhile, the Chronicle’s analysis showed people are living and working in an area where air quality has grown worse since the monitor was installed."

LOGIC: If we look only at the annual values, the monitor actually goes up and down each year. But if we look at the design values that avg. things out over 3 years, the Settegast monitor goes from 12.5 in 2021-23 to 12.7 in 2022-24 so I think this is defendable.

OUTCOME: TRUE 🎉

In [7]:
tx_site_dv.loc[tx_site_dv['AQS Site ID'] == str(settegast_id)]

Unnamed: 0,State Name,County Name,CBSA Name,CSA Name,2012 Annual PM2.5 NAAQS Designated Area,1997 Annual PM2.5 NAAQS Designated Area,EPA Region,AQS Site ID,Local Site Name,Street Address,Site Latitude,Site Longitude,dv_2013_15,dv_2014_16,dv_2015_17,dv_2016_18,dv_2017_19,dv_2018_20,dv_2019_21,dv_2020_22,dv_2021_23,dv_2022_24,rank_2022_24,rank_2021_23,rank_2020_22,rank_2019_21,rank_2018_20
929,Texas,Harris,"Houston-The Woodlands-Sugar Land, TX","Houston-The Woodlands, TX",,,6,482010046,Houston North Wayside,7330 1/2 North Wayside,29.828086,-95.284096,,,,,,,,,12.5,12.7,2.0,1.0,,,


-------------------------
FACT: "The EPA’s new rule tightened annual average pollution limits. By the numbers, that change would have thrown many existing monitors across Texas into the red."

LOGIC: We can use the raw stuff from EPA here and count for each year to see if there are more sites with the 9 limit than the 12 limit.

OUTCOME: TRUE 🎉

In [8]:
for year in [2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025]:
    over9 = len(pm25_site_summary[pm25_site_summary[f'avg_{year}'] >= 9][['site_id', 'county', f'avg_{year}']])
    over12 = len(pm25_site_summary[pm25_site_summary[f'avg_{year}'] >= 12][['site_id', 'county', f'avg_{year}']])
    print(year)
    print('Sites over 12 / 9 µg/m3')
    print(over12, '/', over9)

2018
Sites over 12 / 9 µg/m3
0 / 18
2019
Sites over 12 / 9 µg/m3
0 / 18
2020
Sites over 12 / 9 µg/m3
1 / 18
2021
Sites over 12 / 9 µg/m3
1 / 23
2022
Sites over 12 / 9 µg/m3
0 / 21
2023
Sites over 12 / 9 µg/m3
3 / 33
2024
Sites over 12 / 9 µg/m3
6 / 39
2025
Sites over 12 / 9 µg/m3
6 / 31


-------------------------
FACT: "A Houston Chronicle analysis of PM2.5 data first certified by the EPA this spring shows the Settegast monitor has failed to meet federal health standards. Its annual average exceeds the new EPA standard of 9 micrograms per cubic meter of air, and even surpasses the former, uncontested standard of 12 micrograms every year."

LOGIC: We need to use the certified design value data here.

OUTCOME: TRUE 🎉

In [9]:
tx_site_dv.loc[tx_site_dv['AQS Site ID'] == str(settegast_id)]

Unnamed: 0,State Name,County Name,CBSA Name,CSA Name,2012 Annual PM2.5 NAAQS Designated Area,1997 Annual PM2.5 NAAQS Designated Area,EPA Region,AQS Site ID,Local Site Name,Street Address,Site Latitude,Site Longitude,dv_2013_15,dv_2014_16,dv_2015_17,dv_2016_18,dv_2017_19,dv_2018_20,dv_2019_21,dv_2020_22,dv_2021_23,dv_2022_24,rank_2022_24,rank_2021_23,rank_2020_22,rank_2019_21,rank_2018_20
929,Texas,Harris,"Houston-The Woodlands-Sugar Land, TX","Houston-The Woodlands, TX",,,6,482010046,Houston North Wayside,7330 1/2 North Wayside,29.828086,-95.284096,,,,,,,,,12.5,12.7,2.0,1.0,,,


-------------------------
FACT: "Ten of 12 official measurement stations near homes in the industry-packed Greater Houston area are poised to break the EPA’s contested new health standard, an annual average of 9 micrograms per cubic meter. Only the Wayside monitor seems likely to break the previous limit of 12."

LOGIC: Since we're using the "poised" language I think we wanna look at raw data for 2025? But let's just look at both the design values stuff for Houston area and the raw stuff.

OUTCOME: ISSUES. Suggesting edit.

In [10]:
hou_label = 'Houston-The Woodlands-Sugar Land, TX'
hou_dv = tx_site_dv.loc[tx_site_dv['CBSA Name'] == hou_label]

pm25_site_summary['metro_2025'] = pm25_site_summary['site_id'].map(site_cbsa_2025.set_index('Site ID')['CBSA Name'])
hou_raw = pm25_site_summary[pm25_site_summary['metro_2025'] == hou_label]

In [15]:
#also wtf with there being different counts of sites in the raw vs the dv
hou_dv_sites = hou_dv['AQS Site ID'].values
hou_raw_sites = hou_raw['site_id'].values

for site in hou_raw_sites:
    if site not in hou_dv_sites:
        print(f"Site {site} is missing from design values.")

for site in hou_dv_sites:
    if site not in hou_raw_sites:
        print(f"Site {site} is missing from raw data.")

Site 480391012 is missing from design values.


In [16]:
hou_raw.loc[hou_raw['site_id'] == '480391012']

Unnamed: 0,longitude,latitude,site_id,avg_2018,avg_2019,avg_2020,avg_2021,avg_2022,avg_2023,avg_2024,avg_2025,avg2022_24,yrs_over12,yrs_over9,over35,max_read,min_read,cnty_fips,county,monitor_type,compliance_2024,min_date,max_date,daily_cnt,2022_rank,2023_rank,2024_rank,2025_rank,metro_2025
9,-95.354974,28.964394,480391012,,,,,,7.5125,7.948333,7.669231,,0,0,0,31.8,0.8,39,Brazoria,EPA,in,January 2023,March 2025,129,,48.0,46.0,51.0,"Houston-The Woodlands-Sugar Land, TX"


In [17]:
hou_raw.loc[hou_raw['site_id'] == '481671034']

Unnamed: 0,longitude,latitude,site_id,avg_2018,avg_2019,avg_2020,avg_2021,avg_2022,avg_2023,avg_2024,avg_2025,avg2022_24,yrs_over12,yrs_over9,over35,max_read,min_read,cnty_fips,county,monitor_type,compliance_2024,min_date,max_date,daily_cnt,2022_rank,2023_rank,2024_rank,2025_rank,metro_2025
27,-94.861289,29.254474,481671034,4.617549,7.519178,8.345625,7.190643,7.995087,9.771338,7.031707,7.990795,8.266044,0,1,3,50.9,0.1,167,Galveston,EPA,in,January 2018,September 2025,2540,39.0,23.0,52.0,45.0,"Houston-The Woodlands-Sugar Land, TX"


In [11]:
print(len(hou_raw))
display(hou_raw.sort_values('avg_2024',ascending=False))

13


Unnamed: 0,longitude,latitude,site_id,avg_2018,avg_2019,avg_2020,avg_2021,avg_2022,avg_2023,avg_2024,avg_2025,avg2022_24,yrs_over12,yrs_over9,over35,max_read,min_read,cnty_fips,county,monitor_type,compliance_2024,min_date,max_date,daily_cnt,2022_rank,2023_rank,2024_rank,2025_rank,metro_2025
29,-95.284096,29.828086,482010046,,,,12.652193,11.772159,13.032551,12.824465,12.224464,12.543058,4,5,3,44.7,-1.5,201,Harris,EPA,out,May 2021,September 2025,1481,2.0,2.0,4.0,5.0,"Houston-The Woodlands-Sugar Land, TX"
48,-95.425128,30.350302,483390078,8.268023,7.282746,8.513056,9.082635,9.688252,10.187778,12.048451,11.067672,10.641494,1,5,5,39.7,-0.1,339,Montgomery,EPA,out,January 2018,September 2025,2595,15.0,17.0,5.0,9.0,"Houston-The Woodlands-Sugar Land, TX"
37,-95.38769,29.81453,482011052,9.689655,10.309009,11.242735,11.417736,11.252023,12.416519,11.906471,12.210638,11.858338,2,8,5,44.2,0.9,201,Harris,EPA,out,January 2018,September 2025,1869,3.0,3.0,7.0,6.0,"Houston-The Woodlands-Sugar Land, TX"
28,-95.326137,29.901036,482010024,9.083733,9.241451,9.830769,9.937958,10.186234,10.380679,10.857105,10.266529,10.474672,0,8,4,42.9,0.0,201,Harris,EPA,out,January 2018,September 2025,2942,8.0,14.0,10.0,16.0,"Houston-The Woodlands-Sugar Land, TX"
34,-95.257593,29.733726,482011035,10.067232,10.083812,10.171237,10.654712,10.622436,11.320558,10.740649,11.276667,10.894548,0,8,14,46.7,1.6,201,Harris,EPA,out,January 2018,September 2025,5495,5.0,6.0,12.0,8.0,"Houston-The Woodlands-Sugar Land, TX"
30,-95.499219,29.695729,482010055,,,,,9.458482,10.837709,10.653353,9.454222,10.316515,0,4,2,40.7,2.6,201,Harris,EPA,out,April 2022,September 2025,1150,18.0,7.0,15.0,24.0,"Houston-The Woodlands-Sugar Land, TX"
31,-95.031232,29.770698,482010058,9.15132,8.65641,9.997429,9.63267,10.496078,11.510029,10.396364,10.021586,10.800824,0,7,4,50.2,-0.1,201,Harris,EPA,out,January 2018,September 2025,2657,6.0,5.0,16.0,17.0,"Houston-The Woodlands-Sugar Land, TX"
33,-95.220582,29.767997,482011034,10.962757,9.626496,9.945665,10.528783,9.786167,10.613636,10.171856,10.583117,10.190553,0,8,6,47.4,0.0,201,Harris,EPA,out,January 2018,September 2025,2639,14.0,9.0,21.0,13.0,"Houston-The Woodlands-Sugar Land, TX"
35,-95.128508,29.670025,482011039,8.247229,7.469241,8.771526,8.818121,8.709417,9.787113,9.569102,7.928571,9.355211,0,2,8,47.7,-0.8,201,Harris,EPA,out,January 2018,April 2025,4072,25.0,22.0,32.0,47.0,"Houston-The Woodlands-Sugar Land, TX"
32,-95.635833,29.723333,482010066,,,,8.220229,8.035484,9.416082,9.060119,9.409205,8.837228,0,3,3,43.6,-0.3,201,Harris,EPA,out,January 2021,September 2025,1520,38.0,27.0,39.0,26.0,"Houston-The Woodlands-Sugar Land, TX"


In [12]:
print(len(hou_dv))
display(hou_dv.sort_values('dv_2022_24',ascending=False))

12


Unnamed: 0,State Name,County Name,CBSA Name,CSA Name,2012 Annual PM2.5 NAAQS Designated Area,1997 Annual PM2.5 NAAQS Designated Area,EPA Region,AQS Site ID,Local Site Name,Street Address,Site Latitude,Site Longitude,dv_2013_15,dv_2014_16,dv_2015_17,dv_2016_18,dv_2017_19,dv_2018_20,dv_2019_21,dv_2020_22,dv_2021_23,dv_2022_24,rank_2022_24,rank_2021_23,rank_2020_22,rank_2019_21,rank_2018_20
929,Texas,Harris,"Houston-The Woodlands-Sugar Land, TX","Houston-The Woodlands, TX",,,6,482010046,Houston North Wayside,7330 1/2 North Wayside,29.828086,-95.284096,,,,,,,,,12.5,12.7,2.0,1.0,,,
937,Texas,Harris,"Houston-The Woodlands-Sugar Land, TX","Houston-The Woodlands, TX",,,6,482011052,Houston North Loop,822 North Loop,29.81453,-95.38769,,,10.8,9.9,9.9,10.6,11.1,11.4,11.7,11.9,3.0,2.0,1.0,1.0,2.0
934,Texas,Harris,"Houston-The Woodlands-Sugar Land, TX","Houston-The Woodlands, TX",,,6,482011035,Clinton,9525 1/2 Clinton Dr,29.733726,-95.257593,11.6,11.2,10.7,10.2,10.3,10.4,10.4,10.4,10.7,10.8,4.0,4.0,3.0,5.0,3.0
946,Texas,Montgomery,"Houston-The Woodlands-Sugar Land, TX","Houston-The Woodlands, TX",,,6,483390078,Conroe Relocated,9472a Hwy 1484,30.350302,-95.425128,,,,,,,,,,10.7,5.0,,,,
931,Texas,Harris,"Houston-The Woodlands-Sugar Land, TX","Houston-The Woodlands, TX",,,6,482010058,Baytown,7210 1/2 Bayway Drive,29.770698,-95.031232,9.9,9.7,9.4,9.3,9.2,9.3,9.5,10.1,10.5,10.6,6.0,5.0,7.5,12.5,12.5
928,Texas,Harris,"Houston-The Woodlands-Sugar Land, TX","Houston-The Woodlands, TX",,,6,482010024,Houston Aldine,4510 1/2 Aldine Mail Rd,29.901036,-95.326137,11.1,10.8,10.0,9.4,9.4,9.8,9.9,10.0,10.2,10.5,7.0,8.0,10.5,8.0,9.0
930,Texas,Harris,"Houston-The Woodlands-Sugar Land, TX","Houston-The Woodlands, TX",,,6,482010055,Houston Bayland Park,6400 Bissonnet Street,29.695729,-95.499219,,,,,,,,,,10.4,8.0,,,,
933,Texas,Harris,"Houston-The Woodlands-Sugar Land, TX","Houston-The Woodlands, TX",,,6,482011034,Houston East,1262 1/2 Mae Drive,29.767997,-95.220582,,,,,10.5,10.3,10.2,10.2,10.4,10.3,9.5,6.0,5.0,7.0,4.0
935,Texas,Harris,"Houston-The Woodlands-Sugar Land, TX","Houston-The Woodlands, TX",,,6,482011039,Houston Deer Park #2,4514 1/2 Durant Street,29.670025,-95.128508,9.6,9.2,8.6,8.2,8.0,8.4,8.4,8.7,8.9,9.2,24.5,20.5,20.0,22.5,24.0
932,Texas,Harris,"Houston-The Woodlands-Sugar Land, TX","Houston-The Woodlands, TX",,,6,482010066,Houston Westhollow,3333 1/2 Hwy 6 South,29.723333,-95.635833,,,,,,,,,,8.9,30.5,,,,


-------------------------
FACT: "Despite the county’s sharp population growth, Fort Bend has no official air pollution monitors set up by the Texas Commission on Environmental Quality."

LOGIC: I know the county has had sharp population growth. For as for the no official monitors let's just checkout the EPA raw data counties.

OUTCOME: TRUE 🎉

In [19]:
hou_raw.groupby('county',dropna=False).size()

county
Brazoria       1
Galveston      1
Harris        10
Montgomery     1
dtype: int64

-------------------------
FACT: "But a new Houston Chronicle analysis of air quality data from PurpleAir – a company that provides lower-cost, community-installed air monitors – shows worrisome pollution levels in Fort Bend County near where Wallace lives."

LOGIC: Identify and check the PA monitor readings.

OUTCOME: So... all of these Fort Bend ones are sort of in the area of W.A. Parish Generating Station. One of them showed increasing levels before readings stopped. And two of them are above the new levels. I think that fits "worrisome".

In [23]:
pa_pm25.loc[pa_pm25['cnty_nm'] == 'Fort Bend']

Unnamed: 0,site_id,avg_2022,avg_2023,avg_2024,avg2022_24,yrs_over12,yrs_over9,over35,max_read,min_read,min_date,max_date,daily_cnt,cnty_fips,cnty_nm,latitude,longitude,location_type,sensor_created_date,county,compliance_2024,monitor_type
8,30593,6.555175,7.673677,,,0.0,0.0,2.0,39.326667,0.906667,January 2022,July 2024,895.0,48157.0,Fort Bend,29.557081,-95.74033,0.0,2019-04-17 15:02:37,Fort Bend,in,PurpleAir
31,161015,,,9.576921,,0.0,1.0,3.0,46.466667,1.53,May 2024,December 2024,223.0,48157.0,Fort Bend,29.574024,-95.52899,0.0,2022-08-31 21:59:09,Fort Bend,out,PurpleAir
32,161141,,,8.179703,,0.0,0.0,1.0,49.866667,1.246667,December 2023,December 2024,344.0,48157.0,Fort Bend,29.55004,-95.68345,0.0,2022-09-01 19:38:55,Fort Bend,in,PurpleAir
33,161159,,,9.679103,,0.0,1.0,3.0,51.056667,1.133333,November 2023,December 2024,327.0,48157.0,Fort Bend,29.57438,-95.584496,0.0,2022-09-01 19:39:10,Fort Bend,out,PurpleAir


-------------------------
FACT: "Within the same boundaries, more than 50 PurpleAir monitors take similar measurements, filling in gaps in communities like Wallace’s that lack the costly regulatory versions."

LOGIC: Count PurpleAir monitors

OUTCOME: TRUE 🎉

In [None]:
len(pa_pm25)

54

-------------------------
FACT: "Fort Bend is not alone. Four of the five Houston-area counties with the largest recent uptick in population – Fort Bend, Chambers, Waller and Liberty Counties – feature no regulatory air monitors at all."

LOGIC: We're cool on the population change. But let's check the raw data from EPA again to confirm these counties don't have reg monitors.

OUTCOME: TRUE 🎉

In [None]:
hou_raw.groupby('county',dropna=False).size()

county
Brazoria       1
Galveston      1
Harris        10
Montgomery     1
dtype: int64

-------------------------
FACT: "The fifth, Montgomery County, has only one regulatory monitor that the state is not currently using since it did not collect enough data in 2023."

LOGIC: Check the raw EPA data again to see what's up with Montgomery.

OUTCOME: FALSE!!! Suggested change in the text

In [25]:
hou_raw.loc[hou_raw['county'] == 'Montgomery']

Unnamed: 0,longitude,latitude,site_id,avg_2018,avg_2019,avg_2020,avg_2021,avg_2022,avg_2023,avg_2024,avg_2025,avg2022_24,yrs_over12,yrs_over9,over35,max_read,min_read,cnty_fips,county,monitor_type,compliance_2024,min_date,max_date,daily_cnt,2022_rank,2023_rank,2024_rank,2025_rank,metro_2025
48,-95.425128,30.350302,483390078,8.268023,7.282746,8.513056,9.082635,9.688252,10.187778,12.048451,11.067672,10.641494,1,5,5,39.7,-0.1,339,Montgomery,EPA,out,January 2018,September 2025,2595,15.0,17.0,5.0,9.0,"Houston-The Woodlands-Sugar Land, TX"


In [27]:
hou_dv.loc[hou_dv['County Name'] == 'Montgomery']

Unnamed: 0,State Name,County Name,CBSA Name,CSA Name,2012 Annual PM2.5 NAAQS Designated Area,1997 Annual PM2.5 NAAQS Designated Area,EPA Region,AQS Site ID,Local Site Name,Street Address,Site Latitude,Site Longitude,dv_2013_15,dv_2014_16,dv_2015_17,dv_2016_18,dv_2017_19,dv_2018_20,dv_2019_21,dv_2020_22,dv_2021_23,dv_2022_24,rank_2022_24,rank_2021_23,rank_2020_22,rank_2019_21,rank_2018_20
946,Texas,Montgomery,"Houston-The Woodlands-Sugar Land, TX","Houston-The Woodlands, TX",,,6,483390078,Conroe Relocated,9472a Hwy 1484,30.350302,-95.425128,,,,,,,,,,10.7,5.0,,,,


-------------------------
FACT: "In its most recent air monitoring network plan, TCEQ authors said the agency already has 71 monitors across the state that measure fine particulate pollution..."

LOGIC: Ok I don't have the plan she's referencing but I do have the statewide EPA data that should mirror what TCEQ has across the state.

OUTCOME: FALSE!!! Trying to work out what to do here cause our data suggest TX only has like 50-63 monitors across the state not 71.

In [28]:
len(tx_site_dv)

50

In [31]:
len(tx_site_dv['AQS Site ID'].unique())

50

In [33]:
len(pm25_site_summary['site_id'].unique())

63

-------------------------
FACT: "Texas has 254 counties, the most of any American state. Only 12% monitor PM2.5."

LOGIC: Ok I don't have the plan she's referencing but I do have the statewide EPA data that should mirror what TCEQ has across the state.

OUTCOME: FALSE!!! Suggested change in the text

In [38]:
print(len(pm25_site_summary.groupby('county').size().reset_index()))
print(len(pm25_site_summary.groupby('county').size().reset_index())/254)

33
0.12992125984251968


In [41]:
counties_with_monitors = pm25_site_summary.groupby('county').size().reset_index().sort_values(0,ascending=False)
print(len(counties_with_monitors))
print(len(counties_with_monitors.loc[counties_with_monitors[0] > 1]))

33
11


-------------------------
FACT: "According to the Chronicle’s analysis of PM2.5 values from PurpleAir monitors in Fort Bend, two monitors near Parish Coal showed annual average readings above the new EPA air pollution standard."

LOGIC: Just checkout the PurpleAir data.

OUTCOME: TRUE 🎉

In [None]:
pa_pm25.loc[pa_pm25['cnty_nm'] == 'Fort Bend']

Unnamed: 0,site_id,avg_2022,avg_2023,avg_2024,avg2022_24,yrs_over12,yrs_over9,over35,max_read,min_read,min_date,max_date,daily_cnt,cnty_fips,cnty_nm,latitude,longitude,location_type,sensor_created_date,county,compliance_2024,monitor_type
8,30593,6.555175,7.673677,,,0.0,0.0,2.0,39.326667,0.906667,January 2022,July 2024,895.0,48157.0,Fort Bend,29.557081,-95.74033,0.0,2019-04-17 15:02:37,Fort Bend,in,PurpleAir
31,161015,,,9.576921,,0.0,1.0,3.0,46.466667,1.53,May 2024,December 2024,223.0,48157.0,Fort Bend,29.574024,-95.52899,0.0,2022-08-31 21:59:09,Fort Bend,out,PurpleAir
32,161141,,,8.179703,,0.0,0.0,1.0,49.866667,1.246667,December 2023,December 2024,344.0,48157.0,Fort Bend,29.55004,-95.68345,0.0,2022-09-01 19:38:55,Fort Bend,in,PurpleAir
33,161159,,,9.679103,,0.0,1.0,3.0,51.056667,1.133333,November 2023,December 2024,327.0,48157.0,Fort Bend,29.57438,-95.584496,0.0,2022-09-01 19:39:10,Fort Bend,out,PurpleAir


### SCROLLY CHECKS
-------------------------
FACT: "The majority of Texas' 254 counties don't have a regulatory monitor that reads PM2.5 levels in the air..."

LOGIC: https://drive.google.com/file/d/1ZqV5qLzn5uhv012B9VQWXP-Pu3cylnah/view

OUTCOME: TRUE 🎉

-------------------------
FACT: "In June 2024, TCEQ researchers told Houston communities that among the 30 counties that do have PM2.5 monitors, 18 had readings that would put them at or below the revised 9µg/m3 annual average health threshold for the latest 3-year period."

LOGIC: https://drive.google.com/file/d/1ZqV5qLzn5uhv012B9VQWXP-Pu3cylnah/view

OUTCOME: Will wanna change tense but otherwise true based on the report... "In June 2024, TCEQ researchers told Houston communities that among the 30 counties that had PM2.5 monitors, 18 had readings that would put them at or below the revised 9µg/m3 annual average health threshold for the latest 3-year period."

-------------------------
FACT: "TCEQ researchers also flagged 12 counties where data showed levels of PM2.5 above the new federal health limit, high enough to put them out of compliance unless the agency argued they faced individual situations that ruled them out. Scholars and air quality testers expected them to work towards improving many of the twelve."

LOGIC: https://drive.google.com/file/d/1ZqV5qLzn5uhv012B9VQWXP-Pu3cylnah/view

OUTCOME: TRUE 🎉