In [1]:
import pandas as pd
import geopandas as gpd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

We're trying to make a searchable map with one static layer and 3 toggle layers. 

Static layer = air quality monitors (both regulatory and PurpleAir), styled by 3-yr avg. PM2.5 reading

Toggle layers:
- life expectancy by tract
- asthma by tract
- POC or poverty by tract

Steps:
- pull 2020 and 2010 tracts shapefile
- restrict to just Houston metro
- pull asthma data from [CDC PLACES](https://data.cdc.gov/500-Cities-Places/PLACES-Local-Data-for-Better-Health-Census-Tract-D/cwsq-ngmh/about_data)
- pull most recent POC/poverty from Census API
- pull life expectancy [from CDC](https://data.cdc.gov/NCHS/U-S-Life-Expectancy-at-Birth-by-State-and-Census-T/5h56-n989/about_data) - note, this uses the 2010 tracts
- join data files to shapes and export
- simplify geojson files in mapshaper by 20%
- upload manually to sfc fileserver so you don't need to change filepath in searchable map every damn time you update the data (`tx-data/hc-pm25`)

In [2]:
########################################
#load geos
########################################

houmetro_cntys = {'Fort Bend, TX':'48157','San Jacinto, TX':'48407','Waller, TX':'48473',
                'Galveston, TX':'48167','Harris, TX':'48201','Liberty, TX':'48291',
                'Chambers, TX':'48071','Brazoria, TX':'48039','Austin, TX':'48015',
                'Montgomery, TX':'48339'}
houmetro_cntys_swap = {v: k for k, v in houmetro_cntys.items()}
houmetro_fips_ints = [ int(x) for x in list(houmetro_cntys.values()) ]
houmetro_fips_strs = [ str(x) for x in list(houmetro_cntys.values()) ]
houmetro_nms = [ str(x) for x in list(houmetro_cntys.keys()) ]


tracts10_shp = gpd.read_file('https://www2.census.gov/geo/tiger/TIGER2010/TRACT/2010/tl_2010_48_tract10.zip')
tracts10_shp['cnty_geoid'] = tracts10_shp['STATEFP10'].astype(str) + tracts10_shp['COUNTYFP10'].astype(str)
tracts10_shp['cnty_nm'] = tracts10_shp['cnty_geoid'].map(houmetro_cntys_swap)
tracts10_shp.rename(columns={'GEOID10':'tract_geoid'},inplace=True)
houtracts10_shp = tracts10_shp.loc[tracts10_shp['cnty_geoid'].isin(houmetro_fips_strs)]
houtracts10_shp = houtracts10_shp[['cnty_geoid','cnty_nm','tract_geoid','geometry']]

tracts20_shp = gpd.read_file('https://www2.census.gov/geo/tiger/TIGER2024/TRACT/tl_2024_48_tract.zip')
tracts20_shp['cnty_geoid'] = tracts20_shp['STATEFP'].astype(str) + tracts20_shp['COUNTYFP'].astype(str)
tracts20_shp['cnty_nm'] = tracts20_shp['cnty_geoid'].map(houmetro_cntys_swap)
tracts20_shp.rename(columns={'GEOID':'tract_geoid'},inplace=True)
houtracts20_shp = tracts20_shp.loc[tracts20_shp['cnty_geoid'].isin(houmetro_fips_strs)]
houtracts20_shp = houtracts20_shp[['cnty_geoid','cnty_nm','tract_geoid','geometry']]

In [None]:
print(len(houtracts10_shp))
print(houtracts10_shp.dtypes)
display(houtracts10_shp.head(1))

print(len(houtracts20_shp))
print(houtracts20_shp.dtypes)
display(houtracts20_shp.head(1))

In [None]:
########################################
# CDC asthma data
########################################

#load national data
cdc = pd.read_csv('../data/source/CDC/PLACES__Local_Data_for_Better_Health__Census_Tract_Data_2024_release_20250103.csv')

#restrict to just hou metro counties
hou_cdc = cdc.loc[cdc['CountyFIPS'].isin(houmetro_fips_ints)]

#delete cdc cause it's huge and we don't need it anymore
del cdc

#restrict to just asthma
hou_asthma = hou_cdc.loc[hou_cdc['MeasureId'] == 'CASTHMA']

#create correct type on tract_geoid
hou_asthma['tract_geoid'] = hou_asthma['LocationName'].astype(str)

#too many columns, reduce and rename
rename_cols = {'Data_Value':'dv','TotalPopulation':'p','TotalPop18plus':'py',
               'tract_geoid':'tract_geoid'}
hou_asthma.rename(columns=rename_cols,inplace=True)
hou_asthma = hou_asthma[list(rename_cols.values())]

#connect to 2020 tracts - GEOID
hou_asthma_geo = houtracts20_shp.merge(hou_asthma,on='tract_geoid',how='left')

#get rid of nulls and save
hou_asthma_geo_export = hou_asthma_geo.loc[~hou_asthma_geo['dv'].isna()]
hou_asthma_geo_export.to_file('../GIS/for-map/cdc-asthma-houmetro.geojson')

In [None]:
#get max and min values for searchable map formatting
print(len(hou_asthma_geo_export.loc[hou_asthma_geo_export['dv'].isna()]))
print(hou_asthma_geo_export.dv.min())
print(hou_asthma_geo_export.dv.max())

In [None]:
#just making sure things look right
print(len(hou_asthma_geo))
print(hou_asthma_geo.columns)
display(hou_asthma_geo.head())

In [None]:
#checking for missing valuse
print(len(hou_asthma_geo.loc[hou_asthma_geo['dv'].isna()]))
display(hou_asthma_geo.loc[hou_asthma_geo['dv'].isna()])

In [6]:
########################################
# CDC life expectancy data
########################################

#load national data
life_ex_us = pd.read_csv('../data/source/CDC/U.S._Life_Expectancy_at_Birth_by_State_and_Census_Tract_-_2010-2015.csv')

#restrict to just hou metro counties
life_ex_us['County'] = life_ex_us['County'].str.replace(' County','')
hou_life = life_ex_us.loc[(life_ex_us['State'] == 'Texas')&(life_ex_us['County'].isin(houmetro_nms))]

#delete us data cause we don't need it
del life_ex_us

#get rid of values that don't have tract numbers
hou_life = hou_life.loc[~hou_life['Census Tract Number'].isna()]

#creately format tract_geoid
hou_life['tract_fips'] = hou_life['Census Tract Number'].apply(lambda x: f'{x:.2f}')
hou_life['tract_fips'] = hou_life['tract_fips'].astype(str).str.replace('.','')
hou_life['cnty_fips'] = hou_life['County'].map(houmetro_cntys)
hou_life['tract_geoid'] = hou_life['cnty_fips'] + hou_life['tract_fips']

#too many columns, reduce and rename
rename_cols = {'Life Expectancy':'ex','Life Expectancy Range':'exr', 
               'Life Expectancy Standard Error':'exe','tract_geoid':'tract_geoid'}
hou_life.rename(columns=rename_cols,inplace=True)
hou_life = hou_life[list(rename_cols.values())]

#connect to 2010 tracts - GEOID
hou_life_geo = houtracts10_shp.merge(hou_life,on='tract_geoid')

#get rid of nulls and save
hou_life_geo_export = hou_life_geo.loc[~hou_life_geo['ex'].isna()]
hou_life_geo_export.to_file('../GIS/for-map/cdc-lifeexpectancy-houmetro.geojson')

In [None]:
#get max and min values for searchable map formatting
print(len(hou_life_geo_export.loc[hou_life_geo_export['ex'].isna()]))
print(hou_life_geo_export.ex.min())
print(hou_life_geo_export.ex.max())

In [None]:
#just making sure they all join
print('hou_life:',len(hou_life))
print('hou_life_geo:',len(hou_life_geo))
print(hou_life.columns)
display(hou_life.loc[~hou_life['tract_geoid'].isin(list(hou_life_geo['tract_geoid'].unique()))].head())

In [None]:
########################################
# Poverty data
########################################

#load national data
pov = pd.read_csv('../data/source/ACSST5Y2023.S1901_2025-01-03T110020/ACSST5Y2023.S1901-Data.csv')

#drop the first row that's the hr column names
pov = pov.iloc[1:]

#create correct type on fields
pov['tract_geoid'] = pov['GEO_ID'].str.replace('1400000US','')
pov['cnty_geoid'] = pov['tract_geoid'].str[:5]
pov['S1901_C01_012E'] = pd.to_numeric(pov['S1901_C01_012E'],errors='coerce')
pov['S1901_C01_012M'] = pd.to_numeric(pov['S1901_C01_012M'],errors='coerce')

#restrict to just hou metro counties
hou_pov = pov.loc[pov['cnty_geoid'].isin(houmetro_fips_strs)]

#delete cdc cause it's huge and we don't need it anymore
del pov

#too many columns, reduce and rename
keep_cols = {'tract_geoid':'tract_geoid','S1901_C01_001E':'hh',
             'S1901_C01_012E':'mi','S1901_C01_012M':'mimoe'}
hou_pov = hou_pov[list(keep_cols.keys())]
hou_pov.rename(columns=keep_cols,inplace=True)

#connect to 2020 tracts - GEOID
hou_pov_geo = houtracts20_shp.merge(hou_pov,on='tract_geoid',how='left')

#save
hou_pov_geo_export = hou_pov_geo.loc[~hou_pov_geo['mi'].isna()]
hou_pov_geo_export.to_file('../GIS/for-map/census-hh-income-houmetro.geojson')

In [None]:
#get max and min values for searchable map formatting
print(len(hou_pov_geo_export.loc[hou_pov_geo_export['mi'].isna()]))
print(hou_pov_geo_export.mi.min())
print(hou_pov_geo_export.mi.max())

In [None]:
print(len(hou_pov))
print(len(hou_pov_geo))

In [None]:
#just checking how many have outrageous MOE
hou_pov['moe_share'] = (hou_pov['mimoe']/hou_pov['mi'])*100

#share with moe above 30%
print('Share with MOE over 30%:',(len(hou_pov.loc[hou_pov['moe_share']>29.99])/len(hou_pov))*100)

display(hou_pov.sort_values('moe_share',ascending=False).head(20))
display(hou_pov.sort_values('moe_share',ascending=True).head(20))