## Installs

In [None]:
!pip3 install -qU geopandas

## Imports

In [None]:
# imports
from pathlib import Path
import pandas as pd
import geopandas as gpd

In [None]:
ROOT = Path(".")

In [None]:
# paths
ROOT = Path(".")
TRACTS_ZIP = ROOT/"data/raw/tracts_tx_2018/tl_rd18_48_tract.zip"   # or your 2023 zip if you switched
ACS_CSV   = ROOT/"data/raw/ACSDT5Y2023.B01003-Data.csv"

In [None]:
# read tract polygons
gdf = gpd.read_file(TRACTS_ZIP)

In [None]:
# Find/construct GEOID (11-digit tract FIPS)
if "GEOID" in gdf.columns:
    gdf["GEOID"] = gdf["GEOID"].astype(str)
else:
    # Construct from parts (common TIGER fields)
    for need in ["STATEFP","COUNTYFP","TRACTCE"]:
        assert need in gdf.columns, f"Missing expected column {need} in tract shapefile."
    gdf["GEOID"] = gdf["STATEFP"].astype(str) + gdf["COUNTYFP"].astype(str) + gdf["TRACTCE"].astype(str)

In [None]:
# Keep essential columns, ensure WGS84
gdf = gdf[["GEOID","geometry"]].to_crs(4326)

In [None]:
print(f"Tracts read: {len(gdf):,}")
display(gdf.head(2))

In [None]:
# --- read ACS total population (B01003_001E) ---
acs = pd.read_csv(ACS_CSV, dtype=str)

# Standardize column names and extract fields
cols = {c:c for c in acs.columns}
assert "GEO_ID" in acs.columns, "Expected GEO_ID in ACS file."
# Some downloads name the estimate exactly 'B01003_001E'; keep as is
estimate_col = "B01003_001E"
assert estimate_col in acs.columns, f"Expected {estimate_col} in ACS file."

acs["GEOID"] = acs["GEO_ID"].str[-11:]
acs[estimate_col] = pd.to_numeric(acs[estimate_col], errors="coerce")

acs = acs[["GEOID", estimate_col, "NAME"]]
acs.rename(columns={estimate_col: "total_pop"}, inplace=True)

print(f"ACS rows: {len(acs):,}")
display(acs.head(2))

# --- merge & checks ---
merged = gdf.merge(acs, on="GEOID", how="left")
missing = merged["total_pop"].isna().sum()

print(f"Merged rows: {len(merged):,} (should equal tract count)")
print(f"Tracts missing ACS population: {missing:,}")

# Quick sanity: total population distribution
print(merged["total_pop"].describe())

# Save a clean, aligned geodataframe for later steps
OUT = ROOT/"data/processed/tracts_tx_with_pop.parquet"
merged.to_parquet(OUT, index=False)
print(f"Saved: {OUT}")