In [1]:
import pandas as pd
from pathlib import Path

RAW = Path("../data_raw")
CLEAN = Path("../data_clean")
CLEAN.mkdir(exist_ok=True)

def clean_census_state(path: Path):
    df = pd.read_csv(path, low_memory=False)
    df = df.query("AGEGRP == 0").copy()   # only total population
    df["county_fips"] = (
        df["STATE"].astype(str).str.zfill(2) +
        df["COUNTY"].astype(str).str.zfill(3)
    )
    df["county_name"] = df["CTYNAME"].str.replace(" County", "", regex=False)
    df["black_pop"] = df["BA_MALE"] + df["BA_FEMALE"]
    df["total_pop"] = df["TOT_POP"]
    df["black_pct"] = (df["black_pop"] / df["total_pop"] * 100).round(2)
    return df[["county_fips", "STNAME", "county_name", "total_pop", "black_pop", "black_pct"]]

all_states = []
for p in sorted(RAW.glob("*est2024-alldata-*.csv")):
    try:
        temp = clean_census_state(p)
        all_states.append(temp)
        print("✓", p.name, "→", len(temp), "rows")
    except Exception as e:
        print("⚠️", p.name, e)

county_demo = pd.concat(all_states, ignore_index=True)
county_demo.to_csv(CLEAN / "county_demographics_2024.csv", index=False)
print("Saved county_demographics_2024.csv with", len(county_demo), "rows")
county_demo.head()


✓ Florida-est2024-alldata-12.csv → 402 rows
✓ Georgia-est2024-alldata-13.csv → 954 rows
✓ Maryland-est2024-alldata-24.csv → 144 rows
✓ Michigan-est2024-alldata-26.csv → 498 rows
✓ Mississippi-est2024-alldata-28.csv → 492 rows
✓ Tennessee-est2024-alldata-47.csv → 570 rows
✓ alabama-est2024-alldata-01.csv → 402 rows
Saved county_demographics_2024.csv with 3462 rows


Unnamed: 0,county_fips,STNAME,county_name,total_pop,black_pop,black_pct
0,12001,Florida,Alachua,278474,56838,20.41
1,12001,Florida,Alachua,279765,57025,20.38
2,12001,Florida,Alachua,281710,57230,20.32
3,12001,Florida,Alachua,285241,57679,20.22
4,12001,Florida,Alachua,288962,58143,20.12
