In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

ROOT = Path(__file__).resolve().parents[1] if "__file__" in globals() else Path.cwd().parents[0]
CLEAN = ROOT / "data_clean"
RAW = ROOT / "data_raw"
GEO = ROOT / "data" / "geo"
VIS = ROOT / "visuals"
VIS.mkdir(parents=True, exist_ok=True)

merged_path = CLEAN / "baltimore_igs_merged.csv"
deltas_path = CLEAN / "baltimore_igs_yoy_deltas.csv"  # optional; we’ll compute if missing
print("ROOT:", ROOT)
print("Looking for cleaned files at:", CLEAN)

try:
    import geopandas as gpd
except ModuleNotFoundError as e:
    raise ModuleNotFoundError(
        "GeoPandas is required for this notebook.\n"
        "Install in your venv:\n\n"
        "  python -m pip install geopandas pyproj shapely fiona\n"
    ) from e

GEO.mkdir(parents=True, exist_ok=True)
local_geojson = GEO / "baltimore_tracts.geojson"
tiger_dir = GEO / "tl_2023_24510_tract"
tiger_zip = GEO / "tl_2023_24510_tract.zip"
tiger_url = "https://www2.census.gov/geo/tiger/TIGER2023/TRACT/tl_2023_24510_tract.zip"

def load_tracts():
    # Preferred: local geojson
    if local_geojson.exists():
        gdf = gpd.read_file(local_geojson)
        return gdf

    # Second: a local TIGER shapefile folder
    if tiger_dir.exists():
        shp = next(tiger_dir.glob("*.shp"))
        return gpd.read_file(shp)

    # Fallback: try to download TIGER and read
    try:
        import zipfile, io, requests
        if not tiger_zip.exists():
            r = requests.get(tiger_url, timeout=60)
            r.raise_for_status()
            tiger_zip.write_bytes(r.content)
        with zipfile.ZipFile(tiger_zip) as z:
            z.extractall(tiger_dir)
        shp = next(tiger_dir.glob("*.shp"))
        return gpd.read_file(shp)
    except Exception as e:
        raise FileNotFoundError(
            "Could not find Baltimore tract geometry.\n"
            "Options:\n"
            "  A) Put a GeoJSON at data/geo/baltimore_tracts.geojson\n"
            "  B) Download & unzip TIGER to data/geo/tl_2023_24510_tract/\n"
            f"Details: {e}"
        ) from e

tracts = load_tracts()
# Normalize GEOID column names to 'geoid'
tracts = tracts.rename(columns={c: "geoid" for c in tracts.columns if c.lower() in {"geoid", "geoid10", "geoid20"}})
assert "geoid" in tracts.columns, "Could not find GEOID column in tract geometry."
tracts["geoid"] = tracts["geoid"].astype(str).str.zfill(11)
tracts = tracts.to_crs(4326)  # safe default
tracts.head()

merged = pd.read_csv(merged_path)
print("Merged rows:", len(merged))
if deltas_path.exists():
    deltas = pd.read_csv(deltas_path)
    print("Deltas rows:", len(deltas))
else:
    deltas = None
    print("No deltas file found; will compute YoY from merged.")

cands = [c for c in merged.columns if c.lower() in {"score", "overall_score", "igs_score", "inclusive_growth_score"}]
primary = cands[0] if cands else None
if primary is None:
    raise ValueError(
        f"Could not infer a primary score column. Columns available:\n{list(merged.columns)[:50]}"
    )
print("Primary score column:", primary)

# Normalize keys used for merges and filters
merged["geoid"] = merged["geoid"].astype(str).str.zfill(11)
assert "year" in merged.columns, "The cleaned dataset must have a 'year' column."
latest_year = int(merged["year"].max())
print("Latest year:", latest_year)

latest = merged.query("year == @latest_year").copy()
g_latest = tracts.merge(latest, on="geoid", how="left")

# Simple quantile bins with labels for a clean legend
labels = ["Q1 (low)", "Q2", "Q3", "Q4 (high)"]
g_latest["score_bin"] = pd.qcut(g_latest[primary], q=4, duplicates="drop", labels=labels)

fig, ax = plt.subplots(figsize=(8, 8))
g_latest.plot(column="score_bin", ax=ax, legend=True, edgecolor="white", linewidth=0.2)
ax.set_title(f"{primary} — Baltimore Census Tracts (Year {latest_year})")
ax.axis("off")
out1 = VIS / "igs_primary_latest_year.png"
plt.tight_layout()
plt.savefig(out1, dpi=180)
plt.show()
print("Saved:", out1)

if deltas is None:
    # Compute YoY from merged if a deltas file wasn't produced by Notebook 01
    tmp = merged.sort_values(["geoid", "year"]).copy()
    tmp["primary_score_yoy"] = tmp.groupby("geoid")[primary].diff()
    deltas = tmp[["geoid", "year", "primary_score_yoy"]].dropna()

# For a clean snapshot, use YoY of latest year
d_last = deltas.query("year == @latest_year").copy()
g_dlast = tracts.merge(d_last, on="geoid", how="left")
g_dlast["yoy_bin"] = pd.qcut(g_dlast["primary_score_yoy"], q=4, duplicates="drop")

fig, ax = plt.subplots(figsize=(8, 8))
g_dlast.plot(column="yoy_bin", ax=ax, legend=True, edgecolor="white", linewidth=0.2)
ax.set_title(f"YoY change in {primary} — Baltimore (Year {latest_year})")
ax.axis("off")
out2 = VIS / "igs_primary_yoy_latest_year.png"
plt.tight_layout()
plt.savefig(out2, dpi=180)
plt.show()
print("Saved:", out2)

top10 = d_last.sort_values("primary_score_yoy", ascending=False).head(10)
bot10 = d_last.sort_values("primary_score_yoy", ascending=True).head(10)

print("\nTop 10 YoY increases:")
display(top10)

print("\nBottom 10 YoY decreases:")
display(bot10)
