In [2]:
# --- IGS Baltimore: ingest + quick EDA ---------------------------------------
# Works whether your CSVs are in repo root or in data_raw/.
# Produces: data/clean/igs_baltimore.parquet and basic summary prints.

from pathlib import Path
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# ---------- Paths
ROOT = Path(__file__).resolve().parents[1] if "__file__" in globals() else Path.cwd()
RAW = (ROOT / "data_raw") if (ROOT / "data_raw").exists() else ROOT
OUT = ROOT / "data" / "clean"
OUT.mkdir(parents=True, exist_ok=True)

# ---------- Find IGS CSVs
PATTERN = "Inclusive_Growth_Score_Data_Export_*_BaltimoreCity_CensusTractLevel.csv"
csvs = sorted(RAW.glob(PATTERN))
assert csvs, f"No CSVs found matching {PATTERN} in {RAW}"

print("Found CSVs:")
for p in csvs: print(" •", p.relative_to(ROOT))

# ---------- Helpers
def to_snake(name: str) -> str:
    name = re.sub(r"[^\w]+", "_", name.strip())
    name = re.sub(r"__+", "_", name).strip("_")
    return name.lower()

def extract_year_from_df_or_name(df: pd.DataFrame, path: Path) -> int | None:
    # Common IGS year columns—adjust if needed
    for col in df.columns:
        if col.lower() in {"year","fy","report_year","reporting_year"}:
            try: return int(pd.to_numeric(df[col]).iloc[0])
            except Exception: pass
    # Fallback: look for a 4-digit year in filename
    m = re.search(r"(20\d{2})", path.name)
    return int(m.group(1)) if m else None

# ---------- Load + normalize
frames = []
for p in csvs:
    df = pd.read_csv(p, dtype=str)  # keep IDs exact
    # Standardize columns
    df.columns = [to_snake(c) for c in df.columns]

    # Try to create a consistent census tract key and year
    # Adjust these mappings to match your file’s actual column names
    # (print(df.columns) on first run if unsure)
    col_map = {
        "geoid": ["geoid", "census_tract_geoid", "tract_geoid", "census_tract"],
        "tract_name": ["name", "tract_name"],
        "county": ["county", "county_name"],
        "state": ["state", "state_name"],
        "city": ["city", "city_name", "municipality"],
        "igs_score": ["inclusive_growth_score", "igs_score", "score"],
    }

    norm = {}
    for std, candidates in col_map.items():
        for c in candidates:
            if c in df.columns:
                norm[std] = df[c]
                break

    # Attach normalized columns (only where found)
    for k, v in norm.items():
        df[k] = v

    # Strip whitespace, keep geoid as string
    if "geoid" in df:
        df["geoid"] = df["geoid"].str.strip()

    # --- Numeric coercion (safe + Pylance friendly)
cols = [str(c) for c in df.columns]
num_candidates = [c for c in cols if any(sub in c for sub in ("score", "index", "rate", "pct", "share"))]

for c in num_candidates:
    # force string, drop thousands commas, then coerce to numeric
    s = df[c].astype(str).str.replace(",", "", regex=False)
    df[c] = pd.to_numeric(s, errors="coerce")


    # Year
    yr = extract_year_from_df_or_name(df, p)
    if yr: df["year"] = yr

    # Source filename for traceability
    df["source_file"] = p.name
    frames.append(df)

raw = pd.concat(frames, ignore_index=True)

# ---------- Minimal cleaning
# Deduplicate exact rows
raw = raw.drop_duplicates()

# If both city and tracts exist, keep tract granularity (drop citywide rows)
if "geoid" in raw and raw["geoid"].notna().any():
    # heuristics: GEOIDs are 11 or more digits for tracts; filter out null/short
    raw = raw[raw["geoid"].str.len().fillna(0) >= 11]

# ---------- Save clean parquet
clean_path = OUT / "igs_baltimore.parquet"
raw.to_parquet(clean_path, index=False)
print(f"\nSaved clean file → {clean_path.relative_to(ROOT)}  ({len(raw):,} rows)")

# ---------- Quick EDA
print("\n=== Columns ===")
print(sorted(raw.columns))

print("\n=== Missing values (top 20) ===")
print(raw.isna().sum().sort_values(ascending=False).head(20))

num_cols = [c for c in raw.columns if pd.api.types.is_numeric_dtype(raw[c])]
print("\n=== Numeric summary (head) ===")
print(raw[num_cols].describe().T.head(10))

# Top/bottom tracts by IGS (if available)
score_col = next((c for c in ["igs_score","inclusive_growth_score"] if c in raw.columns), None)
if score_col:
    print("\n=== Top 10 tracts by IGS ===")
    print(raw[["geoid", "tract_name", score_col, "year"]]
          .sort_values(score_col, ascending=False).head(10))

    print("\n=== Bottom 10 tracts by IGS ===")
    print(raw[["geoid", "tract_name", score_col, "year"]]
          .sort_values(score_col, ascending=True).head(10))

    # Simple histogram
    plt.figure(figsize=(6,4))
    raw[score_col].dropna().plot(kind="hist", bins=30, edgecolor="black")
    plt.title("Distribution of IGS (tract level)")
    plt.xlabel(score_col)
    plt.ylabel("count")
    plt.tight_layout()
    plt.show()

# Trend over time (if year present)
if "year" in raw.columns and score_col:
    trend = (raw.groupby("year")[score_col]
             .mean().reset_index().sort_values("year"))
    print("\n=== Citywide mean IGS by year ===")
    print(trend)
    plt.figure(figsize=(6,4))
    plt.plot(trend["year"], trend[score_col], marker="o")
    plt.title("Citywide Mean IGS by Year")
    plt.xlabel("year")
    plt.ylabel(score_col)
    plt.tight_layout()
    plt.show()


AssertionError: No CSVs found matching Inclusive_Growth_Score_Data_Export_*_BaltimoreCity_CensusTractLevel.csv in /Users/warrenjones/Dev/igs-analysis-baltimore/notebooks