In [1]:
# --- Paths ---
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

ROOT = Path.cwd().parents[0] if (Path.cwd().parents[0] / "data_raw").exists() else Path.cwd()
RAW  = ROOT / "data_raw"
OUT  = ROOT / "data_clean"
OUT.mkdir(parents=True, exist_ok=True)

print("RAW dir:", RAW.resolve())

# --- Find CSVs (robust glob) ---
# Your export looks like Inclusive_Growth_Score_Data_Export_03-11-2025_014111.csv
PATTERN = "Inclusive_Growth_Score_Data_Export_*.csv"
csvs = sorted(RAW.glob(PATTERN))
if not csvs:
    # fallbacks: try any csv, or tell us what exists
    csvs = sorted(RAW.glob("*.csv"))
    print("No files matched PATTERN. Fallback to all CSVs in data_raw.")
    print("Found:", [p.name for p in csvs])

assert csvs, f"No CSVs found in {RAW.resolve()}"

print("Using CSVs:")
for p in csvs:
    print(" -", p.name)


RAW dir: /Users/warrenjones/Dev/igs-analysis-baltimore/data_raw
Using CSVs:
 - Inclusive_Growth_Score_Data_Export_03-11-2025_014111.csv


In [4]:
from pathlib import Path
import pandas as pd

def read_igs_csv_autohdr(path: Path) -> pd.DataFrame:
    """Read IGS CSVs that include META/SUMMARY lines above the real header."""
    # find the row that contains the real header
    header_idx = 0
    with open(path, 'r', encoding='utf-8-sig', errors='ignore') as f:
        probe = [next(f, '') for _ in range(40)]
    for i, line in enumerate(probe):
        if ('Census Tract FIPS code' in line) and ('Year' in line):
            header_idx = i
            break

    df = pd.read_csv(path, header=header_idx, engine='python', dtype=str)
    # drop unnamed filler columns
    df = df.loc[:, ~df.columns.str.match(r'^\s*Unnamed')]
    df.columns = df.columns.str.strip()
    return df

def normalize_cols(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    out.columns = (out.columns
                   .str.strip()
                   .str.lower()
                   .str.replace(r'[\s\-/]+', '_', regex=True))
    return out


In [5]:
df_list = []
for p in csvs:
    print(f"\nðŸ“„ Loading: {p.name}")
    raw = read_igs_csv_autohdr(p)
    print("  Raw cols:", list(raw.columns)[:12], "...")
    df_list.append(raw)

df = pd.concat(df_list, ignore_index=True)
df = normalize_cols(df)

print("\nâœ… Loaded & normalized. Columns:", list(df.columns)[:12], "...")
print("Years:", sorted(pd.to_numeric(df.get('year'), errors='coerce').dropna().unique().tolist()))
df.head()



ðŸ“„ Loading: Inclusive_Growth_Score_Data_Export_03-11-2025_014111.csv
  Raw cols: ['N/A', 'Is an Opportunity Zone', 'Census Tract FIPS code', 'County', 'State', 'Year', 'Inclusive Growth Score', 'Growth', 'Inclusion'] ...

âœ… Loaded & normalized. Columns: ['n_a', 'is_an_opportunity_zone', 'census_tract_fips_code', 'county', 'state', 'year', 'inclusive_growth_score', 'growth', 'inclusion'] ...
Years: [2020.0, 2021.0, 2022.0, 2023.0, 2024.0]


Unnamed: 0,n_a,is_an_opportunity_zone,census_tract_fips_code,county,state,year,inclusive_growth_score,growth,inclusion
0,,,,,,,,,
1,0.0,,24510150800.0,Baltimore city,Maryland,2020.0,36.0,43.0,29.0
2,1.0,,24510150800.0,Baltimore city,Maryland,2021.0,40.0,48.0,31.0
3,2.0,,24510150800.0,Baltimore city,Maryland,2022.0,34.0,36.0,33.0
4,3.0,,24510150800.0,Baltimore city,Maryland,2023.0,38.0,42.0,33.0


In [6]:
print("Columns:", list(df.columns))
print("Years:", df['year'].unique())
print("Counties:", df['county'].unique())

# Missingness
df.isna().sum().sort_values(ascending=False).head(10)

# Simple stats for the score column if present
score_col = next((c for c in ['inclusive_growth_score','igs_score','score'] if c in df.columns), None)
if score_col:
    print(df[score_col].astype(float).describe())


Columns: ['n_a', 'is_an_opportunity_zone', 'census_tract_fips_code', 'county', 'state', 'year', 'inclusive_growth_score', 'growth', 'inclusion']
Years: [nan '2020' '2021' '2022' '2023' '2024']
Counties: [nan 'Baltimore city' 'Baltimore County' 'Anne Arundel County']
count    1203.000000
mean       43.035744
std        10.025735
min        18.000000
25%        36.000000
50%        42.000000
75%        50.000000
max        71.000000
Name: inclusive_growth_score, dtype: float64
