# 01 · Data Cleaning & Merge — Baltimore IGS

This notebook loads three years of Mastercard **Inclusive Growth Score (IGS)** data (tract‑level, Baltimore City), standardizes identifiers, merges into one tidy table, and computes **year‑over‑year deltas** for all numeric metrics.

> **Expected repo layout (run from `notebooks/`):**
>
> - `../data_raw/`  → raw .csv or .xlsx files (2022–2024)
> - `../data_clean/`  → cleaned outputs written by this notebook


In [1]:
# --- Imports & Paths
from pathlib import Path
import pandas as pd
import numpy as np

RAW_DIR = Path('../data_raw')
CLEAN_DIR = Path('../data_clean')
CLEAN_DIR.mkdir(parents=True, exist_ok=True)

RAW_DIR, CLEAN_DIR

(PosixPath('../data_raw'), PosixPath('../data_clean'))

In [2]:
# --- Helper functions

def read_any(path: Path) -> pd.DataFrame:
    """Read CSV or Excel by file extension. Returns DataFrame with raw columns."""
    ext = path.suffix.lower()
    if ext in {'.csv', '.txt'}:
        return pd.read_csv(path, low_memory=False)
    elif ext in {'.xlsx', '.xls'}:
        return pd.read_excel(path, engine='openpyxl')
    else:
        raise ValueError(f'Unsupported file type: {ext} for {path}')

def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Lowercase + snake_case columns and strip whitespace."""
    df = df.copy()
    df.columns = (
        df.columns
          .str.strip()
          .str.replace('[\s\-/]+', '_', regex=True)
          .str.replace('[^0-9a-zA-Z_]', '', regex=True)
          .str.lower()
    )
    return df

def ensure_geoid(df: pd.DataFrame) -> pd.DataFrame:
    """Try to coerce a tract identifier (geoid/fips/tract) to 11-char string."""
    df = df.copy()
    candidates = [c for c in df.columns if c in {'geoid','tract_geoid','census_tract_fips_code','tract_fips','census_tract_fips','fips','census_tract'}]
    if not candidates:
        for c in df.columns:
            if 'geoid' in c or ('tract' in c and 'fips' in c):
                candidates.append(c)
    if not candidates:
        raise KeyError('Could not find a tract identifier column (e.g., GEOID / FIPS).')
    geo_col = candidates[0]
    df.rename(columns={geo_col: 'geoid'}, inplace=True)
    df['geoid'] = (df['geoid'].astype(str)
                   .str.replace('\\.0$', '', regex=True)
                   .str.replace('[^0-9]', '', regex=True)
                   .str.zfill(11))
    return df

def infer_year(df: pd.DataFrame, filename: str) -> int:
    """Infer year from a column (e.g., 'year') or the filename."""
    for c in df.columns:
        if c == 'year' or c.endswith('_year'):
            try:
                year = int(pd.to_numeric(df[c], errors='coerce').mode().iat[0])
                return year
            except Exception:
                pass
    import re
    m = re.search(r'(20\d{2})', filename)
    if m:
        return int(m.group(1))
    raise ValueError('Could not infer year from data or filename: ' + filename)

def select_numeric_metrics(df: pd.DataFrame) -> list:
    """Return list of numeric metric columns (exclude id/label columns)."""
    exclude = {'geoid','year','state','county','census_tract_designation','place','tract_name','name'}
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    return [c for c in num_cols if c not in exclude]


  .str.replace('[\s\-/]+', '_', regex=True)


## Load & Merge

In [3]:
# Gather files
files = sorted([p for p in RAW_DIR.glob('*') if p.suffix.lower() in {'.csv','.xlsx','.xls','.txt'}])
assert files, f'No data files found in {RAW_DIR.resolve()}.'
files

AssertionError: No data files found in /Users/warrenjones/Dev/igs-analysis-baltimore/data_raw.

In [None]:
# Read, normalize, and stack
frames = []
for p in files:
    df = read_any(p)
    df = normalize_columns(df)
    year = infer_year(df, p.name)
    df = ensure_geoid(df)
    df['year'] = year
    frames.append(df)

merged = pd.concat(frames, ignore_index=True, sort=False)

print('Years found:', sorted(merged['year'].unique().tolist()))
print('Rows:', len(merged), '| Columns:', len(merged.columns))
merged.head()

## Save merged clean table

In [None]:
merged_out = CLEAN_DIR / 'baltimore_igs_merged.csv'
merged.to_csv(merged_out, index=False)
merged_out.resolve()

## Compute YoY deltas (per tract, all numeric metrics)

In [None]:
# Identify numeric metrics to diff
metric_cols = select_numeric_metrics(merged)
print('Metric columns (sample):', metric_cols[:10], '...')

merged_sorted = merged.sort_values(['geoid','year'])

def diff_group(g):
    g = g.sort_values('year')
    diffs = g[metric_cols].diff()
    diffs.columns = [f'{c}_yoy' for c in diffs.columns]
    return pd.concat([g[['geoid','year']], diffs], axis=1)

deltas = (merged_sorted
          .groupby('geoid', group_keys=False)
          .apply(diff_group)
          .dropna()
          .reset_index(drop=True))

deltas_out = CLEAN_DIR / 'baltimore_igs_yoy_deltas.csv'
deltas.to_csv(deltas_out, index=False)
deltas.head()

## Quick summary: Top/Bottom movers by overall score (if present)

In [None]:
score_candidates = [c for c in merged.columns if c.lower() in {'score','overall_score','inclusive_growth_score'} or 'score' in c.lower()]
score_candidates

In [None]:
if score_candidates:
    primary = score_candidates[0]
    merged_primary = merged.sort_values(['geoid','year']).copy()
    merged_primary['primary_score_yoy'] = merged_primary.groupby('geoid')[primary].diff()
    last_year = merged_primary['year'].max()
    last_yoy = merged_primary[merged_primary['year']==last_year][['geoid','primary_score_yoy']].dropna()
    top10 = last_yoy.sort_values('primary_score_yoy', ascending=False).head(10)
    bottom10 = last_yoy.sort_values('primary_score_yoy', ascending=True).head(10)
else:
    primary = None
    top10 = bottom10 = pd.DataFrame()
primary, top10, bottom10

## Save quick reports

In [None]:
reports = {}
if score_candidates:
    reports['top10_primary_yoy'] = CLEAN_DIR / 'top10_primary_yoy.csv'
    reports['bottom10_primary_yoy'] = CLEAN_DIR / 'bottom10_primary_yoy.csv'
    top10.to_csv(reports['top10_primary_yoy'], index=False)
    bottom10.to_csv(reports['bottom10_primary_yoy'], index=False)
reports

### Next steps
- Join tract GEOIDs to a Baltimore **tract shapefile** (GeoPandas) for mapping.
- Blend in contextual variables (e.g., investment programs, demolition permits, etc.).
- Validate which metric column best reflects the IGS "overall score" for your study.
