# 02 · Trend Analysis — Baltimore IGS

This notebook loads cleaned outputs from `01_data_cleaning.ipynb` and produces quick insights:
- Top / bottom movers by primary score (latest YoY)
- Summary tables (by year)
- Distribution plots for YoY changes
- Simple tract spotlight helper

**Inputs** (expected):
- `../data_clean/baltimore_igs_merged.csv`
- `../data_clean/baltimore_igs_yoy_deltas.csv`

**Outputs**:
- `../visuals/igs_yoy_histogram.png`
- `../visuals/igs_top10_primary_yoy.png`
- `../visuals/igs_bottom10_primary_yoy.png`


In [1]:
# --- Load clean data
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

CLEAN = Path("../data_clean/igs_baltimore.parquet") if Path("../data_clean/igs_baltimore.parquet").exists() \
    else Path("../data/clean/igs_baltimore.parquet")

df = pd.read_parquet(CLEAN)
print(f"Loaded {len(df):,} rows from {CLEAN}")

# --- Canonicalize score/keys
score_col = next((c for c in ["igs_score","inclusive_growth_score","score"]
                  if c in df.columns), None)
if not score_col:
    raise ValueError("Couldn't find an IGS score column. Show me df.columns and I'll map it.")

# keep GEOID as string, sanitize
if "geoid" in df.columns:
    df["geoid"] = df["geoid"].astype(str).str.strip()

# --- Citywide mean by year
if "year" in df.columns:
    city_trend = (df.groupby("year")[score_col]
                    .mean().reset_index().sort_values("year"))
    display(city_trend.head())

    plt.figure(figsize=(7,4))
    plt.plot(city_trend["year"], city_trend[score_col], marker="o")
    plt.title("Citywide Mean IGS by Year")
    plt.xlabel("Year"); plt.ylabel(score_col); plt.tight_layout(); plt.show()

# --- Tract YoY change (latest year vs previous)
if {"geoid","year",score_col}.issubset(df.columns):
    latest = int(df["year"].max())
    prev   = int(df["year"].nlargest(2).min()) if df["year"].nunique() > 1 else None

    if prev:
        wide = (df.pivot_table(index="geoid", columns="year", values=score_col, aggfunc="mean"))
        wide["yoy_change"] = wide.get(latest) - wide.get(prev)
        movers = (wide[["yoy_change"]]
                  .dropna()
                  .sort_values("yoy_change", ascending=False))
        top10  = movers.head(10).reset_index()
        bottom10 = movers.tail(10).reset_index()

        print(f"\nTop 10 movers ({prev}→{latest}):")
        display(top10)

        print(f"\nBottom 10 movers ({prev}→{latest}):")
        display(bottom10)

        plt.figure(figsize=(7,4))
        plt.hist(movers["yoy_change"], bins=30, edgecolor="black")
        plt.title(f"Distribution of YoY Change ({prev}→{latest})")
        plt.xlabel("YoY change"); plt.ylabel("Count"); plt.tight_layout(); plt.show()
    else:
        print("Only one year present—skipping YoY movers.")

# --- Correlation heatmap of numeric columns
num = df.select_dtypes(include=[np.number])
if num.shape[1] >= 2:
    corr = num.corr(numeric_only=True)
    plt.figure(figsize=(8,6))
    sns.heatmap(corr, annot=False, cmap="coolwarm", center=0)
    plt.title("Correlation heatmap (numeric columns)")
    plt.tight_layout(); plt.show()
else:
    print("Not enough numeric columns for a correlation heatmap.")


ImportError: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
A suitable version of pyarrow or fastparquet is required for parquet support.
Trying to import the above resulted in these errors:
 - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.
 - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.

In [None]:
# Load data (will raise a clear error if not present)
merged = pd.read_csv(merged_path)
deltas = pd.read_csv(deltas_path)
merged.head(), deltas.head()

## Identify primary score column

In [None]:
score_candidates = [c for c in merged.columns if c.lower() in {'score','overall_score','inclusive_growth_score'} or 'score' in c.lower()]
primary = score_candidates[0] if score_candidates else None
primary

## Top / Bottom movers in latest year (primary score YoY)

In [None]:
if primary is None:
    raise ValueError('Could not infer a primary score column. Please confirm in 01 notebook.')

# Derive YoY for primary directly from merged table (just in case)
tmp = merged.sort_values(['geoid','year']).copy()
tmp['primary_score_yoy'] = tmp.groupby('geoid')[primary].diff()
last_year = int(tmp['year'].max())
latest = tmp[tmp['year']==last_year][['geoid','year','primary_score_yoy']].dropna()
top10 = latest.sort_values('primary_score_yoy', ascending=False).head(10)
bottom10 = latest.sort_values('primary_score_yoy', ascending=True).head(10)
top10, bottom10

## Plot: Distribution of YoY changes (primary score)

In [None]:
plt.figure()
latest['primary_score_yoy'].hist(bins=30)
plt.title('Distribution of YoY change in primary score (latest year)')
plt.xlabel('YoY change')
plt.ylabel('Count')
hist_path = VIS_DIR / 'igs_yoy_histogram.png'
plt.savefig(hist_path, bbox_inches='tight')
hist_path

## Plot: Top 10 & Bottom 10 Movers (bar charts)

In [None]:
fig = plt.figure()
top10_sorted = top10.sort_values('primary_score_yoy', ascending=True)
plt.barh(top10_sorted['geoid'], top10_sorted['primary_score_yoy'])
plt.title('Top 10 Tracts by YoY (primary score) — latest year')
plt.xlabel('YoY change')
top_path = VIS_DIR / 'igs_top10_primary_yoy.png'
plt.tight_layout(); plt.savefig(top_path, bbox_inches='tight'); top_path

In [None]:
fig = plt.figure()
bottom10_sorted = bottom10.sort_values('primary_score_yoy', ascending=True)
plt.barh(bottom10_sorted['geoid'], bottom10_sorted['primary_score_yoy'])
plt.title('Bottom 10 Tracts by YoY (primary score) — latest year')
plt.xlabel('YoY change')
bot_path = VIS_DIR / 'igs_bottom10_primary_yoy.png'
plt.tight_layout(); plt.savefig(bot_path, bbox_inches='tight'); bot_path

## Yearly summary table

In [None]:
summary = (merged
           .groupby('year')
           [primary]
           .agg(['count','mean','median','std','min','max'])
           .reset_index())
summary

## Tract spotlight helper (enter a GEOID)

In [None]:
def tract_spotlight(geoid: str):
    geoid = str(geoid).zfill(11)
    hist = merged[merged['geoid']==geoid].sort_values('year')[['year', primary]]
    yoy = tmp[tmp['geoid']==geoid][['year','primary_score_yoy']]
    display(hist)
    display(yoy)

# Example usage (replace with a tract of interest):
# tract_spotlight('24510080100')