# Exploratory Analysis – Chicago Crash Inequality

Goals:
- Inspect data health (counts, match rate, missingness).
- Check hotspot label balance and key feature distributions.
- Peek at community-area coverage.


In [25]:
import pandas as pd
import numpy as np
from pathlib import Path

pd.set_option("display.max_columns", 120)
pd.set_option("display.width", 160)

DATA_DIR = Path("../data/processed")
FEATS_PATH = DATA_DIR / "intersection_features_enriched.parquet"
CRASH_PATH = DATA_DIR / "crashes_with_nodes.parquet"

print("Paths:")
print(FEATS_PATH.resolve())
print(CRASH_PATH.resolve())


Paths:
/Users/siddarthbandi/Desktop/SID/VT/intro to urban comp/project/data/processed/intersection_features_enriched.parquet
/Users/siddarthbandi/Desktop/SID/VT/intro to urban comp/project/data/processed/crashes_with_nodes.parquet


In [26]:
# Load data
feats = pd.read_parquet(FEATS_PATH)
crashes = pd.read_parquet(CRASH_PATH, columns=["intersection_id"])

print(f"Intersections: {len(feats):,}")
print(f"Hotspots (label=1): {feats['label_hotspot'].sum():,}")
match_rate = crashes["intersection_id"].notna().mean()
print(f"Crash→intersection match rate: {match_rate*100:.2f}%")


Intersections: 19,200
Hotspots (label=1): 2,292
Crash→intersection match rate: 87.96%


In [27]:
# Label balance
value_counts = feats['label_hotspot'].value_counts().rename_axis('label').to_frame('count')
value_counts['pct'] = value_counts['count'] / len(feats) * 100
value_counts


Unnamed: 0_level_0,count,pct
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,16908,88.0625
1,2292,11.9375


In [28]:
# Missingness (top 20)
missing = feats.isna().mean().sort_values(ascending=False)
missing.head(20)


acs_median_income                          0.013854
community_name                             0.005469
community_id                               0.005469
acs_pop                                    0.001354
acs_vehicle_access_rate                    0.001354
acs_poverty_universe                       0.000156
acs_households_with_vehicle                0.000156
centrality_degree                          0.000000
centrality_closeness                       0.000000
hist_severity                              0.000000
centrality_betweenness                     0.000000
recent90_people_injuries_nonincap          0.000000
recent90_people_injuries_incapacitating    0.000000
recent90_people_injuries_fatal             0.000000
recent90_people_injuries_total             0.000000
hist_people_injuries_nonincap              0.000000
hist_people_injuries_incapacitating        0.000000
hist_people_injuries_fatal                 0.000000
hist_people_injuries_total                 0.000000
recent90_cra

In [29]:
# Numeric feature summary (selected)
num_cols = [
    'hist_crashes','recent90_crashes','future_crashes',
    'hist_severity','recent90_severity','future_severity',
    'hist_people_injuries_total','recent90_people_injuries_total',
    'centrality_degree','centrality_closeness','centrality_betweenness',
    'acs_pop','acs_median_income','acs_vehicle_access_rate'
]
feats[num_cols].describe(percentiles=[0.5,0.9,0.99]).T


Unnamed: 0,count,mean,std,min,50%,90%,99%,max
hist_crashes,19200.0,5.005104,6.708229,1.0,3.0,12.0,35.0,92.0
recent90_crashes,19200.0,1.260677,1.971281,0.0,1.0,3.0,9.0,28.0
future_crashes,19200.0,2.346458,3.648235,0.0,1.0,6.0,18.0,48.0
hist_severity,19200.0,6.466302,8.87117,1.0,3.0,16.0,45.0,131.0
recent90_severity,19200.0,1.613125,2.632402,0.0,1.0,5.0,12.0,32.0
future_severity,19200.0,3.084271,4.901879,0.0,1.0,8.0,24.0,65.0
hist_people_injuries_total,19200.0,8.747031,14.449314,0.0,4.0,23.0,74.0,218.0
recent90_people_injuries_total,19200.0,2.045625,4.039559,0.0,0.0,6.0,19.0,47.0
centrality_degree,19200.0,0.000188,5.3e-05,3.4e-05,0.000204,0.000272,0.000272,0.000409
centrality_closeness,19200.0,0.015944,0.002798,0.0,0.015965,0.019594,0.021888,0.023858


In [30]:
# Community-area coverage: top/bottom by hotspot share
comm = feats.groupby('community_name').agg(
    intersections=('intersection_id','count'),
    hotspots=('label_hotspot','sum')
).reset_index()
comm['hotspot_rate'] = comm['hotspots'] / comm['intersections']
comm.sort_values('hotspot_rate', ascending=False).head(10)


Unnamed: 0,community_name,intersections,hotspots,hotspot_rate
41,LOOP,328,126,0.384146
47,NEAR NORTH SIDE,468,152,0.324786
2,ARMOUR SQUARE,131,23,0.175573
49,NEAR WEST SIDE,797,138,0.173149
71,WEST GARFIELD PARK,191,33,0.172775
48,NEAR SOUTH SIDE,178,29,0.162921
29,GREATER GRAND CROSSING,356,55,0.154494
1,ARCHER HEIGHTS,137,21,0.153285
66,UPTOWN,229,35,0.152838
8,BELMONT CRAGIN,423,64,0.1513


## Next checks
- Visuals (maps) if needed: current vs. predicted hotspots, by community area.
- Spatial autocorrelation (Moran’s I/LISA) on residuals after modeling.
- Calibration/PR curves post-modeling.
