# 03 - Exploratory Spatial Data Analysis (ESDA)

This notebook performs spatial analysis to identify patterns and clusters in the relationship between pain/distress metrics and Trump voting patterns.

In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Spatial analysis libraries
import libpysal
from libpysal.weights import Queen, Rook, KNN
import esda
from esda.moran import Moran, Moran_Local, Moran_BV, Moran_Local_BV
import splot
from splot.esda import moran_scatterplot, lisa_cluster, plot_local_autocorrelation
import mapclassify

# Setup
project_root = Path.cwd().parent
data_processed = project_root / 'data' / 'processed'
reports = project_root / 'reports'

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

## 1. Load Processed Data

In [2]:
# Load the merged county dataset
gdf = gpd.read_file(data_processed / 'counties_analysis.geojson')

print(f"Dataset shape: {gdf.shape}")
print(f"\nFirst 10 columns: {gdf.columns.tolist()[:10]}")
print(f"Geometry CRS: {gdf.crs}")

# Ensure we're using a projected CRS for accurate spatial calculations
if gdf.crs.to_epsg() == 4326:
    gdf = gdf.to_crs('EPSG:5070')  # Albers Equal Area for continental US
    print(f"Reprojected to: {gdf.crs}")

# Quick data summary
print(f"\nKey variables available:")
key_vars = ['trump_share_2016', 'trump_share_2020', 'od_1316_rate', 'od_1720_rate', 
            'freq_phys_distress_pct', 'depression_pct']
for var in key_vars:
    if var in gdf.columns:
        non_null = gdf[var].notna().sum()
        print(f"  {var}: {non_null}/{len(gdf)} ({non_null/len(gdf)*100:.1f}%) non-null")

Dataset shape: (3109, 61)

First 10 columns: ['fips', 'county_name', 'state_fips', 'trump_votes_2016', 'opponent_votes_2016', 'two_party_votes_2016', 'total_votes_2016', 'trump_share_2016', 'trump_margin_2016', 'trump_votes_2020']
Geometry CRS: EPSG:4326
Reprojected to: EPSG:5070

Key variables available:
  trump_share_2016: 3100/3109 (99.7%) non-null
  trump_share_2020: 3100/3109 (99.7%) non-null
  od_1316_rate: 0/3109 (0.0%) non-null
  od_1720_rate: 0/3109 (0.0%) non-null
  freq_phys_distress_pct: 3109/3109 (100.0%) non-null
  depression_pct: 3109/3109 (100.0%) non-null


## 2. Spatial Weights Matrix

In [3]:
def create_spatial_weights(gdf, weight_type='queen'):
    """Create spatial weights matrix"""
    
    if weight_type == 'queen':
        w = Queen.from_dataframe(gdf, use_index=True)
    elif weight_type == 'rook':
        w = Rook.from_dataframe(gdf, use_index=True)
    elif weight_type == 'knn':
        w = KNN.from_dataframe(gdf, k=8)
    else:
        raise ValueError(f"Unknown weight type: {weight_type}")
    
    # Row-standardize the weights
    w.transform = 'r'
    
    print(f"Spatial weights summary:")
    print(f"  - Type: {weight_type}")
    print(f"  - Number of observations: {w.n}")
    print(f"  - Average number of neighbors: {w.mean_neighbors:.2f}")
    print(f"  - Min neighbors: {w.min_neighbors}")
    print(f"  - Max neighbors: {w.max_neighbors}")
    print(f"  - Islands: {w.islands}")
    
    return w

# Create Queen contiguity weights (counties that share a border)
w_queen = create_spatial_weights(gdf, 'queen')

# Handle islands if any
if len(w_queen.islands) > 0:
    print(f"\nWarning: {len(w_queen.islands)} islands detected")
    print(f"Island indices: {w_queen.islands}")

Spatial weights summary:
  - Type: queen
  - Number of observations: 3109
  - Average number of neighbors: 5.94
  - Min neighbors: 1
  - Max neighbors: 14
  - Islands: []


## 3. Global Spatial Autocorrelation

In [None]:
def global_moran_analysis(gdf, w, variables):
    """Calculate Global Moran's I for multiple variables"""
    
    results = []
    
    for var in variables:
        if var in gdf.columns:
            # Check for sufficient non-null values
            valid_count = gdf[var].notna().sum()
            
            if valid_count < 30:
                print(f"Skipping {var}: insufficient non-null values ({valid_count})")
                continue
            
            # Calculate Moran's I (it handles NaN values internally)
            try:
                mi = Moran(gdf[var].values, w, permutations=999)
                
                # Get variance (handle different esda versions)
                variance = getattr(mi, 'VI_norm', getattr(mi, 'VI_rand', None))
                if variance is None:
                    variance = mi.seI_norm**2 if hasattr(mi, 'seI_norm') else np.nan
                
                results.append({
                    'variable': var,
                    'moran_i': mi.I,
                    'expected_i': mi.EI,
                    'variance': variance,
                    'z_score': mi.z_norm,
                    'p_value': mi.p_norm,
                    'p_value_sim': mi.p_sim,
                    'significant': mi.p_norm < 0.05
                })
                
                # Create Moran scatterplot
                fig, ax = plt.subplots(1, 1, figsize=(8, 6))
                moran_scatterplot(mi, ax=ax)
                ax.set_title(f"Moran's I Scatterplot: {var}\nI = {mi.I:.4f}, p = {mi.p_norm:.4f}")
                plt.tight_layout()
                plt.savefig(reports / f'figures/moran_scatter_{var}.png', dpi=150, bbox_inches='tight')
                plt.show()
                
            except Exception as e:
                print(f"Error analyzing {var}: {e}")
                continue
    
    return pd.DataFrame(results)

# Define variables to analyze
spatial_vars = [
    'trump_share_2016',
    'trump_share_2020',
    'trump_shift_16_20',
    'od_1316_rate',
    'od_1720_rate',
    'od_rate_change',
    'freq_phys_distress_pct',
    'arthritis_pct',
    'depression_pct'
]

# Filter to variables that exist and have sufficient data
available_vars = [v for v in spatial_vars if v in gdf.columns and gdf[v].notna().sum() > 100]
print(f"Analyzing {len(available_vars)} variables: {available_vars}\n")

# Run global Moran analysis
moran_results = global_moran_analysis(gdf, w_queen, available_vars)
display(moran_results.sort_values('moran_i', ascending=False))

Analyzing 6 variables: ['trump_share_2016', 'trump_share_2020', 'trump_shift_16_20', 'freq_phys_distress_pct', 'arthritis_pct', 'depression_pct']



AttributeError: 'Moran' object has no attribute 'VI'

## 4. Local Spatial Autocorrelation (LISA)

In [None]:
def local_moran_analysis(gdf, w, variable, alpha=0.05):
    """Calculate Local Moran's I and identify clusters"""
    
    # Calculate Local Moran's I
    lisa = Moran_Local(gdf[variable].values, w, permutations=999)
    
    # Add LISA statistics to geodataframe
    gdf[f'{variable}_lisa_i'] = lisa.Is
    gdf[f'{variable}_lisa_p'] = lisa.p_sim
    gdf[f'{variable}_lisa_q'] = lisa.q
    
    # Identify significant clusters
    sig = gdf[f'{variable}_lisa_p'] < alpha
    gdf[f'{variable}_lisa_cluster'] = 'Not Significant'
    gdf.loc[sig & (gdf[f'{variable}_lisa_q'] == 1), f'{variable}_lisa_cluster'] = 'HH'  # High-High
    gdf.loc[sig & (gdf[f'{variable}_lisa_q'] == 2), f'{variable}_lisa_cluster'] = 'LH'  # Low-High
    gdf.loc[sig & (gdf[f'{variable}_lisa_q'] == 3), f'{variable}_lisa_cluster'] = 'LL'  # Low-Low
    gdf.loc[sig & (gdf[f'{variable}_lisa_q'] == 4), f'{variable}_lisa_cluster'] = 'HL'  # High-Low
    
    # Create LISA cluster map
    fig, axes = plt.subplots(1, 2, figsize=(16, 8))
    
    # Raw values map
    gdf.plot(column=variable, scheme='quantiles', k=5, cmap='RdBu_r',
             edgecolor='white', linewidth=0.1, ax=axes[0], legend=True)
    axes[0].set_title(f'{variable} - Raw Values')
    axes[0].axis('off')
    
    # LISA cluster map
    colors = {'HH': '#d62728', 'HL': '#ff9896', 'LH': '#9edae5', 'LL': '#1f77b4', 'Not Significant': '#7f7f7f'}
    gdf['color'] = gdf[f'{variable}_lisa_cluster'].map(colors)
    gdf.plot(color=gdf['color'], edgecolor='white', linewidth=0.1, ax=axes[1])
    axes[1].set_title(f'{variable} - LISA Clusters')
    axes[1].axis('off')
    
    # Add legend
    from matplotlib.patches import Patch
    legend_elements = [Patch(facecolor=colors[k], label=k) for k in colors.keys()]
    axes[1].legend(handles=legend_elements, loc='lower left')
    
    plt.tight_layout()
    plt.savefig(reports / f'figures/lisa_cluster_{variable}.png', dpi=150, bbox_inches='tight')
    plt.show()
    
    # Summary statistics
    cluster_counts = gdf[f'{variable}_lisa_cluster'].value_counts()
    print(f"\nLISA Cluster Summary for {variable}:")
    print(cluster_counts)
    
    return lisa, gdf

# Analyze Trump 2016 vote share
print("Analyzing spatial clusters of Trump 2016 support...")
lisa_trump_2016, gdf = local_moran_analysis(gdf, w_queen, 'trump_share_2016')

# Analyze overdose rates (if available)
if 'od_1316_rate' in gdf.columns and gdf['od_1316_rate'].notna().sum() > 100:
    print("\nAnalyzing spatial clusters of overdose rates (2013-2016)...")
    lisa_overdose, gdf = local_moran_analysis(gdf, w_queen, 'od_1316_rate')

# Analyze physical distress (if available)
if 'freq_phys_distress_pct' in gdf.columns and gdf['freq_phys_distress_pct'].notna().sum() > 100:
    print("\nAnalyzing spatial clusters of physical distress...")
    lisa_distress, gdf = local_moran_analysis(gdf, w_queen, 'freq_phys_distress_pct')

## 5. Bivariate Spatial Analysis

In [None]:
def bivariate_moran_analysis(gdf, w, var1, var2):
    """Bivariate Moran's I to test spatial correlation between two variables"""
    
    # Global bivariate Moran's I
    moran_bv = Moran_BV(gdf[var1].values, gdf[var2].values, w, permutations=999)
    
    print(f"\nBivariate Moran's I: {var1} vs {var2}")
    print(f"  I = {moran_bv.I:.4f}")
    print(f"  p-value = {moran_bv.p_norm:.4f}")
    print(f"  Significant: {moran_bv.p_norm < 0.05}")
    
    # Local bivariate Moran's I
    lisa_bv = Moran_Local_BV(gdf[var1].values, gdf[var2].values, w, permutations=999)
    
    # Add to geodataframe
    gdf[f'bv_{var1}_{var2}_lisa_i'] = lisa_bv.Is
    gdf[f'bv_{var1}_{var2}_lisa_p'] = lisa_bv.p_sim
    gdf[f'bv_{var1}_{var2}_lisa_q'] = lisa_bv.q
    
    # Create bivariate LISA cluster map
    fig, axes = plt.subplots(1, 3, figsize=(20, 6))
    
    # Variable 1 map
    gdf.plot(column=var1, scheme='quantiles', k=5, cmap='Blues',
             edgecolor='white', linewidth=0.1, ax=axes[0], legend=True)
    axes[0].set_title(f'{var1}')
    axes[0].axis('off')
    
    # Variable 2 map
    gdf.plot(column=var2, scheme='quantiles', k=5, cmap='Reds',
             edgecolor='white', linewidth=0.1, ax=axes[1], legend=True)
    axes[1].set_title(f'{var2}')
    axes[1].axis('off')
    
    # Bivariate LISA clusters
    sig = gdf[f'bv_{var1}_{var2}_lisa_p'] < 0.05
    gdf['bv_cluster'] = 'Not Significant'
    gdf.loc[sig & (gdf[f'bv_{var1}_{var2}_lisa_q'] == 1), 'bv_cluster'] = 'HH'
    gdf.loc[sig & (gdf[f'bv_{var1}_{var2}_lisa_q'] == 2), 'bv_cluster'] = 'LH'
    gdf.loc[sig & (gdf[f'bv_{var1}_{var2}_lisa_q'] == 3), 'bv_cluster'] = 'LL'
    gdf.loc[sig & (gdf[f'bv_{var1}_{var2}_lisa_q'] == 4), 'bv_cluster'] = 'HL'
    
    colors = {'HH': '#8b0000', 'HL': '#ff6347', 'LH': '#4169e1', 'LL': '#87ceeb', 'Not Significant': '#d3d3d3'}
    gdf['bv_color'] = gdf['bv_cluster'].map(colors)
    gdf.plot(color=gdf['bv_color'], edgecolor='white', linewidth=0.1, ax=axes[2])
    axes[2].set_title(f'Bivariate LISA: {var1} vs {var2}')
    axes[2].axis('off')
    
    # Legend
    from matplotlib.patches import Patch
    legend_elements = [
        Patch(facecolor=colors['HH'], label='HH: High-High'),
        Patch(facecolor=colors['HL'], label='HL: High-Low'),
        Patch(facecolor=colors['LH'], label='LH: Low-High'),
        Patch(facecolor=colors['LL'], label='LL: Low-Low'),
        Patch(facecolor=colors['Not Significant'], label='Not Significant')
    ]
    axes[2].legend(handles=legend_elements, loc='lower left')
    
    plt.tight_layout()
    plt.savefig(reports / f'figures/bivariate_lisa_{var1}_{var2}.png', dpi=150, bbox_inches='tight')
    plt.show()
    
    # Summary
    print("\nBivariate LISA Cluster Counts:")
    print(gdf['bv_cluster'].value_counts())
    
    return moran_bv, lisa_bv, gdf

# Analyze relationship between overdose rates and Trump vote share
if 'od_1316_rate' in gdf.columns and gdf['od_1316_rate'].notna().sum() > 100:
    print("\nBivariate analysis: Overdose rates vs Trump 2016 support...")
    moran_bv, lisa_bv, gdf = bivariate_moran_analysis(gdf, w_queen, 'od_1316_rate', 'trump_share_2016')

# Analyze relationship between physical distress and Trump vote share
if 'freq_phys_distress_pct' in gdf.columns and gdf['freq_phys_distress_pct'].notna().sum() > 100:
    print("\nBivariate analysis: Physical distress vs Trump 2016 support...")
    moran_bv2, lisa_bv2, gdf = bivariate_moran_analysis(gdf, w_queen, 'freq_phys_distress_pct', 'trump_share_2016')

## 6. Hot Spot Analysis (Getis-Ord Gi*)

In [None]:
from esda.getisord import G_Local

def hotspot_analysis(gdf, w, variable):
    """Getis-Ord Gi* hot spot analysis"""
    
    # Calculate Gi*
    g = G_Local(gdf[variable].values, w, permutations=999, star=True)
    
    # Add to geodataframe
    gdf[f'{variable}_gi'] = g.Gs
    gdf[f'{variable}_gi_p'] = g.p_sim
    gdf[f'{variable}_gi_z'] = g.Zs
    
    # Classify hot/cold spots based on z-scores and p-values
    gdf[f'{variable}_hotspot'] = 'Not Significant'
    sig = gdf[f'{variable}_gi_p'] < 0.05
    gdf.loc[sig & (gdf[f'{variable}_gi_z'] > 0), f'{variable}_hotspot'] = 'Hot Spot'
    gdf.loc[sig & (gdf[f'{variable}_gi_z'] < 0), f'{variable}_hotspot'] = 'Cold Spot'
    
    # Further classify by confidence level
    gdf[f'{variable}_hotspot_conf'] = 'Not Significant'
    # 99% confidence
    gdf.loc[(gdf[f'{variable}_gi_p'] < 0.01) & (gdf[f'{variable}_gi_z'] > 0), f'{variable}_hotspot_conf'] = 'Hot Spot - 99% Conf'
    gdf.loc[(gdf[f'{variable}_gi_p'] < 0.01) & (gdf[f'{variable}_gi_z'] < 0), f'{variable}_hotspot_conf'] = 'Cold Spot - 99% Conf'
    # 95% confidence
    gdf.loc[(gdf[f'{variable}_gi_p'] < 0.05) & (gdf[f'{variable}_gi_p'] >= 0.01) & (gdf[f'{variable}_gi_z'] > 0), 
            f'{variable}_hotspot_conf'] = 'Hot Spot - 95% Conf'
    gdf.loc[(gdf[f'{variable}_gi_p'] < 0.05) & (gdf[f'{variable}_gi_p'] >= 0.01) & (gdf[f'{variable}_gi_z'] < 0), 
            f'{variable}_hotspot_conf'] = 'Cold Spot - 95% Conf'
    # 90% confidence
    gdf.loc[(gdf[f'{variable}_gi_p'] < 0.10) & (gdf[f'{variable}_gi_p'] >= 0.05) & (gdf[f'{variable}_gi_z'] > 0), 
            f'{variable}_hotspot_conf'] = 'Hot Spot - 90% Conf'
    gdf.loc[(gdf[f'{variable}_gi_p'] < 0.10) & (gdf[f'{variable}_gi_p'] >= 0.05) & (gdf[f'{variable}_gi_z'] < 0), 
            f'{variable}_hotspot_conf'] = 'Cold Spot - 90% Conf'
    
    # Create hot spot map
    fig, ax = plt.subplots(1, 1, figsize=(12, 8))
    
    colors = {
        'Hot Spot - 99% Conf': '#b30000',
        'Hot Spot - 95% Conf': '#e34a33',
        'Hot Spot - 90% Conf': '#fc8d59',
        'Not Significant': '#ffffcc',
        'Cold Spot - 90% Conf': '#91bfdb',
        'Cold Spot - 95% Conf': '#4575b4',
        'Cold Spot - 99% Conf': '#253494'
    }
    
    gdf['hotspot_color'] = gdf[f'{variable}_hotspot_conf'].map(colors)
    gdf.plot(color=gdf['hotspot_color'], edgecolor='white', linewidth=0.1, ax=ax)
    ax.set_title(f'Hot Spot Analysis (Getis-Ord Gi*): {variable}')
    ax.axis('off')
    
    # Legend
    from matplotlib.patches import Patch
    legend_elements = [Patch(facecolor=colors[k], label=k) for k in colors.keys()]
    ax.legend(handles=legend_elements, loc='lower left')
    
    plt.tight_layout()
    plt.savefig(reports / f'figures/hotspot_{variable}.png', dpi=150, bbox_inches='tight')
    plt.show()
    
    # Summary
    print(f"\nHot Spot Analysis Summary for {variable}:")
    print(gdf[f'{variable}_hotspot_conf'].value_counts())
    
    return g, gdf

# Identify hot spots of Trump support
print("\nHot spot analysis: Trump 2016 support...")
g_trump, gdf = hotspot_analysis(gdf, w_queen, 'trump_share_2016')

# Identify hot spots of overdose rates
if 'od_1316_rate' in gdf.columns and gdf['od_1316_rate'].notna().sum() > 100:
    print("\nHot spot analysis: Overdose rates (2013-2016)...")
    g_overdose, gdf = hotspot_analysis(gdf, w_queen, 'od_1316_rate')

# Identify hot spots of physical distress
if 'freq_phys_distress_pct' in gdf.columns and gdf['freq_phys_distress_pct'].notna().sum() > 100:
    print("\nHot spot analysis: Physical distress...")
    g_distress, gdf = hotspot_analysis(gdf, w_queen, 'freq_phys_distress_pct')

## 7. Export Results for Web Visualization

In [None]:
def prepare_web_export(gdf):
    """Prepare data for web visualization"""
    
    # Select columns for web export (filter to what's actually available)
    web_columns = [
        'fips', 'county_name', 'state_fips', 'geometry',
        # Electoral
        'trump_share_2016', 'trump_share_2020', 'trump_shift_16_20',
        'trump_margin_2016', 'trump_margin_2020',
        # Pain/distress metrics
        'od_1316_rate', 'od_1720_rate', 'od_rate_change',
        'freq_phys_distress_pct', 'freq_mental_distress_pct',
        'arthritis_pct', 'depression_pct', 'diabetes_pct',
        # County Health Rankings
        'chr_drug_overdose_deaths_per_100k', 'chr_poor_physical_health_days',
        'chr_poor_mental_health_days',
        # LISA clusters (if computed)
        'trump_share_2016_lisa_cluster',
        'od_1316_rate_lisa_cluster',
        'freq_phys_distress_pct_lisa_cluster',
        # Hot spots (if computed)
        'trump_share_2016_hotspot_conf',
        'od_1316_rate_hotspot_conf',
        'freq_phys_distress_pct_hotspot_conf',
        # Controls
        'rucc', 'rural', 'rucc_category', 'ba_plus_pct', 'median_income'
    ]
    
    # Filter to available columns
    available_cols = [c for c in web_columns if c in gdf.columns]
    print(f"Exporting {len(available_cols)} columns: {available_cols[:10]}...")
    
    web_gdf = gdf[available_cols].copy()
    
    # Convert back to WGS84 for web mapping
    if web_gdf.crs.to_epsg() != 4326:
        web_gdf = web_gdf.to_crs('EPSG:4326')
    
    # Simplify geometry to reduce file size
    web_gdf['geometry'] = web_gdf['geometry'].simplify(0.01, preserve_topology=True)
    
    # Round numeric columns
    numeric_cols = web_gdf.select_dtypes(include=[np.number]).columns
    web_gdf[numeric_cols] = web_gdf[numeric_cols].round(2)
    
    # Export as GeoJSON
    output_path = project_root / 'web' / 'assets' / 'counties_esda.geojson'
    output_path.parent.mkdir(parents=True, exist_ok=True)
    web_gdf.to_file(output_path, driver='GeoJSON')
    
    print(f"âœ… Exported {len(web_gdf)} counties to {output_path}")
    print(f"   File size: {output_path.stat().st_size / 1024 / 1024:.2f} MB")
    
    return web_gdf

# Export the enriched dataset with spatial analysis results
web_data = prepare_web_export(gdf)

## 8. Summary of Findings

Key spatial patterns identified:

### Global Spatial Autocorrelation
- Review the Moran's I table above to see which variables show significant spatial clustering
- Variables with high positive Moran's I indicate strong spatial autocorrelation (similar values cluster together)
- Variables with negative Moran's I show spatial dispersion (dissimilar values are neighbors)

### Local Clustering (LISA)
- **HH (High-High)**: Counties with high values surrounded by high-value neighbors
- **LL (Low-Low)**: Counties with low values surrounded by low-value neighbors  
- **HL (High-Low)**: High-value outliers surrounded by low-value neighbors
- **LH (Low-High)**: Low-value outliers surrounded by high-value neighbors

### Bivariate Relationships
- Bivariate LISA maps show where two variables co-occur spatially
- HH clusters indicate counties where both variables are high AND spatially clustered
- These are the key "hotspots" for the pain-politics relationship

### Hot Spot Analysis (Getis-Ord Gi*)
- More focused on statistical hot spots vs cold spots
- Multi-level confidence intervals (90%, 95%, 99%)
- Useful for identifying statistically significant geographic concentrations

### Next Steps
1. Review the generated maps in `reports/figures/`
2. Examine specific high-value clusters for qualitative interpretation
3. Use these spatial patterns as inputs for regression models (Notebook 04)
4. Explore interactive visualization in the web interface
