## C4. Hotspot Regional Analysis

**Description**  
This section identify statistically significant hotspots and coldspots of hospital AI adoption across the United States. Using Getis-Ord Gi* and Local Moran's I statistics, this analysis pinpoints specific geographic locations where AI adoption is significantly higher (hotspots) or lower (coldspots) 

**Purpose**  
To identify hotspots and coldspots 



### 1 load necessary libraries, functions and preprocessed data 

In [45]:
# Import necessary libraries
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import contextily as ctx
import warnings
from scipy import stats
from scipy.spatial import distance_matrix
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.patches as mpatches
from matplotlib.gridspec import GridSpec
from sklearn.neighbors import NearestNeighbors
import os


In [None]:
AHA_master = pd.read_csv('./data/AHA_master_external_data.csv', low_memory=False)
AHA_IT = AHA_master[~AHA_master.id_it.isnull()]
AHA_master2 = apply_ai_scores_to_dataframe(AHA_IT)

In [48]:
import os
os.environ['SHAPE_RESTORE_SHX'] = 'YES'
states = gpd.read_file('../../../data/map_data/state_boundary.shp')

#### 2 data engineering 

In [None]:
# Remove rows with missing or invalid coordinates
AHA_master2 = AHA_master2.dropna(subset=['lat_as', 'long_as'])

# Filter out invalid coordinates
valid_coords = (
    (AHA_master['lat_as'] != 0) & 
    (AHA_master['long_as'] != 0) &
    (AHA_master['lat_as'] >= -90) & 
    (AHA_master['lat_as'] <= 90) &
    (AHA_master['long_as'] >= -180) & 
    (AHA_master['long_as'] <= 180)
)
AHA_master2 = AHA_master2[valid_coords]

print(f"Number of hospitals with valid coordinates: {len(AHA_master2)}")

# Create GeoDataFrame
hospitals = gpd.GeoDataFrame(
    AHA_master2, 
    geometry=gpd.points_from_xy(AHA_master2.long_as, AHA_master2.lat_as),
    crs="EPSG:4326"
)
print(f"Successfully created GeoDataFrame with {len(hospitals)} hospitals")

In [None]:

# Filter hospitals with valid coordinates and implementation scores
valid_hospitals = hospitals.dropna(subset=['long_as', 'lat_as', 'aipred_it'])
valid_geo_hospitals = hospitals.dropna(subset=['long_as', 'lat_as'])
# Create a GeoDataFrame
hospitals_gdf = gpd.GeoDataFrame(
    valid_hospitals, 
    geometry=gpd.points_from_xy(valid_hospitals.long_as, valid_hospitals.lat_as),
    crs="EPSG:4326" #geographic coordinate system using latitude and longitude
)

# Create a GeoDataFrame
geo_hospitals_gdf = gpd.GeoDataFrame(
    valid_geo_hospitals, 
    geometry=gpd.points_from_xy(valid_geo_hospitals.long_as, valid_geo_hospitals.lat_as),
    crs="EPSG:4326" #geographic coordinate system using latitude and longitude
)


In [51]:
# Convert to a projected CRS for accurate distance calculations
hospitals_gdf_projected = hospitals_gdf.to_crs(epsg=3857) # projected coordinate system using flat, 2D plane to represent Earth's surface 
geo_hospitals_gdf_projected = geo_hospitals_gdf.to_crs(epsg=3857) # projected coordinate system using flat, 2D plane to represent Earth's surface 



In [52]:

# Add census division column to the dataframe
hospitals_gdf_projected['division'] = hospitals_gdf_projected['mstate_it'].map(state_to_division)
geo_hospitals_gdf_projected['division'] = geo_hospitals_gdf_projected['mstate_it'].map(state_to_division)

# Loop through each census division and create a heatmap
divisions = [
    'New England', 'Mid Atlantic', 'South Atlantic', 
    'East North Central', 'East South Central', 'West North Central',
    'West South Central', 'Mountain', 'Pacific'
]


#### 3 Run Hotspot analysis 

In [None]:

import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from scipy import stats
from statsmodels.stats.multitest import multipletests

# ========= Storey q-value helpers =========
def storey_qvalue_python(pvals, lambdas=None):
    """
    Simple Storey q-value fallback in pure Python.
    """
    p = np.asarray(pvals, dtype=float)
    m = p.size
    if m == 0:
        return p
    if lambdas is None:
        lambdas = np.arange(0.05, 0.95, 0.05)
    pi0_vals = []
    for lam in lambdas:
        denom = 1.0 - lam
        if denom <= 0:
            continue
        pi0_vals.append((p > lam).mean() / denom)
    pi0 = min(1.0, np.min(pi0_vals)) if len(pi0_vals) else 1.0
    order = np.argsort(p)
    p_sorted = p[order]
    q_sorted = pi0 * m * p_sorted / (np.arange(1, m + 1))
    q_sorted = np.minimum.accumulate(q_sorted[::-1])[::-1]
    q = np.empty_like(q_sorted)
    q[order] = q_sorted
    return np.clip(q, 0, 1)


# ========= Classification =========
def _classify_levels(sign, p_like):
    """
    Map sign and p-like values to hotspot classes at 99, 95, 90 percent levels.
    p_like can be unadjusted p, Bonferroni p, BH-FDR p, or Storey q.
    """
    lab = np.array(["Not Significant"] * len(p_like), dtype=object)
    lab[(sign > 0) & (p_like <= 0.01)] = "Hotspot (99%)"
    lab[(sign < 0) & (p_like <= 0.01)] = "Coldspot (99%)"
    lab[(sign > 0) & (p_like > 0.01) & (p_like <= 0.05)] = "Hotspot (95%)"
    lab[(sign < 0) & (p_like > 0.01) & (p_like <= 0.05)] = "Coldspot (95%)"
    lab[(sign > 0) & (p_like > 0.05) & (p_like <= 0.1)] = "Hotspot (90%)"
    lab[(sign < 0) & (p_like > 0.05) & (p_like <= 0.1)] = "Coldspot (90%)"
    return lab

# ========= Core Gi* function =========
def calculate_gi_star_all(gdf, value_column, k=6):
    """
    Compute local Getis-Ord Gi* with k-NN (including self) and multiple testing corrections.

    Inputs
    - gdf: GeoDataFrame with Point geometry (prefer EPSG:5070)
    - value_column: column to analyze (numeric)
    - k: number of nearest neighbors (self included via k+1)

    Outputs
    Returns a copy of gdf (rows with non-null value) with these columns:
    - gi_star: raw Gi* sum over neighbors
    - z_score: standardized Gi* Z
    - p_unadj: unadjusted two-sided p
    - p_bonf: Bonferroni adjusted p
    - p_bh: BH-FDR adjusted p
    - q_storey: Storey q-value
    - hotspot_unadj: class from p_unadj
    - hotspot_bonf: class from p_bonf
    - hotspot_bh: class from p_bh
    - hotspot_storey: class from q_storey
    """
    # coords and values
    coords = np.vstack((gdf.geometry.x, gdf.geometry.y)).T
    values = gdf[value_column].to_numpy(dtype=float)
    valid = ~np.isnan(values)
    coords, values = coords[valid], values[valid]
    out = gdf.loc[valid].copy()
    n = len(values)
    if n <= k:
        print(f"Error: Not enough observations ({n}) for k={k}")
        return gdf

    # kNN including self (k+1)
    nn = NearestNeighbors(n_neighbors=k+1)
    nn.fit(coords)
    _, indices = nn.kneighbors(coords)  # shape (n, k+1)

    # global stats (Ord & Getis, 1995)
    x_bar = values.mean()
    S = np.sqrt((np.sum(values**2) / n) - x_bar**2)

    # Gi*, Z, p
    Gs = np.zeros(n)
    Zs = np.zeros(n)
    Ps = np.ones(n)

    for i in range(n):
        neigh = indices[i]             # length k+1
        wi_sum = float(len(neigh))     # sum w_ij (binary)
        wi2_sum = wi_sum               # sum w_ij^2 (binary)
        Gs[i] = np.sum(values[neigh])  # observed sum
        EGi = x_bar * wi_sum           # expected
        varGi = (S**2) * ((n * wi2_sum - wi_sum**2) / (n - 1)) if n > 1 else 0.0
        if varGi > 0 and not np.isnan(varGi):
            Zs[i] = (Gs[i] - EGi) / np.sqrt(varGi)
            Ps[i] = 2 * (1 - stats.norm.cdf(abs(Zs[i])))
        else:
            Zs[i] = 0.0
            Ps[i] = 1.0

    # multiple testing corrections
    m = n
    p_unadj = Ps
    p_bonf = np.clip(p_unadj * m, 0, 1)
    _, p_bh, _, _ = multipletests(p_unadj, alpha=0.05, method='fdr_bh')
    q_storey = storey_qvalue_python(p_unadj)

    # labels
    sign = np.sign(Zs)
    out['gi_star'] = Gs
    out['z_score'] = Zs
    out['p_unadj'] = p_unadj
    out['p_bonf'] = p_bonf
    out['p_bh'] = p_bh
    out['q_storey'] = q_storey
    out['hotspot_unadj'] = _classify_levels(sign, p_unadj)
    out['hotspot_bonf'] = _classify_levels(sign, p_bonf)
    out['hotspot_bh'] = _classify_levels(sign, p_bh)
    out['hotspot_storey'] = _classify_levels(sign, q_storey)

    # column annotations for reference
    out.attrs["columns_doc"] = {
        "gi_star": "raw Gi* statistic (Ord & Getis, 1995)",
        "z_score": "standardized Z-score for Gi*",
        "p_unadj": "unadjusted two-sided p-value",
        "p_bonf": "Bonferroni-adjusted p-value",
        "p_bh": "Benjamini-Hochberg FDR-adjusted p-value",
        "q_storey": "Storey q-value (FDR with pi0 estimated)",
        "hotspot_unadj": "hotspot class from unadjusted p",
        "hotspot_bonf": "hotspot class from Bonferroni p",
        "hotspot_bh": "hotspot class from BH-FDR p",
        "hotspot_storey": "hotspot class from Storey q-value"
    }

    # quick summary
    def _cnt(col, key): return int(out[col].str.contains(key).sum())
    print(f"Total locations: {n}")
    print(f"Unadj  Hot:{_cnt('hotspot_unadj','Hotspot')}  Cold:{_cnt('hotspot_unadj','Coldspot')}")
    print(f"BH-FDR Hot:{_cnt('hotspot_bh','Hotspot')}  Cold:{_cnt('hotspot_bh','Coldspot')}")
    print(f"Storey Hot:{_cnt('hotspot_storey','Hotspot')}  Cold:{_cnt('hotspot_storey','Coldspot')}")
    print(f"Bonf  Hot:{_cnt('hotspot_bonf','Hotspot')}  Cold:{_cnt('hotspot_bonf','Coldspot')}")

    return out




### C4_3 run hotspot 

In [None]:
# 1. Conduct hotspot analysis for the entire US
print("Performing hotspot analysis for the entire US...")
base_hotspot_results = calculate_gi_star_all(hospitals_gdf_projected, 'ai_base_score_imputed', k=6)
breadth_hotspot_results = calculate_gi_star_all(hospitals_gdf_projected, 'ai_base_breadth_score_imputed', k=6)
dev_hotspot_results = calculate_gi_star_all(hospitals_gdf_projected, 'ai_base_dev_score_imputed', k=6)
eval2023_hotspot_results = calculate_gi_star_all(hospitals_gdf_projected, 'ai_base_eval_score_2023_imputed', k=6)
eval2024_hotspot_results = calculate_gi_star_all(hospitals_gdf_projected, 'ai_base_eval_score_2024_imputed', k=6)
llm_hotspot_results = calculate_gi_star_all(hospitals_gdf_projected, 'llm_readiness_score', k=6)


#### 4 Hotspot visualization 

In [None]:
# Hotspot Bubble Chart Visualizations


# Create colormap
hotspot_cmap = LinearSegmentedColormap.from_list(
    'hotspot_cmap', 
    ['#333333', '#737373', '#bfbfbf', '#f2f2f2', '#4d94ff', '#0050b3', '#001f4d']
)

# 1. State-Level Analysis
def create_state_bubble_chart(hotspot_results):
    # Group by state
    state_summary = hotspot_results.groupby('mstate_it').apply(
        lambda x: pd.Series({
            'Hotspot %': 100 * len(x[x['hotspot_type'].str.contains('Hotspot')]) / len(x),
            'Coldspot %': 100 * len(x[x['hotspot_type'].str.contains('Coldspot')]) / len(x),
            'Total': len(x),
            'Mean Z': x['z_score'].mean()
        })
    )
    
    # Filter states with sufficient data
    state_summary = state_summary[state_summary['Total'] >= 20]
    
    fig, ax = plt.subplots(figsize=(12, 9))
    
    # Create scatter plot
    scatter = ax.scatter(
        state_summary['Hotspot %'],
        state_summary['Coldspot %'],
        s=state_summary['Total'] * 3.5,
        c=state_summary['Mean Z'],
        cmap=hotspot_cmap,
        alpha=0.85,
        edgecolors='black',
        linewidths=0.5,
        vmin=-2.5,
        vmax=2.5
    )
    
    # Add state labels
    for state, row in state_summary.iterrows():
        ax.annotate(
            state,
            (row['Hotspot %'], row['Coldspot %']),
            xytext=(3, 3),
            textcoords='offset points',
            fontsize=9,
            fontweight='bold',
            bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.85)
        )
    
    # Add reference lines
    ax.axhline(y=0, color='gray', linestyle='-', alpha=0.3)
    ax.axvline(x=0, color='gray', linestyle='-', alpha=0.3)
    
    # Add colorbar
    cbar = fig.colorbar(scatter, ax=ax, shrink=0.9)
    cbar.set_label('Mean Z-Score', fontsize=12)
    
    # Formatting
    ax.set_title('Spatial Clustering Patterns by State', fontsize=16)
    ax.set_xlabel('Hotspot Percentage', fontsize=13)
    ax.set_ylabel('Coldspot Percentage', fontsize=13)
    ax.set_xlim(-5, 105)
    ax.set_ylim(-5, 105)
    ax.grid(True, alpha=0.2)
    
    plt.tight_lat()
    return fig, state_summary

# 2. Division-Level Analysis
def create_division_bubble_chart(hotspot_results):
    # Group by division
    division_summary = hotspot_results.groupby('division').apply(
        lambda x: pd.Series({
            'Hotspot %': 100 * len(x[x['hotspot_type'].str.contains('Hotspot')]) / len(x),
            'Coldspot %': 100 * len(x[x['hotspot_type'].str.contains('Coldspot')]) / len(x),
            'Total': len(x),
            'Mean Z': x['z_score'].mean()
        })
    )
    
    fig, ax = plt.subplots(figsize=(12, 9))
    
    # Create scatter plot
    scatter = ax.scatter(
        division_summary['Hotspot %'],
        division_summary['Coldspot %'],
        s=division_summary['Total'] * 1.2,
        c=division_summary['Mean Z'],
        cmap=hotspot_cmap,
        alpha=0.85,
        edgecolors='black',
        linewidths=0.7,
        vmin=-2.5,
        vmax=2.5
    )
    
    # Add division labels
    for division, row in division_summary.iterrows():
        ax.annotate(
            division,
            (row['Hotspot %'], row['Coldspot %']),
            xytext=(5, 5),
            textcoords='offset points',
            fontsize=11,
            fontweight='bold',
            bbox=dict(boxstyle="round,pad=0.4", fc="white", ec="gray", alpha=0.9)
        )
    
    # Add reference lines
    ax.axhline(y=0, color='gray', linestyle='-', alpha=0.3)
    ax.axvline(x=0, color='gray', linestyle='-', alpha=0.3)
    
    # Add colorbar
    cbar = fig.colorbar(scatter, ax=ax, shrink=0.9)
    cbar.set_label('Mean Z-Score', fontsize=12)
    
    # Formatting
    ax.set_title('Spatial Clustering Patterns by Census Division', fontsize=16)
    ax.set_xlabel('Hotspot Percentage', fontsize=13)
    ax.set_ylabel('Coldspot Percentage', fontsize=13)
    ax.set_xlim(-5, 105)
    ax.set_ylim(-5, 105)
    ax.grid(True, alpha=0.2)
    
    plt.tight_lat()
    return fig, division_summary

