## C4. Hotspot Regional Analysis

**Description**  
This section identify statistically significant hotspots and coldspots of hospital AI adoption across the United States. Using Getis-Ord Gi* and Local Moran's I statistics, this analysis pinpoints specific geographic locations where AI adoption is significantly higher (hotspots) or lower (coldspots) 

**Purpose**  
To identify hotspots and coldspots 

**Disclaimer**  
- AHA data is subscription-based and not publicly shareable. All reported results are aggregated at the state or census division level.
- All publicly available data should also be independently downlowded from the source 


### C4_0 load necessary libraries, functions and preprocessed data 

#### C4_0_1 load libraries 

In [45]:
# Import necessary libraries
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import contextily as ctx
import warnings
from scipy import stats
from scipy.spatial import distance_matrix
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.patches as mpatches
from matplotlib.gridspec import GridSpec
from sklearn.neighbors import NearestNeighbors
import os


#### C4_0_2 load custom functions 

In [46]:
def calculate_base_ai_implementation_row(row):
    """
    Calculate base AI implementation score for a single row (hospital).
    
    Args:
        row: A pandas Series representing a single hospital row
        
    Returns:
        float: Base AI implementation score
    """
    # Base AI implementation score (continuous)
    # Return None if the input value is null
    if pd.isna(row['aipred_it']):
        return None
    elif row['aipred_it'] == 1:  # Machine Learning
        return 2
    elif row['aipred_it'] == 2:  # Other Non-Machine Learning Predictive Models
        return 1
    else:  # Neither (3) or Do not know (4)
        return 0

def calculate_ai_implementation_breadth_row(row):
    """
    Calculate AI implementation breadth score for a single row (hospital).
    
    Args:
        row: A pandas Series representing a single hospital row
        
    Returns:
        float: AI implementation breadth score
    """
    # Start with base score
    base_score = calculate_base_ai_implementation_row(row)
    if base_score is None:
        return None
    elif base_score == 0:
        return 0
    else:
        breadth_score = base_score
        # Implementation Breadth Score - count use cases
        use_case_cols = ['aitraj_it', 'airfol_it', 'aimhea_it', 'airect_it', 
                     'aibill_it', 'aische_it', 'aipoth_it', 'aicloth_it']
        for col in use_case_cols:
            if row[col] is None:
                breadth_score += 0
            else:
                breadth_score += row[col] * 0.25  # 0.25 points per use case
        return breadth_score

def calculate_ai_development_row(row):
    """
    Calculate AI development score for a single row (hospital).
    
    Args:
        row: A pandas Series representing a single hospital row
        
    Returns:
        float: AI development score
    """
    # Start with base score
    base_score = calculate_base_ai_implementation_row(row)
    if base_score is None:
        return None
    elif base_score == 0:
        return 0 
    else:
        dev_score = base_score
        if 'mlsed_it' in row and pd.notna(row['mlsed_it']):
            dev_score += row['mlsed_it'] * 2  # Self-developed
        if 'mldev_it' in row and pd.notna(row['mldev_it']):
            dev_score += row['mldev_it']  # EHR developer
        if 'mlthd_it' in row and pd.notna(row['mlthd_it']):
            dev_score += row['mlthd_it']  # Third-party
        if 'mlpubd_it' in row and pd.notna(row['mlpubd_it']):
            dev_score += row['mlpubd_it'] * 0.5  # Public domain
        return dev_score

def calculate_ai_evaluation_row(row):
    """
    Calculate AI evaluation score for a single row (hospital).
    
    Args:
        row: A pandas Series representing a single hospital row
        
    Returns:
        float: AI evaluation score
    """
    # Start with base score
    base_score = calculate_base_ai_implementation_row(row)
    if base_score is None:
        return None
    elif base_score == 0:
        return 0
    else:
        eval_score = base_score
        # For model accuracy (MLACCU)
        if row['mlaccu_it'] is None:
            eval_score += 0
        elif row['mlaccu_it'] == 1:  # All models
            eval_score += 1
        elif row['mlaccu_it'] == 2:  # Most models
            eval_score += 0.75
        elif row['mlaccu_it'] == 3:  # Some models
            eval_score += 0.5
        elif row['mlaccu_it'] == 4:  # Few models
            eval_score += 0.25
        # For None (5) or Do not know (6), no points added
    
    # For model bias (MLBIAS)
        if row['mlbias_it'] is None:
            eval_score += 0
        elif row['mlbias_it'] == 1:  # All models
            eval_score += 1
        elif row['mlbias_it'] == 2:  # Most models
            eval_score += 0.75
        elif row['mlbias_it'] == 3:  # Some models
            eval_score += 0.5
        elif row['mlbias_it'] == 4:  # Few models
            eval_score += 0.25
        # For None (5) or Do not know (6), no points added
    
        return eval_score

def calculate_all_ai_scores_row(row):
    """
    Calculate all AI/ML implementation scores as continuous measures for a single row.
    
    Args:
        row: A pandas Series representing a single hospital row
        
    Returns:
        dict: Dictionary with all calculated scores
    """
    # Calculate all scores
    base_score = calculate_base_ai_implementation_row(row)
    breadth_score = calculate_ai_implementation_breadth_row(row)
    dev_score = calculate_ai_development_row(row)
    eval_score = calculate_ai_evaluation_row(row)
    
    return {
        'ai_base_score': base_score,
        'ai_base_breadth_score': breadth_score,
        'ai_base_dev_score': dev_score,
        'ai_base_eval_score': eval_score
    }

def apply_ai_scores_to_dataframe(df):
    """
    Apply all AI score calculations row by row to a dataframe.
    
    Args:
        df: A pandas DataFrame with hospital data
        
    Returns:
        pandas.DataFrame: DataFrame with added AI score columns
    """
    # Initialize empty columns for scores
    df['ai_base_score'] = float('nan')
    df['ai_base_breadth_score'] = float('nan')
    df['ai_base_dev_score'] = float('nan')
    df['ai_base_eval_score'] = float('nan')
    
    # Apply row by row calculations
    for index, row in df.iterrows():
        scores = calculate_all_ai_scores_row(row)
        for score_name, score_value in scores.items():
            df.at[index, score_name] = score_value
    
    return df


#### C4_0_3 load processed dataframe

In [None]:
AHA_master = pd.read_csv('./data/AHA_master_external_data.csv', low_memory=False)
AHA_IT = AHA_master[~AHA_master.id_it.isnull()]
AHA_master2 = apply_ai_scores_to_dataframe(AHA_IT)

In [48]:
import os
os.environ['SHAPE_RESTORE_SHX'] = 'YES'
states = gpd.read_file('../../../data/map_data/state_boundary.shp')

#### C4_1 prepare dataframe for analysis 

In [None]:
# Remove rows with missing or invalid coordinates
AHA_master2 = AHA_master2.dropna(subset=['latitude_address', 'longitude_address'])

# Filter out invalid coordinates
valid_coords = (
    (AHA_master['latitude_address'] != 0) & 
    (AHA_master['longitude_address'] != 0) &
    (AHA_master['latitude_address'] >= -90) & 
    (AHA_master['latitude_address'] <= 90) &
    (AHA_master['longitude_address'] >= -180) & 
    (AHA_master['longitude_address'] <= 180)
)
AHA_master2 = AHA_master2[valid_coords]

print(f"Number of hospitals with valid coordinates: {len(AHA_master2)}")

# Create GeoDataFrame
hospitals = gpd.GeoDataFrame(
    AHA_master2, 
    geometry=gpd.points_from_xy(AHA_master2.longitude_address, AHA_master2.latitude_address),
    crs="EPSG:4326"
)
print(f"Successfully created GeoDataFrame with {len(hospitals)} hospitals")

In [50]:

# Filter hospitals with valid coordinates and implementation scores
valid_hospitals = hospitals.dropna(subset=['longitude_address', 'latitude_address', 'aipred_it'])
valid_geo_hospitals = hospitals.dropna(subset=['longitude_address', 'latitude_address'])
# Create a GeoDataFrame
hospitals_gdf = gpd.GeoDataFrame(
    valid_hospitals, 
    geometry=gpd.points_from_xy(valid_hospitals.longitude_address, valid_hospitals.latitude_address),
    crs="EPSG:4326" #geographic coordinate system using latitude and longitude
)

# Create a GeoDataFrame
geo_hospitals_gdf = gpd.GeoDataFrame(
    valid_geo_hospitals, 
    geometry=gpd.points_from_xy(valid_geo_hospitals.longitude_address, valid_geo_hospitals.latitude_address),
    crs="EPSG:4326" #geographic coordinate system using latitude and longitude
)


In [51]:
# Convert to a projected CRS for accurate distance calculations
hospitals_gdf_projected = hospitals_gdf.to_crs(epsg=3857) # projected coordinate system using flat, 2D plane to represent Earth's surface 
geo_hospitals_gdf_projected = geo_hospitals_gdf.to_crs(epsg=3857) # projected coordinate system using flat, 2D plane to represent Earth's surface 



In [52]:

# Add census division column to the dataframe
hospitals_gdf_projected['division'] = hospitals_gdf_projected['mstate_it'].map(state_to_division)
geo_hospitals_gdf_projected['division'] = geo_hospitals_gdf_projected['mstate_it'].map(state_to_division)

# Loop through each census division and create a heatmap
divisions = [
    'New England', 'Mid Atlantic', 'South Atlantic', 
    'East North Central', 'East South Central', 'West North Central',
    'West South Central', 'Mountain', 'Pacific'
]


#### C4_2 load hotspot function

In [53]:
def calculate_gi_star(gdf, value_column, k=5):
    # Extract coordinates and values
    coords = np.vstack((gdf.geometry.x, gdf.geometry.y)).T
    values = gdf[value_column].values
    n = len(values)
    
    # Handle missing values
    valid_mask = ~np.isnan(values)
    if not np.all(valid_mask):
        coords = coords[valid_mask]
        values = values[valid_mask]
        gdf = gdf[valid_mask].copy()
        n = len(values)
    
    if n <= k:
        return gdf
    
    # Create k-nearest neighbors spatial weights matrix
    nbrs = NearestNeighbors(n_neighbors=k+1, algorithm='auto').fit(coords)
    distances, indices = nbrs.kneighbors(coords)
    
    W = np.zeros((n, n))
    for i in range(n):
        W[i, indices[i, :]] = 1
    
    # Global statistics
    x_bar = np.mean(values)
    s_squared = np.var(values, ddof=1)
    
    # Calculate Gi* for each location
    gi_star = np.zeros(n)
    z_scores = np.zeros(n)
    p_values = np.zeros(n)
    
    for i in range(n):
        gi_star[i] = np.sum(W[i, :] * values)
        wi_sum = np.sum(W[i, :])
        expected_gi = wi_sum * x_bar
        variance_gi = (wi_sum * s_squared * (n - wi_sum)) / (n - 1)
        
        if variance_gi > 0:
            z_scores[i] = (gi_star[i] - expected_gi) / np.sqrt(variance_gi)
            p_values[i] = 2 * (1 - stats.norm.cdf(abs(z_scores[i])))
        else:
            z_scores[i] = 0
            p_values[i] = 1.0
    
    # Create results
    result_gdf = gdf.copy()
    result_gdf['gi_star'] = gi_star
    result_gdf['z_score'] = z_scores
    result_gdf['p_value'] = p_values
    result_gdf['hotspot_type'] = 'Not Significant'
    
    # Classify significance levels
    result_gdf.loc[(z_scores > 0) & (p_values <= 0.01), 'hotspot_type'] = 'Hotspot (99%)'
    result_gdf.loc[(z_scores < 0) & (p_values <= 0.01), 'hotspot_type'] = 'Coldspot (99%)'
    result_gdf.loc[(z_scores > 0) & (p_values > 0.01) & (p_values <= 0.05), 'hotspot_type'] = 'Hotspot (95%)'
    result_gdf.loc[(z_scores < 0) & (p_values > 0.01) & (p_values <= 0.05), 'hotspot_type'] = 'Coldspot (95%)'
    result_gdf.loc[(z_scores > 0) & (p_values > 0.05) & (p_values <= 0.1), 'hotspot_type'] = 'Hotspot (90%)'
    result_gdf.loc[(z_scores < 0) & (p_values > 0.05) & (p_values <= 0.1), 'hotspot_type'] = 'Coldspot (90%)'
    
    return result_gdf



### C4_3 run hotspot 

In [None]:
# Conduct hotspot analysis for the entire US
print("Performing hotspot analysis for the entire US...")
base_hotspot_results = calculate_gi_star(hospitals_gdf_projected, 'ai_base_score', k=5)
breadth_hotspot_results = calculate_gi_star(hospitals_gdf_projected, 'ai_base_breadth_score', k=5)
dev_hotspot_results = calculate_gi_star(hospitals_gdf_projected, 'ai_base_dev_score', k=5)
eval_hotspot_results = calculate_gi_star(hospitals_gdf_projected, 'ai_base_eval_score', k=5)

### C4_4 get visualization and results  

In [None]:
# Hotspot Bubble Chart Visualizations


# Create colormap
hotspot_cmap = LinearSegmentedColormap.from_list(
    'hotspot_cmap', 
    ['#333333', '#737373', '#bfbfbf', '#f2f2f2', '#4d94ff', '#0050b3', '#001f4d']
)

# 1. State-Level Analysis
def create_state_bubble_chart(hotspot_results):
    # Group by state
    state_summary = hotspot_results.groupby('mstate_it').apply(
        lambda x: pd.Series({
            'Hotspot %': 100 * len(x[x['hotspot_type'].str.contains('Hotspot')]) / len(x),
            'Coldspot %': 100 * len(x[x['hotspot_type'].str.contains('Coldspot')]) / len(x),
            'Total': len(x),
            'Mean Z': x['z_score'].mean()
        })
    )
    
    # Filter states with sufficient data
    state_summary = state_summary[state_summary['Total'] >= 20]
    
    fig, ax = plt.subplots(figsize=(12, 9))
    
    # Create scatter plot
    scatter = ax.scatter(
        state_summary['Hotspot %'],
        state_summary['Coldspot %'],
        s=state_summary['Total'] * 3.5,
        c=state_summary['Mean Z'],
        cmap=hotspot_cmap,
        alpha=0.85,
        edgecolors='black',
        linewidths=0.5,
        vmin=-2.5,
        vmax=2.5
    )
    
    # Add state labels
    for state, row in state_summary.iterrows():
        ax.annotate(
            state,
            (row['Hotspot %'], row['Coldspot %']),
            xytext=(3, 3),
            textcoords='offset points',
            fontsize=9,
            fontweight='bold',
            bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.85)
        )
    
    # Add reference lines
    ax.axhline(y=0, color='gray', linestyle='-', alpha=0.3)
    ax.axvline(x=0, color='gray', linestyle='-', alpha=0.3)
    
    # Add colorbar
    cbar = fig.colorbar(scatter, ax=ax, shrink=0.9)
    cbar.set_label('Mean Z-Score', fontsize=12)
    
    # Formatting
    ax.set_title('Spatial Clustering Patterns by State', fontsize=16)
    ax.set_xlabel('Hotspot Percentage', fontsize=13)
    ax.set_ylabel('Coldspot Percentage', fontsize=13)
    ax.set_xlim(-5, 105)
    ax.set_ylim(-5, 105)
    ax.grid(True, alpha=0.2)
    
    plt.tight_layout()
    return fig, state_summary

# 2. Division-Level Analysis
def create_division_bubble_chart(hotspot_results):
    # Group by division
    division_summary = hotspot_results.groupby('division').apply(
        lambda x: pd.Series({
            'Hotspot %': 100 * len(x[x['hotspot_type'].str.contains('Hotspot')]) / len(x),
            'Coldspot %': 100 * len(x[x['hotspot_type'].str.contains('Coldspot')]) / len(x),
            'Total': len(x),
            'Mean Z': x['z_score'].mean()
        })
    )
    
    fig, ax = plt.subplots(figsize=(12, 9))
    
    # Create scatter plot
    scatter = ax.scatter(
        division_summary['Hotspot %'],
        division_summary['Coldspot %'],
        s=division_summary['Total'] * 1.2,
        c=division_summary['Mean Z'],
        cmap=hotspot_cmap,
        alpha=0.85,
        edgecolors='black',
        linewidths=0.7,
        vmin=-2.5,
        vmax=2.5
    )
    
    # Add division labels
    for division, row in division_summary.iterrows():
        ax.annotate(
            division,
            (row['Hotspot %'], row['Coldspot %']),
            xytext=(5, 5),
            textcoords='offset points',
            fontsize=11,
            fontweight='bold',
            bbox=dict(boxstyle="round,pad=0.4", fc="white", ec="gray", alpha=0.9)
        )
    
    # Add reference lines
    ax.axhline(y=0, color='gray', linestyle='-', alpha=0.3)
    ax.axvline(x=0, color='gray', linestyle='-', alpha=0.3)
    
    # Add colorbar
    cbar = fig.colorbar(scatter, ax=ax, shrink=0.9)
    cbar.set_label('Mean Z-Score', fontsize=12)
    
    # Formatting
    ax.set_title('Spatial Clustering Patterns by Census Division', fontsize=16)
    ax.set_xlabel('Hotspot Percentage', fontsize=13)
    ax.set_ylabel('Coldspot Percentage', fontsize=13)
    ax.set_xlim(-5, 105)
    ax.set_ylim(-5, 105)
    ax.grid(True, alpha=0.2)
    
    plt.tight_layout()
    return fig, division_summary

