## C2. Spatial Cluster Analysis

**Description**  
This section conducts comprehensive spatial analysis to identify geographic patterns and clustering in hospital-level AI adoption across the United States. The analysis employs multiple complementary approaches to characterize spatial autocorrelation, identify statistically significant clusters, and examine regional disparities in AI implementation.

**Purpose**  
To explore patterns in hospital-level AI adoption across geographic regions (state or census division), aiding interpretation of implementation disparities.


### 1 load necessary libraries, functions and preprocessed data 

In [90]:

import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import contextily as ctx
import scipy.spatial as spatial
from scipy.stats import zscore
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import os


In [91]:
# load custom functions to calculate AI scores 
# 1. Primay variable of interest: ai_base_score 
# 2. Seconoday variables of interest: ai_base_breadth_score, ai_base_dev_score, ai_base_eval_score 
# 2.1 ai_base_breadth_score : This score reflects the breadth of the use_cases 
# 2.2 ai_base_dev_score : This score reflects the degree of model development 
# 2.3 ai_base_eval_score : This score reflects the degree of model evaluation in bias and accuracy 

def calculate_base_ai_implementation_row(row):
    """
    Calculate base AI implementation score for a single row (hospital).
    
    Args:
        row: A pandas Series representing a single hospital row
        
    Returns:
        float: Base AI implementation score
    """
    # Base AI implementation score (continuous)
    # Return None if the input value is null
    if pd.isna(row['aipred_it']):
        return None
    elif row['aipred_it'] == 1:  # Machine Learning
        return 2
    elif row['aipred_it'] == 2:  # Other Non-Machine Learning Predictive Models
        return 1
    else:  # Neither (3) or Do not know (4)
        return 0

def calculate_ai_implementation_breadth_row(row):
    """
    Calculate AI implementation breadth score for a single row (hospital).
    
    Args:
        row: A pandas Series representing a single hospital row
        
    Returns:
        float: AI implementation breadth score
    """
    # Start with base score
    base_score = calculate_base_ai_implementation_row(row)
    if base_score is None:
        return None
    elif base_score == 0:
        return 0
    else:
        breadth_score = base_score
        # Implementation Breadth Score - count use cases
        use_case_cols = ['aitraj_it', 'airfol_it', 'aimhea_it', 'airect_it', 
                     'aibill_it', 'aische_it', 'aipoth_it', 'aicloth_it']
        for col in use_case_cols:
            if row[col] is None:
                breadth_score += 0
            else:
                breadth_score += row[col] * 0.25  # 0.25 points per use case
        return breadth_score

def calculate_ai_development_row(row):
    """
    Calculate AI development score for a single row (hospital).
    
    Args:
        row: A pandas Series representing a single hospital row
        
    Returns:
        float: AI development score
    """
    # Start with base score
    base_score = calculate_base_ai_implementation_row(row)
    if base_score is None:
        return None
    elif base_score == 0:
        return 0 
    else:
        dev_score = base_score
        if 'mlsed_it' in row and pd.notna(row['mlsed_it']):
            dev_score += row['mlsed_it'] * 2  # Self-developed
        if 'mldev_it' in row and pd.notna(row['mldev_it']):
            dev_score += row['mldev_it']  # EHR developer
        if 'mlthd_it' in row and pd.notna(row['mlthd_it']):
            dev_score += row['mlthd_it']  # Third-party
        if 'mlpubd_it' in row and pd.notna(row['mlpubd_it']):
            dev_score += row['mlpubd_it'] * 0.5  # Public domain
        return dev_score

def calculate_ai_evaluation_row(row):
    """
    Calculate AI evaluation score for a single row (hospital).
    
    Args:
        row: A pandas Series representing a single hospital row
        
    Returns:
        float: AI evaluation score
    """
    # Start with base score
    base_score = calculate_base_ai_implementation_row(row)
    if base_score is None:
        return None
    elif base_score == 0:
        return 0
    else:
        eval_score = base_score
        # For model accuracy (MLACCU)
        if row['mlaccu_it'] is None:
            eval_score += 0
        elif row['mlaccu_it'] == 1:  # All models
            eval_score += 1
        elif row['mlaccu_it'] == 2:  # Most models
            eval_score += 0.75
        elif row['mlaccu_it'] == 3:  # Some models
            eval_score += 0.5
        elif row['mlaccu_it'] == 4:  # Few models
            eval_score += 0.25
        # For None (5) or Do not know (6), no points added
    
    # For model bias (MLBIAS)
        if row['mlbias_it'] is None:
            eval_score += 0
        elif row['mlbias_it'] == 1:  # All models
            eval_score += 1
        elif row['mlbias_it'] == 2:  # Most models
            eval_score += 0.75
        elif row['mlbias_it'] == 3:  # Some models
            eval_score += 0.5
        elif row['mlbias_it'] == 4:  # Few models
            eval_score += 0.25
        # For None (5) or Do not know (6), no points added
    
        return eval_score

def calculate_all_ai_scores_row(row):
    """
    Calculate all AI/ML implementation scores as continuous measures for a single row.
    
    Args:
        row: A pandas Series representing a single hospital row
        
    Returns:
        dict: Dictionary with all calculated scores
    """
    # Calculate all scores
    base_score = calculate_base_ai_implementation_row(row)
    breadth_score = calculate_ai_implementation_breadth_row(row)
    dev_score = calculate_ai_development_row(row)
    eval_score = calculate_ai_evaluation_row(row)
    
    return {
        'ai_base_score': base_score,
        'ai_base_breadth_score': breadth_score,
        'ai_base_dev_score': dev_score,
        'ai_base_eval_score': eval_score
    }

def apply_ai_scores_to_dataframe(df):
    """
    Apply all AI score calculations row by row to a dataframe.
    
    Args:
        df: A pandas DataFrame with hospital data
        
    Returns:
        pandas.DataFrame: DataFrame with added AI score columns
    """
    # Initialize empty columns for scores
    df['ai_base_score'] = float('nan')
    df['ai_base_breadth_score'] = float('nan')
    df['ai_base_dev_score'] = float('nan')
    df['ai_base_eval_score'] = float('nan')
    
    # Apply row by row calculations
    for index, row in df.iterrows():
        scores = calculate_all_ai_scores_row(row)
        for score_name, score_value in scores.items():
            df.at[index, score_name] = score_value
    
    return df


In [None]:
# load preprocessed AHA dataframe from A2 notebook 
AHA_master = pd.read_csv('./data/AHA_master_external_data.csv', low_memory=False)


In [None]:
# drop hospitals that did not respond to IT supplement 
AHA_IT = AHA_master[~AHA_master.id_it.isnull()]


### 2 Data engineering 

In [None]:
AHA_IT2 = apply_ai_scores_to_dataframe(AHA_IT)

In [96]:
# load STATE shape file 
os.environ['SHAPE_RESTORE_SHX'] = 'YES'
states = gpd.read_file('../temp_shp/cb_2018_us_state_500k.shp')

In [None]:
# Remove rows with missing or invalid coordinates
AHA_IT = AHA_IT.dropna(subset=['lat_as', 'long_as'])

# Filter out invalid coordinates
valid_coords = (
    (AHA_IT['lat_as'] != 0) & 
    (AHA_IT['long_as'] != 0) &
    (AHA_IT['lat_as'] >= -90) & 
    (AHA_IT['lat_as'] <= 90) &
    (AHA_IT['long_as'] >= -180) & 
    (AHA_IT['long_as'] <= 180)
)
AHA_IT = AHA_IT[valid_coords]

# Create GeoDataFrame
hospitals = gpd.GeoDataFrame(
    AHA_IT, 
    geometry=gpd.points_from_xy(AHA_IT.long_as, AHA_IT.lat_as),
    crs="EPSG:4326"
)


In [None]:
# Filter hospitals with valid coordinates and implementation scores
valid_hospitals = hospitals.dropna(subset=['long_as', 'lat_as', 'aipred_it'])
valid_geo_hospitals = hospitals.dropna(subset=['long_as', 'lat_as'])
# Create a GeoDataFrame
hospitals_gdf = gpd.GeoDataFrame(
    valid_hospitals, 
    geometry=gpd.points_from_xy(valid_hospitals.long_as, valid_hospitals.lat_as),
    crs="EPSG:4326" #geographic coordinate system using latitude and longitude
)

# Create a GeoDataFrame
geo_hospitals_gdf = gpd.GeoDataFrame(
    valid_geo_hospitals, 
    geometry=gpd.points_from_xy(valid_geo_hospitals.long_as, valid_geo_hospitals.lat_as),
    crs="EPSG:4326" #geographic coordinate system using latitude and longitude
)


In [None]:
# Convert to a projected CRS for accurate distance calculations
hospitals_gdf_projected = hospitals_gdf.to_crs(epsg=3857) # projected coordinate system using flat, 2D plane to represent Earth's surface 
geo_hospitals_gdf_projected = geo_hospitals_gdf.to_crs(epsg=3857) # projected coordinate system using flat, 2D plane to represent Earth's surface 


geo_hospitals_gdf_projected['ML_implementation_score'] = geo_hospitals_gdf_projected['aipred_it'].map({
    1: 3,  # ML gets highest score
    2: 2,  # Non-ML gets middle score
    3: 1,  # Neither gets lowest score
    4: 1,   # Don't know gets lowest score
    None: 0,
    0: 0 
})
geo_hospitals_gdf_projected.shape


### 3 AI/ML barplot (across census division)

In [100]:
# Create a mapping of states to regions
# Create a mapping of states to census divisions based on provided image
state_to_division = {
    # Division 1: New England
    'ME': 'New England', 'NH': 'New England', 'VT': 'New England', 
    'MA': 'New England', 'RI': 'New England', 'CT': 'New England',
    
    # Division 2: Mid Atlantic
    'NY': 'Mid Atlantic', 'NJ': 'Mid Atlantic', 'PA': 'Mid Atlantic',
    
    # Division 3: South Atlantic
    'DE': 'South Atlantic', 'MD': 'South Atlantic', 'DC': 'South Atlantic',
    'VA': 'South Atlantic', 'WV': 'South Atlantic', 'NC': 'South Atlantic',
    'SC': 'South Atlantic', 'GA': 'South Atlantic', 'FL': 'South Atlantic',
    
    # Division 4: East North Central
    'OH': 'East North Central', 'IN': 'East North Central', 'IL': 'East North Central',
    'MI': 'East North Central', 'WI': 'East North Central',
    
    # Division 5: East South Central
    'KY': 'East South Central', 'TN': 'East South Central', 
    'AL': 'East South Central', 'MS': 'East South Central',
    
    # Division 6: West North Central
    'MN': 'West North Central', 'IA': 'West North Central', 'MO': 'West North Central',
    'ND': 'West North Central', 'SD': 'West North Central', 'NE': 'West North Central',
    'KS': 'West North Central',
    
    # Division 7: West South Central
    'AR': 'West South Central', 'LA': 'West South Central', 
    'OK': 'West South Central', 'TX': 'West South Central',
    
    # Division 8: Mountain
    'MT': 'Mountain', 'ID': 'Mountain', 'WY': 'Mountain', 'CO': 'Mountain',
    'NM': 'Mountain', 'AZ': 'Mountain', 'UT': 'Mountain', 'NV': 'Mountain',
    
    # Division 9: Pacific
    'WA': 'Pacific', 'OR': 'Pacific', 'CA': 'Pacific', 
    'AK': 'Pacific', 'HI': 'Pacific',
    
    # Territories
    'PR': 'Territories', 'GU': 'Territories', 'VI': 'Territories', 
    'AS': 'Territories', 'MP': 'Territories'
}

# Add census division column to the dataframe
hospitals_gdf_projected['division'] = hospitals_gdf_projected['mstate_it'].map(state_to_division)
geo_hospitals_gdf_projected['division'] = geo_hospitals_gdf_projected['mstate_it'].map(state_to_division)

# Loop through each census division 
divisions = [
    'New England', 'Mid Atlantic', 'South Atlantic', 
    'East North Central', 'East South Central', 'West North Central',
    'West South Central', 'Mountain', 'Pacific'
]


In [None]:
# Use  existing ai_types
ai_types = [3, 2, 1]  

# Map the numeric AI types to labels for better readability
ai_type_labels = {
    3: 'ML',
    2: 'Non-ML Predictive Model',
    1: 'Do not know/Neither',
    0: 'No Response'  # Assuming 0 or NaN is used for no response
}

# Create a DataFrame to hold the distribution of AI types by division
division_ai_counts = pd.DataFrame(index=divisions, columns=ai_types + [0])  # Include 0 for no response
# Fill the DataFrame with counts
for division in divisions:
    geo_division_data = geo_hospitals_gdf_projected[geo_hospitals_gdf_projected['division'] == division]
    
    # Count no responses (either 0 or NaN)
    no_response_count = len(geo_division_data[geo_division_data['ML_implementation_score'].isna() | 
                                         (geo_division_data['ML_implementation_score'] == 0)])
    division_ai_counts.loc[division, 0] = no_response_count
    
    # Count other AI types
    for ai_type in ai_types:
        division_ai_counts.loc[division, ai_type] = len(geo_division_data[geo_division_data['ML_implementation_score'] == ai_type])
division_ai_counts

# Calculate total hospitals per division
division_totals = division_ai_counts.sum(axis=1)

# Calculate percentages
division_ai_dist = division_ai_counts.div(division_ai_counts.sum(axis=1), axis=0) * 100

# Calculate grid dimensions
n_divisions = len(divisions)
n_cols = 3  # Number of columns in the grid
n_rows = 3

In [None]:


# Map the numeric AI types to labels for better readability
ai_type_labels = {
    3: 'AI/ML',
    2: 'Non-AI/ML Predictive Model',
    1: 'Do not know/Neither',
    0: 'No Response'
}

# Rename columns for better readability
division_ai_counts = division_ai_counts.rename(columns=ai_type_labels)

# Calculate percentages
percentages = division_ai_counts.div(division_ai_counts.sum(axis=1), axis=0) * 100

# Sort divisions by total
total_hospitals = division_ai_counts.sum(axis=1)
sorted_divisions = division_ai_counts.loc[total_hospitals.sort_values(ascending=False).index]
sorted_percentages = percentages.loc[total_hospitals.sort_values(ascending=False).index]

# Define publication-quality colors
colors = ['#3366CC', '#66CCEE', '#EEEEFF', '#CCCCCC']

# Create figure with a higher DPI for better quality
fig, ax = plt.subplots(figsize=(10, 7), dpi=300)

# Create stacked bar chart
bars = sorted_divisions.plot(kind='bar', stacked=True, color=colors, ax=ax, width=0.7, edgecolor='white', linewidth=0.5)

# Add percentage labels on top of each segment
for i, division in enumerate(sorted_divisions.index):
    cumulative_height = 0
    for j, col in enumerate(sorted_divisions.columns):
        height = sorted_divisions.loc[division, col]
        pct = sorted_percentages.loc[division, col]
        
        if height > 0:  # Only label non-zero values
            # Position at the top of each segment
            y_pos = cumulative_height + height
            
            # For small percentages, only show if they're at least 1%
            if pct >= 1:
                # Position text at the top of each segment
                ax.text(i, cumulative_height + (height * 0.5), f'{pct:.1f}%', 
                        ha='center', va='center', fontsize=9,
                        color='black', fontweight='bold')
            
        cumulative_height += height

ax.set_xlabel('Census Division', fontsize=14, labelpad=10, color='black')
ax.set_ylabel('Number of Hospitals', fontsize=14, labelpad=10, color='black')
ax.tick_params(axis='both', which='major', labelsize=12, colors='black')

plt.xticks(rotation=45, ha='right', color='black')
plt.yticks(color='black')

# Calculate combined percentage of ML and Non-ML Predictive Model
combined_percentage = sorted_percentages['AI/ML'] + sorted_percentages['Non-AI/ML Predictive Model']

# Add combined percentage labels above the total count
for i, division in enumerate(sorted_divisions.index):
    total = total_hospitals[division]
    combined_pct = combined_percentage[division]
    # Position the combined percentage above the total count
    plt.text(i, total + 20, f'Model: {combined_pct:.1f}%', 
             ha='center', fontsize=8,  # Smaller font size
             fontweight='normal',      # Normal weight instead of bold
             color='#666666')          # Gray color


plt.tight_lat()
plt.show()


### 4 Hospital clustering - nearest neighbor 

In [None]:

# nearest neighbor analysis (without relying on pointpats)
print("Performing Custom Nearest Neighbor Analysis...")

# Extract coordinates
coords = np.vstack((geo_hospitals_gdf_projected.geometry.x, geo_hospitals_gdf_projected.geometry.y)).T

# Calculate distances between all pairs of points
kdtree = spatial.KDTree(coords)
distances, indices = kdtree.query(coords, k=2)  # k=2 to get the nearest neighbor (first one is self)
mean_observed_nn_distance = np.mean(distances[:, 1])

# Calculate the area - use the bounding box as an approximation
x_min, y_min, x_max, y_max = geo_hospitals_gdf_projected.total_bounds
area = (x_max - x_min) * (y_max - y_min)

# Calculate point density
n = len(coords)
density = n / area

# Expected mean distance for random distribution
mean_expected_nn_distance = 0.5 / np.sqrt(density)

# Calculate nearest neighbor ratio
nn_ratio = mean_observed_nn_distance / mean_expected_nn_distance

# Standard error
se = 0.26136 / np.sqrt(n * density)

# Z-score
z_score = (mean_observed_nn_distance - mean_expected_nn_distance) / se

# Calculate approximate p-value
from scipy.stats import norm
p_value = 2 * (1 - norm.cdf(abs(z_score)))  # two-tailed test

print(f"Nearest Neighbor Ratio: {nn_ratio:.3f}")
print(f"z-score: {z_score:.3f}")
print(f"p-value: {p_value:.3f}")

if z_score < -1.96:
    print("Hospitals show significant clustering (p < 0.05)")
elif z_score > 1.96:
    print("Hospitals show significant dispersion (p < 0.05)")
else:
    print("Hospitals show random spatial pattern")




### 5 Moran's I spatial autocorrelation - AI and model implementation measures 

#### 5.1 moran's I function

In [None]:
def calculate_morans_i(values, coords, k=5):
    """
    Calculate Moran's I using PySAL library
    
    Parameters:
    -----------
    values : array-like
        The values to test for spatial autocorrelation
    coords : array-like
        Coordinates as (n, 2) array
    k : int
        Number of nearest neighbors
    
    Returns:
    --------
    tuple: (I, EI, z_score, p_value, n)
        I: Moran's I statistic
        EI: Expected value of I under null hypothesis
        z_score: Standardized z-score
        p_value: Two-tailed p-value
        n: Number of observations
    """
    # Convert inputs to numpy arrays
    values = np.asarray(values).flatten()
    coords = np.asarray(coords)
    
    # Handle missing values
    valid_idx = ~np.isnan(values)
    if not np.all(valid_idx):
        values = values[valid_idx]
        coords = coords[valid_idx]
    
    n = len(values)
    if n <= 1 or np.var(values) == 0:
        return {'moran_i': np.nan, 'expected_i': np.nan, 'z_score': np.nan, 'p_value': np.nan, 'n': n}
    
    try:
        # Create k-nearest neighbor weights using PySAL
        w = KNN.from_array(coords, k=k)
        
        # Calculate Moran's I using PySAL
        moran = Moran(values, w)
        
        return {
            'moran_i': moran.I,
            'expected_i': moran.EI,
            'z_score': moran.z_norm,
            'p_value': moran.p_norm,
            'n': n
        }
    
    except Exception as e:
        return {'moran_i': np.nan, 'expected_i': np.nan, 'z_score': np.nan, 'p_value': np.nan, 'n': n}



#### 5.2 US moran's I 

In [None]:
# Global Moran's I Spatial Autocorrelation Analysis

# Extract coordinates and target variable
coords = np.vstack((hospitals_gdf_projected.geometry.x, hospitals_gdf_projected.geometry.y)).T
values = hospitals_gdf_projected['ai_base_score'].values



In [None]:
# Extract coordinates 
coords = np.vstack((geo_hospitals_gdf_projected.geometry.x, geo_hospitals_gdf_projected.geometry.y)).T

# Define the variables to analyze 
variables = [
    'ai_base_score', 
    'ai_base_breadth_score', 
    'ai_base_dev_score', 
    'ai_base_eval_score'
]

# Create a list to store results 
results_list = []

# Calculate Moran's I for each variable 
for var in variables:
    try:
        # Get values for this variable
        values = geo_hospitals_gdf_projected[var].values
        
        # Calculate Moran's I
        result = calculate_morans_i(values, coords, k=5)
        
        # Determine pattern 
        if np.isnan(result['p_value']):
            pattern = "Invalid data"
        elif result['p_value'] < 0.05:
            if result['moran_i'] > result['expected_i']:
                pattern = "Significant clustering"
            else:
                pattern = "Significant dispersion"
        else:
            pattern = "Random distribution"
        
        # Add results to list 
        results_list.append({
            'Variable': var,
            'Moran I': result['moran_i'],
            'Expected I': result['expected_i'],
            'z-score': result['z_score'],
            'p-value': result['p_value'],
            'Pattern': pattern,
            'n': result['n']
        })
    
    except Exception as e:
        results_list.append({
            'Variable': var,
            'Moran I': np.nan,
            'Expected I': np.nan,
            'z-score': np.nan,
            'p-value': np.nan,
            'Pattern': f"Error: {str(e)}",
            'n': np.nan
        })

# Create DataFrame 
moran_results_df = pd.DataFrame(results_list)

# Format the numeric columns 
moran_results_df['Moran I'] = moran_results_df['Moran I'].round(4)
moran_results_df['Expected I'] = moran_results_df['Expected I'].round(4)
moran_results_df['z-score'] = moran_results_df['z-score'].round(4)
moran_results_df['p-value'] = moran_results_df['p-value'].round(10)

# Display the DataFrame 
print("\nMoran's I Analysis Results (k=5 neighbors):")
moran_results_df

#### 5.2 moran's I  US Census division 

In [None]:
# Calculate Moran's I for each division and variable
all_results = []

for ai_var in ai_variables:
    for division in mainland_divisions:
        division_data = hospitals_gdf_projected[hospitals_gdf_projected['division'] == division]
        
        # Skip if insufficient data
        if len(division_data) < 10:
            continue
            
        # Adjust k for smaller divisions
        k = min(5, max(2, len(division_data) // 5))
        
        try:
            coords = np.vstack((division_data.geometry.x, division_data.geometry.y)).T
            values = division_data[ai_var].values
            
            result = calculate_morans_i(values, coords, k)
            
            if not np.isnan(result['moran_i']):
                all_results.append({
                    'AI_Variable': ai_var,
                    'Division': division,
                    'Morans_I': result['moran_i'],
                    'p_value': result['p_value'],
                    'z_score': result['z_score'],
                    'expected_I': result['expected_i'],
                    'n': result['n']
                })
                
        except Exception as e:
            continue

# Process results
if all_results:
    comprehensive_df = pd.DataFrame(all_results)
    
    # Add pattern classification
    comprehensive_df['Pattern'] = comprehensive_df.apply(lambda row: 
        'Clustering' if row['p_value'] < 0.05 and row['Morans_I'] > row['expected_I'] else
        'Dispersion' if row['p_value'] < 0.05 and row['Morans_I'] < row['expected_I'] else
        'Random', axis=1)
    
    # Create pivot table for Moran's I values
    morans_pivot = comprehensive_df.pivot_table(
        index='Division', 
        columns='AI_Variable', 
        values='Morans_I', 
        aggfunc='first'
    ).round(4)
    
    # Create pivot table for significance
    pvalue_pivot = comprehensive_df.pivot_table(
        index='Division', 
        columns='AI_Variable', 
        values='p_value', 
        aggfunc='first'
    ).round(4)
    
    print("Moran's I by Division and AI Variable:")
    print(morans_pivot)
    
    print("\nP-values:")
    print(pvalue_pivot)
    
    # Summary statistics
    for ai_var in ai_variables:
        var_data = comprehensive_df[comprehensive_df['AI_Variable'] == ai_var]
        clustering_count = len(var_data[var_data['Pattern'] == 'Clustering'])
        total_divisions = len(var_data)
        
        print(f"\n{ai_var}: {clustering_count}/{total_divisions} divisions show significant clustering")
    
    return comprehensive_df
else:
    print("No valid results calculated")
    return None

#### 5.3 moran's I  US state

In [None]:
# List of AI variables to analyze
ai_variables = [
    'ai_base_score', 
    'ai_base_breadth_score', 
    'ai_base_dev_score', 
    'ai_base_eval_score'
]

# Calculate Moran's I for each state and each AI variable
print("\n=== Moran's I by State for All AI Variables ===")

# Store results for all combinations
all_state_results = []

# Get list of states with hospitals
states = hospitals_gdf_projected['mstate_it'].dropna().unique()
print(f"Found {len(states)} states with hospital data")

for ai_var in ai_variables:
    print(f"\n--- Analyzing {ai_var} ---")
    state_morans = []
    
    for state in states:
        state_data = hospitals_gdf_projected[hospitals_gdf_projected['mstate_it'] == state]
        
        # Need enough data points to calculate
        if len(state_data) < 20:
            # print(f"{state}: Not enough data points for reliable Moran's I calculation ({len(state_data)} hospitals)")
            continue
            
        # Adjust k for smaller states
        k = min(5, max(2, len(state_data) // 5))  # Ensure k is at least 2
        
        print(f"Processing {state}: {len(state_data)} hospitals, k={k}")
            
        try:
            # Extract coordinates and values first
            coords = np.vstack((state_data.geometry.x, state_data.geometry.y)).T
            values = state_data[ai_var].values
            
            # Use PySAL function (returns dictionary)
            result = calculate_morans_i(values, coords, k)
            
            # Check if we got valid results
            if not np.isnan(result['moran_i']):
                # Get state name if available
                state_name = state
                if 'mstate' in state_data.columns:
                    state_names = state_data['mstate'].dropna().unique()
                    if len(state_names) > 0:
                        state_name = state_names[0]
                
                # Store results
                all_state_results.append({
                    'AI_Variable': ai_var,
                    'State_Code': state,
                    'State': state_name,
                    'Morans_I': result['moran_i'],
                    'p_value': result['p_value'],
                    'z_score': result['z_score'],
                    'expected_I': result['expected_i'],
                    'n': result['n']
                })
                
                print(f"  Results: I={result['moran_i']:.4f}, p={result['p_value']:.4f}")
            else:
                print(f"  {state}: Failed to calculate valid Moran's I")
                
        except Exception as e:
            print(f"  {state}: Error calculating Moran's I - {str(e)}")

# Check if we have any results
if not all_state_results:
    print("No valid Moran's I results calculated for any state. Check  data and function.")
else:
    # Create a comprehensive dataframe
    comprehensive_state_df = pd.DataFrame(all_state_results)
    
    # Format the numeric columns
    comprehensive_state_df['Morans_I'] = comprehensive_state_df['Morans_I'].round(4)
    comprehensive_state_df['p_value'] = comprehensive_state_df['p_value'].round(4)
    comprehensive_state_df['z_score'] = comprehensive_state_df['z_score'].round(4)
    comprehensive_state_df['expected_I'] = comprehensive_state_df['expected_I'].round(4)
    
    # Add significance level column
    comprehensive_state_df['Significance'] = comprehensive_state_df['p_value'].apply(lambda x: 
        '*****' if x < 0.0001 else 
        '****' if x < 0.001 else 
        '***' if x < 0.01 else 
        '**' if x < 0.05 else 
        '*' if x < 0.1 else 
        'ns')
    
    # Add pattern column
    comprehensive_state_df['Pattern'] = comprehensive_state_df.apply(lambda row: 
        'Significant clustering' if row['p_value'] < 0.05 and row['Morans_I'] > row['expected_I'] else
        'Significant dispersion' if row['p_value'] < 0.05 and row['Morans_I'] < row['expected_I'] else
        'Random distribution', axis=1)
    
    # Display results by AI variable (abbreviated version)
    for ai_var in ai_variables:
        var_df = comprehensive_state_df[comprehensive_state_df['AI_Variable'] == ai_var].copy()
        
        if len(var_df) > 0:
            # Sort by Moran's I value
            var_df = var_df.sort_values('Morans_I', ascending=False)
            
            print(f"\n{'='*80}")
            print(f"Top 10 States for {ai_var.upper()} (by Moran's I)")
            print(f"{'='*80}")
            
            # Show top 10 only for space
            top_10 = var_df.head(10)
            display_df = top_10[['State_Code', 'State', 'Morans_I', 'z_score', 'p_value', 'Significance', 'Pattern']].copy()
            display_df = display_df.rename(columns={
                'State_Code': 'Code',
                'Morans_I': 'Moran\'s I',
                'z_score': 'Z-Score',
                'p_value': 'P-Value'
            })
            print(display_df.to_string(index=False))
            
            # Summary for this variable
            clustering = len(var_df[var_df['Pattern'] == 'Significant clustering'])
            dispersion = len(var_df[var_df['Pattern'] == 'Significant dispersion'])
            random = len(var_df[var_df['Pattern'] == 'Random distribution'])
            
            print(f"\nSummary for {ai_var}:")
            print(f"  Significant clustering: {clustering} states")
            print(f"  Significant dispersion: {dispersion} states")
            print(f"  Random distribution: {random} states")
    
    # Overall summary across all variables
    print(f"\n{'='*120}")
    print("COMPREHENSIVE STATE COMPARISON ACROSS ALL AI VARIABLES")
    print(f"{'='*120}")
    
    # Create comprehensive pivot tables (SAME AS DIVISION VERSION)
    print("COMPREHENSIVE COMPARISON TABLE:")
    print("-" * 120)
    
    # Create pivot tables for different metrics
    morans_pivot = comprehensive_state_df.pivot_table(
        index=['State_Code', 'State'], 
        columns='AI_Variable', 
        values='Morans_I', 
        aggfunc='first'
    ).round(4)
    
    zscore_pivot = comprehensive_state_df.pivot_table(
        index=['State_Code', 'State'], 
        columns='AI_Variable', 
        values='z_score', 
        aggfunc='first'
    ).round(2)
    
    pvalue_pivot = comprehensive_state_df.pivot_table(
        index=['State_Code', 'State'], 
        columns='AI_Variable', 
        values='p_value', 
        aggfunc='first'
    )
    
    significance_pivot = comprehensive_state_df.pivot_table(
        index=['State_Code', 'State'], 
        columns='AI_Variable', 
        values='Significance', 
        aggfunc='first'
    )
    
    # Create combined display tables
    print("\n1. Moran's I Values:")
    print(morans_pivot.to_string())
    
    print("\n2. Z-Scores:")
    print(zscore_pivot.to_string())
    
    print("\n3. P-Values with Significance Stars:")
    # Combine p-values with significance stars
    pvalue_with_stars = pvalue_pivot.copy()
    for col in pvalue_with_stars.columns:
        for idx in pvalue_with_stars.index:
            if pd.notna(pvalue_with_stars.loc[idx, col]):
                p_val = pvalue_with_stars.loc[idx, col]
                stars = significance_pivot.loc[idx, col]
                pvalue_with_stars.loc[idx, col] = f"{p_val:.4f} {stars}"
    
    print(pvalue_with_stars.to_string())
    
    # Create an ultra-compact summary table
    print("\n4. COMPACT SUMMARY (Moran's I [Z-score] Significance):")
    print("-" * 120)
    
    compact_table = pd.DataFrame(index=morans_pivot.index, columns=morans_pivot.columns)
    
    for state_info in morans_pivot.index:
        for ai_var in morans_pivot.columns:
            if pd.notna(morans_pivot.loc[state_info, ai_var]):
                morans_val = morans_pivot.loc[state_info, ai_var]
                z_val = zscore_pivot.loc[state_info, ai_var]
                sig = significance_pivot.loc[state_info, ai_var]
                # Using 2 decimal places for z-score as requested
                compact_table.loc[state_info, ai_var] = f"{morans_val:.3f} [{z_val:.2f}] {sig}"
            else:
                compact_table.loc[state_info, ai_var] = "No data"
    
    print(compact_table.to_string())
    
    # Legend for significance stars
    print(f"\nSignificance Legend:")
    print(f"***** p < 0.0001 (highly significant)")
    print(f"****  p < 0.001  (very significant)")
    print(f"***   p < 0.01   (significant)")
    print(f"**    p < 0.05   (significant)")
    print(f"*     p < 0.1    (marginally significant)")
    print(f"ns    p ≥ 0.1    (not significant)")
    
    print(f"\nTable Format: Moran's I [Z-score] Significance")
    print(f"Higher Moran's I values indicate stronger spatial clustering")
    
    # Overall patterns
    total_clustering = len(comprehensive_state_df[comprehensive_state_df['Pattern'] == 'Significant clustering'])
    total_dispersion = len(comprehensive_state_df[comprehensive_state_df['Pattern'] == 'Significant dispersion'])
    total_random = len(comprehensive_state_df[comprehensive_state_df['Pattern'] == 'Random distribution'])
    total_analyses = len(comprehensive_state_df)
    
    print(f"\nOverall Pattern Distribution:")
    print(f"  Total state analyses: {total_analyses}")
    print(f"  Significant clustering: {total_clustering} ({total_clustering/total_analyses*100:.1f}%)")
    print(f"  Significant dispersion: {total_dispersion} ({total_dispersion/total_analyses*100:.1f}%)")
    print(f"  Random distribution: {total_random} ({total_random/total_analyses*100:.1f}%)")
    
    # Find which AI variable shows most clustering at state level
    clustering_by_var = comprehensive_state_df[comprehensive_state_df['Pattern'] == 'Significant clustering'].groupby('AI_Variable').size()
    if len(clustering_by_var) > 0:
        most_clustered_var = clustering_by_var.idxmax()
        print(f"\nAI variable with most spatial clustering at state level: {most_clustered_var} ({clustering_by_var[most_clustered_var]} states)")
    
    # Find states that consistently show clustering
    clustering_by_state = comprehensive_state_df[comprehensive_state_df['Pattern'] == 'Significant clustering'].groupby(['State_Code', 'State']).size()
    if len(clustering_by_state) > 0:
        print(f"\nStates showing clustering across multiple AI variables:")
        for (state_code, state_name), count in clustering_by_state.items():
            print(f"  {state_name} ({state_code}): {count} out of {len(ai_variables)} AI variables")
    
print(f"\nAnalysis complete for {len(ai_variables)} AI variables across states with ≥20 hospitals each.")

### 6. DBSCAN - cluster identification 

In [109]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
import numpy as np
import pandas as pd



#### 6.1 DBSCAN function 

In [None]:
def adaptive_dbscan_clustering(hospitals_gdf, division, output_col, eps=0.25, min_samples=2, k=5):
    """
    Perform DBSCAN clustering with adaptive feature weights based on local density.
    
    Parameters:
    - hospitals_gdf: GeoDataFrame with hospital data
    - division: Census division to analyze
    - output_col: Column name for the ML score to use
    - eps: DBSCAN epsilon parameter
    - min_samples: DBSCAN min_samples parameter
    - k: Number of neighbors to consider for density estimation
    """
    # Filter data for this division
    division_data = hospitals_gdf[hospitals_gdf['division'] == division].copy()
    
    # Check if we have the target column
    if output_col not in division_data.columns:
        print(f"Error: Column '{output_col}' not found for division '{division}'")
        return pd.DataFrame(), []
    
    # Handle missing values - drop rows with NaN in the target column or coordinates
    division_data = division_data.dropna(subset=[output_col, 'geometry'])
    
    # Check if we have enough data after dropping NaNs
    if len(division_data) < min_samples:
        print(f"Skipping {division}: too few hospitals with valid data ({len(division_data)})")
        return pd.DataFrame(), []
    
    # Print diagnostic info
    print(f"  Processing {division} with {len(division_data)} hospitals (valid data)")
    
    # Calculate local density using k-nearest neighbors
    coords = np.column_stack([division_data.geometry.x, division_data.geometry.y])
    
    # Ensure coordinates don't have NaNs
    if np.isnan(coords).any():
        print(f"  Warning: Some coordinates contain NaN values. Removing affected points.")
        valid_idx = ~np.isnan(coords).any(axis=1)
        division_data = division_data.iloc[valid_idx].copy()
        coords = coords[valid_idx]
        
        if len(division_data) < min_samples:
            print(f"  Skipping {division}: too few hospitals after removing NaN coordinates")
            return pd.DataFrame(), []
    
    # Ensure ML scores don't have NaNs
    values = division_data[output_col].values
    if np.isnan(values).any():
        print(f"  Warning: '{output_col}' contains NaN values. Removing affected points.")
        valid_idx = ~np.isnan(values)
        division_data = division_data.iloc[valid_idx].copy()
        coords = coords[valid_idx]
        values = values[valid_idx]
        
        if len(division_data) < min_samples:
            print(f"  Skipping {division}: too few hospitals after removing NaN values")
            return pd.DataFrame(), []
    
    # Adjust k if necessary
    actual_k = min(k, len(division_data)-1)
    if actual_k < k:
        print(f"  Note: Adjusted k from {k} to {actual_k} due to small sample size")
    
    # Fit nearest neighbors
    nn = NearestNeighbors(n_neighbors=actual_k+1)  # +1 because point is its own neighbor
    nn.fit(coords)
    distances, _ = nn.kneighbors(coords)
    
    # Average distance to k nearest neighbors (excluding self)
    avg_distances = distances[:, 1:].mean(axis=1)
    
    # Convert to density: smaller distances = higher density
    density = 1 / (avg_distances + 1e-10)  # Add small constant to avoid division by zero
    
    # Higher density areas get HIGHER weights
    min_density, max_density = np.percentile(density, [5, 95])  # Using percentiles to avoid outliers
    normalized_density = (density - min_density) / (max_density - min_density + 1e-10)
    normalized_density = np.clip(normalized_density, 0, 1)  # Clip to [0, 1] first
    weights = 0.5 + 1.5 * normalized_density  # Scale to [0.5, 2.0] - higher density gets higher weights
    
    # Store the weights for later analysis
    division_data['ml_score_weight'] = weights
    
    # Create weighted features: higher weight amplifies ML scores in dense areas
    X = np.column_stack([
        coords,  # x, y coordinates
        division_data[output_col].values * weights  # Adaptive weighting - FIXED
    ])
    
    # Final check for NaNs
    if np.isnan(X).any():
        print(f"  Error: Feature matrix still contains NaN values after processing")
        print(f"  NaN counts: {np.isnan(X).sum(axis=0)}")
        return pd.DataFrame(), []
    
    # Standardize the weighted features
    X_scaled = StandardScaler().fit_transform(X)
    
    # Run DBSCAN
    try:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        cluster_labels = dbscan.fit_predict(X_scaled)
        
        # Store labels in the dataframe
        division_data['cluster'] = cluster_labels
        
        # Create cluster summary
        clusters_summary = []
        if len(set(cluster_labels)) > 1:  # If we have actual clusters
            for cluster_id in sorted(set(cluster_labels)):
                if cluster_id == -1:
                    continue  # Skip noise points
                    
                cluster_hospitals = division_data[division_data['cluster'] == cluster_id]
                
                # Calculate ML stats
                ml_mean = cluster_hospitals[output_col].mean()
                ml_std = cluster_hospitals[output_col].std() if len(cluster_hospitals) > 1 else 0
                
                clusters_summary.append({
                    'division': division,
                    'cluster_id': f"{division}-{cluster_id}",
                    'n_hospitals': len(cluster_hospitals),
                    'pct_of_division': len(cluster_hospitals) / len(division_data) * 100,
                    'ml_mean': ml_mean,
                    'ml_std': ml_std,
                    'ml_min': cluster_hospitals[output_col].min(),
                    'ml_max': cluster_hospitals[output_col].max(),
                    'avg_density_weight': cluster_hospitals['ml_score_weight'].mean(),  # Renamed for clarity
                    'density_weight_std': cluster_hospitals['ml_score_weight'].std() if len(cluster_hospitals) > 1 else 0
                })
        
        # Count results
        n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
        n_noise = list(cluster_labels).count(-1)
        
        print(f"  Results: {n_clusters} clusters, {n_noise} noise points ({n_noise/len(cluster_labels)*100:.1f}%)")
        print(f"  ML score weights - min: {weights.min():.2f}, max: {weights.max():.2f}, avg: {weights.mean():.2f}")
        print(f"  Weight interpretation: Higher values = denser areas get amplified ML scores")
        
        return division_data, clusters_summary
    
    except Exception as e:
        print(f"  Error during DBSCAN: {str(e)}")
        return pd.DataFrame(), []

In [53]:
coords = np.column_stack([hospitals_gdf_projected.geometry.x, hospitals_gdf_projected.geometry.y])
ml_scores = hospitals_gdf_projected['ai_base_score'].values

In [None]:
# Regional DBSCAN Clustering Analysis
# Applies spatial clustering to hospital data by census division

# Initialize storage for results
all_results = []
all_clusters_summary = []
modified_data = pd.DataFrame()

# Define analysis regions
regions = [
    'New England', 'Mid Atlantic', 'South Atlantic', 
    'East North Central', 'East South Central', 'West North Central',
    'West South Central', 'Mountain', 'Pacific'
]

target_metric = 'ai_base_score'



#### 6.2 DBSCAN ai base score 

In [None]:
# DBSCAN Clustering Analysis by Census Division
# Applies adaptive clustering to each geographic region

# Initialize storage
all_clusters_summary = []
modified_data = pd.DataFrame()


target_column = 'ai_base_score'

# Apply clustering to each division
for division in divisions:
    if division not in hospitals_gdf_projected['division'].unique():
        continue
    
    # Run adaptive DBSCAN
    division_result, clusters_summary = adaptive_dbscan_clustering(
        hospitals_gdf_projected, 
        division, 
        output_col=target_column,
        eps=0.25, 
        min_samples=2,
        k=5
    )
    
    if not division_result.empty:
        modified_data = pd.concat([modified_data, division_result])
        all_clusters_summary.extend(clusters_summary)


#### 6.3 DBSCAN secondary model implmentation measures 

In [None]:
# Initialize collection variables
all_results = []
all_clusters_summary = []
modified_hospital_data = pd.DataFrame()  # Empty DataFrame to collect results

# Define census divisions
divisions = [
    'New England', 'Mid Atlantic', 'South Atlantic', 
    'East North Central', 'East South Central', 'West North Central',
    'West South Central', 'Mountain', 'Pacific'
]

# Define the target column
target_column = 'ai_base_score'

print(f"DBSCAN Clustering Analysis for {target_column} by Census Division")
print("=" * 60)

# Apply clustering to each division
for division in divisions:
    print(f"\nAnalyzing {division} division...")
    
    # Check if this division exists in the data
    if division not in hospitals_gdf_projected['division'].unique():
        print(f"  Warning: No hospitals found in '{division}' division")
        continue
    
    # Apply clustering
    division_result, clusters_summary = adaptive_dbscan_clustering(
        hospitals_gdf_projected, 
        division, 
        output_col=target_column,
        eps=0.25, 
        min_samples=2,
        k=5
    )
    
    # Only append if we got results
    if not division_result.empty:
        modified_hospital_data = pd.concat([modified_hospital_data, division_result])
        all_clusters_summary.extend(clusters_summary)

# Convert cluster summaries to DataFrame for analysis
if all_clusters_summary:
    # Create DataFrame from list
    clusters_df = pd.DataFrame(all_clusters_summary)
    
    # Summary statistics by division
    division_summary = clusters_df.groupby('division').agg(
        Number_of_Clusters=('cluster_id', 'count'),
        Total_Hospitals=('n_hospitals', 'sum'),
        Avg_Cluster_Size=('n_hospitals', 'mean'),
        Avg_Score=('ml_mean', 'mean'),
        Min_Score=('ml_min', 'min'),
        Max_Score=('ml_max', 'max')
    ).reset_index()
    
    # Format the numeric columns
    division_summary['Avg_Cluster_Size'] = division_summary['Avg_Cluster_Size'].round(1)
    division_summary['Avg_Score'] = division_summary['Avg_Score'].round(2)
    division_summary['Min_Score'] = division_summary['Min_Score'].round(2)
    division_summary['Max_Score'] = division_summary['Max_Score'].round(2)
    
    # Overall summary as a single row DataFrame
    overall_summary = pd.DataFrame([{
        'Division': 'TOTAL',
        'Number_of_Clusters': len(clusters_df),
        'Total_Hospitals': clusters_df['n_hospitals'].sum(),
        'Avg_Cluster_Size': round(clusters_df['n_hospitals'].mean(), 1),
        'Avg_Score': round(clusters_df['ml_mean'].mean(), 2),
        'Min_Score': round(clusters_df['ml_min'].min(), 2),
        'Max_Score': round(clusters_df['ml_max'].max(), 2)
    }])
    
    # Display summary tables
    print("\nDivision Summary:")
    display(division_summary)
    
    print("\nOverall Summary:")
    display(overall_summary)
else:
    print("No clusters were identified across any division")
