## C2. Spatial Cluster Analysis

**Description**  
This section conducts comprehensive spatial analysis to identify geographic patterns and clustering in hospital-level AI adoption across the United States. The analysis employs multiple complementary approaches to characterize spatial autocorrelation, identify statistically significant clusters, and examine regional disparities in AI implementation.

**Purpose**  
To explore patterns in hospital-level AI adoption across geographic regions (state or census division), aiding interpretation of implementation disparities.

**Disclaimer**  
- This codebase was partially cleaned and annotated using OpenAI’s ChatGPT-4o. Please review and validate before using for critical purposes.  
- AHA data is subscription-based and not publicly shareable. All reported results are aggregated at the state or census division level.
- All publicly available data should also be independently downlowded from the source 

**Notebook Workflow**  

0. Load necessary libraries, functions, and pre-processed data 
1. Prepare the data to conduct spatial analysis 
2. Analaysis 
3. Neighbor Analysis - Hospital clustering 
4. Moran's I score autocorrelation 
5. DBSAN cluster analysis 
6. Moran's I score on census division and states 

### C2_0 load necessary libraries, functions and preprocessed data 

#### C2_0_1 load libraries 

In [90]:

import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import contextily as ctx
import scipy.spatial as spatial
from scipy.stats import zscore
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import os


#### C2_0_2 load custom functions 

In [91]:
# load custom functions to calculate AI scores 
# 1. Primay variable of interest: ai_base_score 
# 2. Seconoday variables of interest: ai_base_breadth_score, ai_base_dev_score, ai_base_eval_score 
# 2.1 ai_base_breadth_score : This score reflects the breadth of the use_cases 
# 2.2 ai_base_dev_score : This score reflects the degree of model development 
# 2.3 ai_base_eval_score : This score reflects the degree of model evaluation in bias and accuracy 

def calculate_base_ai_implementation_row(row):
    """
    Calculate base AI implementation score for a single row (hospital).
    
    Args:
        row: A pandas Series representing a single hospital row
        
    Returns:
        float: Base AI implementation score
    """
    # Base AI implementation score (continuous)
    # Return None if the input value is null
    if pd.isna(row['aipred_it']):
        return None
    elif row['aipred_it'] == 1:  # Machine Learning
        return 2
    elif row['aipred_it'] == 2:  # Other Non-Machine Learning Predictive Models
        return 1
    else:  # Neither (3) or Do not know (4)
        return 0

def calculate_ai_implementation_breadth_row(row):
    """
    Calculate AI implementation breadth score for a single row (hospital).
    
    Args:
        row: A pandas Series representing a single hospital row
        
    Returns:
        float: AI implementation breadth score
    """
    # Start with base score
    base_score = calculate_base_ai_implementation_row(row)
    if base_score is None:
        return None
    elif base_score == 0:
        return 0
    else:
        breadth_score = base_score
        # Implementation Breadth Score - count use cases
        use_case_cols = ['aitraj_it', 'airfol_it', 'aimhea_it', 'airect_it', 
                     'aibill_it', 'aische_it', 'aipoth_it', 'aicloth_it']
        for col in use_case_cols:
            if row[col] is None:
                breadth_score += 0
            else:
                breadth_score += row[col] * 0.25  # 0.25 points per use case
        return breadth_score

def calculate_ai_development_row(row):
    """
    Calculate AI development score for a single row (hospital).
    
    Args:
        row: A pandas Series representing a single hospital row
        
    Returns:
        float: AI development score
    """
    # Start with base score
    base_score = calculate_base_ai_implementation_row(row)
    if base_score is None:
        return None
    elif base_score == 0:
        return 0 
    else:
        dev_score = base_score
        if 'mlsed_it' in row and pd.notna(row['mlsed_it']):
            dev_score += row['mlsed_it'] * 2  # Self-developed
        if 'mldev_it' in row and pd.notna(row['mldev_it']):
            dev_score += row['mldev_it']  # EHR developer
        if 'mlthd_it' in row and pd.notna(row['mlthd_it']):
            dev_score += row['mlthd_it']  # Third-party
        if 'mlpubd_it' in row and pd.notna(row['mlpubd_it']):
            dev_score += row['mlpubd_it'] * 0.5  # Public domain
        return dev_score

def calculate_ai_evaluation_row(row):
    """
    Calculate AI evaluation score for a single row (hospital).
    
    Args:
        row: A pandas Series representing a single hospital row
        
    Returns:
        float: AI evaluation score
    """
    # Start with base score
    base_score = calculate_base_ai_implementation_row(row)
    if base_score is None:
        return None
    elif base_score == 0:
        return 0
    else:
        eval_score = base_score
        # For model accuracy (MLACCU)
        if row['mlaccu_it'] is None:
            eval_score += 0
        elif row['mlaccu_it'] == 1:  # All models
            eval_score += 1
        elif row['mlaccu_it'] == 2:  # Most models
            eval_score += 0.75
        elif row['mlaccu_it'] == 3:  # Some models
            eval_score += 0.5
        elif row['mlaccu_it'] == 4:  # Few models
            eval_score += 0.25
        # For None (5) or Do not know (6), no points added
    
    # For model bias (MLBIAS)
        if row['mlbias_it'] is None:
            eval_score += 0
        elif row['mlbias_it'] == 1:  # All models
            eval_score += 1
        elif row['mlbias_it'] == 2:  # Most models
            eval_score += 0.75
        elif row['mlbias_it'] == 3:  # Some models
            eval_score += 0.5
        elif row['mlbias_it'] == 4:  # Few models
            eval_score += 0.25
        # For None (5) or Do not know (6), no points added
    
        return eval_score

def calculate_all_ai_scores_row(row):
    """
    Calculate all AI/ML implementation scores as continuous measures for a single row.
    
    Args:
        row: A pandas Series representing a single hospital row
        
    Returns:
        dict: Dictionary with all calculated scores
    """
    # Calculate all scores
    base_score = calculate_base_ai_implementation_row(row)
    breadth_score = calculate_ai_implementation_breadth_row(row)
    dev_score = calculate_ai_development_row(row)
    eval_score = calculate_ai_evaluation_row(row)
    
    return {
        'ai_base_score': base_score,
        'ai_base_breadth_score': breadth_score,
        'ai_base_dev_score': dev_score,
        'ai_base_eval_score': eval_score
    }

def apply_ai_scores_to_dataframe(df):
    """
    Apply all AI score calculations row by row to a dataframe.
    
    Args:
        df: A pandas DataFrame with hospital data
        
    Returns:
        pandas.DataFrame: DataFrame with added AI score columns
    """
    # Initialize empty columns for scores
    df['ai_base_score'] = float('nan')
    df['ai_base_breadth_score'] = float('nan')
    df['ai_base_dev_score'] = float('nan')
    df['ai_base_eval_score'] = float('nan')
    
    # Apply row by row calculations
    for index, row in df.iterrows():
        scores = calculate_all_ai_scores_row(row)
        for score_name, score_value in scores.items():
            df.at[index, score_name] = score_value
    
    return df


#### C2_0_3 load custom dataframes

In [None]:
# load preprocessed AHA dataframe from A2 notebook 
AHA_master = pd.read_csv('./data/AHA_master_external_data.csv', low_memory=False)


In [None]:
# drop hospitals that did not respond to IT supplement 
AHA_IT = AHA_master[~AHA_master.id_it.isnull()]


### C2_1 Prepare the data to conduct analysis 

In [None]:
AHA_IT2 = apply_ai_scores_to_dataframe(AHA_IT)

In [96]:
# load STATE shape file 
os.environ['SHAPE_RESTORE_SHX'] = 'YES'
states = gpd.read_file('../temp_shp/cb_2018_us_state_500k.shp')

#### C2_1_2 Clean dataframe 
1. drop missing or invalid coordinate rows 
2. create geodataframe 
3. prepare the dataframe for analysis 

In [None]:
# Remove rows with missing or invalid coordinates
AHA_IT = AHA_IT.dropna(subset=['latitude_address', 'longitude_address'])

# Filter out invalid coordinates
valid_coords = (
    (AHA_IT['latitude_address'] != 0) & 
    (AHA_IT['longitude_address'] != 0) &
    (AHA_IT['latitude_address'] >= -90) & 
    (AHA_IT['latitude_address'] <= 90) &
    (AHA_IT['longitude_address'] >= -180) & 
    (AHA_IT['longitude_address'] <= 180)
)
AHA_IT = AHA_IT[valid_coords]

# Create GeoDataFrame
hospitals = gpd.GeoDataFrame(
    AHA_IT, 
    geometry=gpd.points_from_xy(AHA_IT.longitude_address, AHA_IT.latitude_address),
    crs="EPSG:4326"
)


In [98]:
# 1. Prepare the data for  analysis
# Filter hospitals with valid coordinates and implementation scores
valid_hospitals = hospitals.dropna(subset=['longitude_address', 'latitude_address', 'aipred_it'])
valid_geo_hospitals = hospitals.dropna(subset=['longitude_address', 'latitude_address'])
# Create a GeoDataFrame
hospitals_gdf = gpd.GeoDataFrame(
    valid_hospitals, 
    geometry=gpd.points_from_xy(valid_hospitals.longitude_address, valid_hospitals.latitude_address),
    crs="EPSG:4326" #geographic coordinate system using latitude and longitude
)

# Create a GeoDataFrame
geo_hospitals_gdf = gpd.GeoDataFrame(
    valid_geo_hospitals, 
    geometry=gpd.points_from_xy(valid_geo_hospitals.longitude_address, valid_geo_hospitals.latitude_address),
    crs="EPSG:4326" #geographic coordinate system using latitude and longitude
)


In [None]:
# Convert to a projected CRS for accurate distance calculations
hospitals_gdf_projected = hospitals_gdf.to_crs(epsg=3857) # projected coordinate system using flat, 2D plane to represent Earth's surface 
geo_hospitals_gdf_projected = geo_hospitals_gdf.to_crs(epsg=3857) # projected coordinate system using flat, 2D plane to represent Earth's surface 


geo_hospitals_gdf_projected['ML_implementation_score'] = geo_hospitals_gdf_projected['aipred_it'].map({
    1: 3,  # ML gets highest score
    2: 2,  # Non-ML gets middle score
    3: 1,  # Neither gets lowest score
    4: 1,   # Don't know gets lowest score
    None: 0,
    0: 0 
})
geo_hospitals_gdf_projected.shape


### C2_2 AI/ML barplot 

In [100]:
# Create a mapping of states to regions
# Create a mapping of states to census divisions based on provided image
state_to_division = {
    # Division 1: New England
    'ME': 'New England', 'NH': 'New England', 'VT': 'New England', 
    'MA': 'New England', 'RI': 'New England', 'CT': 'New England',
    
    # Division 2: Mid Atlantic
    'NY': 'Mid Atlantic', 'NJ': 'Mid Atlantic', 'PA': 'Mid Atlantic',
    
    # Division 3: South Atlantic
    'DE': 'South Atlantic', 'MD': 'South Atlantic', 'DC': 'South Atlantic',
    'VA': 'South Atlantic', 'WV': 'South Atlantic', 'NC': 'South Atlantic',
    'SC': 'South Atlantic', 'GA': 'South Atlantic', 'FL': 'South Atlantic',
    
    # Division 4: East North Central
    'OH': 'East North Central', 'IN': 'East North Central', 'IL': 'East North Central',
    'MI': 'East North Central', 'WI': 'East North Central',
    
    # Division 5: East South Central
    'KY': 'East South Central', 'TN': 'East South Central', 
    'AL': 'East South Central', 'MS': 'East South Central',
    
    # Division 6: West North Central
    'MN': 'West North Central', 'IA': 'West North Central', 'MO': 'West North Central',
    'ND': 'West North Central', 'SD': 'West North Central', 'NE': 'West North Central',
    'KS': 'West North Central',
    
    # Division 7: West South Central
    'AR': 'West South Central', 'LA': 'West South Central', 
    'OK': 'West South Central', 'TX': 'West South Central',
    
    # Division 8: Mountain
    'MT': 'Mountain', 'ID': 'Mountain', 'WY': 'Mountain', 'CO': 'Mountain',
    'NM': 'Mountain', 'AZ': 'Mountain', 'UT': 'Mountain', 'NV': 'Mountain',
    
    # Division 9: Pacific
    'WA': 'Pacific', 'OR': 'Pacific', 'CA': 'Pacific', 
    'AK': 'Pacific', 'HI': 'Pacific',
    
    # Territories
    'PR': 'Territories', 'GU': 'Territories', 'VI': 'Territories', 
    'AS': 'Territories', 'MP': 'Territories'
}

# Add census division column to the dataframe
hospitals_gdf_projected['division'] = hospitals_gdf_projected['mstate_it'].map(state_to_division)
geo_hospitals_gdf_projected['division'] = geo_hospitals_gdf_projected['mstate_it'].map(state_to_division)

# Loop through each census division 
divisions = [
    'New England', 'Mid Atlantic', 'South Atlantic', 
    'East North Central', 'East South Central', 'West North Central',
    'West South Central', 'Mountain', 'Pacific'
]


In [101]:
# Use your existing ai_types
ai_types = [3, 2, 1]  # These are your actual AI implementation categories

# Map the numeric AI types to labels for better readability
ai_type_labels = {
    3: 'ML',
    2: 'Non-ML Predictive Model',
    1: 'Do not know/Neither',
    0: 'No Response'  # Assuming 0 or NaN is used for no response
}

# Create a DataFrame to hold the distribution of AI types by division
division_ai_counts = pd.DataFrame(index=divisions, columns=ai_types + [0])  # Include 0 for no response
# Fill the DataFrame with counts
for division in divisions:
    geo_division_data = geo_hospitals_gdf_projected[geo_hospitals_gdf_projected['division'] == division]
    
    # Count no responses (either 0 or NaN)
    no_response_count = len(geo_division_data[geo_division_data['ML_implementation_score'].isna() | 
                                         (geo_division_data['ML_implementation_score'] == 0)])
    division_ai_counts.loc[division, 0] = no_response_count
    
    # Count other AI types
    for ai_type in ai_types:
        division_ai_counts.loc[division, ai_type] = len(geo_division_data[geo_division_data['ML_implementation_score'] == ai_type])
division_ai_counts

# Calculate total hospitals per division
division_totals = division_ai_counts.sum(axis=1)

# Calculate percentages
division_ai_dist = division_ai_counts.div(division_ai_counts.sum(axis=1), axis=0) * 100

# Calculate grid dimensions
n_divisions = len(divisions)
n_cols = 3  # Number of columns in the grid
n_rows = 3

In [None]:


# Map the numeric AI types to labels for better readability
ai_type_labels = {
    3: 'AI/ML',
    2: 'Non-AI/ML Predictive Model',
    1: 'Do not know/Neither',
    0: 'No Response'
}

# Rename columns for better readability
division_ai_counts = division_ai_counts.rename(columns=ai_type_labels)

# Calculate percentages
percentages = division_ai_counts.div(division_ai_counts.sum(axis=1), axis=0) * 100

# Sort divisions by total
total_hospitals = division_ai_counts.sum(axis=1)
sorted_divisions = division_ai_counts.loc[total_hospitals.sort_values(ascending=False).index]
sorted_percentages = percentages.loc[total_hospitals.sort_values(ascending=False).index]

# Define publication-quality colors
colors = ['#3366CC', '#66CCEE', '#EEEEFF', '#CCCCCC']

# Create figure with a higher DPI for better quality
fig, ax = plt.subplots(figsize=(10, 7), dpi=300)

# Create stacked bar chart
bars = sorted_divisions.plot(kind='bar', stacked=True, color=colors, ax=ax, width=0.7, edgecolor='white', linewidth=0.5)

# Add percentage labels on top of each segment
for i, division in enumerate(sorted_divisions.index):
    cumulative_height = 0
    for j, col in enumerate(sorted_divisions.columns):
        height = sorted_divisions.loc[division, col]
        pct = sorted_percentages.loc[division, col]
        
        if height > 0:  # Only label non-zero values
            # Position at the top of each segment
            y_pos = cumulative_height + height
            
            # For small percentages, only show if they're at least 1%
            if pct >= 1:
                # Position text at the top of each segment
                ax.text(i, cumulative_height + (height * 0.5), f'{pct:.1f}%', 
                        ha='center', va='center', fontsize=9,
                        color='black', fontweight='bold')
            
        cumulative_height += height

ax.set_xlabel('Census Division', fontsize=14, labelpad=10, color='black')
ax.set_ylabel('Number of Hospitals', fontsize=14, labelpad=10, color='black')
ax.tick_params(axis='both', which='major', labelsize=12, colors='black')

plt.xticks(rotation=45, ha='right', color='black')
plt.yticks(color='black')

# Calculate combined percentage of ML and Non-ML Predictive Model
combined_percentage = sorted_percentages['AI/ML'] + sorted_percentages['Non-AI/ML Predictive Model']

# Add combined percentage labels above the total count
for i, division in enumerate(sorted_divisions.index):
    total = total_hospitals[division]
    combined_pct = combined_percentage[division]
    # Position the combined percentage above the total count
    plt.text(i, total + 20, f'Model: {combined_pct:.1f}%', 
             ha='center', fontsize=8,  # Smaller font size
             fontweight='normal',      # Normal weight instead of bold
             color='#666666')          # Gray color


plt.tight_layout()
plt.show()


### C2_3 Hospital nearest neighbor analysis 

In [None]:

# 2. Custom nearest neighbor analysis (without relying on pointpats)
print("Performing Custom Nearest Neighbor Analysis...")

# Extract coordinates
coords = np.vstack((geo_hospitals_gdf_projected.geometry.x, geo_hospitals_gdf_projected.geometry.y)).T

# Calculate distances between all pairs of points
kdtree = spatial.KDTree(coords)
distances, indices = kdtree.query(coords, k=2)  # k=2 to get the nearest neighbor (first one is self)
mean_observed_nn_distance = np.mean(distances[:, 1])

# Calculate the area - use the bounding box as an approximation
x_min, y_min, x_max, y_max = geo_hospitals_gdf_projected.total_bounds
area = (x_max - x_min) * (y_max - y_min)

# Calculate point density
n = len(coords)
density = n / area

# Expected mean distance for random distribution
mean_expected_nn_distance = 0.5 / np.sqrt(density)

# Calculate nearest neighbor ratio
nn_ratio = mean_observed_nn_distance / mean_expected_nn_distance

# Standard error
se = 0.26136 / np.sqrt(n * density)

# Z-score
z_score = (mean_observed_nn_distance - mean_expected_nn_distance) / se
p_value = 2 * (1 - norm.cdf(abs(z_score)))  # two-tailed test


#### C2_4 AI/ML Implementation spatial autocorrelation using Moran's I 

In [None]:
def calculate_morans_i(values, coords, k=5):
    """Calculate Moran's I spatial autocorrelation using PySAL"""
    # Convert inputs to numpy arrays
    values = np.asarray(values).flatten()
    coords = np.asarray(coords)
    
    # Handle missing values
    valid_idx = ~np.isnan(values)
    if not np.all(valid_idx):
        values = values[valid_idx]
        coords = coords[valid_idx]
    
    n = len(values)
    if n <= 1 or np.var(values) == 0:
        return {'moran_i': np.nan, 'expected_i': np.nan, 'z_score': np.nan, 'p_value': np.nan, 'n': n}
    
    try:
        # Create k-nearest neighbor weights using PySAL
        w = KNN.from_array(coords, k=k)
        
        # Calculate Moran's I using PySAL
        moran = Moran(values, w)
        
        return {
            'moran_i': moran.I,
            'expected_i': moran.EI,
            'z_score': moran.z_norm,
            'p_value': moran.p_norm,
            'n': n
        }
    
    except Exception as e:
        return {'moran_i': np.nan, 'expected_i': np.nan, 'z_score': np.nan, 'p_value': np.nan, 'n': n}

# Extract coordinates (keep this the same)
coords = np.vstack((geo_hospitals_gdf_projected.geometry.x, geo_hospitals_gdf_projected.geometry.y)).T

# Define the variables to analyze (keep this the same)
variables = [
    'ai_base_score', 
    'ai_base_breadth_score', 
    'ai_base_dev_score', 
    'ai_base_eval_score'
]

# Create a list to store results (keep this the same)
results_list = []

# Calculate Moran's I for each variable (keep this the same)
for var in variables:
    try:
        # Get values for this variable
        values = geo_hospitals_gdf_projected[var].values
        
        # Calculate Moran's I (this function is now using PySAL)
        result = calculate_morans_i(values, coords, k=5)
        
        # Determine pattern (keep this the same)
        if np.isnan(result['p_value']):
            pattern = "Invalid data"
        elif result['p_value'] < 0.05:
            if result['moran_i'] > result['expected_i']:
                pattern = "Significant clustering"
            else:
                pattern = "Significant dispersion"
        else:
            pattern = "Random distribution"
        
        # Add results to list (keep this the same)
        results_list.append({
            'Variable': var,
            'Moran I': result['moran_i'],
            'Expected I': result['expected_i'],
            'z-score': result['z_score'],
            'p-value': result['p_value'],
            'Pattern': pattern,
            'n': result['n']
        })
    
    except Exception as e:
        results_list.append({
            'Variable': var,
            'Moran I': np.nan,
            'Expected I': np.nan,
            'z-score': np.nan,
            'p-value': np.nan,
            'Pattern': f"Error: {str(e)}",
            'n': np.nan
        })

# Create DataFrame (keep this the same)
moran_results_df = pd.DataFrame(results_list)

# Format the numeric columns (keep this the same)
moran_results_df['Moran I'] = moran_results_df['Moran I'].round(4)
moran_results_df['Expected I'] = moran_results_df['Expected I'].round(4)
moran_results_df['z-score'] = moran_results_df['z-score'].round(4)
moran_results_df['p-value'] = moran_results_df['p-value'].round(10)

# Display the DataFrame (keep this the same)
print("\nMoran's I Analysis Results (k=5 neighbors):")
moran_results_df

### C2_5 AI/ML Implementation adaptative clustering analysis 

#### C2_5_0 DBSCAN parameter tuning 

In [109]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
import numpy as np
import pandas as pd



In [53]:
coords = np.column_stack([hospitals_gdf_projected.geometry.x, hospitals_gdf_projected.geometry.y])
ml_scores = hospitals_gdf_projected['ai_base_score'].values

In [None]:
# Regional DBSCAN Clustering Analysis
# Applies spatial clustering to hospital data by census division

# Initialize storage for results
all_results = []
all_clusters_summary = []
modified_data = pd.DataFrame()

# Define analysis regions
regions = [
    'New England', 'Mid Atlantic', 'South Atlantic', 
    'East North Central', 'East South Central', 'West North Central',
    'West South Central', 'Mountain', 'Pacific'
]

target_metric = 'ai_base_score'

# Apply clustering by region
for region in regions:
    print(f"Analyzing {region}...")
    
    # Skip if region has no data
    if region not in hospitals_gdf_projected['division'].unique():
        continue
    
    # Apply adaptive DBSCAN clustering
    region_result, cluster_stats = adaptive_dbscan_clustering(
        hospitals_gdf_projected, 
        region, 
        output_col=target_metric,
        eps=0.25,  # Distance threshold
        min_samples=2  # Minimum cluster size
    )
    
    # Collect results
    if not region_result.empty:
        modified_data = pd.concat([modified_data, region_result])
        all_clusters_summary.extend(cluster_stats)

# Generate summary statistics
if all_clusters_summary:
    clusters_df = pd.DataFrame(all_clusters_summary)
    
    # Regional summary
    regional_summary = clusters_df.groupby('division').agg(
        Number_of_Clusters=('cluster_id', 'count'),
        Total_Points=('n_hospitals', 'sum'),
        Avg_Cluster_Size=('n_hospitals', 'mean'),
        Mean_Score=('ml_mean', 'mean'),
        Score_Range=('ml_max', lambda x: f"{clusters_df.loc[x.index, 'ml_min'].min():.2f}-{x.max():.2f}")
    ).round(2)
    
    # Overall summary
    overall_stats = {
        'Total_Clusters': len(clusters_df),
        'Total_Points': clusters_df['n_hospitals'].sum(),
        'Overall_Mean': clusters_df['ml_mean'].mean()
    }
    
    print("Regional Summary:")
    print(regional_summary)
    print(f"\nOverall: {overall_stats}")
else:
    print("No clusters identified")


#### C2_5_1 DBSCAN cluster identification for ai_base_score 

In [None]:
# DBSCAN Clustering Analysis by Census Division
# Applies adaptive clustering to each geographic region

# Initialize storage
all_clusters_summary = []
modified_data = pd.DataFrame()


target_column = 'ai_base_score'

# Apply clustering to each division
for division in divisions:
    if division not in hospitals_gdf_projected['division'].unique():
        continue
    
    # Run adaptive DBSCAN
    division_result, clusters_summary = adaptive_dbscan_clustering(
        hospitals_gdf_projected, 
        division, 
        output_col=target_column,
        eps=0.25, 
        min_samples=2,
        k=5
    )
    
    if not division_result.empty:
        modified_data = pd.concat([modified_data, division_result])
        all_clusters_summary.extend(clusters_summary)


#### C2_5_2 DBSCAN cluster identification for ai_base_breadth_score

In [None]:
# Initialize collection variables
all_results = []
all_clusters_summary = []
modified_hospital_data = pd.DataFrame()  # Empty DataFrame to collect results


# Define the target column
target_column = 'ai_base_breadth_score'

# Apply clustering to each division
for division in divisions:
    if division not in data['division'].unique():
        continue
    
    # Run adaptive DBSCAN
    division_result, clusters_summary = adaptive_dbscan_clustering(
        data, 
        division, 
        output_col=target_column,
        eps=0.25, 
        min_samples=2,
        k=5
    )
    
    if not division_result.empty:
        modified_data = pd.concat([modified_data, division_result])
        all_clusters_summary.extend(clusters_summary)


#### B2_5_3 DBSCAN cluster identification for ai_base_development_score

In [None]:
# Initialize collection variables
all_results = []
all_clusters_summary = []
modified_hospital_data = pd.DataFrame()  # Empty DataFrame to collect results


# Define the target column
target_column = 'ai_base_dev_score'

# Apply clustering to each division
for division in divisions:
    if division not in hospitals_gdf_projected['division'].unique():
        continue
    
    # Run adaptive DBSCAN
    division_result, clusters_summary = adaptive_dbscan_clustering(
        hospitals_gdf_projected, 
        division, 
        output_col=target_column,
        eps=0.25, 
        min_samples=2,
        k=5
    )
    
    if not division_result.empty:
        modified_data = pd.concat([modified_data, division_result])
        all_clusters_summary.extend(clusters_summary)

#### B2_5_4 DBSCAN cluster identification for ai_base_evaluation_score

In [None]:
# Initialize collection variables
all_results = []
all_clusters_summary = []
modified_hospital_data = pd.DataFrame()  # Empty DataFrame to collect results


# Define the target column
target_column = 'ai_base_eval_score'

# Apply clustering to each division
for division in divisions:
    if division not in hospitals_gdf_projected['division'].unique():
        continue
    
    # Run adaptive DBSCAN
    division_result, clusters_summary = adaptive_dbscan_clustering(
        hospitals_gdf_projected, 
        division, 
        output_col=target_column,
        eps=0.25, 
        min_samples=2,
        k=5
    )
    
    if not division_result.empty:
        modified_data = pd.concat([modified_data, division_result])
        all_clusters_summary.extend(clusters_summary)

### C2_6 AI/ML Implementation Moran's I spatial autocorrelation 

In [None]:
# Global Moran's I Spatial Autocorrelation Analysis

# Extract coordinates and target variable
coords = np.vstack((hospitals_gdf_projected.geometry.x, hospitals_gdf_projected.geometry.y)).T
values = hospitals_gdf_projected['ai_base_score'].values

# Calculate Moran's I
result = calculate_morans_i(values, coords, k=5)

if not np.isnan(result['moran_i']):
    I = result['moran_i']
    EI = result['expected_i']
    z_score = result['z_score']
    p_value = result['p_value']
    
    # Create results summary
    results = {
        'Moran_I': round(I, 4),
        'Expected_I': round(EI, 4),
        'Z_Score': round(z_score, 2),
        'P_Value': round(p_value, 6),
        'Significant': p_value < 0.05,
        'Pattern': 'Clustering' if (p_value < 0.05 and I > EI) else
                  'Dispersion' if (p_value < 0.05 and I < EI) else
                  'Random'
    }
    
    print(f"Moran's I: {results['Moran_I']}")
    print(f"p-value: {results['P_Value']}")
    print(f"Pattern: {results['Pattern']}")
    
    return results

# Compare multiple variables
ai_variables = ['ai_base_score', 'ai_base_breadth_score', 'ai_base_dev_score', 'ai_base_eval_score']
comparison_results = []

for var in ai_variables:
    var_values = hospitals_gdf_projected[var].values
    var_result = calculate_morans_i(var_values, coords, k=5)
    
    if not np.isnan(var_result['moran_i']):
        comparison_results.append({
            'Variable': var,
            'Moran_I': round(var_result['moran_i'], 4),
            'P_Value': round(var_result['p_value'], 6),
            'Pattern': 'Clustering' if (var_result['p_value'] < 0.05 and 
                                     var_result['moran_i'] > var_result['expected_i']) else
                      'Dispersion' if (var_result['p_value'] < 0.05 and 
                                     var_result['moran_i'] < var_result['expected_i']) else
                      'Random'
        })

comparison_df = pd.DataFrame(comparison_results)
print("\nComparison across AI variables:")
print(comparison_df)

In [None]:

# Calculate Moran's I for each division and variable
all_results = []

for ai_var in ai_variables:
    for division in mainland_divisions:
        division_data = hospitals_gdf_projected[hospitals_gdf_projected['division'] == division]
        
        # Skip if insufficient data
        if len(division_data) < 10:
            continue
            
        # Adjust k for smaller divisions
        k = min(5, max(2, len(division_data) // 5))
        
        try:
            coords = np.vstack((division_data.geometry.x, division_data.geometry.y)).T
            values = division_data[ai_var].values
            
            result = calculate_morans_i(values, coords, k)
            
            if not np.isnan(result['moran_i']):
                all_results.append({
                    'AI_Variable': ai_var,
                    'Division': division,
                    'Morans_I': result['moran_i'],
                    'p_value': result['p_value'],
                    'z_score': result['z_score'],
                    'expected_I': result['expected_i'],
                    'n': result['n']
                })
                
        except Exception as e:
            continue

# Process results
if all_results:
    comprehensive_df = pd.DataFrame(all_results)
    
    # Add pattern classification
    comprehensive_df['Pattern'] = comprehensive_df.apply(lambda row: 
        'Clustering' if row['p_value'] < 0.05 and row['Morans_I'] > row['expected_I'] else
        'Dispersion' if row['p_value'] < 0.05 and row['Morans_I'] < row['expected_I'] else
        'Random', axis=1)
    
    # Create pivot table for Moran's I values
    morans_pivot = comprehensive_df.pivot_table(
        index='Division', 
        columns='AI_Variable', 
        values='Morans_I', 
        aggfunc='first'
    ).round(4)
    
    # Create pivot table for significance
    pvalue_pivot = comprehensive_df.pivot_table(
        index='Division', 
        columns='AI_Variable', 
        values='p_value', 
        aggfunc='first'
    ).round(4)
    
    print("Moran's I by Division and AI Variable:")
    print(morans_pivot)
    
    print("\nP-values:")
    print(pvalue_pivot)
    
    # Summary statistics
    for ai_var in ai_variables:
        var_data = comprehensive_df[comprehensive_df['AI_Variable'] == ai_var]
        clustering_count = len(var_data[var_data['Pattern'] == 'Clustering'])
        total_divisions = len(var_data)
        
        print(f"\n{ai_var}: {clustering_count}/{total_divisions} divisions show significant clustering")
    
    return comprehensive_df
else:
    print("No valid results calculated")
    return None

In [None]:
# State-Level Moran's I Analysis

ai_variables = [
    'ai_base_score', 'ai_base_breadth_score', 
    'ai_base_dev_score', 'ai_base_eval_score'
]

# Get states with sufficient data
states = hospitals_gdf_projected['mstate_it'].dropna().unique()
all_state_results = []

for ai_var in ai_variables:
    for state in states:
        state_data = hospitals_gdf_projected[hospitals_gdf_projected['mstate_it'] == state]
        
        # Require minimum 20 hospitals for reliable analysis
        if len(state_data) < 20:
            continue
            
        # Adaptive k parameter
        k = min(5, max(2, len(state_data) // 5))
        
        try:
            coords = np.vstack((state_data.geometry.x, state_data.geometry.y)).T
            values = state_data[ai_var].values
            
            result = calculate_morans_i(values, coords, k)
            
            if not np.isnan(result['moran_i']):
                # Get state name if available
                state_name = state
                if 'mstate' in state_data.columns:
                    state_names = state_data['mstate'].dropna().unique()
                    if len(state_names) > 0:
                        state_name = state_names[0]
                
                all_state_results.append({
                    'AI_Variable': ai_var,
                    'State_Code': state,
                    'State': state_name,
                    'Morans_I': result['moran_i'],
                    'p_value': result['p_value'],
                    'z_score': result['z_score'],
                    'expected_I': result['expected_i'],
                    'n': result['n']
                })
                
        except Exception:
            continue

# Process results
if all_state_results:
    comprehensive_state_df = pd.DataFrame(all_state_results)
    
    # Add pattern classification
    comprehensive_state_df['Pattern'] = comprehensive_state_df.apply(lambda row: 
        'Clustering' if row['p_value'] < 0.05 and row['Morans_I'] > row['expected_I'] else
        'Dispersion' if row['p_value'] < 0.05 and row['Morans_I'] < row['expected_I'] else
        'Random', axis=1)
    
    # Create pivot table for Moran's I values
    morans_pivot = comprehensive_state_df.pivot_table(
        index=['State_Code', 'State'], 
        columns='AI_Variable', 
        values='Morans_I', 
        aggfunc='first'
    ).round(4)
    
    
    # Overall summary
    total_clustering = len(comprehensive_state_df[comprehensive_state_df['Pattern'] == 'Clustering'])
    total_analyses = len(comprehensive_state_df)
    
    return comprehensive_state_df
else:
    print("No valid results calculated")
    return None