## B2. Alignment Analysis – Need vs AI Implementation

**Description**  
This section evaluates the alignment between healthcare need and AI implementation levels across hospitals. Tertiles are computed for both variables to assess patterns of alignment. 

**Purpose**  
To examine whether AI implementation correspond with areas of greatest need. 

**Method Summary**  
- Rank-based tertiles were created for HPSA, MUA, ADI, SVI scores.  
- AI implementation scores were already categorized into three ctegories (Low, Medium, High).  
- Cross-tabulations were generated and visualized using heatmaps.  


### 1 Load necessary libraries, functions, and pre-processed data 

In [17]:

# load necessary libraries 
import geopandas as gpd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
ai_exposures = ["ai_base_score",
"ai_base_breadth_score",
"ai_base_dev_score",
"ai_base_eval_score_2023",
"ai_base_eval_score_2024",
"llm_readiness_score", 
"ai_base_score_imputed",
"ai_base_breadth_score_imputed",
"ai_base_dev_score_imputed",
"ai_base_eval_score_2023_imputed",
"ai_base_eval_score_2024_imputed"]

In [19]:
AHA_master = pd.read_csv("./data/AHA_master_external_data.csv", low_memory=False)
AHA_IT = AHA_master[AHA_master.id_it.notna()]

### 2 Data engineering 

In [None]:
AHA_master2 = calculate_ai_scores.apply_ai_scores_to_dataframe(AHA_IT)

In [None]:
AHA_IT_US = AHA_master2[AHA_master2['division']!='Territories']
AHA_IT_US.shape

In [None]:
# Convert numeric AI implementation scores to categorical labels
AHA_IT_US['AI_implementation_tertile'] = AHA_IT_US['ai_base_score'].map({
    0: 'Low',
    1: 'Medium',
    2: 'High'
})

### 3 Alignment analysis 

In [None]:
# =========================
# TABLE 1: Tertile Ranges for Need Indicators
# =========================

def get_tertile_ranges(series, measure_name):
    """Get the actual value ranges for tertiles"""
    try:
        # Create tertiles
        tertiles = pd.qcut(series, 3, labels=['Low Need', 'Medium Need', 'High Need'])
        
        # Get the cut points
        _, bins = pd.qcut(series, 3, retbins=True)
        
        # Get actual min/max within each tertile
        low_values = series[tertiles == 'Low Need']
        med_values = series[tertiles == 'Medium Need'] 
        high_values = series[tertiles == 'High Need']
        
        return {
            'Measure': measure_name,
            'Low Need Range': f"{low_values.min():.2f} - {low_values.max():.2f}",
            'Medium Need Range': f"{med_values.min():.2f} - {med_values.max():.2f}",
            'High Need Range': f"{high_values.min():.2f} - {high_values.max():.2f}",
            'Low Need N': len(low_values),
            'Medium Need N': len(med_values),
            'High Need N': len(high_values),
            'Cutpoint 1': f"{bins[1]:.2f}",
            'Cutpoint 2': f"{bins[2]:.2f}"
        }
    except:
        return None

# Socioeconomic measures (using standard tertiles)
socio_measures = {
    'Area Deprivation Index': 'national_adi_median',
    'Social Vulnerability Index': 'svi_themes_median',
    'SVI Theme 1 (Socioeconomic)': 'svi_theme1_median',
    'SVI Theme 2 (Household)': 'svi_theme2_median',
    'SVI Theme 3 (Minority)': 'svi_theme3_median',
    'SVI Theme 4 (Housing/Transport)': 'svi_theme4_median'
}

tertile_ranges = []
for measure_name, column in socio_measures.items():
    if column in AHA_master2.columns:
        range_info = get_tertile_ranges(AHA_master2[column], measure_name)
        if range_info:
            tertile_ranges.append(range_info)

tertile_df = pd.DataFrame(tertile_ranges)

# =========================
# TABLE 2: Designation Distribution for HPSA/MUA Indicators
# =========================

def get_designation_distribution(series, measure_name):
    """Get distribution of designation status (score > 0 vs score = 0)"""
    try:
        designated = (series > 0).sum()
        not_designated = (series == 0).sum()
        total = len(series)
        
        # Get ranges for designated areas
        positive_values = series[series > 0]
        
        return {
            'Measure': measure_name,
            'Not Designated (Score = 0)': f"{not_designated:,} ({100*not_designated/total:.1f}%)",
            'Designated (Score > 0)': f"{designated:,} ({100*designated/total:.1f}%)",
            'Designated Score Range': f"{positive_values.min():.2f} - {positive_values.max():.2f}" if len(positive_values) > 0 else "N/A",
            'Designated Score Median': f"{positive_values.median():.2f}" if len(positive_values) > 0 else "N/A",
            'Total Hospitals': f"{total:,}"
        }
    except:
        return None

# HPSA/MUA measures (using binary designation)
provider_measures = {
    'Primary HPSA Score': 'mean_primary_hpss',
    'Mental HPSA Score': 'mean_mental_hpss',
    'Dental HPSA Score': 'mean_dental_hpss',
    'MUA Overall Score': 'mean_mua_score',
    'MUA Elder Score': 'mean_mua_elders_score',
    'MUA Infant Score': 'mean_mua_infant_score'
}

designation_distributions = []
for measure_name, column in provider_measures.items():
    if column in AHA_master2.columns:
        dist_info = get_designation_distribution(AHA_master2[column], measure_name)
        if dist_info:
            designation_distributions.append(dist_info)

designation_df = pd.DataFrame(designation_distributions)

# =========================
# RANK-BASED TERTILE RANGES FOR HPSA/MUA MEASURES
# =========================

def get_rank_based_tertile_ranges(series, measure_name):
    """Get the actual value ranges for rank-based tertiles"""
    try:
        # Create rank-based tertiles (handles duplicate values)
        ranked_series = series.rank(method='first')
        tertiles = pd.qcut(ranked_series, 3, labels=['Low Need', 'Medium Need', 'High Need'])
        
        # Get actual values within each tertile group
        low_mask = tertiles == 'Low Need'
        med_mask = tertiles == 'Medium Need'
        high_mask = tertiles == 'High Need'
        
        low_values = series[low_mask]
        med_values = series[med_mask]
        high_values = series[high_mask]
        
        # Get the rank cutpoints
        _, rank_bins = pd.qcut(ranked_series, 3, retbins=True)
        
        # Find the actual score values at the rank cutpoints
        sorted_values = series.sort_values()
        cutpoint1_idx = int(rank_bins[1]) - 1  # Convert to 0-based index
        cutpoint2_idx = int(rank_bins[2]) - 1
        
        # Handle edge cases
        cutpoint1_value = sorted_values.iloc[min(cutpoint1_idx, len(sorted_values)-1)]
        cutpoint2_value = sorted_values.iloc[min(cutpoint2_idx, len(sorted_values)-1)]
        
        return {
            'Measure': measure_name,
            'Low Need Range': f"{low_values.min():.3f} - {low_values.max():.3f}",
            'Medium Need Range': f"{med_values.min():.3f} - {med_values.max():.3f}",
            'High Need Range': f"{high_values.min():.3f} - {high_values.max():.3f}",
            'Low Need N': len(low_values),
            'Medium Need N': len(med_values),
            'High Need N': len(high_values),
            'Score Cutpoint 1': f"{cutpoint1_value:.3f}",
            'Score Cutpoint 2': f"{cutpoint2_value:.3f}",
            'Rank Cutpoint 1': f"{rank_bins[1]:.0f}",
            'Rank Cutpoint 2': f"{rank_bins[2]:.0f}"
        }
    except Exception as e:
        print(f"Error processing {measure_name}: {e}")
        return None

# HPSA/MUA measures using rank-based tertiles
hpsa_mua_measures = {
    'Primary HPSA Score': 'mean_primary_hpss',
    'Mental HPSA Score': 'mean_mental_hpss',
    'Dental HPSA Score': 'mean_dental_hpss',
    'MUA Overall Score': 'mean_mua_score',
    'MUA Elder Score': 'mean_mua_elders_score',
    'MUA Infant Score': 'mean_mua_infant_score'
}

rank_tertiles = []
for measure_name, column in hpsa_mua_measures.items():
    if column in AHA_master2.columns:
        range_info = get_rank_based_tertile_ranges(AHA_master2[column], measure_name)
        if range_info:
            rank_tertiles.append(range_info)

rank_tertiles_df = pd.DataFrame(rank_tertiles)


In [None]:
# Model type categorization
AHA_master2['model_type'] = AHA_master2['ai_base_score_imputed'].map({
    0: 'No Models',
    1: 'Non-AI Predictive Models', 
    2: 'AI Predictive Models'
})

AHA_master2['model_type'] = pd.Categorical(
    AHA_master2['model_type'],
    categories=['No Models', 'Non-AI Predictive Models', 'AI Predictive Models'],
    ordered=True
)

# SEPARATE the need measures into two groups as requested by reviewer
provider_shortage_measures = {
    'Primary HPSA': 'primary_hpss_tertile_new',
    'Mental HPSA': 'mental_hpss_tertile_new', 
    'Dental HPSA': 'dental_hpss_tertile_new',
    'MUA Overall': 'mua_score_tertile_new'
}

socioeconomic_measures = {
    'Area Deprivation Index': 'adi_tertile',
    'Social Vulnerability Index Overall': 'svi_tertile',
    'Social Vulnerability Index Theme 1': 'svi1_tertile',
    'Social Vulnerability Index Theme 2': 'svi2_tertile',
    'Social Vulnerability Index Theme 3': 'svi3_tertile',
    'Social Vulnerability Index Theme 4': 'svi4_tertile'
}


In [None]:
def calculate_tables(need_measures_dict):
    cross_tabs = {}

    for name, column in need_measures_dict.items():
        # Create cross-tabulation
        cross_tabs[name] = pd.crosstab(
            AHA_master2[column], 
            AHA_master2['model_type'], 
            normalize=True
        ) * 100
        
        # Reorder to put High Need at TOP, Low Need at BOTTOM
        cross_tabs[name] = cross_tabs[name].reindex(['High Need', 'Medium Need', 'Low Need'])
    
    return cross_tabs

# Calculate metrics for both groups
provider_tabs = calculate_tables(provider_shortage_measures)
socioeconomic_tabs = calculate_tables(socioeconomic_measures)


In [None]:
AI_LABEL = "AI Predictive Models"
MODEL_LABELS = ["No Models", "Non-AI Predictive Models", "AI Predictive Models"]


def pick_high_low_rows(tbl_index):
    """Return the row labels for High and Low based on text; fallbacks to last/first."""
    idx = list(tbl_index)
    hi = next((lab for lab in idx if "high" in str(lab).lower()), idx[-1])
    lo = next((lab for lab in idx if "low"  in str(lab).lower()), idx[0])
    return hi, lo

def rr_from_joint_table(tbl, ai_label=AI_LABEL, hi_label=None, lo_label=None):
    """
    tbl entries are global % of total (not row-normalized).
    RR = P(AI|High)/P(AI|Low) = [p(High,AI)/p(High,*)] / [p(Low,AI)/p(Low,*)]
    """
    if hi_label is None or lo_label is None:
        hi_label, lo_label = pick_high_low_rows(tbl.index)
    num_H = float(tbl.loc[hi_label, ai_label])
    den_H = float(tbl.loc[hi_label, :].sum())
    num_L = float(tbl.loc[lo_label, ai_label])
    den_L = float(tbl.loc[lo_label, :].sum())
    pH = num_H / den_H if den_H > 0 else np.nan
    pL = num_L / den_L if den_L > 0 else np.nan
    RR = np.nan if (pL == 0 or np.isnan(pL)) else pH / pL
    rel = 100 * (RR - 1) if np.isfinite(RR) else np.nan
    return pH, pL, RR, rel, hi_label, lo_label

# =========================
# Figure 1: Socioeconomic Disadvantage (SVI/ADI) 
# These tables already represent % of total, so all 9 cells sum to ~100.
# RR is derived correctly from those tables (High vs Low).
# =========================

# Names you already use
socioeconomic_names = [
    "Area Deprivation Index",
    "Social Vulnerability Index Overall",
    "Social Vulnerability Index Theme 1",
    "Social Vulnerability Index Theme 2",
    "Social Vulnerability Index Theme 3",
    "Social Vulnerability Index Theme 4",
]

# Color scale across all socioeconomic tabs
all_socio_vals = []
for t in socioeconomic_tabs.values():
    all_socio_vals.extend(t.values.flatten())
vmin_socio = float(pd.Series(all_socio_vals).dropna().min())
vmax_socio = float(pd.Series(all_socio_vals).dropna().max())

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
for i, name in enumerate(socioeconomic_names):
    r, c = divmod(i, 3)
    ax = axes[r, c]
    tbl = socioeconomic_tabs[name]  # rows = High/Medium/Low; cols = 3 model buckets; entries are global %

    sns.heatmap(tbl, annot=True, fmt=".1f", cmap="YlOrRd", ax=ax,
                vmin=vmin_socio, vmax=vmax_socio, cbar=False,
                annot_kws={"size": 12, "weight": "bold"},
                linewidths=1, linecolor="white")

    pH, pL, RR, rel, hi_label, lo_label = rr_from_joint_table(tbl, ai_label=AI_LABEL)

    ax.set_title(
        f"{name}\nHigh vs Low: {100*pH:.1f}% vs {100*pL:.1f}%\nRR = {RR:.2f} ({rel:+.0f}% rel.)",
        fontsize=12, fontweight="bold", pad=28
    )
    ax.set_xlabel("", fontsize=11)
    ax.set_ylabel("", fontsize=11)
    ax.set_xticklabels(["none", "non-AI", "AI"], fontsize=10)
    ax.set_yticklabels([str(x) for x in tbl.index], fontsize=10, rotation=0)

plt.tight_layout(rect=[0, 0, 0.9, 0.92])
cax = fig.add_axes([0.92, 0.18, 0.02, 0.66])
sm = plt.cm.ScalarMappable(cmap="YlOrRd"); sm.set_array([vmin_socio, vmax_socio])
cb = plt.colorbar(sm, cax=cax)
cb.set_label("Percentage (%)", fontsize=12, fontweight="bold", rotation=270, labelpad=18)
cb.ax.tick_params(labelsize=10)

plt.suptitle("Socioeconomic Disadvantage vs AI Implementation (Global %; RR from P(AI|group))",
             fontsize=16, y=0.99)
fig.savefig("figures/socioeconomic_analysis_global_rr.pdf", bbox_inches="tight", dpi=300)
plt.show()

# =========================
# FIGURE 2: HRSA Designations (binary) – build GLOBAL-% table so 6 cells sum to 100
# and compute crude RR from the same 3-bucket mapping.
# =========================

# Build designation flags (your variables)
AHA_master2["primary_hpsa_desig"] = (AHA_master2["mean_primary_hpss"] > 0).astype(int)
AHA_master2["mental_hpsa_desig"]  = (AHA_master2["mean_mental_hpss"]  > 0).astype(int)
AHA_master2["dental_hpsa_desig"]  = (AHA_master2["mean_dental_hpss"]  > 0).astype(int)
AHA_master2["mua_overall_desig"]  = (AHA_master2["mean_mua_score"]    > 0).astype(int)
AHA_master2["mua_elder_desig"]    = (AHA_master2["mean_mua_elders_score"]  > 0).astype(int)
AHA_master2["mua_infant_desig"]   = (AHA_master2["mean_mua_infant_score"]  > 0).astype(int)

provider_names = ["Primary HPSA","Mental HPSA","Dental HPSA","MUA Overall","MUA Elder","MUA Infant"]
flag_col = {
    "Primary HPSA": "primary_hpsa_desig",
    "Mental HPSA":  "mental_hpsa_desig",
    "Dental HPSA":  "dental_hpsa_desig",
    "MUA Overall":  "mua_overall_desig",
    "MUA Elder":    "mua_elder_desig",
    "MUA Infant":   "mua_infant_desig",
}

def hpsa_global_table_and_counts(df, desig_flag):
    """Return (2x3) table of global % and the n for each group, using the 3-bucket mapping."""
    d = df[df["_mt3"].notna()].copy()
    total_all = len(d)

    # counts by group x model
    desig_mask = d[desig_flag] == 1
    cnt_D = d.loc[desig_mask, "_mt3"].value_counts().reindex(MODEL_LABELS, fill_value=0)
    cnt_N = d.loc[~desig_mask, "_mt3"].value_counts().reindex(MODEL_LABELS, fill_value=0)

    # convert to GLOBAL %
    pct_D = cnt_D / total_all * 100.0
    pct_N = cnt_N / total_all * 100.0

    tbl = pd.DataFrame([pct_D.values, pct_N.values],
                       index=["Designated","Not designated"],
                       columns=MODEL_LABELS)

    # n for each group (for RR denominator and for title)
    nD = int(desig_mask.sum())
    nN = int((~desig_mask).sum())
    # AI counts for RR
    aD = int((d.loc[desig_mask, "_mt3"] == AI_LABEL).sum())
    aN = int((d.loc[~desig_mask, "_mt3"] == AI_LABEL).sum())

    # crude RR from counts
    pD = aD / nD if nD > 0 else np.nan
    pN = aN / nN if nN > 0 else np.nan
    RR = np.nan if (pN == 0 or np.isnan(pN)) else pD / pN
    rel = 100 * (RR - 1) if np.isfinite(RR) else np.nan

    return tbl, nD, nN, pD, pN, RR, rel

# Build all provider tables first (for color scale)
provider_tabs = {}
provider_stats = {}
all_vals = []
for nm in provider_names:
    tbl, nD, nN, pD, pN, RR, rel = hpsa_global_table_and_counts(AHA_master2, flag_col[nm])
    provider_tabs[nm] = tbl
    provider_stats[nm] = dict(nD=nD, nN=nN, pD=pD, pN=pN, RR=RR, rel=rel)
    all_vals.extend(tbl.values.flatten())

# Combine with socioeconomic scale if you want one shared colorbar across figs:
# all_vals += all_socio_vals
vmin_hrsa = float(pd.Series(all_vals).dropna().min())
vmax_hrsa = float(pd.Series(all_vals).dropna().max())

# Plot
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
for i, nm in enumerate(provider_names):
    r, c = divmod(i, 3)
    ax = axes[r, c]
    tbl = provider_tabs[nm]
    st  = provider_stats[nm]

    sns.heatmap(tbl, annot=True, fmt=".1f", cmap="copper_r", ax=ax,
                vmin=vmin_hrsa, vmax=vmax_hrsa, cbar=False,
                annot_kws={"size": 12, "weight": "bold"},
                linewidths=1, linecolor="white")

    ax.set_title(
        f"{nm}\nDesignated vs Not: {100*st['pD']:.1f}% (n={st['nD']}) vs "
        f"{100*st['pN']:.1f}% (n={st['nN']})\nRR = {st['RR']:.2f} ({st['rel']:+.0f}% rel.)",
        fontsize=12, fontweight="bold", pad=28
    )
    ax.set_xlabel("Model Type", fontsize=11, fontweight="bold")
    ax.set_xticklabels(["No\nModels","Non-AI\nModels","AI\nModels"], fontsize=10)
    ax.set_ylabel("Designation Status", fontsize=11, fontweight="bold")
    ax.set_yticklabels(["Designated","Not designated"], fontsize=10, rotation=0)

plt.suptitle("HRSA Designations vs AI Implementation (Cells are GLOBAL %; RR is crude)",
             fontsize=16, y=0.99)
plt.tight_layout(rect=[0, 0, 0.9, 0.92])
cax = fig.add_axes([0.92, 0.18, 0.02, 0.66])
sm = plt.cm.ScalarMappable(cmap="copper_r"); sm.set_array([vmin_hrsa, vmax_hrsa])
cb = plt.colorbar(sm, cax=cax)
cb.set_label("Percentage (%)", fontsize=12, fontweight="bold", rotation=270, labelpad=18)
cb.ax.tick_params(labelsize=10)

fig.savefig("figures/hrsa_designations_global_rr.pdf", bbox_inches="tight", dpi=300)
plt.show()
