# Goal
## Original Questions
- Can we use Chronic Health Conditions to accurately predict Health Care Access?
- Are there Demographic clusters that are disproportionately affected by Chronic Health Conditions?
- Can unsupervised learning methods reveal distinct clusters that account for the bulk of Chronic Health Conditions?

### Questions:
- I have gotten a bit hung up bc as worded 2 and 3 seem to be asking same thing?
- were there established chronic health conditions?
- established demographic features?
- what are the features used in RQ1, and RQ3?

#### Rough Plan:
Question: Refinement:
Are there diagnostically useful demographic clusters that indicate chronic health conditions?
- Are there demographic clusters that strongly indicate certain chronic health conditions?
- Can we predict chronic health conditions from demographics, and how does a ML model compare with simpler cluster membership?

##### Part 1: Clustering
- Cluster the demographic features of the BRFSS data
- Visualize clusters and prevalence of chronic health conditions within each cluster
    - what chronic health conditions to use?
    - VISUAL: Clustering results
    - VISUAL:  Heatmaps of cluster membership vs chronic health conditions
- Run statistical tests to determine if certain clusters are significantly more affected by chronic health conditions
    - Translation - test the strength of correlation between cluster membership and chronic health conditions
    - Does being a member of a cluster correlate with having a chronic health condition?
#### Part 2: Prediction of Chronic Health Conditions From Demographics via DL Model. Inversion of Question 3
- use cluster labels as features?
- compare performance of a deep learning model vs simpler clustering membership



### Misc
- potential 'linchpin' variables given we are clustering on demographics (and ran random forest on demographics)


In [9]:
import os
from IPython import get_ipython
import logging

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from kmodes.kmodes import KModes
from sklearn.preprocessing import LabelEncoder
import subprocess
import sys

try:
    import kmodes
    print("kmodes already installed")
except ImportError:
    print("Installing kmodes...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "kmodes"])
    import kmodes
    print("kmodes installed successfully")

logger = logging.getLogger()
logger.setLevel(logging.INFO)

def is_colab():
    return 'google.colab' in str(get_ipython())

# Set up environment and paths
if is_colab():
    print("Running in Google Colab")

    # Clone the repository if not already cloned
    if not os.path.exists('dat490'):
        import subprocess
        print("Cloning repository...")
        subprocess.run(['git', 'clone', 'https://github.com/sksizer/dat490.git'], check=True)
        print("Repository cloned successfully")

    # Add the repository to Python path for imports
    sys.path.insert(0, '/content/dat490')

    # Set paths to use data from the cloned repository
    BFRSS_DATA_PATH = 'dat490/data/LLCP2023.parquet'
    BFRSS_CODEBOOK_PATH = 'dat490/data/codebook_USCODE23_LLCP_021924.HTML'
    BFRSS_DESC_PATH = 'dat490/data/LLCP2023_desc.parquet'  # Additional metadata file if needed
else:
    print("Running in local environment")

    # Add parent directory to path for dat490 module imports
    sys.path.insert(0, os.path.abspath('..'))

    # Use local data paths
    BFRSS_DATA_PATH = '../data/LLCP2023.parquet'
    BFRSS_CODEBOOK_PATH = '../data/codebook_USCODE23_LLCP_021924.HTML'
    BFRSS_DESC_PATH = '../data/LLCP2023_desc.parquet'  # Additional metadata file if needed

# Verify files exist
print(f"\\nData path: {BFRSS_DATA_PATH}")
print(f"Codebook path: {BFRSS_CODEBOOK_PATH}")

if not os.path.exists(BFRSS_DATA_PATH):
    raise FileNotFoundError(f"Data file not found at {BFRSS_DATA_PATH}")

if not os.path.exists(BFRSS_CODEBOOK_PATH):
    raise FileNotFoundError(f"Codebook file not found at {BFRSS_CODEBOOK_PATH}")

print("\\nAll required files found!")
logger.info('Environment setup complete')

##################################
# Load BFRSS data and metadata using the new wrapper
from dat490 import load_bfrss

# Single function call to load everything
# exclude_desc_columns=True will exclude _DESC columns from metadata generation
bfrss = load_bfrss(exclude_desc_columns=True)

# Get a copy of the raw DataFrame
bfrss_raw_df = bfrss.cloneDF()
bfrss_raw_df.info()

DEMOGRAPHIC_FEATURE_COLUMNS = [
    # Demographics section columns (13 total)
    # Demographics section columns (13 total)
    'MARITAL',    # https://singular-eclair-6a5a16.netlify.app/columns/MARITAL
    'EDUCA',      # https://singular-eclair-6a5a16.netlify.app/columns/EDUCA
    'RENTHOM1',   # https://singular-eclair-6a5a16.netlify.app/columns/RENTHOM1
    # 'NUMHHOL4',   # https://singular-eclair-6a5a16.netlify.app/columns/NUMHHOL4
    # 'NUMPHON4',   # https://singular-eclair-6a5a16.netlify.app/columns/NUMPHON4
    # 'CPDEMO1C',   # https://singular-eclair-6a5a16.netlify.app/columns/CPDEMO1C
    'VETERAN3',   # https://singular-eclair-6a5a16.netlify.app/columns/VETERAN3
    'EMPLOY1',    # https://singular-eclair-6a5a16.netlify.app/columns/EMPLOY1
    # 'CHILDREN',   # https://singular-eclair-6a5a16.netlify.app/columns/CHILDREN
    'INCOME3',    # https://singular-eclair-6a5a16.netlify.app/columns/INCOME3
    # 'PREGNANT',   # https://singular-eclair-6a5a16.netlify.app/columns/PREGNANT
    'SEXVAR',    # https://singular-eclair-6a5a16.netlify.app/columns/SEXVAR
    '_HISPANC', # https://singular-eclair-6a5a16.netlify.app/columns/_HISPANC # Calculated but not sure from what
    # '_CRACE1',    # https://singular-eclair-6a5a16.netlify.app/columns/_CRACE1 # Child race
    '_IMPRACE',   # https://singular-eclair-6a5a16.netlify.app/columns/_IMPRACE
    # '_AGE80',     # https://singular-eclair-6a5a16.netlify.app/columns/_AGE80
]




kmodes already installed
Running in local environment
\nData path: ../data/LLCP2023.parquet
Codebook path: ../data/codebook_USCODE23_LLCP_021924.HTML
\nAll required files found!
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 433323 entries, 0 to 433322
Columns: 694 entries, _STATE to X_DRNKDRV_DESC
dtypes: category(342), float64(340), int32(5), object(7)
memory usage: 1.3+ GB


# K-Modes Analysis of BRFSS Data

K-Modes clustering is an extension of K-Means designed for categorical data. Instead of using means to define cluster centers, K-Modes uses modes (most frequent values) and measures dissimilarity using the number of mismatches between data points.

In [10]:
# Subsampling and Stability Testing for K-Modes

import numpy as np
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

def subsample_kmode_stability(df: pd.DataFrame, 
                            feature_cols: list, 
                            n_clusters: int = 2,
                            n_iterations: int = 10,
                            subsample_ratio: float = 0.8,
                            random_state: int = 42):
    """
    Perform stability testing of k-modes clustering through subsampling.
    
    Parameters:
    -----------
    df : pd.DataFrame
        Input dataframe
    feature_cols : list
        List of feature columns to use for clustering
    n_clusters : int
        Number of clusters for k-modes
    n_iterations : int
        Number of subsampling iterations
    subsample_ratio : float
        Proportion of data to sample in each iteration
    random_state : int
        Random seed for reproducibility
        
    Returns:
    --------
    dict : Dictionary containing stability metrics and results
    """
    
    np.random.seed(random_state)
    
    # Prepare data
    data = df[feature_cols].copy()
    
    # Encode categorical variables
    encoders = {}
    for col in data.columns:
        if data[col].dtype == 'object' or data[col].dtype.name == 'category':
            le = LabelEncoder()
            data[col] = data[col].fillna(data[col].mode()[0] if len(data[col].mode()) > 0 else 'missing')
            data[col] = le.fit_transform(data[col])
            encoders[col] = le
        else:
            data[col] = data[col].fillna(data[col].mean())
    
    # Storage for results
    results = {
        'cluster_assignments': [],
        'centroids': [],
        'costs': [],
        'ari_scores': [],
        'nmi_scores': [],
        'stability_scores': []
    }
    
    # Reference clustering on full dataset
    km_ref = KModes(n_clusters=n_clusters, init='Huang', n_init=5, verbose=0)
    ref_clusters = km_ref.fit_predict(data)
    ref_centroids = km_ref.cluster_centroids_
    
    print(f"Running {n_iterations} iterations with {subsample_ratio*100:.0f}% subsampling...")
    
    for i in range(n_iterations):
        # Subsample data
        n_samples = int(len(data) * subsample_ratio)
        sample_idx = np.random.choice(len(data), n_samples, replace=False)
        data_sample = data.iloc[sample_idx]
        
        # Run k-modes on subsample
        km = KModes(n_clusters=n_clusters, init='Huang', n_init=5, verbose=0)
        clusters = km.fit_predict(data_sample)
        
        # Store results
        results['cluster_assignments'].append(clusters)
        results['centroids'].append(km.cluster_centroids_)
        results['costs'].append(km.cost_)
        
        # Calculate stability metrics against reference (for overlapping samples)
        ref_clusters_sample = ref_clusters[sample_idx]
        ari = adjusted_rand_score(ref_clusters_sample, clusters)
        nmi = normalized_mutual_info_score(ref_clusters_sample, clusters)
        
        results['ari_scores'].append(ari)
        results['nmi_scores'].append(nmi)
        
        if (i + 1) % 5 == 0:
            print(f"  Completed iteration {i+1}/{n_iterations}")
    
    # Calculate pairwise stability between iterations
    print("\\nCalculating pairwise stability...")
    for i in range(n_iterations):
        for j in range(i+1, n_iterations):
            # Find common indices between two subsamples
            # Since we're using random sampling, we need to track indices
            # For simplicity, we'll calculate stability on the full dataset predictions
            pass
    
    # Summary statistics
    results['summary'] = {
        'mean_cost': np.mean(results['costs']),
        'std_cost': np.std(results['costs']),
        'mean_ari': np.mean(results['ari_scores']),
        'std_ari': np.std(results['ari_scores']),
        'mean_nmi': np.mean(results['nmi_scores']),
        'std_nmi': np.std(results['nmi_scores']),
        'reference_centroids': ref_centroids,
        'reference_clusters': ref_clusters
    }
    
    return results

In [ ]:
# Advanced Stability Metrics with Pairwise Comparisons

def calculate_pairwise_stability(df: pd.DataFrame,
                               feature_cols: list,
                               n_clusters: int = 2,
                               n_iterations: int = 10,
                               subsample_ratio: float = 0.8,
                               random_state: int = 42):
    """
    Calculate pairwise stability between multiple k-modes runs with bootstrap confidence intervals.
    """
    
    np.random.seed(random_state)
    
    # Prepare data
    data = df[feature_cols].copy()
    
    # Encode categorical variables
    for col in data.columns:
        if data[col].dtype == 'object' or data[col].dtype.name == 'category':
            le = LabelEncoder()
            data[col] = data[col].fillna(data[col].mode()[0] if len(data[col].mode()) > 0 else 'missing')
            data[col] = le.fit_transform(data[col])
        else:
            data[col] = data[col].fillna(data[col].mean())
    
    # Store sample indices and cluster assignments
    sample_indices = []
    cluster_assignments = []
    centroids = []
    
    print(f"Running {n_iterations} k-modes iterations...")
    
    for i in range(n_iterations):
        # Subsample data
        n_samples = int(len(data) * subsample_ratio)
        sample_idx = np.random.choice(len(data), n_samples, replace=False)
        # Sort indices to use searchsorted later
        sample_idx = np.sort(sample_idx)
        sample_indices.append(sample_idx)
        
        data_sample = data.iloc[sample_idx]
        
        # Run k-modes
        km = KModes(n_clusters=n_clusters, init='Huang', n_init=5, verbose=0)
        clusters = km.fit_predict(data_sample)
        
        cluster_assignments.append(clusters)
        centroids.append(km.cluster_centroids_)
        
        if (i + 1) % 5 == 0:
            print(f"  Completed iteration {i+1}/{n_iterations}")
    
    # Calculate pairwise stability
    print("\\nCalculating pairwise stability metrics...")
    pairwise_ari = np.zeros((n_iterations, n_iterations))
    pairwise_nmi = np.zeros((n_iterations, n_iterations))
    
    for i in range(n_iterations):
        for j in range(i, n_iterations):
            if i == j:
                pairwise_ari[i, j] = 1.0
                pairwise_nmi[i, j] = 1.0
            else:
                # Find common indices
                common_idx = np.intersect1d(sample_indices[i], sample_indices[j])
                
                if len(common_idx) > 0:
                    # Get positions in respective samples
                    pos_i = np.searchsorted(sample_indices[i], common_idx)
                    pos_j = np.searchsorted(sample_indices[j], common_idx)
                    
                    # Compare cluster assignments for common samples
                    clusters_i = cluster_assignments[i][pos_i]
                    clusters_j = cluster_assignments[j][pos_j]
                    
                    ari = adjusted_rand_score(clusters_i, clusters_j)
                    nmi = normalized_mutual_info_score(clusters_i, clusters_j)
                    
                    pairwise_ari[i, j] = pairwise_ari[j, i] = ari
                    pairwise_nmi[i, j] = pairwise_nmi[j, i] = nmi
    
    # Calculate centroid stability
    centroid_distances = np.zeros((n_iterations, n_iterations))
    for i in range(n_iterations):
        for j in range(i, n_iterations):
            if i == j:
                centroid_distances[i, j] = 0.0
            else:
                # Calculate Hamming distance between centroids
                dist = 0
                for k in range(n_clusters):
                    dist += np.sum(centroids[i][k] != centroids[j][k])
                centroid_distances[i, j] = centroid_distances[j, i] = dist / (n_clusters * len(feature_cols))
    
    return {
        'pairwise_ari': pairwise_ari,
        'pairwise_nmi': pairwise_nmi,
        'centroid_distances': centroid_distances,
        'mean_ari': np.mean(pairwise_ari[np.triu_indices(n_iterations, k=1)]),
        'std_ari': np.std(pairwise_ari[np.triu_indices(n_iterations, k=1)]),
        'mean_nmi': np.mean(pairwise_nmi[np.triu_indices(n_iterations, k=1)]),
        'std_nmi': np.std(pairwise_nmi[np.triu_indices(n_iterations, k=1)]),
        'mean_centroid_dist': np.mean(centroid_distances[np.triu_indices(n_iterations, k=1)]),
        'std_centroid_dist': np.std(centroid_distances[np.triu_indices(n_iterations, k=1)])
    }

In [12]:
# Visualization Functions for Stability Results

def visualize_stability_results(results, title_prefix=""):
    """
    Create comprehensive visualizations for k-modes stability analysis.
    """
    
    # Set up the figure with subplots
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle(f'{title_prefix}K-Modes Clustering Stability Analysis', fontsize=16)
    
    # 1. Stability Metrics Over Iterations (if from subsample_kmode_stability)
    if 'ari_scores' in results:
        ax = axes[0, 0]
        iterations = range(1, len(results['ari_scores']) + 1)
        
        ax.plot(iterations, results['ari_scores'], 'o-', label='ARI', markersize=8)
        ax.plot(iterations, results['nmi_scores'], 's-', label='NMI', markersize=8)
        
        # Add mean lines
        ax.axhline(y=results['summary']['mean_ari'], color='blue', linestyle='--', alpha=0.5)
        ax.axhline(y=results['summary']['mean_nmi'], color='orange', linestyle='--', alpha=0.5)
        
        ax.set_xlabel('Iteration')
        ax.set_ylabel('Score')
        ax.set_title('Stability Metrics vs Reference Clustering')
        ax.legend()
        ax.grid(True, alpha=0.3)
        ax.set_ylim(0, 1.05)
    
    # 2. Pairwise Stability Heatmap (if from calculate_pairwise_stability)
    if 'pairwise_ari' in results:
        ax = axes[0, 1]
        sns.heatmap(results['pairwise_ari'], annot=True, fmt='.2f', cmap='YlOrRd', 
                   vmin=0, vmax=1, square=True, ax=ax, cbar_kws={'label': 'ARI'})
        ax.set_title('Pairwise ARI Between Iterations')
        ax.set_xlabel('Iteration')
        ax.set_ylabel('Iteration')
    
    # 3. Distribution of Stability Scores
    if 'pairwise_ari' in results:
        ax = axes[1, 0]
        
        # Extract upper triangle values (excluding diagonal)
        n_iter = results['pairwise_ari'].shape[0]
        ari_values = results['pairwise_ari'][np.triu_indices(n_iter, k=1)]
        nmi_values = results['pairwise_nmi'][np.triu_indices(n_iter, k=1)]
        
        # Create violin plots
        data_to_plot = [ari_values, nmi_values]
        positions = [1, 2]
        
        parts = ax.violinplot(data_to_plot, positions=positions, showmeans=True, showmedians=True)
        
        # Customize colors
        colors = ['#3498db', '#e74c3c']
        for pc, color in zip(parts['bodies'], colors):
            pc.set_facecolor(color)
            pc.set_alpha(0.7)
        
        ax.set_xticks(positions)
        ax.set_xticklabels(['ARI', 'NMI'])
        ax.set_ylabel('Score')
        ax.set_title('Distribution of Pairwise Stability Scores')
        ax.set_ylim(0, 1.05)
        ax.grid(True, axis='y', alpha=0.3)
        
        # Add summary statistics
        ax.text(1, 0.05, f'μ={np.mean(ari_values):.3f}\\nσ={np.std(ari_values):.3f}', 
                ha='center', fontsize=10, bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
        ax.text(2, 0.05, f'μ={np.mean(nmi_values):.3f}\\nσ={np.std(nmi_values):.3f}', 
                ha='center', fontsize=10, bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
    
    # 4. Centroid Stability
    if 'centroid_distances' in results:
        ax = axes[1, 1]
        
        # Create a histogram of centroid distances
        distances = results['centroid_distances'][np.triu_indices(results['centroid_distances'].shape[0], k=1)]
        
        ax.hist(distances, bins=20, edgecolor='black', alpha=0.7)
        ax.set_xlabel('Normalized Hamming Distance')
        ax.set_ylabel('Frequency')
        ax.set_title('Distribution of Centroid Distances')
        ax.grid(True, axis='y', alpha=0.3)
        
        # Add mean line
        mean_dist = np.mean(distances)
        ax.axvline(x=mean_dist, color='red', linestyle='--', linewidth=2, 
                  label=f'Mean = {mean_dist:.3f}')
        ax.legend()
    
    # Adjust layout
    plt.tight_layout()
    
    # Print summary statistics
    print("\\n=== Stability Analysis Summary ===")
    if 'summary' in results:
        print(f"Mean ARI: {results['summary']['mean_ari']:.3f} ± {results['summary']['std_ari']:.3f}")
        print(f"Mean NMI: {results['summary']['mean_nmi']:.3f} ± {results['summary']['std_nmi']:.3f}")
    elif 'mean_ari' in results:
        print(f"Mean Pairwise ARI: {results['mean_ari']:.3f} ± {results['std_ari']:.3f}")
        print(f"Mean Pairwise NMI: {results['mean_nmi']:.3f} ± {results['std_nmi']:.3f}")
        print(f"Mean Centroid Distance: {results['mean_centroid_dist']:.3f} ± {results['std_centroid_dist']:.3f}")
    
    return fig

In [13]:
# Helper Functions for Quick Testing and Multi-Scale Analysis

def quick_stability_test(df: pd.DataFrame,
                        feature_cols: list,
                        n_clusters: int = 2,
                        n_iterations: int = 5,
                        subsample_size: int = 1000,  # Absolute sample size instead of ratio
                        random_state: int = 42):
    """
    Quick stability test with small absolute sample sizes for proof of concept.
    
    Parameters:
    -----------
    subsample_size : int
        Absolute number of samples to use (default 1000 for quick testing)
    """
    
    # Calculate the ratio based on absolute size
    total_rows = len(df)
    subsample_ratio = min(subsample_size / total_rows, 1.0)
    
    print(f"\\n=== Quick Stability Test ===")
    print(f"Total rows: {total_rows:,}")
    print(f"Sample size: {subsample_size:,} ({subsample_ratio*100:.1f}% of data)")
    print(f"Iterations: {n_iterations}")
    print(f"Clusters: {n_clusters}\\n")
    
    # Run the pairwise stability analysis with small samples
    results = calculate_pairwise_stability(
        df,
        feature_cols,
        n_clusters=n_clusters,
        n_iterations=n_iterations,
        subsample_ratio=subsample_ratio,
        random_state=random_state
    )
    
    return results


def multi_scale_stability_analysis(df: pd.DataFrame,
                                 feature_cols: list,
                                 n_clusters: int = 2,
                                 sample_sizes: list = [500, 1000, 5000, 10000],
                                 n_iterations: int = 10,
                                 random_state: int = 42):
    """
    Run stability analysis across multiple sample sizes to understand scaling behavior.
    """
    
    results_by_size = {}
    
    for sample_size in sample_sizes:
        print(f"\\n{'='*50}")
        print(f"Testing with sample size: {sample_size:,}")
        
        results = quick_stability_test(
            df,
            feature_cols,
            n_clusters=n_clusters,
            n_iterations=n_iterations,
            subsample_size=sample_size,
            random_state=random_state
        )
        
        results_by_size[sample_size] = results
    
    # Create summary plot
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Plot 1: Stability metrics vs sample size
    sizes = list(results_by_size.keys())
    mean_aris = [results_by_size[s]['mean_ari'] for s in sizes]
    std_aris = [results_by_size[s]['std_ari'] for s in sizes]
    mean_nmis = [results_by_size[s]['mean_nmi'] for s in sizes]
    std_nmis = [results_by_size[s]['std_nmi'] for s in sizes]
    
    ax1.errorbar(sizes, mean_aris, yerr=std_aris, marker='o', label='ARI', capsize=5, markersize=8)
    ax1.errorbar(sizes, mean_nmis, yerr=std_nmis, marker='s', label='NMI', capsize=5, markersize=8)
    
    ax1.set_xlabel('Sample Size')
    ax1.set_ylabel('Mean Stability Score')
    ax1.set_title('Stability vs Sample Size')
    ax1.set_xscale('log')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    ax1.set_ylim(0, 1.05)
    
    # Plot 2: Centroid distance vs sample size
    mean_dists = [results_by_size[s]['mean_centroid_dist'] for s in sizes]
    std_dists = [results_by_size[s]['std_centroid_dist'] for s in sizes]
    
    ax2.errorbar(sizes, mean_dists, yerr=std_dists, marker='d', color='green', capsize=5, markersize=8)
    ax2.set_xlabel('Sample Size')
    ax2.set_ylabel('Mean Centroid Distance')
    ax2.set_title('Centroid Stability vs Sample Size')
    ax2.set_xscale('log')
    ax2.grid(True, alpha=0.3)
    
    plt.suptitle('K-Modes Stability Analysis Across Sample Sizes', fontsize=16)
    plt.tight_layout()
    
    return results_by_size

In [14]:
## Quick Test Run (Small Sample)

# This cell runs a quick stability test with small sample sizes
# Run this for rapid prototyping and testing

print("="*60)
print("QUICK TEST RUN - Small Sample Sizes")
print("="*60)

# Single quick test
quick_results = quick_stability_test(
    bfrss_raw_df,
    DEMOGRAPHIC_FEATURE_COLUMNS,
    n_clusters=2,
    n_iterations=5,
    subsample_size=1000  # Only 1000 samples
)

# Visualize results
visualize_stability_results(quick_results, "Quick Test - ")

# Multi-scale analysis with small samples
print("\\n" + "="*60)
print("Multi-Scale Analysis with Small Samples")
print("="*60)

multi_results_quick = multi_scale_stability_analysis(
    bfrss_raw_df,
    DEMOGRAPHIC_FEATURE_COLUMNS,
    n_clusters=2,
    sample_sizes=[500, 1000, 2000],  # Very small sizes
    n_iterations=5  # Fewer iterations
)

QUICK TEST RUN - Small Sample Sizes
\n=== Quick Stability Test ===
Total rows: 433,323
Sample size: 1,000 (0.2% of data)
Iterations: 5
Clusters: 2\n
Running 5 k-modes iterations...
  Completed iteration 5/5
\nCalculating pairwise stability metrics...


IndexError: index 999 is out of bounds for axis 0 with size 999

In [None]:
## Full Analysis Run (Large Sample)

# This cell runs a comprehensive stability analysis with larger sample sizes
# WARNING: This will take significantly longer to run

print("="*60)
print("FULL ANALYSIS RUN - Large Sample Sizes")
print("="*60)
print("WARNING: This analysis will take several minutes to complete")
print("="*60)

# Comprehensive stability test using subsample approach
full_subsample_results = subsample_kmode_stability(
    bfrss_raw_df,
    DEMOGRAPHIC_FEATURE_COLUMNS,
    n_clusters=2,
    n_iterations=20,  # More iterations
    subsample_ratio=0.8  # 80% of data
)

# Visualize subsample results
visualize_stability_results(full_subsample_results, "Full Subsample Analysis - ")

# Comprehensive pairwise stability analysis
print("\\n" + "="*60)
print("Pairwise Stability Analysis")
print("="*60)

full_pairwise_results = calculate_pairwise_stability(
    bfrss_raw_df,
    DEMOGRAPHIC_FEATURE_COLUMNS,
    n_clusters=2,
    n_iterations=15,
    subsample_ratio=0.8
)

# Visualize pairwise results
visualize_stability_results(full_pairwise_results, "Full Pairwise Analysis - ")

# Multi-scale analysis with larger samples
print("\\n" + "="*60)
print("Multi-Scale Analysis with Large Samples")
print("="*60)

multi_results_full = multi_scale_stability_analysis(
    bfrss_raw_df,
    DEMOGRAPHIC_FEATURE_COLUMNS,
    n_clusters=2,
    sample_sizes=[5000, 10000, 20000, 50000],  # Larger sizes
    n_iterations=10  # More iterations
)