## A3. Data Preprocessing – health outcome 

**Description**  
This section preprocesses longitudinal health outcome data from CMS. Additionally create add feature on hospital-level interoperatbility score   

**Data Sources**  
- https://www.cms.gov/medicare/quality/initiatives/hospital-quality-initiative/hospital-compare


**Purpose**  
To prepare longitudinal health outcome data for hospital-level analysis.
   

**Disclaimer**  
This codebase was partially cleaned and annotated using OpenAI’s ChatGPT-4o.  
Please review and validate before use in critical workflows.

**notebook workflow** 
0. load necessary libraries 
1. load longitudinal health outcome data 
2. merge health outcome 


### A3_0 load neccesary libraries 

In [11]:
# Import necessary libraries
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import contextily as ctx
import warnings
from scipy import stats
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.patches as mpatches
import os
import getpass  
import re 
import json 
import sys 


### A3_1 load longitudinal healthoutcome 

In [12]:
# chronologically ordered time points of longitudinal health outcome data 
data_order = ['01_2022', '04_2022', '07_2022', '10_2022', 
              '01_2023', '04_2023', '07_2023', '10_2023', 
              '01_2024', '04_2024', '07_2024', '10_2024', 
              '02_2025', '04_2025']

In [None]:
# continous variables 
hospital_quality_outcomes = ["COMP_HIP_KNEE",
"MORT_30_AMI",
"MORT_30_CABG",
"MORT_30_COPD",
"MORT_30_HF",
"MORT_30_PN",
"MORT_30_STK",
"PSI_03",
"PSI_04",
"PSI_06",
"PSI_08",
"PSI_09",
"PSI_10",
"PSI_11",
"PSI_12",
"PSI_13",
"PSI_14",
"PSI_15",
"PSI_90",
"Total HAC Score",
"READM-30-AMI-HRRP",
"READM-30-CABG-HRRP",
"READM-30-COPD-HRRP",
"READM-30-HF-HRRP",
"READM-30-HIP-KNEE-HRRP",
"READM-30-PN-HRRP",
"MSPB-1",
"EDV",
"ED_2_Strata_1",
"ED_2_Strata_2",
"HCP_COVID_19",
"HH_01",
"HH_02",
"IMM_3",
"OP_18b",
"OP_18c",
"OP_22",
"OP_23",
"OP_29",
"OP_31",
"OP_40",
"SAFE_USE_OF_OPIOIDS",
"SEP_1",
"SEP_SH_3HR",
"SEP_SH_6HR",
"SEV_SEP_3HR",
"SEV_SEP_6HR",
"STK_02",
"STK_03",
"STK_05",
"STK_06",
"VTE_1",
"VTE_2",
"EDAC_30_AMI",
"EDAC_30_HF",
"EDAC_30_PN",
"OP_32",
"OP_35_ADM",
"OP_35_ED",
"OP_36",
"READM_30_AMI",
"READM_30_CABG",
"READM_30_COPD",
"READM_30_HF",
"READM_30_HIP_KNEE",
"READM_30_HOSP_WIDE",
"READM_30_PN"]
# List of columns to process
outcome_columns = hospital_quality_outcomes.copy()  # Make a copy to safely modify


### A3_2 merge longitudinal healthoutcome 

In [13]:
def load_and_merge_time_series_data(base_path, time_points, file_dict, column_selections=None, pivot_configs=None):
    """
    Load and merge datasets from multiple time points with column selection and pivoting.
    """
    merged_data = {}
    
    for file_type, pattern in file_dict.items():
        all_time_data = []
        
        for time_point in time_points:
            # Construct folder path with format 'hospitals_{time_point}'
            folder_path = os.path.join(base_path, f'hospitals_{time_point}')
            
            # Extract year from time_point (e.g., '01_2022' -> '2022')
            year = time_point.split('_')[1]
            
            # Construct file path with year substitution for HAC and readmission files
            if 'HAC_reduction' in file_type or 'readmission' in file_type:
                file_path = os.path.join(folder_path, pattern.format(year=year))
            else:
                file_path = os.path.join(folder_path, pattern)
            
            # Check if file exists
            if not os.path.exists(file_path):
                print(f"Warning: File not found: {file_path}")
                continue

            df = pd.read_csv(file_path, low_memory=False, encoding='utf-8')
            df['time_point'] = time_point   
                
            if 'Facility ID' in df.columns:
                df['Facility ID'] = df['Facility ID'].astype(str).str.zfill(6)
            elif 'Facility ID' in df.columns.str.lower():  # Check for case-insensitive match
                facility_id_col = [col for col in df.columns if col.lower() == 'facility id'][0]
                df[facility_id_col] = df[facility_id_col].astype(str).str.zfill(6)

        
        # Merge all time points for this file type
        if all_time_data:
            merged_df = pd.concat(all_time_data, ignore_index=True)
            
            # Apply pivot if specified
            if pivot_configs and file_type in pivot_configs:
                pivot_config = pivot_configs[file_type]
                try:
                    merged_df = merged_df.pivot_table(
                        index=pivot_config['index'],
                        columns=pivot_config['columns'],
                        values=pivot_config['values'],
                        aggfunc=pivot_config.get('aggfunc', 'first')
                    ).reset_index()
                except Exception as e:
                    print(f"Error pivoting {file_type}: {str(e)}")
            
            merged_data[file_type] = merged_df
            print(f"Successfully merged {len(all_time_data)} time points for {file_type}")
    
    return merged_data

In [15]:
# Update the file patterns to handle different years
file_dict = {
    'general_hospital_info': 'Hospital_General_Information.csv',
    'death_complication': 'Complications_and_Deaths-Hospital.csv',
    'HAC_reduction': 'FY_{year}_HAC_Reduction_Program_Hospital.csv',  # Pattern with year placeholder
    'readmission': 'FY_{year}_Hospital_Readmissions_Reduction_Program_Hospital.csv',  # Pattern with year placeholder
    'Medicare_Hospital_Spending_Per_Patient': 'Medicare_Hospital_Spending_Per_Patient-Hospital.csv',
    'Timely_and_Effective_Care': 'Timely_and_Effective_Care-Hospital.csv',
    'Unplanned_Hospital_Visits': 'Unplanned_Hospital_Visits-Hospital.csv'
}

In [16]:
# Define column selections for each file type
column_selections = {
    'general_hospital_info': [
        'Facility ID', 'Hospital overall rating',
        'Count of Facility MORT Measures', 'Count of MORT Measures Better','Count of MORT Measures Worse',
        'Count of Facility Safety Measures', 'Count of Safety Measures Better', 'Count of Safety Measures Worse',
        'Count of Facility READM Measures', 'Count of READM Measures Better', 'Count of READM Measures Worse'
    ],
    'HAC_reduction': ['Facility ID', 'Total HAC Score'],
    'readmission': ['Facility ID', 'Measure Name', 'Excess Readmission Ratio'],
    'death_complication': ['Facility ID', 'Measure ID', 'Score'],
    'Medicare_Hospital_Spending_Per_Patient': ['Facility ID', 'Measure ID', 'Score'],
    'Timely_and_Effective_Care': ['Facility ID', 'Measure ID', 'Score'],
    'Unplanned_Hospital_Visits': ['Facility ID', 'Measure ID', 'Score']
}

In [17]:
# Define pivot configurations for files that need pivoting
pivot_configs = {
    'death_complication': {
        'index': ['Facility ID', 'time_point'],
        'columns': 'Measure ID',
        'values': 'Score',
        'aggfunc': 'first'
    },
    'readmission': {
        'index': ['Facility ID', 'time_point'],
        'columns': 'Measure Name',
        'values': 'Excess Readmission Ratio',
        'aggfunc': 'first'
    },
    'Medicare_Hospital_Spending_Per_Patient': {
        'index': ['Facility ID', 'time_point'],
        'columns': 'Measure ID',
        'values': 'Score',
        'aggfunc': 'first'
    },
    'Timely_and_Effective_Care': {
        'index': ['Facility ID', 'time_point'],
        'columns': 'Measure ID',
        'values': 'Score',
        'aggfunc': 'first'
    },
    'Unplanned_Hospital_Visits': {
        'index': ['Facility ID', 'time_point'],
        'columns': 'Measure ID',
        'values': 'Score',
        'aggfunc': 'first'
    }
}

In [None]:

# Load and merge data with column selection and pivoting
merged_data = load_and_merge_time_series_data(
    base_path, 
    data_order, 
    file_dict,
    column_selections=column_selections,
    pivot_configs=pivot_configs
)


In [None]:
# Save each DataFrame to a separate CSV file
for file_type, df in merged_data.items():
    output_path = f"./data/outcomes/merged_{file_type}.csv"
    df.to_csv(output_path, index=False)
    print(f"Saved {file_type} to {output_path}")