In [1]:
# Import our dependencies
import pandas as pd
from pathlib import Path
import numpy as np

In [2]:
# Load in the data
df = pd.read_csv(Path("Resources/FY_2025_Hospital_Readmissions_Reduction_Program_Hospital.csv"))
df.head()

Unnamed: 0,Facility Name,Facility ID,State,Measure Name,Number of Discharges,Footnote,Excess Readmission Ratio,Predicted Readmission Rate,Expected Readmission Rate,Number of Readmissions,Start Date,End Date
0,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,READM-30-AMI-HRRP,296.0,,0.9483,13.0146,13.7235,36,07/01/2020,06/30/2023
1,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,READM-30-CABG-HRRP,151.0,,0.9509,9.6899,10.1898,13,07/01/2020,06/30/2023
2,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,READM-30-HF-HRRP,681.0,,1.0597,21.5645,20.3495,151,07/01/2020,06/30/2023
3,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,READM-30-HIP-KNEE-HRRP,,,0.9654,4.268,4.4211,Too Few to Report,07/01/2020,06/30/2023
4,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,READM-30-PN-HRRP,490.0,,0.9715,16.1137,16.5863,77,07/01/2020,06/30/2023


In [3]:
# Cleaning the data
# Keeping only relevant/useful columns
working_df = df[['Facility Name',
                 'Facility ID',
                 'State',
                 'Number of Discharges', 
                 'Excess Readmission Ratio',
                 'Predicted Readmission Rate',
                 'Expected Readmission Rate', 
                 'Number of Readmissions']]
working_df.head()

Unnamed: 0,Facility Name,Facility ID,State,Number of Discharges,Excess Readmission Ratio,Predicted Readmission Rate,Expected Readmission Rate,Number of Readmissions
0,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,296.0,0.9483,13.0146,13.7235,36
1,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,151.0,0.9509,9.6899,10.1898,13
2,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,681.0,1.0597,21.5645,20.3495,151
3,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,,0.9654,4.268,4.4211,Too Few to Report
4,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,490.0,0.9715,16.1137,16.5863,77


In [4]:
# Create lists of states for each census division
new_england = ['CT', 'ME', 'MA', 'NH', 'RI', 'VT']
middle_atlantic = ['NJ', 'NY', 'PA']
e_n_central = ['IL', 'IN', 'MI', 'OH', 'WI']
w_n_central = ['IA', 'KS', 'MN', 'MO', 'NE', 'ND', 'SD']
s_atlantic = ['DE', 'DC', 'FL', 'GA', 'MD', 'NC', 'SC', 'VA', 'WV']
e_s_central = ['AL', 'KY', 'MS', 'TN']
w_s_central = ['AR', 'LA', 'OK', 'TX']
mountain = ['AZ', 'CO', 'ID', 'MT', 'NV', 'NM', 'UT', 'WY']
pacific = ['AK', 'CA', 'HI', 'OR', 'WA']

# And lists of divisions for each census region
northeast = ['New England', 'Middle Atlantic']
midwest = ['East North Central', 'West North Central']
south = ['South Atlantic', 'East South Central', 'West South Central']
west = ['Mountain', 'Pacific']

# Function to get division
def get_division(state):
    if state in new_england:
        return 'New England'
    elif state in middle_atlantic:
        return 'Middle Atlantic'
    elif state in e_n_central:
        return 'East North Central'
    elif state in w_n_central:
        return 'West North Central'
    elif state in s_atlantic:
        return 'South Atlantic'
    elif state in e_s_central:
        return 'East South Central'
    elif state in w_s_central:
        return 'West South Central'
    elif state in mountain:
        return 'Mountain'
    elif state in pacific:
        return 'Pacific'
    else:
        return None  # Handle cases where the state is not found

# Function to get region
def get_region(division):
    if division in northeast:
        return 'Northeast'
    elif division in midwest:
        return 'Midwest'
    elif division in south:
        return 'South'
    elif division in west:
        return 'West'
    else:
        return None 

# Run both functions on working_df
working_df2 = working_df.copy()  # Create a copy of the DataFrame (was causing errors)
working_df2['Division'] = working_df2['State'].apply(get_division)
working_df2['Region'] = working_df2['Division'].apply(get_region)

In [5]:
# Drop null values
working_df2 = working_df2.dropna()

print(f'Row Count: {len(working_df2)}')
working_df2.head()

Row Count: 8121


Unnamed: 0,Facility Name,Facility ID,State,Number of Discharges,Excess Readmission Ratio,Predicted Readmission Rate,Expected Readmission Rate,Number of Readmissions,Division,Region
0,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,296.0,0.9483,13.0146,13.7235,36,East South Central,South
1,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,151.0,0.9509,9.6899,10.1898,13,East South Central,South
2,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,681.0,1.0597,21.5645,20.3495,151,East South Central,South
4,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,490.0,0.9715,16.1137,16.5863,77,East South Central,South
5,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,130.0,0.933,15.4544,16.5637,16,East South Central,South


In [6]:
# Write to CSV
working_df2.to_csv('Resources/hospital_clean.csv', index=False)