In [2]:
import pandas as pd
import datetime as dt

df = pd.read_csv('FY_2025_Hospital_Readmissions_Reduction_Program_Hospital.csv')

In [3]:
df

Unnamed: 0,Facility Name,Facility ID,State,Measure Name,Number of Discharges,Footnote,Excess Readmission Ratio,Predicted Readmission Rate,Expected Readmission Rate,Number of Readmissions,Start Date,End Date
0,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,READM-30-AMI-HRRP,296.0,,0.9483,13.0146,13.7235,36,07/01/2020,06/30/2023
1,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,READM-30-CABG-HRRP,151.0,,0.9509,9.6899,10.1898,13,07/01/2020,06/30/2023
2,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,READM-30-HF-HRRP,681.0,,1.0597,21.5645,20.3495,151,07/01/2020,06/30/2023
3,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,READM-30-HIP-KNEE-HRRP,,,0.9654,4.2680,4.4211,Too Few to Report,07/01/2020,06/30/2023
4,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,READM-30-PN-HRRP,490.0,,0.9715,16.1137,16.5863,77,07/01/2020,06/30/2023
...,...,...,...,...,...,...,...,...,...,...,...,...
18505,EXCEPTIONAL COMMUNITY HOSPITAL LUBBOCK,670327,TX,READM-30-HIP-KNEE-HRRP,,5.0,,,,,07/01/2020,06/30/2023
18506,EXCEPTIONAL COMMUNITY HOSPITAL LUBBOCK,670327,TX,READM-30-HF-HRRP,,5.0,,,,,07/01/2020,06/30/2023
18507,EXCEPTIONAL COMMUNITY HOSPITAL LUBBOCK,670327,TX,READM-30-COPD-HRRP,,5.0,,,,,07/01/2020,06/30/2023
18508,EXCEPTIONAL COMMUNITY HOSPITAL LUBBOCK,670327,TX,READM-30-CABG-HRRP,,5.0,,,,,07/01/2020,06/30/2023


In [4]:
df['Start Date'] = pd.to_datetime(df['Start Date'], format="%m/%d/%Y")
df['End Date'] = pd.to_datetime(df['End Date'], format="%m/%d/%Y")
df['stay_length'] = (df['End Date'] - df['Start Date']).dt.days

df = df.drop(['Footnote', 'Facility ID', 'Facility Name', 'End Date', 'Start Date'], axis=1)

df['Number of Readmissions'] = df['Number of Readmissions'].replace('Too Few to Report', 0)

df['Number of Readmissions'] = df['Number of Readmissions'].astype(float)

In [5]:
correlation_matrix = df.corr(numeric_only=True)
correlation_matrix

Unnamed: 0,Number of Discharges,Excess Readmission Ratio,Predicted Readmission Rate,Expected Readmission Rate,Number of Readmissions,stay_length
Number of Discharges,1.0,-0.144654,-0.018601,0.009785,0.894155,
Excess Readmission Ratio,-0.144654,1.0,0.203273,0.021479,0.122949,
Predicted Readmission Rate,-0.018601,0.203273,1.0,0.978263,0.41005,
Expected Readmission Rate,0.009785,0.021479,0.978263,1.0,0.387094,
Number of Readmissions,0.894155,0.122949,0.41005,0.387094,1.0,
stay_length,,,,,,


In [6]:
na_counts = df.isna().sum()
print(na_counts)
print()
skew_values = df.iloc[:, 2:].skew()
print(skew_values)

State                             0
Measure Name                      0
Number of Discharges          10170
Excess Readmission Ratio       6583
Predicted Readmission Rate     6583
Expected Readmission Rate      6583
Number of Readmissions         6583
stay_length                       0
dtype: int64

Number of Discharges          3.388118
Excess Readmission Ratio      0.393122
Predicted Readmission Rate   -0.753145
Expected Readmission Rate    -0.885156
Number of Readmissions        3.571046
stay_length                   0.000000
dtype: float64


In [7]:
discharges_mean = df['Number of Discharges'].mean()
df['Number of Discharges'] = df['Number of Discharges'].fillna(discharges_mean)

excess_median = df['Excess Readmission Ratio'].median()
df['Excess Readmission Ratio'] = df['Excess Readmission Ratio'].fillna(excess_median)

predicted_median = df['Predicted Readmission Rate'].median()
df['Predicted Readmission Rate'] = df['Predicted Readmission Rate'].fillna(predicted_median)

expected_median = df['Expected Readmission Rate'].median()
df['Expected Readmission Rate'] = df['Expected Readmission Rate'].fillna(expected_median)

number_of_mean = df['Number of Readmissions'].mean()
df['Number of Readmissions'] = df['Number of Readmissions'].fillna(number_of_mean)

In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X = df.drop('Excess Readmission Ratio', axis=1)
y = df['Excess Readmission Ratio']  

X = pd.get_dummies(X, columns=['Measure Name', 'State'], drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

In [9]:
from sklearn.model_selection import cross_val_score, KFold

cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(rf, X, y, cv=cv, scoring='r2')

print("CV scores:", scores)
print("Average R^2:", scores.mean())

CV scores: [0.99026972 0.98696497 0.98290469 0.9878133  0.98673278]
Average R^2: 0.9869370923852667


In [13]:
importances = rf.feature_importances_

feature_names = X.columns

importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

importance_df

Unnamed: 0,Feature,Importance
1,Predicted Readmission Rate,0.563069
2,Expected Readmission Rate,0.414184
0,Number of Discharges,0.012387
3,Number of Readmissions,0.004825
8,Measure Name_READM-30-HIP-KNEE-HRRP,0.000655
12,State_AZ,0.000404
5,Measure Name_READM-30-CABG-HRRP,0.000402
13,State_CA,0.000353
18,State_FL,0.000226
52,State_TX,0.000222


In [14]:
dummy_cols = [col for col in X.columns if col.startswith('State_') or col.startswith('Measure Name_')]
X[dummy_cols].sum().sort_values(ascending=False)

Measure Name_READM-30-CABG-HRRP        3085
Measure Name_READM-30-HF-HRRP          3085
Measure Name_READM-30-HIP-KNEE-HRRP    3085
Measure Name_READM-30-PN-HRRP          3085
Measure Name_READM-30-COPD-HRRP        3085
State_TX                               1704
State_CA                               1674
State_FL                               1002
State_PA                                804
State_NY                                792
State_OH                                714
State_IL                                696
State_GA                                576
State_MI                                546
State_NC                                492
State_LA                                486
State_AL                                486
State_IN                                486
State_TN                                480
State_OK                                456
State_VA                                426
State_WI                                390
State_MO                        