### 1. Loading and Inspecting the cleaned data

In [16]:
import pandas as pd
import numpy as np

df = pd.read_csv("data/hospital_readmission_clean.csv")
print(df.shape)
df.head()

(8121, 12)


Unnamed: 0,Facility Name,Facility ID,State,Measure Name,Number of Discharges,Footnote,Excess Readmission Ratio,Predicted Readmission Rate,Expected Readmission Rate,Number of Readmissions,Start Date,End Date
0,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,READM-30-AMI-HRRP,296.0,,0.9483,13.0146,13.7235,36,2020-07-01,2023-06-30
1,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,READM-30-CABG-HRRP,151.0,,0.9509,9.6899,10.1898,13,2020-07-01,2023-06-30
2,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,READM-30-HF-HRRP,681.0,,1.0597,21.5645,20.3495,151,2020-07-01,2023-06-30
3,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,READM-30-PN-HRRP,490.0,,0.9715,16.1137,16.5863,77,2020-07-01,2023-06-30
4,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,READM-30-COPD-HRRP,130.0,,0.933,15.4544,16.5637,16,2020-07-01,2023-06-30


In [17]:
# Keeping only the columns relevant for modeling

cols = [
    "Facility ID",
    "State",
    "Measure Name",
    "Number of Discharges",
    "Number of Readmissions",
    "Predicted Readmission Rate",
    "Expected Readmission Rate",
    "Excess Readmission Ratio"
]
df = df[cols].copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8121 entries, 0 to 8120
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Facility ID                 8121 non-null   int64  
 1   State                       8121 non-null   object 
 2   Measure Name                8121 non-null   object 
 3   Number of Discharges        8121 non-null   float64
 4   Number of Readmissions      8121 non-null   int64  
 5   Predicted Readmission Rate  8121 non-null   float64
 6   Expected Readmission Rate   8121 non-null   float64
 7   Excess Readmission Ratio    8121 non-null   float64
dtypes: float64(4), int64(2), object(2)
memory usage: 507.7+ KB


### 2. Creating the target variable

In [18]:
# Creating a binary target: 1 if a hospital exceeds expected readmissions (> 1.0) else 0
df["High_Readmit"] = (df["Excess Readmission Ratio"] > 1.0).astype(int)

# Doing a quick check for proportion
df["High_Readmit"].value_counts(normalize=True).rename("Class_Share")

High_Readmit
1    0.548455
0    0.451545
Name: Class_Share, dtype: float64

### 3. Choosing features and target variable

In [25]:
from sklearn.model_selection import train_test_split

feature_cols_num = ["Number of Discharges",
                    "Predicted Readmission Rate",
                    "Expected Readmission Rate"]

feature_cols_cat = ["State", "Measure Name"]

target_col = ["High_Readmit"]

x = df[feature_cols_num + feature_cols_cat].copy()
y = df[target_col].copy()


### 4. Train / Test split

In [33]:
x_train, x_test, y_train, y_test = train_test_split (x, y, test_size= 0.2, random_state= 42, stratify= y)

print("Train shape:", x_train.shape, "Test shape:", x_test.shape)
print("Class balance (train):")
print(y_train.value_counts(normalize= True).rename("Share").map("{:.2%}".format))
print("Class balance (test):")
print(y_test.value_counts(normalize= True).rename("Share").map("{:.2%}".format))

Train shape: (6496, 5) Test shape: (1625, 5)
Class balance (train):
High_Readmit
1               54.85%
0               45.15%
Name: Share, dtype: object
Class balance (test):
High_Readmit
1               54.83%
0               45.17%
Name: Share, dtype: object


### 5. Preprocessing pipeline

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

preprocessor = ColumnTransformer (
    transformers= [
        ("num", "passthrough", feature_cols_num),
        ("cat", OneHotEncoder(handle_unknown= "ignore", drop= "first"),feature_cols_cat)
    ]
)

# creating a resuable pipeline shell
from sklearn.linear_model import LogisticRegression

logit_pipe = Pipeline