### 1. Importing the Libraries, and Loading the Dataset

In [11]:
import pandas as pd
import numpy as np

df = pd.read_csv("data/FY_2025_Hospital_Readmissions_Reduction_Program_Hospital.csv")
print(df.shape)
df.head()

(18510, 12)


Unnamed: 0,Facility Name,Facility ID,State,Measure Name,Number of Discharges,Footnote,Excess Readmission Ratio,Predicted Readmission Rate,Expected Readmission Rate,Number of Readmissions,Start Date,End Date
0,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,READM-30-AMI-HRRP,296.0,,0.9483,13.0146,13.7235,36,07/01/2020,06/30/2023
1,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,READM-30-CABG-HRRP,151.0,,0.9509,9.6899,10.1898,13,07/01/2020,06/30/2023
2,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,READM-30-HF-HRRP,681.0,,1.0597,21.5645,20.3495,151,07/01/2020,06/30/2023
3,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,READM-30-HIP-KNEE-HRRP,,,0.9654,4.268,4.4211,Too Few to Report,07/01/2020,06/30/2023
4,SOUTHEAST HEALTH MEDICAL CENTER,10001,AL,READM-30-PN-HRRP,490.0,,0.9715,16.1137,16.5863,77,07/01/2020,06/30/2023


### 2. Finding missing values and dropping them from the dataset

In [12]:
df.isna().sum()

Facility Name                     0
Facility ID                       0
State                             0
Measure Name                      0
Number of Discharges          10170
Footnote                      11927
Excess Readmission Ratio       6583
Predicted Readmission Rate     6583
Expected Readmission Rate      6583
Number of Readmissions         6583
Start Date                        0
End Date                          0
dtype: int64

In [13]:
df_clean = df.dropna(subset=[
    'Excess Readmission Ratio',
    'Predicted Readmission Rate',
    'Expected Readmission Rate',
    'Number of Discharges'
]).copy()
print("Remaining rows:", len(df_clean))

Remaining rows: 8121


### 3. Finding the data types, and converting to numeric if necessary

In [14]:
df_clean.dtypes

Facility Name                  object
Facility ID                     int64
State                          object
Measure Name                   object
Number of Discharges          float64
Footnote                      float64
Excess Readmission Ratio      float64
Predicted Readmission Rate    float64
Expected Readmission Rate     float64
Number of Readmissions         object
Start Date                     object
End Date                       object
dtype: object

In [15]:
df_clean['Number of Readmissions'] = (
    df_clean['Number of Readmissions']
    .astype(str)
    .str.replace(',', '', regex=True)
    .replace('Not Available', np.nan)
)
df_clean['Number of Readmissions'] = pd.to_numeric(df_clean['Number of Readmissions'], errors='coerce')

df_clean['Start Date'] = pd.to_datetime(df_clean['Start Date'], errors='coerce')
df_clean['End Date'] = pd.to_datetime(df_clean['End Date'], errors='coerce')

In [16]:
for c in ["Facility Name", "State", "Measure Name"]:
    df_clean[c] = df_clean[c].astype(str).str.strip()  # Removeing extra spaces

df_clean["State"] = df_clean["State"].str.upper()  # Standardizing case for state codes
df_clean['State'].nunique()

51

In [17]:
# Saving cleaned csv file for future use for EDA and Predictive Modeling.

cleaned_csv = "data/hospital_readmission_clean.csv"
df_clean.to_csv(cleaned_csv, index=False)
print("Saved:", cleaned_csv)

Saved: data/hospital_readmission_clean.csv
