In [1]:
# STEP 1: Load the data
import pandas as pd
cms = pd.read_csv("data/inpatient.csv", encoding="windows-1252", low_memory=False)

print(f"Total CMS rows: {len(cms)}")
print(f"\nColumns available:")
print(list(cms.columns))


Total CMS rows: 146427

Columns available:
['Rndrng_Prvdr_CCN', 'Rndrng_Prvdr_Org_Name', 'Rndrng_Prvdr_City', 'Rndrng_Prvdr_St', 'Rndrng_Prvdr_State_FIPS', 'Rndrng_Prvdr_Zip5', 'Rndrng_Prvdr_State_Abrvtn', 'Rndrng_Prvdr_RUCA', 'Rndrng_Prvdr_RUCA_Desc', 'DRG_Cd', 'DRG_Desc', 'Tot_Dschrgs', 'Avg_Submtd_Cvrd_Chrg', 'Avg_Tot_Pymt_Amt', 'Avg_Mdcr_Pymt_Amt']


In [2]:
print(f"\nUnique DRGs (first 20):")
print(cms['DRG_Desc'].unique()[:20])

print(f"\nTotal unique DRGs: {cms['DRG_Desc'].nunique()}")


Unique DRGs (first 20):
['ECMO OR TRACHEOSTOMY WITH MV >96 HOURS OR PRINCIPAL DIAGNOSIS EXCEPT FACE, MOUTH AND NEC'
 'CRANIOTOMY WITH MAJOR DEVICE IMPLANT OR ACUTE COMPLEX CNS PRINCIPAL DIAGNOSIS WITH MCC O'
 'CRANIOTOMY WITH MAJOR DEVICE IMPLANT OR ACUTE COMPLEX CNS PRINCIPAL DIAGNOSIS WITHOUT MC'
 'CRANIOTOMY AND ENDOVASCULAR INTRACRANIAL PROCEDURES WITH MCC'
 'EXTRACRANIAL PROCEDURES WITH CC'
 'EXTRACRANIAL PROCEDURES WITHOUT CC/MCC'
 'DEGENERATIVE NERVOUS SYSTEM DISORDERS WITHOUT MCC'
 'INTRACRANIAL HEMORRHAGE OR CEREBRAL INFARCTION WITH MCC'
 'INTRACRANIAL HEMORRHAGE OR CEREBRAL INFARCTION WITH CC OR TPA IN 24 HOURS'
 'TRANSIENT ISCHEMIA WITHOUT THROMBOLYTIC'
 'TRAUMATIC STUPOR AND COMA <1 HOUR WITH MCC'
 'OTHER DISORDERS OF NERVOUS SYSTEM WITH CC' 'SEIZURES WITH MCC'
 'SEIZURES WITHOUT MCC' 'MAJOR CHEST PROCEDURES WITH MCC'
 'OTHER RESPIRATORY SYSTEM O.R. PROCEDURES WITH MCC'
 'RESPIRATORY INFECTIONS AND INFLAMMATIONS WITH MCC'
 'RESPIRATORY INFECTIONS AND INFLAMMATIONS WITH CC'

In [3]:
# STEP 2: Keep only the 3 diseases we want

# Find DRGs related to our three diseases
print("DRGs containing 'HEART FAILURE':")
hf_drgs = cms[cms['DRG_Desc'].str.contains('HEART FAILURE', case=False, na=False)]['DRG_Desc'].unique()
print(hf_drgs)
print(f"Count: {len(hf_drgs)}\n")

print("\nDRGs containing 'COPD' or 'CHRONIC OBSTRUCTIVE':")
copd_drgs = cms[cms['DRG_Desc'].str.contains('COPD|CHRONIC OBSTRUCTIVE', case=False, na=False)]['DRG_Desc'].unique()
print(copd_drgs)
print(f"Count: {len(copd_drgs)}\n")

print("\nDRGs containing 'DIABETES':")
diabetes_drgs = cms[cms['DRG_Desc'].str.contains('DIABETES', case=False, na=False)]['DRG_Desc'].unique()
print(diabetes_drgs)
print(f"Count: {len(diabetes_drgs)}\n")

print("\nTotal rows for each disease:")
hf_rows = len(cms[cms['DRG_Desc'].str.contains('HEART FAILURE', case=False, na=False)])
copd_rows = len(cms[cms['DRG_Desc'].str.contains('COPD|CHRONIC OBSTRUCTIVE', case=False, na=False)])
diabetes_rows = len(cms[cms['DRG_Desc'].str.contains('DIABETES', case=False, na=False)])

print(f"Heart Failure: {hf_rows} rows")
print(f"COPD: {copd_rows} rows")
print(f"Diabetes: {diabetes_rows} rows")
print(f"Total: {hf_rows + copd_rows + diabetes_rows} rows")

DRGs containing 'HEART FAILURE':
['HEART FAILURE AND SHOCK WITH MCC' 'HEART FAILURE AND SHOCK WITH CC'
 'HEART FAILURE AND SHOCK WITHOUT CC/MCC']
Count: 3


DRGs containing 'COPD' or 'CHRONIC OBSTRUCTIVE':
['CHRONIC OBSTRUCTIVE PULMONARY DISEASE WITH MCC'
 'CHRONIC OBSTRUCTIVE PULMONARY DISEASE WITH CC'
 'CHRONIC OBSTRUCTIVE PULMONARY DISEASE WITHOUT CC/MCC']
Count: 3


DRGs containing 'DIABETES':
['DIABETES WITH MCC' 'DIABETES WITH CC' 'DIABETES WITHOUT CC/MCC']
Count: 3


Total rows for each disease:
Heart Failure: 2962 rows
COPD: 2337 rows
Diabetes: 2278 rows
Total: 7577 rows


In [4]:
cms = cms[
    (cms["DRG_Desc"].str.contains("HEART FAILURE", case=False, na=False)) |
    (cms["DRG_Desc"].str.contains("COPD|CHRONIC OBSTRUCTIVE", case=False, na=False)) |
    (cms["DRG_Desc"].str.contains("DIABETES", case=False, na=False))
]
print(f"Rows after filtering to 3 diseases: {len(cms)}")


Rows after filtering to 3 diseases: 7577


In [5]:
# STEP 3: Create disease column
def get_disease(drg_desc):
    if "HEART FAILURE" in drg_desc.upper():
        return "Heart_Failure"
    elif "COPD" in drg_desc.upper() or "CHRONIC OBSTRUCTIVE" in drg_desc.upper():
        return "COPD"
    elif "DIABETES" in drg_desc.upper():
        return "Diabetes"
    return None

cms["Disease"] = cms["DRG_Desc"].apply(get_disease)
print(f"Diseases created:")
print(cms["Disease"].value_counts())

Diseases created:
Disease
Heart_Failure    2962
COPD             2337
Diabetes         2278
Name: count, dtype: int64


In [6]:
# STEP 4: Keep only required columns
cms = cms[[
    "Rndrng_Prvdr_State_Abrvtn",
    "Disease",
    "Tot_Dschrgs",
    "Avg_Tot_Pymt_Amt",
]].copy()

cms.rename(columns={
    "Rndrng_Prvdr_State_Abrvtn": "State",
    "Tot_Dschrgs": "Total_Discharges",
    "Avg_Tot_Pymt_Amt": "Avg_Total_Payment",
}, inplace=True)



In [7]:
# STEP 5: Remove missing values
print("\nSTEP 5: REMOVE MISSING VALUES")
print(f"Rows before: {len(cms)}")
cms = cms.dropna(subset=["Avg_Total_Payment"])
print(f"Rows after: {len(cms)}")

# STEP 6: Remove zero discharge rows
print("\nSTEP 6: REMOVE ZERO DISCHARGE ROWS")
print(f"Rows before: {len(cms)}")
cms = cms[cms["Total_Discharges"] > 0]
print(f"Rows after: {len(cms)}")


STEP 5: REMOVE MISSING VALUES
Rows before: 7577
Rows after: 7577

STEP 6: REMOVE ZERO DISCHARGE ROWS
Rows before: 7577
Rows after: 7577


In [8]:
# STEP 7: Create weighted columns
cms["Weighted_Total_Payment"] = cms["Avg_Total_Payment"] * cms["Total_Discharges"]

# STEP 8: Aggregate by State and Disease
print(f"Rows before aggregation: {len(cms)}")

cms_state = cms.groupby(["State", "Disease"], as_index=False).agg({
    "Total_Discharges": "sum",
    "Weighted_Total_Payment": "sum",
})

print(f"Rows after aggregation: {len(cms_state)}")


#  STEP 9: Calculate weighted average for Total Payment
cms_state["Avg_Total_Payment"] = cms_state["Weighted_Total_Payment"] / cms_state["Total_Discharges"]



Rows before aggregation: 7577
Rows after aggregation: 153


In [9]:

# STEP 10: Keep only final columns
cms_state = cms_state[[
    "State",
    "Disease",
    "Total_Discharges",
    "Avg_Total_Payment",
]]
cms_state.head()

Unnamed: 0,State,Disease,Total_Discharges,Avg_Total_Payment
0,AK,COPD,110,16610.6
1,AK,Diabetes,68,12859.014706
2,AK,Heart_Failure,590,16814.910169
3,AL,COPD,1355,8025.652399
4,AL,Diabetes,684,8570.195906


In [10]:
# STEP 11: Pivot to wide format
print(f"Rows before pivot: {len(cms_state)}")

# Pivot for costs
cms_pivot_cost = cms_state.pivot_table(
    index="State",
    columns="Disease",
    values="Avg_Total_Payment",
    aggfunc="first"
).reset_index()
cms_pivot_cost.columns.name = None

# Pivot for discharges
cms_pivot_discharge = cms_state.pivot_table(
    index="State",
    columns="Disease",
    values="Total_Discharges",
    aggfunc="first"
).reset_index()
cms_pivot_discharge.columns.name = None

# Rename columns
cms_pivot_cost.rename(columns={
    "COPD": "COPD_Cost",
    "Diabetes": "Diabetes_Cost",
    "Heart_Failure": "Heart_Failure_Cost"
}, inplace=True)

cms_pivot_discharge.rename(columns={
    "COPD": "COPD_Discharges",
    "Diabetes": "Diabetes_Discharges",
    "Heart_Failure": "Heart_Failure_Discharges"
}, inplace=True)

# Merge both pivots on State
cms_pivot = cms_pivot_cost.merge(cms_pivot_discharge, on="State")

print(f"Rows after pivot: {len(cms_pivot)}")


Rows before pivot: 153
Rows after pivot: 51


In [11]:
cms_pivot.head()

Unnamed: 0,State,COPD_Cost,Diabetes_Cost,Heart_Failure_Cost,COPD_Discharges,Diabetes_Discharges,Heart_Failure_Discharges
0,AK,16610.6,12859.014706,16814.910169,110,68,590
1,AL,8025.652399,8570.195906,9643.388742,1355,684,4548
2,AR,7521.352232,8574.209581,8912.16938,829,501,3501
3,AZ,8773.520249,9925.730189,10827.994527,963,530,5116
4,CA,11831.60925,13126.219394,14486.320625,4865,4125,28538


In [13]:
# STEP 12: Final verification
print(f"Number of states: {len(cms_pivot)}")
print(f"\nAverage costs by disease across all states:")
print(f"Heart Failure: ${cms_state[cms_state['Disease']=='Heart_Failure']['Avg_Total_Payment'].mean():,.0f}")
print(f"COPD: ${cms_state[cms_state['Disease']=='COPD']['Avg_Total_Payment'].mean():,.0f}")
print(f"Diabetes: ${cms_state[cms_state['Disease']=='Diabetes']['Avg_Total_Payment'].mean():,.0f}")

print(f"\nCost ranges by disease:")
print(f"Heart Failure: ${cms_state[cms_state['Disease']=='Heart_Failure']['Avg_Total_Payment'].min():,.0f} to ${cms_state[cms_state['Disease']=='Heart_Failure']['Avg_Total_Payment'].max():,.0f}")
print(f"COPD: ${cms_state[cms_state['Disease']=='COPD']['Avg_Total_Payment'].min():,.0f} to ${cms_state[cms_state['Disease']=='COPD']['Avg_Total_Payment'].max():,.0f}")
print(f"Diabetes: ${cms_state[cms_state['Disease']=='Diabetes']['Avg_Total_Payment'].min():,.0f} to ${cms_state[cms_state['Disease']=='Diabetes']['Avg_Total_Payment'].max():,.0f}")



Number of states: 51

Average costs by disease across all states:
Heart Failure: $11,311
COPD: $9,542
Diabetes: $10,443

Cost ranges by disease:
Heart Failure: $8,585 to $16,815
COPD: $7,419 to $16,611
Diabetes: $7,666 to $15,267
