In [1]:
import pandas as pd
cms = pd.read_csv("data/clean_cms.csv")
cdi = pd.read_csv("data/clean_cdi.csv")
hrrp = pd.read_csv("data/clean_hrrp.csv")

In [2]:
cms.head()

Unnamed: 0,State,COPD_Cost,Heart_Failure_Cost
0,AK,16610.6,16814.910169
1,AL,8025.652399,9643.388742
2,AR,7521.352232,8912.16938
3,AZ,8773.520249,10827.994527
4,CA,11831.60925,14486.320625


In [3]:
cdi.head()

Unnamed: 0,State,HeartDisease_Rate,COPD_Rate
0,AK,27.7,5.0
1,AL,83.8,9.4
2,AR,72.8,13.3
3,AZ,28.1,2.6
4,CA,30.7,3.3


In [4]:
hrrp.head()

Unnamed: 0,State,COPD_Predicted_Rate,Heart_Failure_Predicted_Rate,COPD_Expected_Rate,Heart_Failure_Expected_Rate,COPD_Excess_Ratio,Heart_Failure_Excess_Ratio
0,AK,18.730869,18.664527,18.658266,19.087871,1.003891,0.977821
1,AL,17.872373,19.633992,17.901635,19.634685,0.998365,0.999965
2,AR,18.326514,19.59735,18.129895,19.357507,1.010845,1.01239
3,AZ,16.737891,19.168292,16.955757,19.116684,0.987151,1.0027
4,CA,19.645774,20.095834,19.210772,19.74425,1.022644,1.017807


In [5]:
# MERGE ALL THREE CLEANED DATASETS 

merged = cdi.merge(cms, on="State", how="inner").merge(hrrp, on="State", how="inner")

print(f"\nFinal Merged Dataset:")
print(f"Rows: {len(merged)}")

merged.head(10)


Final Merged Dataset:
Rows: 51


Unnamed: 0,State,HeartDisease_Rate,COPD_Rate,COPD_Cost,Heart_Failure_Cost,COPD_Predicted_Rate,Heart_Failure_Predicted_Rate,COPD_Expected_Rate,Heart_Failure_Expected_Rate,COPD_Excess_Ratio,Heart_Failure_Excess_Ratio
0,AK,27.7,5.0,16610.6,16814.910169,18.730869,18.664527,18.658266,19.087871,1.003891,0.977821
1,AL,83.8,9.4,8025.652399,9643.388742,17.872373,19.633992,17.901635,19.634685,0.998365,0.999965
2,AR,72.8,13.3,7521.352232,8912.16938,18.326514,19.59735,18.129895,19.357507,1.010845,1.01239
3,AZ,28.1,2.6,8773.520249,10827.994527,16.737891,19.168292,16.955757,19.116684,0.987151,1.0027
4,CA,30.7,3.3,11831.60925,14486.320625,19.645774,20.095834,19.210772,19.74425,1.022644,1.017807
5,CO,28.6,2.2,10360.385827,10798.765909,17.651251,17.595898,18.073222,18.829706,0.976652,0.934475
6,CT,92.3,5.0,11154.817819,13535.904184,18.510919,19.823882,18.342643,19.335565,1.009174,1.025255
7,DC,60.8,8.0,13002.272727,15562.930796,22.357205,19.649238,21.272567,20.571871,1.050988,0.955151
8,DE,62.3,7.3,9122.795678,11539.787047,17.950422,18.143992,18.292941,18.914121,0.981276,0.959283
9,FL,82.8,13.3,8508.778213,10210.235044,18.842651,20.795498,18.601526,20.215305,1.012963,1.028701


In [6]:
merged.describe()

Unnamed: 0,HeartDisease_Rate,COPD_Rate,COPD_Cost,Heart_Failure_Cost,COPD_Predicted_Rate,Heart_Failure_Predicted_Rate,COPD_Expected_Rate,Heart_Failure_Expected_Rate,COPD_Excess_Ratio,Heart_Failure_Excess_Ratio
count,51.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0
mean,42.147059,7.211765,9541.515123,11311.174172,18.525882,19.0291,18.480355,19.478418,1.001867,0.976257
std,20.834552,4.050662,1846.302404,1922.239449,1.274388,1.264886,0.955531,0.587398,0.029399,0.045709
min,12.1,0.0,7418.512,8584.822446,15.020907,15.951943,14.970923,18.252962,0.916436,0.856337
25%,27.55,4.7,8379.02422,9956.435865,17.857318,18.11519,18.019175,18.997576,0.987188,0.948799
50%,33.3,6.4,8906.066667,10798.765909,18.642262,19.476089,18.512534,19.548815,1.004096,0.989663
75%,57.85,9.3,10063.449999,11754.078951,19.251736,19.963039,18.877545,19.941905,1.015773,1.009002
max,92.3,20.2,16610.6,16814.910169,22.357205,21.56964,21.272567,20.833739,1.097901,1.065285


In [7]:
merged.isnull().sum()

State                           0
HeartDisease_Rate               0
COPD_Rate                       0
COPD_Cost                       0
Heart_Failure_Cost              0
COPD_Predicted_Rate             0
Heart_Failure_Predicted_Rate    0
COPD_Expected_Rate              0
Heart_Failure_Expected_Rate     0
COPD_Excess_Ratio               0
Heart_Failure_Excess_Ratio      0
dtype: int64

In [8]:
merged.to_csv("data/final_merged_dataset.csv", index=False)

In [10]:
print("\n4. OUTLIER DETECTION (IQR Method)")

def find_outliers(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return outliers, lower_bound, upper_bound

metrics_to_check = [
    'HeartDisease_Rate',
    'COPD_Rate',
    'COPD_Cost',
    'Heart_Failure_Cost',
    'COPD_Predicted_Rate',
    'Heart_Failure_Predicted_Rate',
    'COPD_Excess_Ratio',
    'Heart_Failure_Excess_Ratio'
]

outlier_count = 0
for metric in metrics_to_check:
    outliers, lower, upper = find_outliers(merged, metric)
    if len(outliers) > 0:
        print(f"\n   {metric}:")
        print(f"   Normal Range: [{lower:.2f}, {upper:.2f}]")
        for idx, row in outliers.iterrows():
            value = row[metric]
            status = "HIGH" if value > upper else "LOW"
            print(f"     → {row['State']}: {value:.2f} ({status})")
        outlier_count += len(outliers)

print(f"\n   Total Outliers Found: {outlier_count}")



4. OUTLIER DETECTION (IQR Method)

   COPD_Rate:
   Normal Range: [-2.20, 16.20]
     → KY: 20.20 (HIGH)
     → OK: 16.30 (HIGH)

   COPD_Cost:
   Normal Range: [5852.39, 12590.09]
     → AK: 16610.60 (HIGH)
     → DC: 13002.27 (HIGH)
     → HI: 13954.81 (HIGH)
     → MD: 13687.82 (HIGH)

   Heart_Failure_Cost:
   Normal Range: [7259.97, 14450.54]
     → AK: 16814.91 (HIGH)
     → CA: 14486.32 (HIGH)
     → DC: 15562.93 (HIGH)
     → HI: 16125.35 (HIGH)
     → MD: 15313.20 (HIGH)
     → NY: 14991.23 (HIGH)

   COPD_Predicted_Rate:
   Normal Range: [15.77, 21.34]
     → DC: 22.36 (HIGH)
     → HI: 15.02 (LOW)
     → WY: 15.70 (LOW)

   COPD_Excess_Ratio:
   Normal Range: [0.94, 1.06]
     → ID: 0.94 (LOW)
     → RI: 1.10 (HIGH)
     → WY: 0.92 (LOW)

   Heart_Failure_Excess_Ratio:
   Normal Range: [0.86, 1.10]
     → ID: 0.86 (LOW)

   Total Outliers Found: 19
