In [None]:
import pandas as pd
cdi =  pd.read_csv("data/Chronic_Disease.csv")


In [6]:
print(f"Total rows in CDC data: {len(cdi)}")


Total rows in CDC data: 74978


In [None]:
cdi.head()


In [4]:
# STEP 2: Keep only the 3 diseases we want
diseases = ["Diabetes", "Cardiovascular Disease", "Chronic Obstructive Pulmonary Disease"]
cdi = cdi[cdi["Topic"].isin(diseases)]
print(f"Rows after filtering: {len(cdi)}")

Rows after filtering: 74978


In [5]:
# STEP 3: Look at all three diseases in detail
selected_topics = ["Diabetes", "Cardiovascular Disease", "Chronic Obstructive Pulmonary Disease"]

for topic in selected_topics:
    topic_data = cdi[cdi['Topic'] == topic]
    print(f"\n{topic}:")
    print(f"  Total rows: {len(topic_data)}")
    print(f"  Unique questions:")
    for q in topic_data['Question'].unique():
        print(f"    - {q}")


Diabetes:
  Total rows: 17318
  Unique questions:
    - Diabetic ketoacidosis mortality among all people, underlying or contributing cause
    - Diabetes among adults
    - Gestational diabetes among women with a recent live birth
    - Diabetes mortality among all people, underlying or contributing cause

Cardiovascular Disease:
  Total rows: 30709
  Unique questions:
    - Taking medicine to control high blood pressure among adults with high blood pressure
    - Coronary heart disease mortality among all people, underlying cause
    - High cholesterol among adults who have been screened
    - Taking medicine for high cholesterol among adults
    - Cerebrovascular disease (stroke) mortality among all people, underlying cause
    - High blood pressure among adults
    - Diseases of the heart mortality among all people, underlying cause
    - Hospitalization for heart failure as principal diagnosis, Medicare-beneficiaries aged 65 years and older

Chronic Obstructive Pulmonary Disease:


In [8]:
# STEP 4: Keep only questions with "among adults" (prevalence, not mortality)
cdi = cdi[cdi["Question"].str.contains("among adults", case=False, na=False)]
print(f"Step 4 - Rows after filtering to 'among adults': {len(cdi)}")

Step 4 - Rows after filtering to 'among adults': 34640


In [10]:
# STEP 5: Remove empty DataValue
cdi = cdi[cdi["DataValue"].notna()]
print(f"Step 5 - Rows after removing empty values: {len(cdi)}")

Step 5 - Rows after removing empty values: 23194


In [11]:
# STEP 6: Convert DataValue to number
cdi["DataValue"] = (
    cdi["DataValue"]
    .astype(str)
    .str.replace(",", "")
    .str.replace("%", "")
)
cdi["DataValue"] = pd.to_numeric(cdi["DataValue"], errors="coerce")
print(f"Step 6 - DataValue converted to numeric")

Step 6 - DataValue converted to numeric


In [12]:
# STEP 7: Keep latest year per state and disease
cdi = cdi.sort_values(["LocationAbbr", "Topic", "YearStart"])
cdi = cdi.drop_duplicates(subset=["LocationAbbr", "Topic"], keep="last")
print(f"Step 7 - Rows after keeping latest year only: {len(cdi)}")

Step 7 - Rows after keeping latest year only: 165


In [15]:
# STEP 8: Pivot to wide format
cdi_pivot = cdi.pivot_table(
    index="LocationAbbr",
    columns="Topic",
    values="DataValue",
    aggfunc="first"
).reset_index()
cdi_pivot.columns.name = None

print(f"Step 8 - After pivot: {len(cdi_pivot)} rows")


Step 8 - After pivot: 55 rows


In [16]:
# STEP 9: Rename columns
cdi_pivot.rename(columns={
    "LocationAbbr": "State",
    "Diabetes": "Diabetes_Rate",
    "Cardiovascular Disease": "HeartDisease_Rate",
    "Chronic Obstructive Pulmonary Disease": "COPD_Rate"
}, inplace=True)

In [19]:
# STEP 10: Verify final data
print(f"\nStep 10 - Final Data Check:")
print(f"HeartDisease_Rate: {cdi_pivot['HeartDisease_Rate'].min():.2f} to {cdi_pivot['HeartDisease_Rate'].max():.2f}")
print(f"COPD_Rate: {cdi_pivot['COPD_Rate'].min():.2f} to {cdi_pivot['COPD_Rate'].max():.2f}")
print(f"Diabetes_Rate: {cdi_pivot['Diabetes_Rate'].min():.2f} to {cdi_pivot['Diabetes_Rate'].max():.2f}")

print(f"\nIowa data:")
print(cdi_pivot[cdi_pivot['State'] == 'IA'])


Step 10 - Final Data Check:
HeartDisease_Rate: 12.10 to 92.30
COPD_Rate: 0.00 to 20.20
Diabetes_Rate: 1.80 to 24.40

Iowa data:
   State  HeartDisease_Rate  COPD_Rate  Diabetes_Rate
13    IA               54.4        8.1           15.2

Final cleaned CDC data (first 10 rows):


In [21]:
cdi_pivot.head(10)

Unnamed: 0,State,HeartDisease_Rate,COPD_Rate,Diabetes_Rate
0,AK,27.7,5.0,20.6
1,AL,83.8,9.4,21.5
2,AR,72.8,13.3,5.4
3,AZ,28.1,2.6,10.5
4,CA,30.7,3.3,24.4
5,CO,28.6,2.2,7.6
6,CT,92.3,5.0,7.1
7,DC,60.8,8.0,12.9
8,DE,62.3,7.3,18.1
9,FL,82.8,13.3,10.8
