In [None]:
1.Imports & File Paths

In [None]:
import pandas as pd
import numpy as np
import io
import matplotlib.pyplot as plt

female_path = "/mnt/data/nhanes_adult_female_bmx_2020.csv"
male_path   = "/mnt/data/nhanes_adult_male_bmx_2020.csv"

In [None]:
2. Function to Load and Clean Data

In [None]:
def read_nhanes_bmx(path: str, sex_label: str) -> pd.DataFrame:
    with open(path, "r", encoding="utf-8") as f:
        lines = f.readlines()
    header_idx = None
    for i, line in enumerate(lines):
        if line.strip().startswith('"BMXWT"') or line.strip().startswith('BMXWT'):
            header_idx = i
            break

    content = "".join(lines[header_idx:])
    df = pd.read_csv(io.StringIO(content))
    
    
    df.columns = [c.replace('"','').strip() for c in df.columns]
    
    
    expected_cols = ["BMXWT","BMXHT","BMXARML","BMXLEG","BMXARMC","BMXHIP","BMXWAIST"]
    present = [c for c in expected_cols if c in df.columns]
    df = df[present].copy()
    
    
    df["sex"] = sex_label
    return df


In [None]:
3. Load & Combine Male and Female Data

In [None]:
df_f = read_nhanes_bmx(female_path, "Female")
df_m = read_nhanes_bmx(male_path,   "Male")
df = pd.concat([df_f, df_m], ignore_index=True)


In [None]:
4. Create New Health Metrics

In [None]:
df["height_m"] = df["BMXHT"] / 100.0
df["BMI"] = df["BMXWT"] / (df["height_m"]**2)
df["WHR"] = df["BMXWAIST"] / df["BMXHIP"]
df["WHtR"] = df["BMXWAIST"] / df["BMXHT"]


In [None]:
5. Filter Out Bad Data

In [None]:
mask_valid = (
    (df["BMXWT"] > 0) &
    (df["BMXHT"] > 0) &
    (df["BMXHIP"] > 0) &
    (df["BMXWAIST"] > 0)
)
df = df.loc[mask_valid].copy()


In [None]:
6. Classify BMI Categories

In [None]:
def bmi_category(bmi: float) -> str:
    if pd.isna(bmi):
        return "Missing"
    if bmi < 18.5: return "Underweight"
    if bmi < 25:   return "Normal"
    if bmi < 30:   return "Overweight"
    return "Obese"

df["BMI_Category"] = df["BMI"].apply(bmi_category)


In [None]:
7. Summary Statistics

In [None]:
def summary_by_sex(metric: str) -> pd.DataFrame:
    g = df.groupby("sex")[metric].agg(["count","mean","std","min","median","max"])
    return g.round(2)


In [None]:
8. BMI Category Distribution

In [None]:
bmi_ct = pd.crosstab(df["sex"], df["BMI_Category"]).reindex(columns=["Underweight","Normal","Overweight","Obese"], fill_value=0)
bmi_pct = (bmi_ct.div(bmi_ct.sum(axis=1), axis=0) * 100).round(1)


In [None]:
9. Correlation Matrix

In [None]:
corr_cols = ["BMXWT", "BMXHT", "BMXWAIST", "BMXHIP", "BMI", "WHR", "WHtR"]
present_corr = [c for c in corr_cols if c in df.columns]
corr = df[present_corr].corr().round(3)


In [None]:
10. Plots

In [None]:
for sex in ["Female", "Male"]:
    sub = df[df["sex"] == sex]
    plt.figure()
    plt.hist(sub["BMI"].dropna(), bins=20)
    plt.title(f"BMI distribution — {sex}")
    plt.xlabel("BMI")
    plt.ylabel("Frequency")
    plt.show()

for sex in ["Female", "Male"]:
    sub = df[df["sex"] == sex]
    plt.figure()
    plt.hist(sub["WHtR"].dropna(), bins=20)
    plt.title(f"Waist-to-Height Ratio (WHtR) — {sex}")
    plt.xlabel("WHtR")
    plt.ylabel("Frequency")
    plt.show()
