### Healthy Eating Index (HEI) Notebook

In [2]:
!pip install pyreadstat

Defaulting to user installation because normal site-packages is not writeable
Collecting pyreadstat
  Downloading pyreadstat-1.2.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)
Downloading pyreadstat-1.2.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (617 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m617.7/617.7 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyreadstat
Successfully installed pyreadstat-1.2.9


In [None]:
import pandas as pd
import numpy as np
import requests
import tempfile

def download_xpt(url):
    """Download an NHANES .xpt file and return a pandas DataFrame."""
    r = requests.get(url)
    r.raise_for_status()
    with tempfile.NamedTemporaryFile(suffix=".xpt") as tmp:
        tmp.write(r.content)
        tmp.flush()
        return pd.read_sas(tmp.name, format="xport")

# 1) DOWNLOAD NHANES DAY 1 DATA (correct URLs)
DR1IFF_URL = "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/DR1IFF_J.xpt"
DR1TOT_URL = "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/DR1TOT_J.xpt"
DR2IFF_URL = "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/DR2IFF_J.xpt"
DR2TOT_URL = "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/DR2TOT_J.xpt"
DEMO_URL   = "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/DEMO_J.xpt"

dr1iff = download_xpt(DR1IFF_URL)   # individual foods
dr1tot = download_xpt(DR1TOT_URL)   # nutrient totals
demo   = download_xpt(DEMO_URL)     # demographics

# 2) LOAD LOCAL FPED CSV (per-person field equivalents)
FPED_PATH = "/workspaces/enterntainment720/nutrition/data/raw/FPED_1718.csv"
fped = pd.read_csv(FPED_PATH)

# 3) ENSURE SEQN IS INTEGER
for df_ in (dr1iff, dr1tot, demo):
    df_["SEQN"] = df_["SEQN"].astype(int)

# 4) JOIN FOODS → FPED ON EXACT COLUMN NAMES
dr1iff["FOODCODE"] = dr1iff["DR1IFDCD"].astype(int)
fped["FOODCODE"]   = fped["FOODCODE"].astype(int)
foods_fped = dr1iff.merge(fped, on="FOODCODE", how="left")

# 5) AGGREGATE FPED TO PERSON-LEVEL
# Exclude FOODCODE + DESCRIPTION
agg_cols = [c for c in fped.columns if c not in ("FOODCODE","DESCRIPTION")]
fped_person = foods_fped.groupby("SEQN")[agg_cols].sum().reset_index()

# 6) MERGE PERSON-LEVEL FPED WITH NUTRIENTS + DEMO
df = (
    dr1tot
    .merge(demo, on="SEQN", how="left")
    .merge(fped_person, on="SEQN", how="left")
)

# 7) COMPUTE PER-1 000 KCAL & % ENERGY (use exact FPED names)
df["DR1TKCAL"] = df["DR1TKCAL"].replace(0, np.nan)  # calories

df["total_fruit_1000kcal"]      = df["F_TOTAL (cup eq.)"]   / df["DR1TKCAL"] * 1000
df["whole_fruit_1000kcal"]      = (df["F_TOTAL (cup eq.)"] - df["F_JUICE (cup eq.)"]) / df["DR1TKCAL"] * 1000
df["total_veg_1000kcal"]        = df["V_TOTAL (cup eq.)"]   / df["DR1TKCAL"] * 1000
df["greens_beans_1000kcal"]     = (df["V_DRKGR (cup eq.)"] + df["V_LEGUMES (cup eq.)"]) / df["DR1TKCAL"] * 1000
df["whole_grain_1000kcal"]      = df["G_WHOLE (oz. eq.)"]    / df["DR1TKCAL"] * 1000
df["dairy_1000kcal"]            = df["D_TOTAL (cup eq.)"]    / df["DR1TKCAL"] * 1000
df["total_protein_1000kcal"]    = df["PF_TOTAL (oz. eq.)"]   / df["DR1TKCAL"] * 1000
df["sea_plant_protein_1000kcal"]= (
    df["PF_SEAFD_HI (oz. eq.)"] + df["PF_SEAFD_LOW (oz. eq.)"] +
    df["PF_SOY (oz. eq.)"] + df["PF_NUTSDS (oz. eq.)"] +
    df["PF_LEGUMES (oz. eq.)"]
) / df["DR1TKCAL"] * 1000

# Fatty acid ratio (PUFA+MUFA)/SFA
df["fatty_acid_ratio"] = (df["DR1TM181"] + df["DR1TP182"] + df["DR1TP183"]) / df["DR1TSFAT"]

# Moderation: lower is better
df["refined_grain_1000kcal"] = df["G_REFINED (oz. eq.)"]   / df["DR1TKCAL"] * 1000
df["sodium_1000kcal"]        = df["DR1TSODI"]             / df["DR1TKCAL"] * 1000
df["added_sugars_pct"]       = df["ADD_SUGARS (tsp. eq.)"] * 4 / df["DR1TKCAL"] * 100
df["sat_fat_pct"]            = df["DR1TSFAT"] * 9         / df["DR1TKCAL"] * 100

# 8) SCORING HELPERS
def capped_score(val, lo, hi, max_score):
    return np.clip((val - lo)/(hi - lo)*max_score, 0, max_score)

def reverse_capped(val, lo, hi, max_score):
    return np.clip((hi - val)/(hi - lo)*max_score, 0, max_score)

# 9) APPLY HEI-2015 CUTPOINTS
df["hei_total_fruit"]       = capped_score(df["total_fruit_1000kcal"],    0, 0.8,  5)
df["hei_whole_fruit"]       = capped_score(df["whole_fruit_1000kcal"],    0, 0.4,  5)
df["hei_total_veg"]         = capped_score(df["total_veg_1000kcal"],      0, 1.1,  5)
df["hei_greens_beans"]      = capped_score(df["greens_beans_1000kcal"],   0, 0.2,  5)
df["hei_whole_grains"]      = capped_score(df["whole_grain_1000kcal"],    0, 1.5, 10)
df["hei_dairy"]             = capped_score(df["dairy_1000kcal"],          0, 1.3, 10)
df["hei_total_protein"]     = capped_score(df["total_protein_1000kcal"],  0, 2.5,  5)
df["hei_sea_plant_protein"] = capped_score(df["sea_plant_protein_1000kcal"],0, 0.8,  5)
df["hei_fatty_acids"]       = capped_score(df["fatty_acid_ratio"],        1.2,2.5, 10)
df["hei_refined_grains"]    = reverse_capped(df["refined_grain_1000kcal"],1.8,4.3, 10)
df["hei_sodium"]            = reverse_capped(df["sodium_1000kcal"],       1.1,2.0, 10)
df["hei_added_sugars"]      = reverse_capped(df["added_sugars_pct"],      6.5,26.0,10)
df["hei_sat_fat"]           = reverse_capped(df["sat_fat_pct"],           8.0,16.0,10)

# 10) TOTAL HEI SCORE
hei_cols = [c for c in df.columns if c.startswith("hei_")]
df["HEI2015_TOTAL_SCORE"] = df[hei_cols].sum(axis=1)

# 11) EXPORT HEI SCORES TO CSV
output_path = "/workspaces/enterntainment720/nutrition/data/output/hei_scores.csv"
df[["SEQN", "HEI2015_TOTAL_SCORE"] + hei_cols].to_csv(output_path, index=False)
print(f"HEI scores written to {output_path}")

KeyError: 'DR1IFDCD'

In [31]:
import pandas as pd
import numpy as np
import requests
import tempfile

def download_xpt(url):
    """Download an NHANES .xpt file and return a pandas DataFrame."""
    r = requests.get(url); r.raise_for_status()
    with tempfile.NamedTemporaryFile(suffix=".xpt") as tmp:
        tmp.write(r.content); tmp.flush()
        return pd.read_sas(tmp.name, format="xport")

# 1) URLs for NHANES 2017–2018 Day 1 & Day 2
DR1IFF_URL = "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/DR1IFF_J.xpt"
DR1TOT_URL = "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/DR1TOT_J.xpt"
DR2IFF_URL = "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/DR2IFF_J.xpt"
DR2TOT_URL = "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/DR2TOT_J.xpt"
DEMO_URL   = "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/DEMO_J.xpt"

# 2) Download NHANES files
dr1iff = download_xpt(DR1IFF_URL)
dr1tot = download_xpt(DR1TOT_URL)
dr2iff = download_xpt(DR2IFF_URL)
dr2tot = download_xpt(DR2TOT_URL)
demo   = download_xpt(DEMO_URL)

# 3) Load your FPED per-food CSV
FPED_PATH = "/workspaces/enterntainment720/nutrition/data/raw/FPED_1718.csv"
fped      = pd.read_csv(FPED_PATH)

# 4) Ensure SEQN is integer everywhere
for df_ in (dr1iff, dr1tot, dr2iff, dr2tot, demo):
    df_["SEQN"] = df_["SEQN"].astype(int)

# 5) Helper to build person-level FPED totals from individual foods
def build_fped_person(ind_df, code_col):
    df_food = ind_df.copy()
    df_food["FOODCODE"] = df_food[code_col].astype(int)
    merged = df_food.merge(fped, on="FOODCODE", how="left")
    agg_cols = [c for c in fped.columns if c not in ("FOODCODE", "DESCRIPTION")]
    return merged.groupby("SEQN")[agg_cols].sum().reset_index()

fped_p1 = build_fped_person(dr1iff, "DR1IFDCD")
fped_p2 = build_fped_person(dr2iff, "DR2IFDCD")

# 6) Function to calculate HEI for a single day, using the given prefix (DR1 or DR2)
def calc_day_hei(dr_tot, fped_person, prefix):
    # merge
    df = dr_tot.merge(demo, on="SEQN", how="left").merge(fped_person, on="SEQN", how="left")
    kcal_col = f"{prefix}TKCAL"
    df[kcal_col] = df[kcal_col].replace(0, np.nan)

    # per-1000kcal
    df["total_fruit"]       = df["F_TOTAL (cup eq.)"]   / df[kcal_col] * 1000
    df["whole_fruit"]       = (df["F_TOTAL (cup eq.)"] - df["F_JUICE (cup eq.)"]) / df[kcal_col] * 1000
    df["total_veg"]         = df["V_TOTAL (cup eq.)"]   / df[kcal_col] * 1000
    df["greens_beans"]      = (
        df["V_DRKGR (cup eq.)"] + df["V_LEGUMES (cup eq.)"]
    ) / df[kcal_col] * 1000
    df["whole_grain"]       = df["G_WHOLE (oz. eq.)"]    / df[kcal_col] * 1000
    df["dairy"]             = df["D_TOTAL (cup eq.)"]    / df[kcal_col] * 1000
    df["total_protein"]     = df["PF_TOTAL (oz. eq.)"]   / df[kcal_col] * 1000
    df["sea_plant_protein"] = (
        df["PF_SEAFD_HI (oz. eq.)"] + df["PF_SEAFD_LOW (oz. eq.)"] +
        df["PF_SOY (oz. eq.)"] + df["PF_NUTSDS (oz. eq.)"] +
        df["PF_LEGUMES (oz. eq.)"]
    ) / df[kcal_col] * 1000

    # fatty acids ratio
    mufacol = f"{prefix}TM181"
    pufacol = f"{prefix}TP182"
    pufbcol = f"{prefix}TP183"
    sfatcol = f"{prefix}TSFAT"
    df["fatty_acid_ratio"] = (df[mufacol] + df[pufacol] + df[pufbcol]) / df[sfatcol]

    # moderation (reverse)
    df["refined_grain"]  = df["G_REFINED (oz. eq.)"]    / df[kcal_col] * 1000
    df["sodium"]         = df[f"{prefix}TSODI"]        / df[kcal_col] * 1000
    df["added_sugars"]   = df["ADD_SUGARS (tsp. eq.)"]  * 4  / df[kcal_col] * 100
    df["sat_fat"]        = df[sfatcol]                 * 9  / df[kcal_col] * 100

    # scoring helpers
    def cap(v, lo, hi, mx): return np.clip((v-lo)/(hi-lo)*mx, 0, mx)
    def rcap(v, lo, hi, mx): return np.clip((hi-v)/(hi-lo)*mx, 0, mx)

    # apply
    s = {}
    s[1] = cap(df["total_fruit"],    0,   0.8,   5)
    s[2] = cap(df["whole_fruit"],    0,   0.4,   5)
    s[3] = cap(df["total_veg"],      0,   1.1,   5)
    s[4] = cap(df["greens_beans"],   0,   0.2,   5)
    s[5] = cap(df["whole_grain"],    0,   1.5,  10)
    s[6] = cap(df["dairy"],          0,   1.3,  10)
    s[7] = cap(df["total_protein"],  0,   2.5,   5)
    s[8] = cap(df["sea_plant_protein"],0, 0.8,   5)
    s[9] = cap(df["fatty_acid_ratio"],1.2,2.5,  10)
    s[10]= rcap(df["refined_grain"], 1.8, 4.3,  10)
    s[11]= rcap(df["sodium"],        1.1, 2.0,  10)
    s[12]= rcap(df["added_sugars"],  6.5,26.0,  10)
    s[13]= rcap(df["sat_fat"],       8.0,16.0,  10)

    for i in range(1,14):
        df[f"s{i}"] = s[i]
    df["HEI"] = df[[f"s{i}" for i in range(1,14)]].sum(axis=1)
    return df[["SEQN","HEI"] + [f"s{i}" for i in range(1,14)]]

# 7) CALCULATE
day1 = calc_day_hei(dr1tot, fped_p1, prefix="DR1")
day2 = calc_day_hei(dr2tot, fped_p2, prefix="DR2")

# 8) AVERAGE
hei = day1.merge(day2, on="SEQN", suffixes=("_day1","_day2"))
hei["HEI_usual"] = hei[["HEI_day1","HEI_day2"]].mean(axis=1)

# 9) SAVE
out = "/workspaces/enterntainment720/nutrition/data/output/hei_usual_intake.csv"
hei.to_csv(out, index=False)
print("Usual‐intake HEI saved to", out)


Usual‐intake HEI saved to /workspaces/enterntainment720/nutrition/data/output/hei_usual_intake.csv
