In [14]:
import pandas as pd
import numpy as np

paq = pd.read_sas("../data/PAQ_L.xpt")
demo = pd.read_sas("../data/DEMO_L.xpt")

df = paq.merge(demo, on="SEQN", how="left")


In [15]:
NHANES_MISSING = 5.397605e-79

df.replace(NHANES_MISSING, np.nan, inplace=True)


Unnamed: 0,SEQN,PAD790Q,PAD790U,PAD800,PAD810Q,PAD810U,PAD820,PAD680,SDDSRVYR,RIDSTATR,...,DMDHRGND,DMDHRAGZ,DMDHREDZ,DMDHRMAZ,DMDHSEDZ,WTINT2YR,WTMEC2YR,SDMVSTRA,SDMVPSU,INDFMPIR
0,130378.0,3.000000e+00,b'W',45.0,3.000000e+00,b'W',45.0,360.0,12.0,2.0,...,,,,,,50055.450807,5.437446e+04,173.0,2.0,5.00
1,130379.0,4.000000e+00,b'W',45.0,3.000000e+00,b'W',45.0,480.0,12.0,2.0,...,,,,,,29087.450605,3.408472e+04,173.0,2.0,5.00
2,130380.0,1.000000e+00,b'W',20.0,5.397605e-79,b'',,240.0,12.0,2.0,...,,,,,,80062.674301,8.119628e+04,174.0,1.0,1.41
3,130384.0,5.397605e-79,b'',,5.397605e-79,b'',,60.0,12.0,1.0,...,,,,,,15078.746749,5.397605e-79,179.0,2.0,0.63
4,130385.0,1.000000e+00,b'D',90.0,1.000000e+00,b'W',60.0,180.0,12.0,1.0,...,,,,,,16151.033173,5.397605e-79,187.0,2.0,5.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8148,142305.0,2.000000e+00,b'W',40.0,5.397605e-79,b'',,480.0,12.0,2.0,...,,,,,,28979.979165,4.348341e+04,180.0,2.0,2.25
8149,142307.0,3.000000e+00,b'W',15.0,5.397605e-79,b'',,480.0,12.0,2.0,...,,,,,,69419.620456,6.496233e+04,181.0,1.0,
8150,142308.0,1.000000e+00,b'W',45.0,5.397605e-79,b'',,600.0,12.0,2.0,...,,,,,,32696.313477,4.436753e+04,183.0,2.0,1.95
8151,142309.0,2.000000e+00,b'D',15.0,5.397605e-79,b'',,240.0,12.0,2.0,...,,,,,,30547.974564,4.624936e+04,176.0,1.0,3.11


In [16]:
def decode_unit(x):
    if isinstance(x, bytes):
        return x.decode("utf-8")
    return np.nan

df["PAD790U"] = df["PAD790U"].apply(decode_unit)
df["PAD810U"] = df["PAD810U"].apply(decode_unit)


In [17]:
UNIT_TO_WEEK = {
    "D": 7,
    "W": 1,
    "M": 1/4.33,
    "Y": 1/52
}

def weekly_minutes(freq, unit, mins):
    if pd.isna(freq) or pd.isna(unit) or pd.isna(mins):
        return np.nan
    mult = UNIT_TO_WEEK.get(unit)
    if mult is None:
        return np.nan
    return freq * mins * mult

df["moderate_min_week"] = df.apply(
    lambda r: weekly_minutes(r["PAD790Q"], r["PAD790U"], r["PAD800"]),
    axis=1
)

df["vigorous_min_week"] = df.apply(
    lambda r: weekly_minutes(r["PAD810Q"], r["PAD810U"], r["PAD820"]),
    axis=1
)

In [18]:
df["mvpa_equiv_min_week"] = (
    df["moderate_min_week"] + 2 * df["vigorous_min_week"]
)


In [19]:
df["sedentary_hours_day"] = df["PAD680"] / 60
df.loc[df["sedentary_hours_day"] > 16, "sedentary_hours_day"] = np.nan


In [20]:
df_clean = df.rename(columns={
    "SEQN": "seqn",
    "RIDAGEYR": "age",
    "RIAGENDR": "sex"
})


In [21]:
df_clean = df_clean[
    [
        "seqn",
        "age",
        "sex",
        "moderate_min_week",
        "vigorous_min_week",
        "mvpa_equiv_min_week",
        "sedentary_hours_day"
    ]
]


In [22]:
df_clean.describe()
df_clean.isna().mean()


seqn                   0.000000
age                    0.000000
sex                    0.000000
moderate_min_week      0.216239
vigorous_min_week      0.547896
mvpa_equiv_min_week    0.560039
sedentary_hours_day    0.016926
dtype: float64

In [23]:
df_clean.to_csv("../data/paq_activity_clean.csv", index=False)
print("Saved: data/paq_activity_clean.csv")


Saved: data/paq_activity_clean.csv
