## Overview

In this notebook, we will be reading the relevant datasets downloaded, merging them and prepare them for exploratory data analysis.

## Dataset

This study utilises data from the August 2021 - August 2023 cycle of National Health and Nutrition Examination Survey (NHANES) conducted by Centers for Disease Control and Prevention (CDC).

[NHANES Aug 2021 - Aug 2023 Cycle] (https://wwwn.cdc.gov/nchs/nhanes/continuousnhanes/default.aspx?Cycle=2021-2023)

A few datasets of interest were selected for the purpose of analysis.

Demographic Data
- DEMO_L.xpt : Demographic Variables and Sample Weights

Examination Data
- BMX_L.xpt : Body Measures

Laboratory Data
- HDL_L.xpt : Cholesterol - High-Density Lipoprotein
- GHB_L.xpt : Glycohemoglobin
- GLU_L.xpt : Plasma Fasting Glucose

Questionnaire Data
- ALQ_L.xpt : Alcohol Use
- BPQ_L.xpt : Blood Pressure & Cholesterol
- DIQ_L.xpt : Diabetes
- SMQ_L.xpt : Smoking - Cigarette Use



In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

# Set the path to data directory
data_dir= Path("../data/raw")

# Create a file map for better readability
files = {
    "alcohol": "ALQ_L.xpt",
    "bmi": "BMX_L.xpt",
    "blood_pressure": "BPQ_L.xpt",
    "demographics": "DEMO_L.xpt",
    "diabetes": "DIQ_L.xpt",
    "glycohemoglobin": "GHB_L.xpt",
    "fasting_plasma_glucose": "GLU_L.xpt",
    "high_density_lipoprotein": "HDL_L.xpt",
    "smoking": "SMQ_L.xpt"
}

datasets = {}
total_files_count = len(files)
current_count = 1
# Load each dataset into a pandas DataFrame
for key, filename in files.items():
    path = data_dir / filename
    print(f"File {current_count}/{total_files_count} Loading {filename} into DataFrame...")
    datasets[key] = pd.read_sas(path, format='xport')
    current_count += 1


File 1/9 Loading ALQ_L.xpt into DataFrame...
File 2/9 Loading BMX_L.xpt into DataFrame...
File 3/9 Loading BPQ_L.xpt into DataFrame...
File 4/9 Loading DEMO_L.xpt into DataFrame...
File 5/9 Loading DIQ_L.xpt into DataFrame...
File 6/9 Loading GHB_L.xpt into DataFrame...
File 7/9 Loading GLU_L.xpt into DataFrame...
File 8/9 Loading HDL_L.xpt into DataFrame...
File 9/9 Loading SMQ_L.xpt into DataFrame...


In [2]:
# Read each dataset and print the first few rows to observe what the data looks like and identify any common features
datasets["alcohol"].head()

Unnamed: 0,SEQN,ALQ111,ALQ121,ALQ130,ALQ142,ALQ270,ALQ280,ALQ151,ALQ170
0,130378.0,,,,,,,,
1,130379.0,1.0,2.0,3.0,5.397605e-79,,,2.0,
2,130380.0,1.0,10.0,1.0,5.397605e-79,,,2.0,
3,130386.0,1.0,4.0,2.0,10.0,5.397605e-79,10.0,2.0,5.397605e-79
4,130387.0,1.0,5.397605e-79,,,,,2.0,


In [3]:
datasets["bmi"].head()

Unnamed: 0,SEQN,BMDSTATS,BMXWT,BMIWT,BMXRECUM,BMIRECUM,BMXHEAD,BMIHEAD,BMXHT,BMIHT,...,BMXLEG,BMILEG,BMXARML,BMIARML,BMXARMC,BMIARMC,BMXWAIST,BMIWAIST,BMXHIP,BMIHIP
0,130378.0,1.0,86.9,,,,,,179.5,,...,42.8,,42.0,,35.7,,98.3,,102.9,
1,130379.0,1.0,101.8,,,,,,174.2,,...,38.5,,38.7,,33.7,,114.7,,112.4,
2,130380.0,1.0,69.4,,,,,,152.9,,...,38.5,,35.5,,36.3,,93.5,,98.0,
3,130381.0,1.0,34.3,,,,,,120.1,,...,,,25.4,,23.4,,70.4,,,
4,130382.0,3.0,13.6,,,1.0,,,,1.0,...,,,,1.0,,1.0,,1.0,,


In [4]:
datasets["blood_pressure"].head()

Unnamed: 0,SEQN,BPQ020,BPQ030,BPQ150,BPQ080,BPQ101D
0,130378.0,1.0,1.0,1.0,2.0,2.0
1,130379.0,1.0,1.0,1.0,2.0,2.0
2,130380.0,2.0,,,1.0,1.0
3,130384.0,2.0,,,2.0,2.0
4,130385.0,2.0,,,2.0,2.0


In [5]:
datasets["demographics"].head()

Unnamed: 0,SEQN,SDDSRVYR,RIDSTATR,RIAGENDR,RIDAGEYR,RIDAGEMN,RIDRETH1,RIDRETH3,RIDEXMON,RIDEXAGM,...,DMDHRGND,DMDHRAGZ,DMDHREDZ,DMDHRMAZ,DMDHSEDZ,WTINT2YR,WTMEC2YR,SDMVSTRA,SDMVPSU,INDFMPIR
0,130378.0,12.0,2.0,1.0,43.0,,5.0,6.0,2.0,,...,,,,,,50055.450807,54374.463898,173.0,2.0,5.0
1,130379.0,12.0,2.0,1.0,66.0,,3.0,3.0,2.0,,...,,,,,,29087.450605,34084.721548,173.0,2.0,5.0
2,130380.0,12.0,2.0,2.0,44.0,,2.0,2.0,1.0,,...,,,,,,80062.674301,81196.277992,174.0,1.0,1.41
3,130381.0,12.0,2.0,2.0,5.0,,5.0,7.0,1.0,71.0,...,2.0,2.0,2.0,3.0,,38807.268902,55698.607106,182.0,2.0,1.53
4,130382.0,12.0,2.0,1.0,2.0,,3.0,3.0,2.0,34.0,...,2.0,2.0,3.0,1.0,2.0,30607.519774,36434.146346,182.0,2.0,3.6


In [6]:
datasets["diabetes"].head()

Unnamed: 0,SEQN,DIQ010,DID040,DIQ160,DIQ180,DIQ050,DID060,DIQ060U,DIQ070
0,130378.0,2.0,,2.0,2.0,,,,
1,130379.0,2.0,,2.0,1.0,,,,
2,130380.0,1.0,35.0,,,2.0,,,1.0
3,130381.0,2.0,,,,,,,
4,130382.0,2.0,,,,,,,


In [7]:
datasets["glycohemoglobin"].head()

Unnamed: 0,SEQN,WTPH2YR,LBXGH
0,130378.0,56042.12941,5.6
1,130379.0,37435.705647,5.6
2,130380.0,85328.844519,6.2
3,130386.0,44526.214135,5.1
4,130387.0,22746.296353,5.9


In [8]:
datasets["fasting_plasma_glucose"].head()

Unnamed: 0,SEQN,WTSAF2YR,LBXGLU,LBDGLUSI
0,130378.0,120025.3,113.0,6.27
1,130379.0,5.397605e-79,99.0,5.5
2,130380.0,145090.8,156.0,8.66
3,130386.0,82599.62,100.0,5.55
4,130394.0,100420.3,88.0,4.88


In [9]:
datasets["high_density_lipoprotein"].head()

Unnamed: 0,SEQN,WTPH2YR,LBDHDD,LBDHDDSI
0,130378.0,56042.12941,45.0,1.16
1,130379.0,37435.705647,60.0,1.55
2,130380.0,85328.844519,49.0,1.27
3,130386.0,44526.214135,46.0,1.19
4,130387.0,22746.296353,42.0,1.09


In [10]:
datasets["smoking"].head()

Unnamed: 0,SEQN,SMQ020,SMQ040,SMD641,SMD650,SMD100MN,SMQ621,SMD630,SMAQUEX2
0,130378.0,1.0,3.0,,,,,,1.0
1,130379.0,1.0,3.0,,,,,,1.0
2,130380.0,2.0,,,,,,,1.0
3,130384.0,2.0,,,,,,,1.0
4,130385.0,2.0,,,,,,,1.0


In [11]:
# Merge the datasets on "SEQN" into a single DataFrame using Python reduce() and pandas merge()
from functools import reduce

df = reduce(
    lambda left, right: pd.merge(left, right, on="SEQN", how="outer"),
    datasets.values()
)

In [12]:
# Show information about the merged DataFrame
pd.set_option('display.max_rows', None)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11933 entries, 0 to 11932
Data columns (total 85 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   SEQN       11933 non-null  float64
 1   ALQ111     5481 non-null   float64
 2   ALQ121     4922 non-null   float64
 3   ALQ130     4069 non-null   float64
 4   ALQ142     4082 non-null   float64
 5   ALQ270     2366 non-null   float64
 6   ALQ280     2362 non-null   float64
 7   ALQ151     4901 non-null   float64
 8   ALQ170     2358 non-null   float64
 9   BMDSTATS   8860 non-null   float64
 10  BMXWT      8754 non-null   float64
 11  BMIWT      345 non-null    float64
 12  BMXRECUM   454 non-null    float64
 13  BMIRECUM   18 non-null     float64
 14  BMXHEAD    70 non-null     float64
 15  BMIHEAD    0 non-null      float64
 16  BMXHT      8499 non-null   float64
 17  BMIHT      134 non-null    float64
 18  BMXBMI     8471 non-null   float64
 19  BMDBMIC    2492 non-null   float64
 20  BMXLEG

In [13]:
df.shape

(11933, 85)

In [14]:
# Show the first 5 rows of the merged DataFrame
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

df.head()

Unnamed: 0,SEQN,ALQ111,ALQ121,ALQ130,ALQ142,ALQ270,ALQ280,ALQ151,ALQ170,BMDSTATS,BMXWT,BMIWT,BMXRECUM,BMIRECUM,BMXHEAD,BMIHEAD,BMXHT,BMIHT,BMXBMI,BMDBMIC,BMXLEG,BMILEG,BMXARML,BMIARML,BMXARMC,BMIARMC,BMXWAIST,BMIWAIST,BMXHIP,BMIHIP,BPQ020,BPQ030,BPQ150,BPQ080,BPQ101D,SDDSRVYR,RIDSTATR,RIAGENDR,RIDAGEYR,RIDAGEMN,RIDRETH1,RIDRETH3,RIDEXMON,RIDEXAGM,DMQMILIZ,DMDBORN4,DMDYRUSR,DMDEDUC2,DMDMARTZ,RIDEXPRG,DMDHHSIZ,DMDHRGND,DMDHRAGZ,DMDHREDZ,DMDHRMAZ,DMDHSEDZ,WTINT2YR,WTMEC2YR,SDMVSTRA,SDMVPSU,INDFMPIR,DIQ010,DID040,DIQ160,DIQ180,DIQ050,DID060,DIQ060U,DIQ070,WTPH2YR_x,LBXGH,WTSAF2YR,LBXGLU,LBDGLUSI,WTPH2YR_y,LBDHDD,LBDHDDSI,SMQ020,SMQ040,SMD641,SMD650,SMD100MN,SMQ621,SMD630,SMAQUEX2
0,130378.0,,,,,,,,,1.0,86.9,,,,,,179.5,,27.0,,42.8,,42.0,,35.7,,98.3,,102.9,,1.0,1.0,1.0,2.0,2.0,12.0,2.0,1.0,43.0,,5.0,6.0,2.0,,2.0,2.0,6.0,5.0,1.0,,4.0,,,,,,50055.450807,54374.463898,173.0,2.0,5.0,2.0,,2.0,2.0,,,,,56042.12941,5.6,120025.3,113.0,6.27,56042.12941,45.0,1.16,1.0,3.0,,,,,,1.0
1,130379.0,1.0,2.0,3.0,5.397605e-79,,,2.0,,1.0,101.8,,,,,,174.2,,33.5,,38.5,,38.7,,33.7,,114.7,,112.4,,1.0,1.0,1.0,2.0,2.0,12.0,2.0,1.0,66.0,,3.0,3.0,2.0,,2.0,1.0,,5.0,1.0,,2.0,,,,,,29087.450605,34084.721548,173.0,2.0,5.0,2.0,,2.0,1.0,,,,,37435.705647,5.6,5.397605e-79,99.0,5.5,37435.705647,60.0,1.55,1.0,3.0,,,,,,1.0
2,130380.0,1.0,10.0,1.0,5.397605e-79,,,2.0,,1.0,69.4,,,,,,152.9,,29.7,,38.5,,35.5,,36.3,,93.5,,98.0,,2.0,,,1.0,1.0,12.0,2.0,2.0,44.0,,2.0,2.0,1.0,,2.0,2.0,6.0,3.0,1.0,2.0,7.0,,,,,,80062.674301,81196.277992,174.0,1.0,1.41,1.0,35.0,,,2.0,,,1.0,85328.844519,6.2,145090.8,156.0,8.66,85328.844519,49.0,1.27,2.0,,,,,,,1.0
3,130381.0,,,,,,,,,1.0,34.3,,,,,,120.1,,23.8,4.0,,,25.4,,23.4,,70.4,,,,,,,,,12.0,2.0,2.0,5.0,,5.0,7.0,1.0,71.0,,1.0,,,,,2.0,2.0,2.0,2.0,3.0,,38807.268902,55698.607106,182.0,2.0,1.53,2.0,,,,,,,,,,,,,,,,,,,,,,,
4,130382.0,,,,,,,,,3.0,13.6,,,1.0,,,,1.0,,,,,,1.0,,1.0,,1.0,,,,,,,,12.0,2.0,1.0,2.0,,3.0,3.0,2.0,34.0,,1.0,,,,,4.0,2.0,2.0,3.0,1.0,2.0,30607.519774,36434.146346,182.0,2.0,3.6,2.0,,,,,,,,,,,,,,,,,,,,,,,


The following features are deemed relevant to the problem and will be used for analysis:
- BMXBMI [Continuous] : Body Mass Index (kg/m^2)
- BPQ020 [Binary] : History of high blood pressure (0=N, 1=Y)
- RIAGENDR [Binary] : Gender (0=F, 1=M)
- RIDAGEYR [Discrete] : Age at screening (years)
- DIQ010 [Multiclass] : Diabetic (Yes, No, Borderline)
- LBXGH [Continuous] : Glycohemoglobin level (%)
- LBDGLUSI [Continuous] : Fasting Glucose (mmol/L)
- LBDHDDSI [Continuous] : Direct HDL-Cholesterol (mmol/L)
- SMQ020 [Binary] : Smoked at least 100 cigarettes in lifetime
- SMQ040 [Multiclass] : Do you now smoke cigarettes (Every day, Some day, None at all)

In [15]:
# Selecting relevant columns for analysis
keep_cols = ["BMXBMI", "BPQ020", "RIAGENDR", "RIDAGEYR", "DIQ010", "LBXGH", "LBDGLUSI", "LBDHDDSI", "SMQ020", "SMQ040"]

df = df[keep_cols]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11933 entries, 0 to 11932
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   BMXBMI    8471 non-null   float64
 1   BPQ020    8498 non-null   float64
 2   RIAGENDR  11933 non-null  float64
 3   RIDAGEYR  11933 non-null  float64
 4   DIQ010    11740 non-null  float64
 5   LBXGH     6715 non-null   float64
 6   LBDGLUSI  3672 non-null   float64
 7   LBDHDDSI  6890 non-null   float64
 8   SMQ020    8135 non-null   float64
 9   SMQ040    3243 non-null   float64
dtypes: float64(10)
memory usage: 932.4 KB


In [16]:
# Rename columns for better readability
df = df.rename(columns={
    "BMXBMI": "bmi",
    "BPQ020": "high_blood_pressure",
    "RIAGENDR": "gender",
    "RIDAGEYR": "age",
    "DIQ010": "diabetes",
    "LBXGH": "hba1c_percentage",
    "LBDGLUSI": "fpg_mmol_L",
    "LBDHDDSI": "hdl_mmol_L",
    "SMQ020": "lifetime_100_cigs_smoked",
    "SMQ040": "current_smoking_freq"
})

In [17]:
# Identify the number of missing values in each column
df.isnull().sum()

bmi                         3462
high_blood_pressure         3435
gender                         0
age                            0
diabetes                     193
hba1c_percentage            5218
fpg_mmol_L                  8261
hdl_mmol_L                  5043
lifetime_100_cigs_smoked    3798
current_smoking_freq        8690
dtype: int64

In [18]:
# Value counts for the "SMQ020" column (smoking status)
# df["lifetime_100_cigs_smoked"].value_counts()

In [19]:
# Verify that all rows with SMQ020 == 2.0 had missing values, hence no SMQ020 = 2.0 present in the resulting DataFrame
# sum(df[df["lifetime_100_cigs_smoked"] == 2.0].isnull().sum(axis=1).value_counts())

In [20]:
# Remove rows with missing or unwanted values
df = df.dropna()
df = df[(df["high_blood_pressure"] != 9.0)]
df = df[(df["age"] >= 18.0)]

In [21]:
# Resultant DataFrame info after cleaning
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1209 entries, 0 to 11925
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   bmi                       1209 non-null   float64
 1   high_blood_pressure       1209 non-null   float64
 2   gender                    1209 non-null   float64
 3   age                       1209 non-null   float64
 4   diabetes                  1209 non-null   float64
 5   hba1c_percentage          1209 non-null   float64
 6   fpg_mmol_L                1209 non-null   float64
 7   hdl_mmol_L                1209 non-null   float64
 8   lifetime_100_cigs_smoked  1209 non-null   float64
 9   current_smoking_freq      1209 non-null   float64
dtypes: float64(10)
memory usage: 103.9 KB


In [22]:
# Define a function to display unique values for each column in the DataFrame
def display_unique_valeus(df):
    unique_values = {}
    for col in df.columns:
        unique_values[col] = df[col].unique().tolist()
    return unique_values

In [23]:
# Display unique values for each column in the DataFrame and their counts
unique_col_values = display_unique_valeus(df)
for col, vals in unique_col_values.items():
    print(f"{col} ({len(vals)}): {vals}")

bmi (282): [27.0, 33.5, 30.2, 27.3, 21.4, 37.0, 23.0, 37.9, 33.2, 16.7, 29.0, 32.2, 28.3, 25.2, 28.2, 32.1, 27.1, 50.2, 29.7, 38.4, 26.4, 31.9, 19.7, 30.9, 25.9, 35.0, 24.8, 27.4, 39.2, 21.9, 22.9, 20.9, 32.3, 36.8, 26.5, 29.1, 20.7, 32.0, 28.7, 35.2, 21.5, 33.7, 31.4, 32.4, 24.7, 36.6, 34.2, 25.0, 31.3, 27.5, 17.9, 28.9, 33.0, 40.8, 28.6, 23.8, 24.2, 28.1, 40.7, 26.1, 42.4, 20.2, 25.3, 31.1, 37.3, 32.6, 20.5, 28.4, 29.5, 36.1, 40.0, 19.4, 23.2, 26.2, 27.7, 23.7, 37.2, 26.7, 40.6, 29.4, 27.2, 26.6, 24.3, 36.0, 29.9, 33.1, 26.0, 50.1, 28.8, 32.7, 29.3, 28.5, 25.6, 22.7, 20.6, 31.8, 29.2, 31.7, 24.4, 42.1, 20.0, 23.4, 18.6, 25.8, 28.0, 26.3, 43.9, 21.3, 25.7, 41.9, 35.5, 31.0, 36.7, 49.5, 30.1, 32.5, 22.5, 23.9, 34.8, 31.2, 24.6, 42.9, 26.9, 37.5, 25.5, 30.8, 38.8, 30.3, 38.0, 22.4, 45.7, 33.6, 24.1, 30.5, 39.4, 27.8, 38.1, 22.0, 34.0, 41.2, 35.9, 35.6, 46.7, 45.6, 20.4, 33.9, 23.1, 42.5, 40.5, 19.1, 38.6, 21.0, 37.4, 23.5, 21.2, 30.0, 34.9, 54.2, 24.5, 25.1, 30.7, 40.9, 39.7, 30.4, 22.8

In [24]:
# Replace gender values to 0=F, 1=M
df["gender"] = df["gender"].replace(2.0, 0.0)

In [25]:
# Replace high_blood_pressure values to 0 = No, 1 = Yes
df["high_blood_pressure"] = df["high_blood_pressure"].replace(2.0, 0.0)

In [26]:
# Convert categorical columns to integers
cols_to_int = ["high_blood_pressure", "gender", "age", "diabetes", "lifetime_100_cigs_smoked", "current_smoking_freq"]

for col in cols_to_int:
    df[col] = df[col].astype(int)

In [27]:
# Save the cleaned DataFrame to a CSV file
df.to_csv("../data/interim/nhanes_data_interim.csv", index=False)