# NHANES project: descriptive and regression analysis
> This notebook has the purpose to collect all the analysis on Nhanes dataset for a medical paper project 

Requirements and Information:
1. Nhanes dataset from 1999/00 to 2001/02
2. Oral Frailty Index with:
    - 1: Do you have any difficulties eating tough foods compared to 6 months ago? (OHQ080)
    - 2: Have you choked on your tea or soup recently? (OHQ100 and OHQ105)
    - 3: Do you use dentures? (OHXEDEN)
    - 4: Do you often have a dry mouth?	(OHQ110)
    - 5: Do you go out less frequently than you did last year? (PAQ500)
    - 6: Can you eat hard foods like squid jerky or pickled radish?	(OHQ020)
    - 7: How many times do you brush your teeth in a day? (3 or more times/day)	(OHQ040 and OHQ010)
    - 8:  Do you visit a dental clinic at least annually? (OHQ050)
3. Outcome/Exposure is
4. Demographic Data:
    - Gender (RIAGENDR)
    - Age at screening (RIDAGEYR)
    - Race (RIDRETH1)
    - Education	(DMDEDUC2)
    - Poverty income ratio (INDFMPIR)
    - Smoking status (SMQ020)
5. Confounding Variables:
    - Heart failure	(RIDRETH1)  
    - Coronary heart disease (MCQ160b)
    - Stroke (MCQ160c)
    - COPD (MCQ160f)
    - Liver disease	(MCQ160o)
    - Cancer (MCQ500)
    - Diabetes (MCQ220)
    - High blood pressure (DIQ010)
6. Age => 60 
7. Oral Frailty Index cutoff: 
8. Oral Frailty Index groups:

## Install packages

Reference for nhanesA [here](https://cran.r-project.org/web/packages/nhanesA/nhanesA.pdf)

In [None]:
install.packages(c("nhanesA", "survey", "MASS", "dplyr", "tidyr", "tidyverse"))


In [None]:
install.packages("devtools")
devtools::install_github("cjendres1/nhanes")


## Load packages

In [74]:
library(nhanesA)
library(survey)
library(MASS)
library(dplyr)
library(tidyr)
library(tidyverse)

## Load NHANES data (1999/00-2001/02)

In [75]:
# The browser may be directed to a specific year, survey, or table

browseNHANES(
    year = 1999,
    data_group = "DEMO"
)


In [76]:
# Demographics data

demo_99_00 <- nhanes(nh_table = "DEMO")
demo_01_02 <- nhanes(nh_table = "DEMO_B")


In [None]:
nhanesTableSummary("DEMO", use = "codebook")

In [78]:
# Smoking status

smoking_99_00 <- nhanes(nh_table = "SMQ")
smoking_01_02 <- nhanes(nh_table = "SMQ_B")

In [79]:
# Oral data

oral_99_00 <- nhanes("OHQ")
oral_01_02 <- nhanes("OHQ_B")

In [80]:
# Physical Activity

physical_99_00 <- nhanes(nh_table = "PAQ")
physical_01_02 <- nhanes(nh_table = "PAQ_B")

In [81]:
# Medical Conditions

med_cond_99_00 <- nhanes("MCQ")
med_cond_01_02 <- nhanes("MCQ_B")

In [82]:
# Diabetes

diabetes_99_00 <- nhanes(nh_table = "DIQ")
diabetes_01_02 <- nhanes(nh_table = "DIQ_B")

In [83]:
# High blood pressure

blood_pressure_99_00 <- nhanes(nh_table = "BPQ")
blood_pressure_01_02 <- nhanes(nh_table = "BPQ_B")

In [84]:
# Edentulous

eden_99_00 <- nhanes(nh_table = "OHXDENT")
eden_01_02 <- nhanes(nh_table = "OHXDEN_B")

## Data Preparation

### Features Selection

In [None]:
# Demographics selection: RIAGENDR, RIDAGEYR, RIDRETH1, DMDEDUC2, INDFMPIR

demo_99_00_selected <- demo_99_00 %>%
    select(SEQN, RIAGENDR, RIDAGEYR, RIDRETH1, DMDEDUC2, INDFMPIR)

demo_01_02_selected <- demo_01_02 %>%
    select(SEQN, RIAGENDR, RIDAGEYR, RIDRETH1, DMDEDUC2, INDFMPIR)

In [None]:
# Smoking Status selection: SMQ020

smoking_99_00_selected <- smoking_99_00 %>%
    select(SEQN, SMQ020)

smoking_01_02_selected <- smoking_01_02 %>%
    select(SEQN, SMQ020)

In [None]:
# Medical Conditions selection: MCQ160B, MCQ160C, MCQ160F, MCQ160K, MCQ160L, MCQ220

med_cond_99_00_selected <- med_cond_99_00 %>%
    select(SEQN, MCQ160B, MCQ160C, MCQ160F, MCQ160K, MCQ160L, MCQ220)

med_cond_01_02_selected <- med_cond_01_02 %>%
    select(SEQN, MCQ160B, MCQ160C, MCQ160F, MCQ160K, MCQ160L, MCQ220)

In [None]:
# Diabetes selection: DIQ010

diabetes_99_00_selected <- diabetes_99_00 %>%
    select(SEQN, DIQ010)

diabetes_01_02_selected <- diabetes_01_02 %>%
    select(SEQN, DIQ010)

In [None]:
# High blood pressure selection: BPQ020

blood_pressure_99_00_selected <- blood_pressure_99_00 %>%
    select(SEQN, BPQ020)

blood_pressure_01_02_selected <- blood_pressure_01_02 %>%
    select(SEQN, BPQ020)

In [None]:
# Physical Activity selection: PAQ500

physical_99_00_selected <- physical_99_00 %>%
    select(SEQN, PAQ500)

physical_01_02_selected <- physical_01_02 %>%
    select(SEQN, PAQ500)

In [None]:
# Edentulous selection: OHXEDEN

eden_99_00_selected <- eden_99_00 %>%
    select(SEQN, OHXEDEN)

eden_01_02_selected <- eden_01_02 %>%
    select(SEQN, OHXEDEN)

In [None]:
# Oral data selection: OHQ080/OHQ085, OHQ100/OHQ105, OHQ110/OHQ115, OHQ020, OHQ040/OHQ010, OHQ050

oral_99_00_selected <- oral_99_00 %>%
    select(SEQN, OHQ080, OHQ100, OHQ110, OHQ020, OHQ040, OHQ050)

oral_01_02_selected <- oral_01_02 %>%
    select(SEQN, OHQ085, OHQ105, OHQ115, OHQ020, OHQ040, OHQ050)

### Data Information

In [None]:
# Function to show dataset information

print_dataset_info <- function(df, name) {
    cat(sprintf("\nDataset: %s\n", name))
    cat(sprintf("Number of rows: %d\n", nrow(df)))
    cat(sprintf("Number of columns: %d\n", ncol(df)))
    cat(sprintf("Memory usage: %.2f MB\n", object.size(df) / 1024^2))
    cat("\nColumn details:\n")
    for (col_name in colnames(df)) {
        cat(sprintf("- %s: %s\n", col_name, class(df[[col_name]])))
    }
}


In [None]:
print_dataset_info(demo_99_00_selected, "Demographics 1999-2000")

In [None]:
print_dataset_info(demo_01_02_selected, "Demographics 2001-2002")

In [None]:
print_dataset_info(smoking_99_00_selected, "Smoking Status 1999-2000")

In [None]:
print_dataset_info(smoking_01_02_selected, "Smoking Status 2001-2002")

In [None]:
print_dataset_info(med_cond_99_00_selected, "Medical Conditions 1999-2000")

In [None]:
print_dataset_info(med_cond_01_02_selected, "Medical Conditions 2001-2002")

In [None]:
print_dataset_info(diabetes_99_00_selected, "Diabetes 1999-2000")

In [None]:
print_dataset_info(diabetes_01_02_selected, "Diabetes 2001-2002")

In [None]:
print_dataset_info(blood_pressure_99_00_selected, "High Blood Pressure 1999-2000")

In [None]:
print_dataset_info(blood_pressure_01_02_selected, "High Blood Pressure 2001-2002")

In [None]:
print_dataset_info(physical_99_00_selected, "Physical Activities 1999-2000")

In [None]:
print_dataset_info(physical_01_02_selected, "Physical Activities 2001-2002")

In [None]:
print_dataset_info(eden_99_00_selected, "Edentulous 1999-2000")

In [None]:
print_dataset_info(eden_01_02_selected, "Edentulous 2001-2002")

In [None]:
print_dataset_info(oral_99_00_selected, "Oral Health 1999-2000")

In [None]:
print_dataset_info(oral_01_02_selected, "Oral Health 2001-2002")

### Unique values and % for each categorical feature in the datasets

In [None]:
# Function to see unique values for categorical features

print_categorical_info <- function(df, name) {
  cat(sprintf("\nCategorical Column Analysis for Dataset: %s\n", name))

  categorical_cols <- names(df)[sapply(df, is.factor) | sapply(df, is.character)]

  if (length(categorical_cols) == 0) {
    cat("No categorical columns found.\n")
    return()
  }

  for (col_name in categorical_cols) {
    cat(sprintf("\nColumn: %s\n", col_name))
    value_counts <- table(df[[col_name]])  
    total_count <- sum(value_counts)      
    percentages <- round(100 * value_counts / total_count, 2) 

    for (i in seq_along(value_counts)) {
      cat(sprintf("- %s: %d (%.2f%%)\n", names(value_counts)[i], value_counts[i], percentages[i]))
    }
  }
}

In [None]:
print_categorical_info(demo_99_00_selected, "Demographics 1999-2000")

In [None]:
print_categorical_info(demo_01_02_selected, "Demographics 2001-2002")

In [None]:
print_categorical_info(smoking_99_00_selected, "Smoking Status 1999-2000")

In [None]:
print_categorical_info(smoking_01_02_selected, "Smoking Status 2001-2002")

In [None]:
print_categorical_info(med_cond_99_00_selected, "Medical Conditions 1999-2000")

In [None]:
print_categorical_info(med_cond_01_02_selected, "Medical Conditions 2001-2002")

In [None]:
print_categorical_info(diabetes_99_00_selected, "Diabetes 1999-2000")

In [None]:
print_categorical_info(diabetes_01_02_selected, "Diabetes 2001-2002")

In [None]:
print_categorical_info(blood_pressure_99_00_selected, "High Blood Pressure 1999-2000")

In [None]:
print_categorical_info(blood_pressure_01_02_selected, "High Blood Pressure 2001-2002")

In [None]:
print_categorical_info(physical_99_00_selected, "Physical Activities 1999-2000")

In [None]:
print_categorical_info(physical_01_02_selected, "Physical Activities 2001-2002")

In [None]:
print_categorical_info(eden_99_00_selected, "Edentulous 1999-2000")

In [None]:
print_categorical_info(eden_01_02_selected, "Edentulous 2001-2002")

In [None]:
print_categorical_info(oral_99_00_selected, "Oral Health 1999-2000")

In [None]:
print_categorical_info(oral_01_02_selected, "Oral Health 2001-2002")

### Missing Values and % for each dataset

In [86]:
# Function to calculate missing values

missing_values <- function(df) {
    missing_count <- colSums(is.na(df))
    missing_percent <- (missing_count/nrow(df)) * 100

    missing_df <- data.frame(
        variable = names(missing_count),
        n_missing = missing_count,
        percent_missing = round(missing_percent, 2)
    ) %>%
        arrange(desc(n_missing))

    return(missing_df)
}

In [87]:
demo_99_00_missing <- missing_values(demo_99_00_selected)
print(demo_99_00_missing)

         variable n_missing percent_missing
DMDEDUC2 DMDEDUC2      5087           51.05
INDFMPIR INDFMPIR      1483           14.88
SEQN         SEQN         0            0.00
RIAGENDR RIAGENDR         0            0.00
RIDAGEYR RIDAGEYR         0            0.00
RIDRETH1 RIDRETH1         0            0.00


In [88]:
demo_01_02_missing <- missing_values(demo_01_02_selected)
print(demo_01_02_missing)

         variable n_missing percent_missing
DMDEDUC2 DMDEDUC2      5632           51.02
INDFMPIR INDFMPIR       794            7.19
SEQN         SEQN         0            0.00
RIAGENDR RIAGENDR         0            0.00
RIDAGEYR RIDAGEYR         0            0.00
RIDRETH1 RIDRETH1         0            0.00


In [89]:
smoking_99_00_missing <- missing_values(smoking_99_00_selected)
print(smoking_99_00_missing)

       variable n_missing percent_missing
SMQ020   SMQ020         3            0.06
SEQN       SEQN         0            0.00


In [90]:
smoking_01_02_missing <- missing_values(smoking_01_02_selected)
print(smoking_01_02_missing)

       variable n_missing percent_missing
SMQ020   SMQ020         4            0.07
SEQN       SEQN         0            0.00


In [91]:
med_cond_99_00_missing <- missing_values(med_cond_99_00_selected)
print(med_cond_99_00_missing)

        variable n_missing percent_missing
MCQ160B  MCQ160B      4613           48.59
MCQ160C  MCQ160C      4613           48.59
MCQ160F  MCQ160F      4613           48.59
MCQ160K  MCQ160K      4613           48.59
MCQ160L  MCQ160L      4613           48.59
MCQ220    MCQ220      4613           48.59
SEQN        SEQN         0            0.00


In [93]:
med_cond_01_02_missing <- missing_values(med_cond_01_02_selected)
print(med_cond_01_02_missing)

        variable n_missing percent_missing
MCQ160B  MCQ160B      5060           48.33
MCQ160C  MCQ160C      5060           48.33
MCQ160F  MCQ160F      5060           48.33
MCQ160K  MCQ160K      5060           48.33
MCQ160L  MCQ160L      5060           48.33
MCQ220    MCQ220      5060           48.33
SEQN        SEQN         0            0.00


In [94]:
diabetes_99_00_missing <- missing_values(diabetes_99_00_selected)
print(diabetes_99_00_missing)

       variable n_missing percent_missing
DIQ010   DIQ010         5            0.05
SEQN       SEQN         0            0.00


In [95]:
diabetes_01_02_missing <- missing_values(diabetes_01_02_selected)
print(diabetes_01_02_missing)

       variable n_missing percent_missing
DIQ010   DIQ010         2            0.02
SEQN       SEQN         0            0.00


In [96]:
blood_pressure_99_00_missing <- missing_values(blood_pressure_99_00_selected)
print(blood_pressure_99_00_missing)

       variable n_missing percent_missing
BPQ020   BPQ020        93            1.54
SEQN       SEQN         0            0.00


In [97]:
blood_pressure_01_02_missing <- missing_values(blood_pressure_01_02_selected)
print(blood_pressure_01_02_missing)

       variable n_missing percent_missing
BPQ020   BPQ020       100            1.51
SEQN       SEQN         0            0.00


In [98]:
physical_99_00_missing <- missing_values(physical_99_00_selected)
print(physical_99_00_missing)

       variable n_missing percent_missing
PAQ500   PAQ500      1990           21.66
SEQN       SEQN         0            0.00


In [99]:
physical_01_02_missing <- missing_values(physical_01_02_selected)
print(physical_01_02_missing)

       variable n_missing percent_missing
PAQ500   PAQ500      2310           22.88
SEQN       SEQN         0            0.00


In [100]:
eden_99_00_missing <- missing_values(eden_99_00_selected)
print(eden_99_00_missing)

        variable n_missing percent_missing
OHXEDEN  OHXEDEN      8151           94.92
SEQN        SEQN         0            0.00


In [101]:
eden_01_02_missing <- missing_values(eden_01_02_selected)
print(eden_01_02_missing)

        variable n_missing percent_missing
OHXEDEN  OHXEDEN      9172           95.56
SEQN        SEQN         0            0.00


In [103]:
oral_99_00_missing <- missing_values(oral_99_00_selected)
print(oral_99_00_missing)

       variable n_missing percent_missing
OHQ080   OHQ080      7364           79.78
OHQ100   OHQ100      7364           79.78
OHQ110   OHQ110      7364           79.78
OHQ040   OHQ040      6898           74.73
OHQ020   OHQ020      3787           41.03
OHQ050   OHQ050      3457           37.45
SEQN       SEQN         0            0.00


In [104]:
oral_01_02_missing <- missing_values(oral_01_02_selected)
print(oral_01_02_missing)

       variable n_missing percent_missing
OHQ040   OHQ040      7543           74.48
OHQ085   OHQ085      4317           42.63
OHQ105   OHQ105      4317           42.63
OHQ115   OHQ115      4317           42.63
OHQ020   OHQ020      4138           40.86
OHQ050   OHQ050      3541           34.97
SEQN       SEQN         0            0.00


### Combine Demographics datasets and filter with age >= 60

In [None]:
combined_demo <- bind_rows(demo_99_00_selected, demo_01_02_selected)

In [119]:
dim(combined_demo)

In [120]:
elderly_demo_data <- combined_demo %>%
  filter(RIDAGEYR >= 60)

In [123]:
dim(elderly_demo_data)

### Next steps

In [None]:
# Feature Engineering

process_features <- function(data) {
    processed_data <- data %>%
        mutate(
            gender = factor(x = RIAGENDR, levels = c(1, 2), labels = c("Male", "Female"))
        ) %>%
        mutate(
            ethnicity = case_when(
                RIDRETH1 == 1 ~ "Mexican American",
                RIDRETH1 == 2 ~ "Other Hispanic",
                RIDRETH1 == 3 ~ "Non-Hispanic White",
                RIDRETH1 == 4 ~ "Non-Hispanic Black",
                RIDRETH1 == 5 ~ "Other Race - Including Multi-Racial",
                TRUE ~ NA_character_
            ),
            ethnicity_num = as.numeric(RIDRETH1)
        ) %>%
        mutate(
            education = case_when(
                DMDEDUC2 == 1 ~ "Less than 9th grade",
                DMDEDUC2 == 2 ~ "9-11th grade",
                DMDEDUC2 == 3 ~ "High school graduate/GED",
                DMDEDUC2 == 4 ~ "Some college or AA degree",
                DMDEDUC2 == 5 ~ "College graduate or above",
                DMDEDUC2 %in% c(7, 9) ~ NA_character_
            ),
            education_3cat = case_when(
                DMDEDUC2 %in% c(1, 2) ~ "Less than 12 years",
                DMDEDUC2 == 3 ~ "12 years",
                DMDEDUC2 %in% c(4, 5) ~ "More than 12 years",
                TRUE ~ NA_character_
            ),
            education_years = case_when(
                DMDEDUC2 == 1 ~ 8, # Less than 9th grade
                DMDEDUC2 == 2 ~ 10, # 9-11th grade
                DMDEDUC2 == 3 ~ 12, # High school/GED
                DMDEDUC2 == 4 ~ 14, # Some college/AA
                DMDEDUC2 == 5 ~ 16, # College graduate
                TRUE ~ NA_real_
            )
        ) %>%
        # Binary indicators for each ethnic category
        mutate(
            is_mexican_american = RIDRETH1 == 1,
            is_other_hispanic = RIDRETH1 == 2,
            is_nh_white = RIDRETH1 == 3,
            is_nh_black = RIDRETH1 == 4,
            is_other_race = RIDRETH1 == 5
        ) %>%
        # Binary indicators for each educational level
        mutate(
            is_less_than_hs = DMDEDUC2 %in% c(1, 2),
            is_hs_grad = DMDEDUC2 == 3,
            is_some_college = DMDEDUC2 == 4,
            is_college_grad = DMDEDUC2 == 5
        ) %>%
        mutate(
            income_category = case_when(
                INDHHIN2 %in% c(1:4) ~ "Under $20,000",
                INDHHIN2 %in% c(5:7) ~ "$20,000 to $44,999",
                INDHHIN2 %in% c(8:10) ~ "$45,000 to $74,999",
                INDHHIN2 == 13 ~ "$75,000 to $99,999",
                INDHHIN2 == 14 ~ "$100,000 and Over",
                INDHHIN2 %in% c(77, 99) ~ NA_character_,
                TRUE ~ NA_character_
            )
        ) %>%
        mutate(
            income_numeric = case_when(
                INDHHIN2 == 1 ~ 2500, # $0-$4,999
                INDHHIN2 == 2 ~ 7500, # $5,000-$9,999
                INDHHIN2 == 3 ~ 12500, # $10,000-$14,999
                INDHHIN2 == 4 ~ 17500, # $15,000-$19,999
                INDHHIN2 == 5 ~ 22500, # $20,000-$24,999
                INDHHIN2 == 6 ~ 30000, # $25,000-$34,999
                INDHHIN2 == 7 ~ 40000, # $35,000-$44,999
                INDHHIN2 == 8 ~ 50000, # $45,000-$54,999
                INDHHIN2 == 9 ~ 60000, # $55,000-$64,999
                INDHHIN2 == 10 ~ 70000, # $65,000-$74,999
                INDHHIN2 == 13 ~ 87500, # $75,000-$99,999
                INDHHIN2 == 14 ~ 125000, # $100,000 and Over
                TRUE ~ NA_real_
            )
        ) %>%
        mutate(
            permanent_teeth = rowSums(select(., starts_with("OHX") & ends_with("TC")) == 1, na.rm = TRUE),
            missing_teeth = rowSums(select(., starts_with("OHX") & ends_with("TC")) == 2, na.rm = TRUE),
            root_fragments = rowSums(select(., starts_with("OHX") & ends_with("TC")) == 3, na.rm = TRUE),
            perc_permanent = (permanent_teeth / (permanent_teeth + missing_teeth + root_fragments)) * 100,
            dental_status = case_when(
                permanent_teeth >= 20 ~ "Adequate dentition",
                permanent_teeth >= 10 ~ "Partial dentition",
                TRUE ~ "Poor dentition"
            )
        ) %>%
        mutate(
            periodontal_disease = case_when(
                # Definition CDC/AAP as in the paper (this is a simplified example)
                rowMeans(select(., ends_with("CSC")), na.rm = TRUE) >= 4 ~ "Moderate/Severe",
                TRUE ~ "No/Mild"
            )
        )

    return(processed_data)
}


In [None]:
preprocessed_data <- process_features(features_selected)
head(preprocessed_data)


## nhanesA packages functionalities

### Create dataframe using NHANES codes

In [None]:
DF <- nhanesA::nhanes(nh_table = "OHQ")
DF$OHQ080


In [None]:
nhanesA::nhanesCodebook("OHQ", "OHQ080")


In [None]:
head(DF[, c("OHQ020", "OHQ040", "OHQ050", "OHQ080", "OHQ100", "OHQ110")])


### How to drop all NA values

In [None]:
DF_filtered <- na.omit(DF[, c("OHQ020", "OHQ040", "OHQ050", "OHQ080", "OHQ100", "OHQ110")])
dim(DF_filtered)


### Explore Nhanes attributes

In [None]:
attributes_df <- nhanesAttr(nh_table = "OHQ")
attributes_df


In [None]:
attributes_df <- nhanesTableSummary(nh_table = "OHQ")
attributes_df


### Display codebook for selected variable

In [None]:
nhanesCodebook(nh_table = "OHQ")$OHQ100


In [None]:
# You can do the same with nhanesCodebookFromURL: Download and parse an NHANES doc file from a URL

nhanesCodebookFromURL(url = "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2011/DataFiles/OHQ_G.htm")$OHQ885


In [None]:
dxa_c_s <- nhanesDXA(2003, suppl = TRUE)
head(dxa_c_s)


### Download and parse NHANES manifests

In [None]:
manifest <- nhanesManifest(sizes = FALSE)
dim(manifest)


In [None]:
varmf <- nhanesManifest("variables", component = "lab")
head(varmf)


### Perform a search over the comprehensive NHANES variable list

In [None]:
length(bladder)


In [None]:
head(bladder)


### Explore Data Groups such as Questionnarie, Demographics...

In [None]:
nhanesA::nhanesTables(data_group = "QUESTIONNAIRE", year = 2000)


In [None]:
exam <- nhanesA::nhanesTables("QUESTIONNAIRE", 2000)
dim(exam)


In [None]:
nhanesA::nhanesTableSummary("DEMO_D", use = "codebook")


In [None]:
# Displays a list of variables in the specified NHANES table.
exam_ohx <- nhanesTableVars("EXAM", "OHX_E", details = TRUE, nchar = 50)
dim(exam_ohx)


In [None]:
head(exam_ohx)


In [None]:
# Display code translation information.

nhanesTranslate("DEMO_B", c("DMDBORN", "DMDCITZN"))
