# SID Genetics Study Step 3: Exclusions and Matching

## Objective
The purpose of this notebook is to create the final statin-induced diabetes (SID) phenotype by excluding participants with missing data, type 1 diabetes (T1D), and no short-read whole genome sequence (srWGS) data. Statin users are then matched to 2 non-users based on a combination of exact, threshold, and propensity score matching, and any participants who are not matched are excluded as well.

# Pull in Data

**Objective**: The purpose of this section is to load packages and pull in data from the All of Us Research Project (AoURP). AoURP dataset code (R and SQL) is generated using the AoURP's cohort builder.

In [None]:
# Load necessary packages

# install.packages("allofus")
# install.packages("tidyverse")
# install.packages("stats")
# install.packages("ggplot2")
# install.packages("cowplot")
# install.packages("tableone")
# install.packages("DiagrammeR")


library(allofus)
library(tidyverse)
library(stats)
library(ggplot2)
library(cowplot)
library(tableone)
library(DiagrammeR)

my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

In [None]:
# Pull in covariate data frames
system(paste0("gsutil cp ", my_bucket, "/sid_pheno_files/", "statin_user_covs_df_v2.csv", " ."), intern=T)
statin_user_covs_df <- read.csv("statin_user_covs_df_v2.csv")

dim(statin_user_covs_df)

system(paste0("gsutil cp ", my_bucket, "/sid_pheno_files/", "non_users_covs_df_v2.csv", " ."), intern=T)
non_user_covs_df <- read.csv("non_users_covs_df_v2.csv")

dim(non_user_covs_df)

In [None]:
# Pull in T1D data

library(tidyverse)
library(bigrquery)

# This query represents dataset "T1D ICD Codes" for domain "condition" and was generated for All of Us Controlled Tier Dataset v7
dataset_81335991_condition_sql <- paste("
    SELECT
        c_occurrence.person_id,
        c_occurrence.condition_concept_id,
        c_standard_concept.concept_name as standard_concept_name,
        c_occurrence.condition_start_datetime,
        c_occurrence.condition_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `condition_occurrence` c_occurrence 
        WHERE
            (
                condition_source_concept_id IN (SELECT
                    DISTINCT c.concept_id 
                FROM
                    `cb_criteria` c 
                JOIN
                    (SELECT
                        CAST(cr.id as string) AS id       
                    FROM
                        `cb_criteria` cr       
                    WHERE
                        concept_id IN (35206878, 35206879, 44819501, 44819502, 44819504, 44820682, 44820683, 44820684, 44821787, 44822934, 44822935, 44822936, 44824071, 44825264, 44829881, 44831046, 44832190, 44832191, 44832192, 44833368, 44834549, 44836918, 45533018, 45537960, 45542736, 45547622, 45547623, 45547624, 45552379, 45552381, 45552382, 45552383, 45557110, 45566729, 45576438, 45581350, 45600636, 45600637, 45600638, 45600639, 45600640, 45605398)       
                        AND full_text LIKE '%_rank1]%'      ) a 
                        ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                        OR c.path LIKE CONCAT('%.', a.id) 
                        OR c.path LIKE CONCAT(a.id, '.%') 
                        OR c.path = a.id) 
                WHERE
                    is_standard = 0 
                    AND is_selectable = 1)
            )) c_occurrence 
    LEFT JOIN
        `concept` c_standard_concept 
            ON c_occurrence.condition_concept_id = c_standard_concept.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
condition_81335991_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "condition_81335991",
  "condition_81335991_*.csv")
message(str_glue('The data will be written to {condition_81335991_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_81335991_condition_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  condition_81335991_path,
  destination_format = "CSV")


# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {condition_81335991_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), condition_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
dataset_81335991_condition_df <- read_bq_export_from_workspace_bucket(condition_81335991_path)

dim(dataset_81335991_condition_df)

# head(dataset_81335991_condition_df, 5)

In [None]:
# Pull in LDL data

library(tidyverse)
library(bigrquery)

# This query represents dataset "LDLC" for domain "measurement" and was generated for All of Us Controlled Tier Dataset v8
dataset_97827399_measurement_sql <- paste("
    SELECT
        measurement.person_id,
        measurement.measurement_concept_id,
        m_standard_concept.concept_name as standard_concept_name,
        measurement.measurement_datetime,
        measurement.value_as_number,
        m_unit.concept_name as unit_concept_name,
        measurement.unit_source_value 
    FROM
        ( SELECT
            * 
        FROM
            `measurement` measurement 
        WHERE
            (
                measurement_concept_id IN (SELECT
                    DISTINCT c.concept_id 
                FROM
                    `cb_criteria` c 
                JOIN
                    (SELECT
                        CAST(cr.id as string) AS id       
                    FROM
                        `cb_criteria` cr       
                    WHERE
                        concept_id IN (3009966, 3028288, 3028437, 3053341, 40795800)       
                        AND full_text LIKE '%_rank1]%'      ) a 
                        ON (c.path LIKE CONCAT('%.', a.id, '.%') 
                        OR c.path LIKE CONCAT('%.', a.id) 
                        OR c.path LIKE CONCAT(a.id, '.%') 
                        OR c.path = a.id) 
                WHERE
                    is_standard = 1 
                    AND is_selectable = 1)
            )) measurement 
    LEFT JOIN
        `concept` m_standard_concept 
            ON measurement.measurement_concept_id = m_standard_concept.concept_id 
    LEFT JOIN
        `concept` m_unit 
            ON measurement.unit_concept_id = m_unit.concept_id", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
measurement_97827399_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "measurement_97827399",
  "measurement_97827399_*.csv")
message(str_glue('The data will be written to {measurement_97827399_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_97827399_measurement_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  measurement_97827399_path,
  destination_format = "CSV")


# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {measurement_97827399_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- cols(standard_concept_name = col_character(), unit_concept_name = col_character(), unit_source_value = col_character())
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
dataset_97827399_measurement_df <- read_bq_export_from_workspace_bucket(measurement_97827399_path)

dim(dataset_97827399_measurement_df)

# head(dataset_97827399_measurement_df, 5)

In [None]:
# Pull in statin prescription data
system(paste0("gsutil cp ", my_bucket, "/sid_pheno_files/", "statin_rx_df_v2_rx.csv", " ."), intern=T)
statin_rx_df_itt <- read.csv("statin_rx_df_v2_rx.csv")

In [None]:
# Participans with genomic data

library(tidyverse)
library(bigrquery)

# This query represents dataset "SRWGS IDs" for domain "person" and was generated for All of Us Controlled Tier Dataset v8
dataset_91464632_person_sql <- paste("
    SELECT
        person.person_id 
    FROM
        `person` person   
    WHERE
        person.PERSON_ID IN (SELECT
            distinct person_id  
        FROM
            `cb_search_person` cb_search_person  
        WHERE
            cb_search_person.person_id IN (SELECT
                person_id 
            FROM
                `cb_search_person` p 
            WHERE
                has_whole_genome_variant = 1 ) )", sep="")

# Formulate a Cloud Storage destination path for the data exported from BigQuery.
# NOTE: By default data exported multiple times on the same day will overwrite older copies.
#       But data exported on a different days will write to a new location so that historical
#       copies can be kept as the dataset definition is changed.
person_91464632_path <- file.path(
  Sys.getenv("WORKSPACE_BUCKET"),
  "bq_exports",
  Sys.getenv("OWNER_EMAIL"),
  strftime(lubridate::now(), "%Y%m%d"),  # Comment out this line if you want the export to always overwrite.
  "person_91464632",
  "person_91464632_*.csv")
message(str_glue('The data will be written to {person_91464632_path}. Use this path when reading ',
                 'the data into your notebooks in the future.'))

# Perform the query and export the dataset to Cloud Storage as CSV files.
# NOTE: You only need to run `bq_table_save` once. After that, you can
#       just read data from the CSVs in Cloud Storage.
bq_table_save(
  bq_dataset_query(Sys.getenv("WORKSPACE_CDR"), dataset_91464632_person_sql, billing = Sys.getenv("GOOGLE_PROJECT")),
  person_91464632_path,
  destination_format = "CSV")


# Read the data directly from Cloud Storage into memory.
# NOTE: Alternatively you can `gsutil -m cp {person_91464632_path}` to copy these files
#       to the Jupyter disk.
read_bq_export_from_workspace_bucket <- function(export_path) {
  col_types <- NULL
  bind_rows(
    map(system2('gsutil', args = c('ls', export_path), stdout = TRUE, stderr = TRUE),
        function(csv) {
          message(str_glue('Loading {csv}.'))
          chunk <- read_csv(pipe(str_glue('gsutil cat {csv}')), col_types = col_types, show_col_types = FALSE)
          if (is.null(col_types)) {
            col_types <- spec(chunk)
          }
          chunk
        }))
}
dataset_91464632_person_df <- read_bq_export_from_workspace_bucket(person_91464632_path)

dim(dataset_91464632_person_df)

# head(dataset_91464632_person_df, 5)

In [None]:
# Prep genetic ancestry PCs
library(dplyr)
library(tidyr)
library(stringr)

# Pull in ancestry PCs
system(paste0("gsutil -u ${GOOGLE_PROJECT} cp ", "gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/aux/ancestry/ancestry_preds.tsv", " ."), intern=T)
ancestry_pcs <- read.csv("ancestry_preds.tsv", sep = "\t")

# head(ancestry_pcs, 5)

# Convert pca_features column from string to numeric and separate into individual columns
ancestry_pcs <- ancestry_pcs %>%
                  mutate(pca_features = str_replace_all(pca_features, "\\[|\\]", "")) %>%
                  separate(pca_features, into = paste0("PC", 1:16), sep = ", ", convert = TRUE) %>%
                  rename(person_id = research_id) %>%
                  select(person_id, starts_with("PC"))

# head(ancestry_pcs, 5)

# Exclusions

**Objective**: The purpose of this section is to exclude participants with missing data, type 1 diabetes (T1D), and no short-read whole genome sequence (srWGS) data.

In [None]:
# Create NAs for unanswered demo characteristics and filter
# For statin users
statin_user_covs_df_filtered <- statin_user_covs_df %>% 
                                mutate(sex_at_birth = ifelse(sex_at_birth == "I prefer not to answer" |
                                                            sex_at_birth == "No matching concept" |
                                                            sex_at_birth == "None" |
                                                            sex_at_birth == "PMI: Skip", NA, sex_at_birth),
                                        race = ifelse(race == "I prefer not to answer" |
                                                     race == "None Indicated" | 
                                                     race == "None of these" |
                                                     race == "PMI: Skip", NA, race),
                                        ethnicity = ifelse(ethnicity == "PMI: Prefer Not To Answer" |
                                                         ethnicity == "PMI: Skip" |
                                                         ethnicity == "What Race Ethnicity: Race Ethnicity None Of These", NA, ethnicity)) %>%
                                filter(!is.na(sex_at_birth)) %>%
                                filter(!(is.na(race) & is.na(ethnicity)))

length(unique(statin_user_covs_df_filtered$person_id))
dim(statin_user_covs_df_filtered)
# head(statin_user_covs_df_filtered)

# For non-users
non_user_covs_df_filtered <- non_user_covs_df %>% 
                                mutate(sex_at_birth = ifelse(sex_at_birth == "I prefer not to answer" |
                                                            sex_at_birth == "No matching concept" |
                                                            sex_at_birth == "None" |
                                                            sex_at_birth == "PMI: Skip", NA, sex_at_birth),
                                        race = ifelse(race == "I prefer not to answer" |
                                                     race == "None Indicated" | 
                                                     race == "None of these" |
                                                     race == "PMI: Skip", NA, race),
                                        ethnicity = ifelse(ethnicity == "PMI: Prefer Not To Answer" |
                                                         ethnicity == "PMI: Skip" |
                                                         ethnicity == "What Race Ethnicity: Race Ethnicity None Of These", NA, ethnicity)) %>%
                                filter(!is.na(sex_at_birth)) %>%
                                filter(!(is.na(race) & is.na(ethnicity)))

length(unique(non_user_covs_df_filtered$person_id))
dim(non_user_covs_df_filtered)
# head(non_user_covs_df_filtered)

In [None]:
# Filtering out participants with T1D
# Statin users
statin_user_covs_df_filtered_2 <- statin_user_covs_df_filtered[!statin_user_covs_df_filtered$person_id %in% dataset_81335991_condition_df$person_id,] %>%
                                    rename(high_tg = tg_over_150,
                                           high_bmi = bmi_over_25) %>% 
                                    mutate(sex_at_birth = ifelse(is.na(sex_at_birth), "Other", sex_at_birth),
                                            race = ifelse(is.na(race), "Other", race),
                                            ethnicity = ifelse(is.na(ethnicity), "Other", ethnicity)) 

print("Statin Users")
length(unique(statin_user_covs_df_filtered_2$person_id))
dim(statin_user_covs_df_filtered_2)
# head(statin_user_covs_df_filtered_2)

# Statin non-users
non_user_covs_df_filtered_2 <- non_user_covs_df_filtered[!non_user_covs_df_filtered$person_id %in% dataset_81335991_condition_df$person_id,] %>% 
                                mutate(sex_at_birth = ifelse(is.na(sex_at_birth), "Other", sex_at_birth),
                                        race = ifelse(is.na(race), "Other", race),
                                        ethnicity = ifelse(is.na(ethnicity), "Other", ethnicity))

print("Statin Non-users")
length(unique(non_user_covs_df_filtered_2$person_id))
dim(non_user_covs_df_filtered_2)
# head(non_user_covs_df_filtered_2)

In [None]:
# Filtering out participants with NA covariates
# Statin users
statin_user_covs_df_filtered_3 <- statin_user_covs_df_filtered_2 %>%
                                    drop_na(low_hdl, high_tg, high_bmi, smoking_status, htn_status, pd_status, gd_status)

print("Statin Users")
length(unique(statin_user_covs_df_filtered_3$person_id))
dim(statin_user_covs_df_filtered_3)
# head(statin_user_covs_df_filtered_3)

# Statin non-users
non_user_covs_df_filtered_3 <- non_user_covs_df_filtered_2 %>%
                                    drop_na(low_hdl, high_tg, high_bmi, smoking_status, htn_status, pd_status, gd_status)

print("Non-users")
length(unique(non_user_covs_df_filtered_3$person_id))
dim(non_user_covs_df_filtered_3)
# head(non_user_covs_df_filtered_3)

In [None]:
# Filtering out participants without genomic data
# Statin users
statin_user_covs_df_filtered_4 <- statin_user_covs_df_filtered_3 %>%
                                    filter(person_id %in% dataset_91464632_person_df$person_id)

print("Statin Users")
length(unique(statin_user_covs_df_filtered_4$person_id))
dim(statin_user_covs_df_filtered_4)
# head(statin_user_covs_df_filtered_4)

# Statin non-users
non_user_covs_df_filtered_4 <- non_user_covs_df_filtered_3 %>%
                                    filter(person_id %in% dataset_91464632_person_df$person_id)

print("Non-users")
length(unique(non_user_covs_df_filtered_4$person_id))
dim(non_user_covs_df_filtered_4)
# head(non_user_covs_df_filtered_4)

# Merging Statin Users and Non-users

**Objective**: The purpose of this section is to harmonize the statin user and non-user data frames so they can be merged successfully.

In [None]:
# Calculate index ages for non-users
non_user_covs_df_filtered_4$index_age <- round(as.numeric(difftime(as.Date(non_user_covs_df_filtered_4$t2d_measurement_index_date),
                                                  as.Date(non_user_covs_df_filtered_4$date_of_birth),
                                                  units = "days")/365.2425), digits = 1)

# Create a new column for treatment group in each data frame
non_user_covs_df_filtered_4$group <- "non-user"
# head(non_user_covs_df_filtered_4)

statin_user_covs_df_filtered_4$group <- "user"
# head(statin_user_covs_df_filtered_4)

In [None]:
# Merge statin and non-user data frames

# Prep data frames for merging by harmonizing columns
non_user_covs_df_merging <- non_user_covs_df_filtered_4 %>% 
                                select(person_id, t2d_status, eof_age, eof_date,
                                       sex_at_birth, race, ethnicity, 
                                       low_hdl, high_tg, high_bmi, pd_status, 
                                       smoking_status, htn_status, gd_status, 
                                       index_age, group, t2d_measurement_concept_id, 
                                       t2d_value_as_number, t2d_measurement_index_date, index_id) %>%
                                rename(index_date = t2d_measurement_index_date)

statin_user_covs_df_merging <- statin_user_covs_df_filtered_4 %>% rename(index_age = statin_init_age) %>% 
                            mutate(person_id = as.integer(person_id)) %>%
                            select(person_id, t2d_status, eof_age, eof_datetime,
                                   statin_init_date, statin_type_start, statin_dose_start, 
                                   statin_end_date, statin_type_end, statin_dose_end,
                                   sex_at_birth, race, ethnicity, low_hdl, high_tg, high_bmi, 
                                   pd_status, smoking_status, htn_status, gd_status, index_age, group) %>%
                            rename(index_date = statin_init_date,
                                      eof_date = eof_datetime)
# Merge final data frames
covs_df_merged <- full_join(non_user_covs_df_merging, statin_user_covs_df_merging)

# Change categorical variables ot factors for propensity score calculation
covs_df_merged$group <- as.factor(covs_df_merged$group)
binary_cols <- c("low_hdl", "high_tg", "high_bmi", "pd_status", "smoking_status", "htn_status", "gd_status")
covs_df_merged[binary_cols] <- lapply(covs_df_merged[binary_cols], as.factor)

# Create population and time variables
covs_df_merged <- covs_df_merged %>%
                mutate(population = ifelse(ethnicity == "Hispanic or Latino" & race == "Other",
                                                         "Hispanic or Latino", as.character(race))) %>%
                mutate(time = as.numeric(difftime(as.Date(eof_date), as.Date(index_date)))/365.2425)

In [None]:
# Check merged df
length(unique(covs_df_merged$person_id))
dim(covs_df_merged)
# head(covs_df_merged)

# Matching

**Objective**: The purpose of this section is to use exact, threshold, and propensity score matching to match each statin user to 2 non-users at the most appropriate index dates.

## **Matching Algorithm**
1. Assign each eligible non-user a pool of potential index dates based on the dates of random glucose, fasting glucose, and HbA1c measurement dates
2. Determine covariate status at index
3. Calculate propensity scores by predicting statin use based on covariates using logistic regression
4. Create separate data frames for each combination of demographic characteristics (race, population) to match them exactly
5. Split each combination of demographic characteristics back into statin users and non-users 
6. Arrange statin users from youngest to oldest
7. Starting with the youngest statin user, find out which potential non-user indexes:
        a. Are within 3 years of the statin user's index age
        b. Have a follow-up duration within 6 months
        c. Have a propensity score within 0.01
8. If there are at least two potential matches, arrange them in order of closest propensity score and choose the closest
9. Remove the matched non-user and all their index dates from the potential matching pool
10. With the matched non-user removed, refilter for eligible non-users and choose the one with the next closest propensity score
11. Add the matched statin user and the two matched non-users to a data frame
12. Remove the second matched non-user and their index dates from the potential matching pool
13. Repeat with all statin users
14. If a statin user is not matched with two non-users, do not include them in the final data frame

In [None]:
# Calculate propensity scores

# Model the odds of using statins based on covariates
prop_glm <- glm(group ~ sex_at_birth + race + ethnicity + low_hdl + high_tg + high_bmi + smoking_status + 
                htn_status + pd_status + gd_status,
                data = covs_df_merged,
                family = 'binomial')
summary(prop_glm)

# Predict propensity scores based on the model
covs_df_merged$propensity_scores <- predict(prop_glm, covs_df_merged, type = "response")

length(unique(covs_df_merged$person_id))
dim(covs_df_merged)
# head(covs_df_merged)

In [None]:
# Check propensity scores pre-matching using summary statistics and box plots
boxplot(propensity_scores~group, data = covs_df_merged, col = c("coral", "lightblue"))
covs_df_merged %>% group_by(group) %>% summarise(Count = n(), Mean=mean(propensity_scores), Max=max(propensity_scores), Min=min(propensity_scores), Median=median(propensity_scores), Std=sd(propensity_scores))

In [None]:
# Helper function to create data frames for each sex/population group
split_by_demo <- function(data, sex_at_birth, population) {
    
  # Ensure the specified variables exist in the data
  if (!all(c(sex_at_birth, population) %in% names(data))) {
    stop("One or more specified variables are not in the data frame.")
  }
  
  # Use split to divide the data by combinations of the three variables
  split_data <- split(
    data,
    data[c(sex_at_birth, population)],
    drop = FALSE
  )
  
  return(split_data)
}

In [None]:
# Run helper function on merged df
merged_df_list <- split_by_demo(covs_df_merged, "sex_at_birth", "population")

In [None]:
# Function to filter list of data frames
filter_data_frames <- function(data_frame_list) {
      # Check each data frame to see if there are any rows and return NULL if not
      filtered_list <- lapply(data_frame_list, function(df) 
          {
            if (nrow(df %>% filter(group == "user")) > 0) {
              return(df)
            } else {
              return(NULL)
            }
          }
                         )
      # Remove NULL entries from the list
      filtered_list <- filtered_list[!sapply(filtered_list, is.null)]
      return(filtered_list)
    }

In [None]:
# Run helper function on list of data frames
merged_df_filtered_list <- filter_data_frames(merged_df_list)

In [None]:
# Match statin users to 2 non-users

# Parameters
caliper <- 0.01 # Caliper for propensity score
age_diff <- 3  # Maximum age difference for matching
time_diff <- 0.5 # Maximum duration difference (years)

matched <- data.frame()

for (i in names(merged_df_filtered_list)) {
    # Arrange participants by index age from youngest to oldest
    su_age_order <- merged_df_filtered_list[[i]] %>% filter(group == "user") %>% arrange(index_age)
    
    nu_df <- merged_df_filtered_list[[i]] %>% filter(group == "non-user")
    

    # Matching
    for (i in 1:nrow(su_age_order)) {
      treated_row <- su_age_order[i, ] %>% mutate(match_group = su_age_order[[i, 'person_id']])

      # Filter potential controls based on age difference and caliper
      potential_controls <- nu_df %>%
                            filter(abs(index_age - treated_row$index_age) <= age_diff,
                                   abs(time - treated_row$time) <= time_diff,
                                   abs(propensity_scores - treated_row$propensity_scores) <= caliper)

      if (nrow(potential_controls) >= 2) {
        # Sort by propensity score difference
        potential_controls <- potential_controls %>%
                              mutate(score_diff = abs(propensity_scores - treated_row$propensity_scores)) %>%
                              arrange(score_diff)

        # Select the first control and remove it from the pool
        first_control <- potential_controls[1, ] %>% mutate(match_group = su_age_order[[i, 'person_id']])
        nu_df <- nu_df %>% filter(person_id != first_control$person_id)

        # Re-filter potential controls after removing the first control
        potential_controls <- nu_df %>%
                              filter(abs(index_age - treated_row$index_age) <= age_diff,
                                     abs(time - treated_row$time) <= time_diff,
                                     abs(propensity_scores - treated_row$propensity_scores) <= caliper) %>%
                              mutate(score_diff = abs(propensity_scores - treated_row$propensity_scores)) %>%
                              arrange(score_diff)

        if (nrow(potential_controls) >= 1) {
          # Select the second control
          second_control <- potential_controls[1, ] %>% mutate(match_group = su_age_order[[i, 'person_id']])

          # Add matches to the matched data frame
          matches <- bind_rows(treated_row, first_control, second_control)
          matched <- rbind(matched, matches)
            
          # Remove the second control from the pool
          nu_df <- nu_df %>% filter(person_id != second_control$person_id)
        }
      }
    }

} 

In [None]:
# Check final matched data frame
length(unique(matched$person_id))
dim(matched)
table(matched$group)
# head(matched)

# Create Main Intention-to-Treat Population

**Objective**: The purpose of this section is to create the main study phenotype, which is based on an intention-to-treat (ITT) model. That means that the first statin prescription participants received is assumed to be carried out throughout the study, similar to randomized control trials. Statin users are assigned statin types, statin intensities, and statin lipophilicities based on their first statin prescription.

In [None]:
# Helper function to determine statin intensity based on statin type and dosage
statin_intensity <- function(statin_type, statin_dose) {
    if(statin_type == "atorvastatin") {
        if(statin_dose == "40 MG" | statin_dose == "80 MG") {
            return("high")
        }
        
        if(statin_dose == "10 MG" | statin_dose == "20 MG") {
            return("moderate")
        }
    }
    
    if(statin_type == "simvastatin") {
        if(statin_dose == "20 MG" | statin_dose == "40 MG") {
            return("moderate")
        }
        
        if(statin_dose == "10 MG") {
            return("low")
        }
    }
    
    if(statin_type == "fluvastatin") {
        if(statin_dose == "80 MG" | statin_dose == "40 MG") {
            return("moderate")
        }
        
        if(statin_dose == "20 MG" | statin_dose == "40 MG") {
            return("low")
        }
    }
    
    if(statin_type == "pitavastatin") {
        return("moderate")
    }
    
    if(statin_type == "lovastatin") {
        if(statin_dose == "80 MG" | statin_dose == "40 MG") {
            return("moderate")
        }
        
        if(statin_dose == "20 MG") {
            return("low")
        }
    }
    
    if(statin_type == "rosuvastatin") {
        if(statin_dose == "20 MG" | statin_dose == "40 MG") {
            return("high")
        }
        
        if(statin_dose == "5 MG" | statin_dose == "10 MG") {
            return("moderate")
        }
    }
    
    if(statin_type == "pravastatin") {
        if(statin_dose == "40 MG" | statin_dose == "80 MG") {
            return("moderate")
        }
        
        if(statin_dose == "10 MG" | statin_dose == "20 MG") {
            return("low")
        }
    }
    
    else {return(NA)}
}

In [None]:
# Add columns for starting statin type, statin intensity, statin lipophilicity, and numeric T2D status for regression
itt_df <- matched %>% 
            mutate(eof_age = round(eof_age, digits = 1)) %>% 
            mutate(statin_type_start = ifelse(is.na(statin_type_start) & group == 'non-user', "non-user", statin_type_start),
                   statin_type_start = factor(statin_type_start, levels = c("non-user", "lovastatin", "rosuvastatin", 
                                                                "simvastatin", "atorvastatin", "pravastatin", 
                                                                "fluvastatin", "pitavastatin")),
                   statin_type_start = relevel(statin_type_start, "non-user")) %>%
            group_by(person_id) %>%
            mutate(statin_intensity_start = ifelse(group == 'non-user', "non-user", 
                                             ifelse(!is.na(statin_dose_start) & !is.na(statin_type_start), 
                                                    statin_intensity(statin_type_start, statin_dose_start), NA)),
                   statin_intensity_start = factor(statin_intensity_start, levels = c("non-user", "low", "moderate", "high")),
                   statin_intensity_start = relevel(statin_intensity_start, "non-user")) %>% 
            mutate(statin_lipo_start = ifelse(statin_type_start == "rosuvastatin" | statin_type_start == "pravastatin",
                                        "hydro", ifelse(statin_type_start == "non-user", "non-user", "lipo")),
                   statin_lipo_start = factor(statin_lipo_start, levels = c("non-user", "lipo", "hydro")),
                   statin_lipo_start = relevel(statin_lipo_start, "non-user")) %>%
            mutate(status = case_when(t2d_status == "Event" ~ 1,
                                      t2d_status == "Censor" ~ 0)) %>%
            mutate(FID = 0,
                   IID = person_id) %>% 
            ungroup()

In [None]:
# Add columns for last statin type, statin intensity, statin lipophilicity, and numeric T2D status for regression
itt_df <- itt_df %>% 
            mutate(eof_age = round(eof_age, digits = 1)) %>% 
            mutate(statin_type_end = ifelse(is.na(statin_type_end) & group == 'non-user', "non-user", statin_type_end),
                   statin_type_end = factor(statin_type_end, levels = c("non-user", "lovastatin", "rosuvastatin", 
                                                                "simvastatin", "atorvastatin", "pravastatin", 
                                                                "fluvastatin", "pitavastatin")),
                   statin_type_end = relevel(statin_type_end, "non-user")) %>%
            group_by(person_id) %>%
            mutate(statin_intensity_end = ifelse(group == 'non-user', "non-user", 
                                             ifelse(!is.na(statin_dose_end) & !is.na(statin_type_end), 
                                                    statin_intensity(statin_type_end, statin_dose_end), NA)),
                   statin_intensity_end = factor(statin_intensity_end, levels = c("non-user", "low", "moderate", "high")),
                   statin_intensity_end = relevel(statin_intensity_end, "non-user")) %>% 
            mutate(statin_lipo_end = ifelse(statin_type_end == "rosuvastatin" | statin_type_end == "pravastatin",
                                        "hydro", ifelse(statin_type_end == "non-user", "non-user", "lipo")),
                   statin_lipo_end = factor(statin_lipo_end, levels = c("non-user", "lipo", "hydro")),
                   statin_lipo_end = relevel(statin_lipo_end, "non-user"))

In [None]:
# Check intention-to-treat (ITT) data frame
length(unique(itt_df$person_id))
dim(itt_df)
table(itt_df$group)
# head(itt_df)

In [None]:
# Define variables for Table 1
library(tableone)
myVars <- c("t2d_status", "sex_at_birth", "race", "ethnicity", "low_hdl", "high_tg", "high_bmi", "pd_status", 
            "smoking_status", "htn_status", "gd_status", "index_age", "eof_age", "group", "propensity_scores", 
            "time", "statin_type", "statin_intensity", "statin_lipo")

catVars <- c("t2d_status", "sex_at_birth", "race", "ethnicity", "low_hdl", "high_tg", "high_bmi", "pd_status", 
            "smoking_status", "htn_status", "gd_status", "group", "statin_type", "statin_intensity", "statin_lipo")

In [None]:
# Create Table 1 for ITT cohort
tab_itt <- CreateTableOne(vars = myVars, strata = "group", data = itt_df, factorVars = catVars, test = TRUE)
as.data.frame(print(tab_itt, noSpaces = TRUE, printToggle = FALSE))

In [None]:
# Add ancestry PCs and split categorical variables in preparation for genomic analysis
itt_df <- left_join(itt_df, ancestry_pcs) %>%
            mutate(male = ifelse(sex_at_birth == "Male", 1, 0),
                   pop_black = ifelse(population == "Black or African American", 1, 0),
                   pop_lat = ifelse(population == "Hispanic or Latino", 1, 0),
                   pop_more = ifelse(population == "More than one population", 1, 0),
                   pop_asian = ifelse(population == "Asian", 1, 0),
                   pop_aian = ifelse(population == "American Indian or Alaska Native", 1, 0),
                   pop_mena = ifelse(population == "Middle Eastern or North African", 1, 0)) %>%
            ungroup()

# Check df
length(unique(itt_df$person_id))
dim(itt_df)
# head(itt_df)

In [None]:
# Save data frame
write.csv(itt_df, "itt_df_v2.csv")
system(paste0("gsutil cp ./", "itt_df_v2.csv", " ", my_bucket, "/sid_pheno_files/"), intern=T)

In [None]:
# Save genomic analysis versions of the data - statin users

# Regenie phenotype file
itt_statin_pheno_df <- itt_df %>%
                            filter(group == "user") %>%
                            select(FID, IID, status, time)

write.table(itt_statin_pheno_df, "itt_statin_pheno_df.tsv", row.names = FALSE, sep = "\t", quote = FALSE, na = "NA")
system(paste0("gsutil cp ./", "itt_statin_pheno_df.tsv", " ", my_bucket, "/sid_pheno_files/genomic/"), intern=T)

# Regenie covariate file
itt_statin_covs_df <- itt_df %>% 
                            filter(group == "user") %>%
                            select(FID, IID, low_hdl, high_tg, high_bmi,
                                   pd_status, smoking_status, htn_status, gd_status, index_age,
                                   PC1, PC2, PC3, PC4, PC5, PC6, PC7, PC8, PC9, PC10, PC11, PC12, PC13, 
                                   PC14, PC15, PC16, pop_black, pop_lat, pop_more, pop_asian, 
                                   pop_aian, pop_mena, male) 

write.table(itt_statin_covs_df, "itt_statin_covs_df.tsv", row.names = FALSE, sep = "\t", quote = FALSE, na = "NA")
system(paste0("gsutil cp ./", "itt_statin_covs_df.tsv", " ", my_bucket, "/sid_pheno_files/genomic/"), intern=T)

In [None]:
# Save genomic analysis versions of the data - non-users

# Regenie phenotype file
itt_nu_pheno_df <- itt_df %>%
                            filter(group == "non-user") %>%
                            select(FID, IID, status, time) 

write.table(itt_nu_pheno_df, "itt_nu_pheno_df.tsv", row.names = FALSE, sep = "\t", quote = FALSE, na = "NA")
system(paste0("gsutil cp ./", "itt_nu_pheno_df.tsv", " ", my_bucket, "/sid_pheno_files/genomic/"), intern=T)

# Regenie covariate file
itt_nu_covs_df <- itt_df %>% 
                            filter(group == "non-user") %>%
                            select(FID, IID, low_hdl, high_tg, high_bmi,
                                   pd_status, smoking_status, htn_status, gd_status, index_age,
                                   PC1, PC2, PC3, PC4, PC5, PC6, PC7, PC8, PC9, PC10, PC11, PC12, PC13, 
                                   PC14, PC15, PC16, pop_black, pop_lat, pop_more, pop_asian, 
                                   pop_aian, pop_mena, male) 

write.table(itt_nu_covs_df, "itt_nu_covs_df.tsv", row.names = FALSE, sep = "\t", quote = FALSE, na = "NA")
system(paste0("gsutil cp ./", "itt_nu_covs_df.tsv", " ", my_bucket, "/sid_pheno_files/genomic/"), intern=T)

In [None]:
# Select ID columns
itt_ids <- itt_df %>% select(FID, IID)

# Save IDs
write.table(itt_ids, "itt_ids_v2.txt", sep = "\t", col.names = FALSE, quote = FALSE, row.names = FALSE)
system(paste0("gsutil cp ./", "itt_ids_v2.txt", " ", my_bucket, "/sid_pheno_files/genomic/"), intern=T)

# Select ID columns - statin users
itt_statin_ids <- itt_df %>% filter(group == "user") %>% select(FID, IID)

# Save IDs
write.table(itt_statin_ids, "itt_statin_ids_v2.txt", sep = "\t", col.names = FALSE, quote = FALSE, row.names = FALSE)
system(paste0("gsutil cp ./", "itt_statin_ids_v2.txt", " ", my_bucket, "/sid_pheno_files/genomic/"), intern=T)

# Create Change in LDL Subset

**Objective**: The purpose of this section is to create a subset of the main study population where statin users demonstrated an at least 30% decrease in low-density lipoprotein (LDL-C) after starting statins. This is becuase moderate intensity statins should lower LDL-C by at least 30%, and we want to determine who in our cohort took an at least moderate intensity statin, and whether or not those statins were effective. Only non-users that were matched with eligible statin users in this subset, maintaining the matching structure.

This section also involves the creation of baseline and change in LDL-C variables, for further analysis of LDL-C.

In [None]:
# Pull in intention-to-treat data frame
system(paste0("gsutil cp ", my_bucket, "/sid_pheno_files/", "itt_df_v2.csv", " ."), intern=T)
itt_df <- read.csv("itt_df_v2.csv")

In [None]:
# Find the most recent LDL measurement before starting statins
ldl30_df_1 <- itt_df %>% filter(group == "user") %>%
                left_join(dataset_97827399_measurement_df) %>% 
                filter(unit_concept_name == 'milligram per deciliter' | 
                       unit_concept_name == 'milligram per deciliter calculated' |
                       unit_concept_name == 'mg/dL') %>% 
                filter(value_as_number > 0 & value_as_number < 1000) %>%
                group_by(person_id) %>%
                arrange(person_id, as.Date(measurement_datetime)) %>%
                filter(as.Date(measurement_datetime) <= as.Date(index_date)) %>%
                slice_tail() %>%
                mutate(first_ldl_date = measurement_datetime,
                      first_ldl = value_as_number) %>%
                select(person_id, eof_date, index_date, first_ldl_date, first_ldl)

# Check data frame
length(unique(ldl30_df_1$person_id))
dim(ldl30_df_1)
# head(ldl30_df_1)

# Find LDL measurements that occur during follow-up period
ldl30_df_2 <- itt_df %>% filter(group == "user") %>%
                left_join(dataset_97827399_measurement_df) %>%
                filter(unit_concept_name == 'milligram per deciliter' | 
                       unit_concept_name == 'milligram per deciliter calculated' |
                       unit_concept_name == 'mg/dL') %>%
                filter(value_as_number > 0 & value_as_number < 1000) %>%
                group_by(person_id) %>%
                arrange(person_id, as.Date(measurement_datetime)) %>%
                filter(as.Date(measurement_datetime) <= as.Date(eof_date) & as.Date(measurement_datetime) > as.Date(index_date)) %>%
                select(person_id, eof_date, index_date, measurement_datetime, value_as_number)

# Check data frame
length(unique(ldl30_df_2$person_id))
dim(ldl30_df_2)
# head(ldl30_df_2)

In [None]:
# Join before and after measurements and filter for participants with a greater than 30% decrease in LDL-C after 
# starting statins
ldl30_ids <- inner_join(ldl30_df_1, ldl30_df_2, by = join_by(person_id, eof_date, index_date)) %>%
                mutate(change_ldl = (value_as_number-first_ldl)/first_ldl) %>%
                filter(change_ldl <= -0.30) %>%
                ungroup() %>%
                select(person_id) %>% 
                distinct(.keep_all = TRUE)

# Check eligible IDs
length(unique(ldl30_ids$person_id))
dim(ldl30_ids)
# head(ldl30_ids)

In [None]:
# Subset the original ITT df using the LDL-filtered IDs for statin users and their matched non-users
ldl30_df <- itt_df %>% filter(match_group %in% ldl30_ids$person_id)

# Check LDL-filtered df
table(ldl30_df$group)
length(unique(ldl30_df$person_id))
dim(ldl30_df)
# head(ldl30_df)

In [None]:
# Define variables for Table 1
library(tableone)
myVars <- c("t2d_status", "sex_at_birth", "race", "ethnicity", "low_hdl", "high_tg", "high_bmi", "pd_status", 
            "smoking_status", "htn_status", "gd_status", "index_age", "eof_age", "group", "propensity_scores", 
            "time", "statin_type", "statin_intensity", "statin_lipo")

catVars <- c("t2d_status", "sex_at_birth", "race", "ethnicity", "low_hdl", "high_tg", "high_bmi", "pd_status", 
            "smoking_status", "htn_status", "gd_status", "group", "statin_type", "statin_intensity", "statin_lipo")

In [None]:
# Create Table 1 for LDL-filtered subset
tab_ldl30 <- CreateTableOne(vars = myVars, strata = "group", data = ldl30_df, factorVars = catVars, test = TRUE)
as.data.frame(print(tab_ldl30, noSpaces = TRUE, printToggle = FALSE))

In [None]:
# Save data frame
write.csv(ldl30_df, "ldl30_df_v2.csv")
system(paste0("gsutil cp ./", "ldl30_df_v2.csv", " ", my_bucket, "/sid_pheno_files/"), intern=T)

In [None]:
# Save genomic analysis versions of the data - statin users

# Regenie phenotype file
ldl30_statin_pheno_df <- ldl30_df %>%
                            filter(group == "user") %>%
                            select(FID, IID, status, time)

write.table(ldl30_statin_pheno_df, "ldl30_statin_pheno_df.tsv", row.names = FALSE, sep = "\t", quote = FALSE, na = "NA")
system(paste0("gsutil cp ./", "ldl30_statin_pheno_df.tsv", " ", my_bucket, "/sid_pheno_files/genomic/"), intern=T)

# Regenie covariate file
ldl30_statin_covs_df <- ldl30_df %>% 
                            filter(group == "user") %>%
                            select(FID, IID, low_hdl, high_tg, high_bmi,
                                   pd_status, smoking_status, htn_status, gd_status, index_age,
                                   PC1, PC2, PC3, PC4, PC5, PC6, PC7, PC8, PC9, PC10, PC11, PC12, PC13, 
                                   PC14, PC15, PC16, pop_black, pop_lat, pop_more, pop_asian, 
                                   pop_aian, pop_mena, male) 

write.table(ldl30_statin_covs_df, "ldl30_statin_covs_df.tsv", row.names = FALSE, sep = "\t", quote = FALSE, na = "NA")
system(paste0("gsutil cp ./", "ldl30_statin_covs_df.tsv", " ", my_bucket, "/sid_pheno_files/genomic/"), intern=T)

In [None]:
# Save genomic analysis versions of the data - non-users

# Regenie phenotype file
ldl30_nu_pheno_df <- ldl30_df %>%
                            filter(group == "non-user") %>%
                            select(FID, IID, status, time) 

write.table(ldl30_nu_pheno_df, "ldl30_nu_pheno_df.tsv", row.names = FALSE, sep = "\t", quote = FALSE, na = "NA")
system(paste0("gsutil cp ./", "ldl30_nu_pheno_df.tsv", " ", my_bucket, "/sid_pheno_files/genomic/"), intern=T)

# Regenie covariate file
ldl30_nu_covs_df <- ldl30_df %>% 
                            filter(group == "non-user") %>%
                            select(FID, IID, low_hdl, high_tg, high_bmi,
                                   pd_status, smoking_status, htn_status, gd_status, index_age,
                                   PC1, PC2, PC3, PC4, PC5, PC6, PC7, PC8, PC9, PC10, PC11, PC12, PC13, 
                                   PC14, PC15, PC16, pop_black, pop_lat, pop_more, pop_asian, 
                                   pop_aian, pop_mena, male) 

write.table(ldl30_nu_covs_df, "ldl30_nu_covs_df.tsv", row.names = FALSE, sep = "\t", quote = FALSE, na = "NA")
system(paste0("gsutil cp ./", "ldl30_nu_covs_df.tsv", " ", my_bucket, "/sid_pheno_files/genomic/"), intern=T)

In [None]:
# Select ID columns
ldl30_ids <- ldl30_df %>% select(FID, IID)

# Save IDs
write.table(ldl30_ids, "ldl30_ids_v2.txt", sep = "\t", col.names = FALSE, quote = FALSE, row.names = FALSE)
system(paste0("gsutil cp ./", "ldl30_ids_v2.txt", " ", my_bucket, "/sid_pheno_files/genomic/"), intern=T)

# Select ID columns - statin users
ldl30_statin_ids <- ldl30_df %>% filter(group == "user") %>% select(FID, IID)

# Save IDs
write.table(ldl30_statin_ids, "ldl30_statin_ids_v2.txt", sep = "\t", col.names = FALSE, quote = FALSE, row.names = FALSE)
system(paste0("gsutil cp ./", "ldl30_statin_ids_v2.txt", " ", my_bucket, "/sid_pheno_files/genomic/"), intern=T)

## Create baseline LDL-C and change in LDL-C variables

In [None]:
# Pull in intention-to-treat data frame
system(paste0("gsutil cp ", my_bucket, "/sid_pheno_files/", "itt_df_v2.csv", " ."), intern=T)
itt_df <- read.csv("itt_df_v2.csv")

In [None]:
# Find the most recent LDL measurement before starting statins
ldl_df_1 <- itt_df %>%
                left_join(dataset_97827399_measurement_df) %>% 
                filter(unit_concept_name == 'milligram per deciliter' | 
                       unit_concept_name == 'milligram per deciliter calculated' |
                       unit_concept_name == 'mg/dL') %>%
                filter(value_as_number > 0 & value_as_number < 1000) %>%
                group_by(person_id) %>%
                arrange(person_id, as.Date(measurement_datetime)) %>%
                filter(as.Date(measurement_datetime) <= as.Date(index_date) & value_as_number != 0) %>%
                slice_tail() %>%
                mutate(first_ldl_date = measurement_datetime,
                      first_ldl = value_as_number) %>%
                select(person_id, match_group, group, eof_date, index_date, statin_end_date, 
                       first_ldl_date, first_ldl) %>% 
                ungroup()

# Check data frame
length(unique(ldl_df_1$person_id))
table(ldl_df_1$group)
dim(ldl_df_1)
# head(ldl_df_1)
ldl_df_1 %>% group_by(group) %>% summarize(na_baseline_ldl = sum(is.na(first_ldl)))

# Find LDL measurements that occur during follow-up period
ldl_df_2 <- itt_df %>%
                left_join(dataset_97827399_measurement_df) %>%
                filter(unit_concept_name == 'milligram per deciliter' | 
                       unit_concept_name == 'milligram per deciliter calculated' |
                       unit_concept_name == 'mg/dL') %>%
                filter(value_as_number > 0 & value_as_number < 1000) %>%
                group_by(person_id) %>%
                arrange(person_id, as.Date(measurement_datetime)) %>%
                filter(as.Date(measurement_datetime) <= as.Date(eof_date) & as.Date(measurement_datetime) > as.Date(index_date)) %>%
                select(person_id, match_group, group, eof_date, index_date, statin_end_date,
                       measurement_datetime, value_as_number) %>%
                rename(post_ldl_date = measurement_datetime,
                       post_ldl_value = value_as_number) %>% arrange(desc(post_ldl_value))

# Check data frame
length(unique(ldl_df_2$person_id))
table(ldl_df_2$group)
dim(ldl_df_2)
# head(ldl_df_2)
ldl_df_2 %>% group_by(group) %>% summarize(na_post_ldl = sum(is.na(post_ldl_value)))

In [None]:
# Join before and after measurements
ldl_df <- inner_join(ldl_df_1, ldl_df_2, by = join_by(person_id, eof_date, index_date, match_group, group, statin_end_date))

In [None]:
# Find the last LDL-C measurement before EoF
ldl_df_last <- ldl_df %>%
                group_by(person_id) %>%
                arrange(person_id, as.Date(post_ldl_date)) %>%
                filter(as.Date(post_ldl_date) <= as.Date(eof_date) & post_ldl_value != 0) %>%
                slice_tail() %>%
                mutate(last_ldl_date = post_ldl_date,
                      last_ldl = post_ldl_value) %>%
                select(person_id, match_group, group, eof_date, index_date, statin_end_date, 
                       last_ldl_date, last_ldl) %>% 
                ungroup() %>%
                full_join(ldl_df)

dim(ldl_df_last)
# head(ldl_df_last)

In [None]:
# Find the last LDL-C measurement that occured within 6 month of the last date of statin prescription
ldl_df_last_statin <- ldl_df %>%
                group_by(person_id) %>%
                arrange(person_id, as.Date(post_ldl_date)) %>%
                filter(as.Date(post_ldl_date) >= as.Date(statin_end_date) & as.numeric(difftime(as.Date(post_ldl_date), as.Date(statin_end_date))) <= 180 & post_ldl_value != 0) %>%
                slice_head() %>%
                mutate(last_statin_ldl_date = post_ldl_date,
                      last_statin_ldl = post_ldl_value) %>%
                select(person_id, match_group, group, eof_date, index_date, 
                       last_statin_ldl_date, last_statin_ldl) %>% 
                ungroup() %>%
                full_join(ldl_df_last)

dim(ldl_df_last_statin)
# head(ldl_df_last_statin)

In [None]:
# Find the first LDL-C measurement after index
ldl_df_first_post <- ldl_df %>%
                group_by(person_id) %>%
                arrange(person_id, as.Date(post_ldl_date)) %>%
                filter(as.Date(post_ldl_date) > as.Date(index_date) & post_ldl_value != 0) %>%
                slice_head() %>%
                mutate(first_post_ldl_date = post_ldl_date,
                      first_post_ldl = post_ldl_value) %>%
                select(person_id, match_group, group, eof_date, index_date, 
                       first_post_ldl_date, first_post_ldl) %>% 
                ungroup() %>%
                full_join(ldl_df_last_statin)

dim(ldl_df_first_post)
# head(ldl_df_first_post)

In [None]:
# Create change in LDL-C variables to try
ldl_df_change <- ldl_df_first_post %>% 
                mutate(change_ldl = (post_ldl_value-first_ldl)/first_ldl,
                       change_ldl_last = (last_ldl-first_ldl)/first_ldl,
                       change_ldl_last_statin = (last_statin_ldl-first_ldl)/first_ldl,
                       change_ldl_first_post = (first_post_ldl-first_ldl)/first_ldl) %>%
                group_by(person_id) %>%
                reframe(person_id = person_id, 
                        match_group = match_group, 
                        group = group, 
                        baseline_ldl = first_ldl, 
                        max_change_ldl = ifelse(any(abs(change_ldl) == max(abs(change_ldl))), 
                                                change_ldl[which(abs(change_ldl) == max(abs(change_ldl)))], 
                                                NA),
                        max_decrease_ldl = ifelse(any(change_ldl < 0), min(change_ldl), NA),
                        change_ldl_last = change_ldl_last,
                        change_ldl_last_statin = change_ldl_last_statin,
                        change_ldl_first_post = change_ldl_first_post
                       ) %>%
                ungroup() %>%
                distinct(.keep_all = TRUE) %>%
                mutate(change_ldl_last_statin = case_when(group == "non-user" ~ change_ldl_last,
                                                          group == "user" ~ change_ldl_last_statin)) %>%
                group_by(match_group) %>%
                mutate(change_ldl_last_statin2 = ifelse(any(is.na(change_ldl_last_statin)), NA, "X")) %>%
                ungroup() %>%
                mutate(change_ldl_last_statin = ifelse(is.na(change_ldl_last_statin2), NA, change_ldl_last_statin)) %>%
                arrange(person_id) %>%
                select(-change_ldl_last_statin2)

# Check
length(unique(ldl_df_change$person_id))
table(ldl_df_change$group)
dim(ldl_df_change)
# head(ldl_df_change %>% arrange(match_group))

In [None]:
# Join change in LDL columns to main data frame to capture adjustment variables - statin users + 2 non-users
ldl_df <- full_join(ldl_df_change, itt_df, by = join_by(person_id, match_group, group)) %>% 
                select(-X) %>%
                filter((group == "user" & !is.na(baseline_ldl) & !is.na(max_change_ldl)) | group == "non-user") %>%
                mutate(max_change_group = case_when(max_change_ldl <= -0.5 & group == 'user' ~ "high",
                                                    max_change_ldl <= -0.3 & max_change_ldl > -0.5 & group == 'user' ~ "moderate",
                                                    max_change_ldl > -0.3 & max_change_ldl < 0 & group == 'user' ~ "low",
                                                    max_change_ldl >= 0 & group == 'user' ~ "increase",
                                                    group == 'non-user' ~ "non_user"),
                      max_dec_group = case_when(max_decrease_ldl <= -0.5 & group == 'user' ~ "high",
                                                    max_decrease_ldl <= -0.3 & max_decrease_ldl > -0.5 & group == 'user' ~ "moderate",
                                                    max_decrease_ldl > -0.3 & max_decrease_ldl < 0 & group == 'user' ~ "low",
                                                    group == 'non-user' ~ "non_user"),
                      last_change_group = case_when(change_ldl_last <= -0.5 & group == 'user' ~ "high",
                                                    change_ldl_last <= -0.3 & change_ldl_last > -0.5 & group == 'user' ~ "moderate",
                                                    change_ldl_last > -0.3 & change_ldl_last < 0 & group == 'user' ~ "low",
                                                    change_ldl_last >= 0 & group == 'user' ~ "increase",
                                                    group == 'non-user' ~ "non_user"),
                      last_statin_change_group = case_when(change_ldl_last_statin <= -0.5 & group == 'user' ~ "high",
                                                    change_ldl_last_statin <= -0.3 & change_ldl_last_statin > -0.5 & group == 'user' ~ "moderate",
                                                    change_ldl_last_statin > -0.3 & change_ldl_last_statin < 0 & group == 'user' ~ "low",
                                                    change_ldl_last_statin >= 0 & group == 'user' ~ "increase",
                                                    group == 'non-user' ~ "non_user"),
                      first_post_change_group = case_when(change_ldl_first_post <= -0.5 & group == 'user' ~ "high",
                                                    change_ldl_first_post <= -0.3 & change_ldl_first_post > -0.5 & group == 'user' ~ "moderate",
                                                    change_ldl_first_post > -0.3 & change_ldl_first_post < 0 & group == 'user' ~ "low",
                                                    change_ldl_first_post >= 0 & group == 'user' ~ "increase",
                                                    group == 'non-user' ~ "non_user")) %>%
                group_by(match_group) %>%
                filter(n() == 3) %>%
                ungroup() %>%
                arrange(match_group)

# Check
length(unique(ldl_df$person_id))
table(ldl_df$group, ldl_df$max_change_group, useNA = 'always')
table(ldl_df$group, ldl_df$max_dec_group, useNA = 'always')
table(ldl_df$group, ldl_df$last_change_group, useNA = 'always')
table(ldl_df$group, ldl_df$last_statin_change_group, useNA = 'always')
table(ldl_df$group, ldl_df$first_post_change_group, useNA = 'always')
dim(ldl_df)
# head(ldl_df)

In [None]:
# Save data frames
write.csv(ldl_df, "ldl_df_v3.csv")
system(paste0("gsutil cp ./", "ldl_df_v3.csv", " ", my_bucket, "/sid_pheno_files/"), intern=T)

In [None]:
# Function to summarize LDL-C variables
summarize_ldl <- function(ldl) {
    
    # Summarize maxiumum change in LDL-C
    summary <- ldl_df %>% group_by(group) %>%
            summarize(min = min(get(ldl), na.rm = TRUE),
                      mean = mean(get(ldl), na.rm = TRUE),
                      median = median(get(ldl), na.rm = TRUE),
                      max = max(get(ldl), na.rm = TRUE))
    
    print(summary)

    # Define variables for Table 1
    myVars <- c("t2d_status", ldl)
    catVars <- c("t2d_status")

    # Create Table 1 to see differences in baseline LDL
    tab_change_ldl <- CreateTableOne(vars = myVars, strata = "group", data = ldl_df, factorVars = catVars, test = TRUE)
    print(as.data.frame(print(tab_change_ldl, noSpaces = TRUE, printToggle = FALSE)))

    # Create grouped boxplots
    boxplot <- ggplot(ldl_df, aes(x = group, y = .data[[ldl]], fill = group)) +
                    geom_boxplot() +
                    coord_flip() +
                    theme_void() +
                    scale_fill_manual(values = c("#C32882", "#178CCB")) +
                    theme(legend.position = "none")

    # Create grouped density plots
    density_plot <- ggplot(ldl_df, aes(x = .data[[ldl]], fill = group)) +
                        geom_density(alpha = 0.5) +
                        labs(x = paste0("Change in LDL-C (", ldl, ")"), y = "Density") +
                        scale_fill_manual(values = c("#C32882", "#178CCB")) +
                        theme_minimal()

    # Combine plots
    combined_plot <- plot_grid(boxplot, density_plot, ncol = 1, rel_heights = c(1, 8))
    print(combined_plot)
    }


In [None]:
# Print summary for baseline LDL-C
summarize_ldl("baseline_ldl")

In [None]:
# Print summary for maximum change in LDL-C
summarize_ldl("max_change_ldl")

In [None]:
# Print summary for maximum decrease in LDL-C
summarize_ldl("max_decrease_ldl")

In [None]:
# Print summary for change in LDL-C from baseline to last LDL-C
summarize_ldl("change_ldl_last")

In [None]:
# Print summary for change in LDL-C from baseline to most recent LDL-C post statin end
summarize_ldl("change_ldl_last_statin")

In [None]:
# Print summary for change in LDL-C from baseline to first LDL-C post statin initiation
summarize_ldl("change_ldl_first_post")

# Create Per Protocol Subset

**Objective**: The purpose of this section is to create a subset of statin users who followed a full statin protocol and their matched non-users. To be eligible for this subset, statin users must:
- have statin prescriptions covering at least 80% of their follow-up duration 
- have statin prescriptions less than 30 days before the end of follow-up

In [None]:
# Calculate total time covered by statin prescriptions while considering overlaps - Code provided by lab member
calculate_total_time <- function(data) {
    
    # Sort data by start date
    data <-  data %>% arrange(rx_start_date)
    
    # Initialize variables
    sum <- 0
    prev_start <- NULL
    prev_end <- NULL
    
    for (i in 1:nrow(data)) {
        current_start <- data$rx_start_date[i]
        current_end <- data$new_rx_end_date[i]
        
        if (is.null(prev_start)) {
            prev_start <- current_start
            prev_end <- current_end
        } else if (current_start > prev_end) {
            sum <- sum + as.numeric(difftime(prev_end, prev_start, units = "days"))
            prev_start <- current_start
            prev_end <- current_end
        } else if (current_end > prev_end) {
            prev_end <- current_end
        }
    }
    
    if(!is.null(prev_start)) {
        sum <- sum + as.numeric(difftime(prev_end, prev_start, units = "days"))
    }
    
    return(sum)
}

In [None]:
# Filter statin users who have used statins 30 days or fewer before end of follow-up and are ≥80% adherent
per_protocol_ids <- inner_join(itt_df, statin_rx_df_itt, by = join_by(person_id, t2d_status, eof_age)) %>% 
                group_by(person_id) %>%
                filter(as.Date(rx_start_date) <= as.Date(eof_date)) %>%
                mutate(max_rx_end_date = max(rx_end_date),
                       recent_use_time = as.numeric(difftime(as.Date(max_rx_end_date), 
                                                                     as.Date(eof_datetime), 
                                                                     units = 'days')),
                       recent_user = ifelse(recent_use_time > -30, 1, 0),
                       new_rx_end_date = as.Date(ifelse(rx_end_date > eof_date, eof_date, rx_end_date)),
                       totalSUP = calculate_total_time(cur_data()),
                       PDC = as.numeric(totalSUP/follow_up_time_1),
                       pdc_adherent = ifelse(PDC >= 0.8, 1, 0)) %>% 
                filter(recent_user == 1 & pdc_adherent == 1) %>% 
                ungroup() %>%
                select(person_id) %>%
                distinct(.keep_all = TRUE)

# Check eligible IDs
length(unique(per_protocol_ids$person_id))
dim(per_protocol_ids)
# head(per_protocol_ids)

In [None]:
# Subset the original ITT df using the per protocol IDs for statin users and their matched non-users
per_protocol_df <- itt_df %>% filter(match_group %in% per_protocol_ids$person_id)

# Check per protocol df
table(per_protocol_df$group)
length(unique(per_protocol_df$person_id))
dim(per_protocol_df)
# head(per_protocol_df)

In [None]:
# Define variables for Table 1
library(tableone)
myVars <- c("t2d_status", "sex_at_birth", "race", "ethnicity", "low_hdl", "high_tg", "high_bmi", "pd_status", 
            "smoking_status", "htn_status", "gd_status", "index_age", "eof_age", "group", "propensity_scores", 
            "time", "statin_type", "statin_intensity", "statin_lipo")

catVars <- c("t2d_status", "sex_at_birth", "race", "ethnicity", "low_hdl", "high_tg", "high_bmi", "pd_status", 
            "smoking_status", "htn_status", "gd_status", "group", "statin_type", "statin_intensity", "statin_lipo")

In [None]:
# Create Table 1 for per protocol subset
tab_per_protocol <- CreateTableOne(vars = myVars, strata = "group", data = per_protocol_df, factorVars = catVars, test = TRUE)
as.data.frame(print(tab_per_protocol, noSpaces = TRUE, printToggle = FALSE))

In [None]:
# Save data frame
write.csv(per_protocol_df, "per_protocol_df_v2.csv")
system(paste0("gsutil cp ./", "per_protocol_df_v2.csv", " ", my_bucket, "/sid_pheno_files/"), intern=T)

In [None]:
# Save genomic analysis versions of the data - statin users

# Regenie phenotype file
per_protocol_statin_pheno_df <- per_protocol_df %>%
                            filter(group == "user") %>%
                            select(FID, IID, status, time)

write.table(per_protocol_statin_pheno_df, "per_protocol_statin_pheno_df.tsv", row.names = FALSE, sep = "\t", quote = FALSE, na = "NA")
system(paste0("gsutil cp ./", "per_protocol_statin_pheno_df.tsv", " ", my_bucket, "/sid_pheno_files/genomic/"), intern=T)

# Regenie covariate file
per_protocol_statin_covs_df <- per_protocol_df %>% 
                            filter(group == "user") %>%
                            select(FID, IID, low_hdl, high_tg, high_bmi,
                                   pd_status, smoking_status, htn_status, gd_status, index_age,
                                   PC1, PC2, PC3, PC4, PC5, PC6, PC7, PC8, PC9, PC10, PC11, PC12, PC13, 
                                   PC14, PC15, PC16, pop_black, pop_lat, pop_more, pop_asian, 
                                   pop_aian, pop_mena, male) 

write.table(per_protocol_statin_covs_df, "per_protocol_statin_covs_df.tsv", row.names = FALSE, sep = "\t", quote = FALSE, na = "NA")
system(paste0("gsutil cp ./", "per_protocol_statin_covs_df.tsv", " ", my_bucket, "/sid_pheno_files/genomic/"), intern=T)

In [None]:
# Save genomic analysis versions of the data - non-users

# Regenie phenotype file
per_protocol_nu_pheno_df <- per_protocol_df %>%
                            filter(group == "non-user") %>%
                            select(FID, IID, status, time) 

write.table(per_protocol_nu_pheno_df, "per_protocol_nu_pheno_df.tsv", row.names = FALSE, sep = "\t", quote = FALSE, na = "NA")
system(paste0("gsutil cp ./", "per_protocol_nu_pheno_df.tsv", " ", my_bucket, "/sid_pheno_files/genomic/"), intern=T)

# Regenie covariate file
per_protocol_nu_covs_df <- per_protocol_df %>% 
                            filter(group == "non-user") %>%
                            select(FID, IID, low_hdl, high_tg, high_bmi,
                                   pd_status, smoking_status, htn_status, gd_status, index_age,
                                   PC1, PC2, PC3, PC4, PC5, PC6, PC7, PC8, PC9, PC10, PC11, PC12, PC13, 
                                   PC14, PC15, PC16, pop_black, pop_lat, pop_more, pop_asian, 
                                   pop_aian, pop_mena, male) 

write.table(per_protocol_nu_covs_df, "per_protocol_nu_covs_df.tsv", row.names = FALSE, sep = "\t", quote = FALSE, na = "NA")
system(paste0("gsutil cp ./", "per_protocol_nu_covs_df.tsv", " ", my_bucket, "/sid_pheno_files/genomic/"), intern=T)

In [None]:
# Select ID columns
per_protocol_ids <- per_protocol_df %>% select(FID, IID)

# Save IDs
write.table(per_protocol_ids, "per_protocol_ids_v2.txt", sep = "\t", col.names = FALSE, quote = FALSE, row.names = FALSE)
system(paste0("gsutil cp ./", "per_protocol_ids_v2.txt", " ", my_bucket, "/sid_pheno_files/genomic/"), intern=T)

# Select ID columns - statin users
per_protocol_statin_ids <- per_protocol_df %>% filter(group == "user") %>% select(FID, IID)

# Save IDs
write.table(per_protocol_statin_ids, "per_protocol_statin_ids_v2.txt", sep = "\t", col.names = FALSE, quote = FALSE, row.names = FALSE)
system(paste0("gsutil cp ./", "per_protocol_statin_ids_v2.txt", " ", my_bucket, "/sid_pheno_files/genomic/"), intern=T)

# Self-Identified White subset

**Objective**: The purpose of this section is to create a subset of self-identified white statin users and non-users. Since most of our analysis is done in a pooled cohort to maintain statistical power, and white participants make up a large proportion of said cohort (71.9%), they may have an outsized impact on the associations modeled. Creating this subset allows us to see the magnitude of said impact.

In [None]:
# Filter self-identified white statin users and keep their matching non-users
white_ids <- itt_df %>% filter(population == "White" & group == "user") %>% select(person_id)
white_df <- itt_df %>% filter(match_group %in% white_ids$person_id)

In [None]:
# Define variables for Table 1
library(tableone)
myVars <- c("t2d_status", "sex_at_birth", "race", "ethnicity", "low_hdl", "high_tg", "high_bmi", "pd_status", 
            "smoking_status", "htn_status", "gd_status", "index_age", "eof_age", "group", "propensity_scores", 
            "time", "statin_type", "statin_intensity", "statin_lipo")

catVars <- c("t2d_status", "sex_at_birth", "race", "ethnicity", "low_hdl", "high_tg", "high_bmi", "pd_status", 
            "smoking_status", "htn_status", "gd_status", "group", "statin_type", "statin_intensity", "statin_lipo")

In [None]:
# Create Table 1 for self-identified white subset
tab_white <- CreateTableOne(vars = myVars, strata = "group", data = white_df, factorVars = catVars, test = TRUE)
as.data.frame(print(tab_white, noSpaces = TRUE, printToggle = FALSE))

In [None]:
# Save data frame
write.csv(white_df, "white_df_v2.csv")
system(paste0("gsutil cp ./", "white_df_v2.csv", " ", my_bucket, "/sid_pheno_files/"), intern=T)

In [None]:
# Save genomic analysis versions of the data - statin users

# Regenie phenotype file
white_statin_pheno_df <- white_df %>%
                            filter(group == "user") %>%
                            select(FID, IID, status, time)

write.table(white_statin_pheno_df, "white_statin_pheno_df.tsv", row.names = FALSE, sep = "\t", quote = FALSE, na = "NA")
system(paste0("gsutil cp ./", "white_statin_pheno_df.tsv", " ", my_bucket, "/sid_pheno_files/genomic/"), intern=T)

# Regenie covariate file
white_statin_covs_df <- white_df %>% 
                            filter(group == "user") %>%
                            select(FID, IID, low_hdl, high_tg, high_bmi,
                                   pd_status, smoking_status, htn_status, gd_status, index_age,
                                   PC1, PC2, PC3, PC4, PC5, PC6, PC7, PC8, PC9, PC10, PC11, PC12, PC13, 
                                   PC14, PC15, PC16, male) 

write.table(white_statin_covs_df, "white_statin_covs_df.tsv", row.names = FALSE, sep = "\t", quote = FALSE, na = "NA")
system(paste0("gsutil cp ./", "white_statin_covs_df.tsv", " ", my_bucket, "/sid_pheno_files/genomic/"), intern=T)

In [None]:
# Save genomic analysis versions of the data - non-users

# Regenie phenotype file
white_nu_pheno_df <- white_df %>%
                            filter(group == "non-user") %>%
                            select(FID, IID, status, time) 

write.table(white_nu_pheno_df, "white_nu_pheno_df.tsv", row.names = FALSE, sep = "\t", quote = FALSE, na = "NA")
system(paste0("gsutil cp ./", "white_nu_pheno_df.tsv", " ", my_bucket, "/sid_pheno_files/genomic/"), intern=T)

# Regenie covariate file
white_nu_covs_df <- white_df %>% 
                            filter(group == "non-user") %>%
                            select(FID, IID, low_hdl, high_tg, high_bmi,
                                   pd_status, smoking_status, htn_status, gd_status, index_age,
                                   PC1, PC2, PC3, PC4, PC5, PC6, PC7, PC8, PC9, PC10, PC11, PC12, PC13, 
                                   PC14, PC15, PC16, male) 

write.table(white_nu_covs_df, "white_nu_covs_df.tsv", row.names = FALSE, sep = "\t", quote = FALSE, na = "NA")
system(paste0("gsutil cp ./", "white_nu_covs_df.tsv", " ", my_bucket, "/sid_pheno_files/genomic/"), intern=T)

In [None]:
# Select ID columns
white_ids <- white_df %>% select(FID, IID)

# Save IDs
write.table(white_ids, "white_ids_v2.txt", sep = "\t", col.names = FALSE, quote = FALSE, row.names = FALSE)
system(paste0("gsutil cp ./", "white_ids_v2.txt", " ", my_bucket, "/sid_pheno_files/genomic/"), intern=T)

# Select ID columns - statin users
white_statin_ids <- white_df %>% filter(group == "user") %>% select(FID, IID)

# Save IDs
write.table(white_statin_ids, "white_statin_ids_v2.txt", sep = "\t", col.names = FALSE, quote = FALSE, row.names = FALSE)
system(paste0("gsutil cp ./", "white_statin_ids_v2.txt", " ", my_bucket, "/sid_pheno_files/genomic/"), intern=T)

# Check Adherence

**Objective**: The purpose of this section is to check the statin adherence for each subset. This is to make sure that on average, statin users have a reasonable amount of statin prescriptions throughout their follow-up period so that conclusions drawn are more credible. 

In [None]:
# Use calculate_total_time() function from previous section

# Intention-to-treat adherence
ad_check_itt <- inner_join(itt_df, statin_rx_df_itt, by = join_by(person_id, t2d_status, eof_age)) %>% 
                    group_by(person_id) %>%
                    filter(as.Date(rx_start_date) <= as.Date(eof_date)) %>%
                    mutate(new_rx_end_date = as.Date(ifelse(rx_end_date > eof_date, eof_date, rx_end_date)),
                            totalSUP = calculate_total_time(cur_data()),
                            PDC = as.numeric(totalSUP/follow_up_time_1),
                            pdc_adherent = ifelse(PDC >= 0.8, 1, 0)) %>% ungroup()  

ad_check_itt %>% summarize(rx_count = n(), user_count = length(unique(person_id)), min = min(PDC), 
                           median = median(PDC), mean = mean(PDC), max = max(PDC))

In [None]:
# LDL-C filtered adherence
ad_check_ldl <- inner_join(ldl30_df, statin_rx_df_itt, by = join_by(person_id, t2d_status, eof_age)) %>% 
                    group_by(person_id) %>%
                    filter(as.Date(rx_start_date) <= as.Date(eof_date)) %>%
                    mutate(new_rx_end_date = as.Date(ifelse(rx_end_date > eof_date, eof_date, rx_end_date)),
                            totalSUP = calculate_total_time(cur_data()),
                            PDC = as.numeric(totalSUP/follow_up_time_1),
                            pdc_adherent = ifelse(PDC >= 0.8, 1, 0)) %>% ungroup()  

ad_check_ldl %>% summarize(rx_count = n(), user_count = length(unique(person_id)), min = min(PDC), 
                           median = median(PDC), mean = mean(PDC), max = max(PDC))

In [None]:
# Per protocol adherence
ad_check_pp <- inner_join(per_protocol_df, statin_rx_df_itt, by = join_by(person_id, t2d_status, eof_age)) %>% 
                    group_by(person_id) %>%
                    filter(as.Date(rx_start_date) <= as.Date(eof_date)) %>%
                    mutate(new_rx_end_date = as.Date(ifelse(rx_end_date > eof_date, eof_date, rx_end_date)),
                            totalSUP = calculate_total_time(cur_data()),
                            PDC = as.numeric(totalSUP/follow_up_time_1),
                            pdc_adherent = ifelse(PDC >= 0.8, 1, 0)) %>% ungroup()  

ad_check_pp %>% summarize(rx_count = n(), user_count = length(unique(person_id)), min = min(PDC), 
                           median = median(PDC), mean = mean(PDC), max = max(PDC))

In [None]:
# Self-identified white adherence
ad_check_white <- inner_join(white_df, statin_rx_df_itt, by = join_by(person_id, t2d_status, eof_age)) %>% 
                    group_by(person_id) %>%
                    filter(as.Date(rx_start_date) <= as.Date(eof_date)) %>%
                    mutate(new_rx_end_date = as.Date(ifelse(rx_end_date > eof_date, eof_date, rx_end_date)),
                            totalSUP = calculate_total_time(cur_data()),
                            PDC = as.numeric(totalSUP/follow_up_time_1),
                            pdc_adherent = ifelse(PDC >= 0.8, 1, 0)) %>% ungroup()  

ad_check_white %>% summarize(rx_count = n(), user_count = length(unique(person_id)), min = min(PDC), 
                           median = median(PDC), mean = mean(PDC), max = max(PDC))

# Create inclusion flowchart

**Objective**: The purpose of this section is to create a flowchart of inclusion criteria for publication.

In [None]:
# Define inclusion criteria statements for flowchart
eval_expressions <- list(
  'Individuals in the AoURP \nn=633,547',
  'Has an end of follow-up date (random glucose, \nfasting glucose, HbA1c, or death date) \nn=297,459',
  'Statin users',
  '≥1 statin prescription before the end of follow-up \nn=64,146',
  '≥6 months of EHR data before statin initiation \nand at least ≥30 days of follow-up after statin initiation \nn=50,205',
  'No missing demographic characteristics \nn=48,095',
  'Does not have Type 1 diabetes \nn=47,576',
  'No missing covariate data \nn=41,995',
  'Has srWGS data \nn=35,668',
  'Matched to two statin non-users \nn=15,767',
  '≥80% adherence and statin Rx ≤30 days \nbefore end of follow-up (per protocol) \nn=3,782',
  'Self-identified white \nn=11,339',
  'Statin non-users',
  'Did not have statin Rx or had <30 days of \nfollow-up after starting statins \nn=247,254',
  'No missing demographic characteristics \nn=159,011',
  'Does not have Type 1 diabetes \nn=156,759',
  'No missing covariate data \nn=73,255',
  'Has srWGS data \nn=61,138',
  'Matched to a statin user \nn=31,534',
  'Matched to a statin user adhering \nto per protocol conditions \nn=7,564',
  'Self-identified white \nn=22,678'
)

# Define the diagram string
diagram <- "
digraph flowchart {
  # node definitions with substituted label text
  node [fontname = Helvetica, shape = rectangle]
  tab1 [label = '@@1']
  tab2 [label = '@@2']
  tab3 [label = '@@3']
  tab4 [label = '@@4']
  tab5 [label = '@@5']
  tab6 [label = '@@6']
  tab7 [label = '@@7']
  tab8 [label = '@@8']
  tab9 [label = '@@9']
  tab10 [label = '@@10']
  tab11 [label = '@@11']
  tab12 [label = '@@12']
  tab13 [label = '@@13']
  tab14 [label = '@@14']
  tab15 [label = '@@15']
  tab16 [label = '@@16']
  tab17 [label = '@@17']
  tab18 [label = '@@18']
  tab19 [label = '@@19']
  tab20 [label = '@@20']
  tab21 [label = '@@21'] 

  # edge definitions with the node IDs
  tab1 -> tab2;
  tab2 -> tab3 -> tab4 -> tab5 -> tab6 -> tab7 -> tab8 -> tab9 -> tab10;
  tab10 -> tab11;
  tab10 -> tab12;
  tab2 -> tab13 -> tab14 -> tab15 -> tab16 -> tab17 -> tab18 -> tab19;
  tab19 -> tab20;
  tab19 -> tab21;
}
"

# Add zero-padding to the placeholder numbers
diagram <- gsub("@@(\\d+)", "@@0\\1", diagram)
diagram <- gsub("@@0(\\d\\d)", "@@\\1", diagram)  # Remove extra zero for double digits

# Perform the substitutions with zero-padded numbers
for (i in seq_along(eval_expressions)) {
  i_padded <- sprintf("%02d", i)  # Creates zero-padded numbers (01, 02, etc.)
  diagram <- gsub(paste0("@@", i_padded), eval_expressions[[i]], diagram)
}

# Pass the modified diagram string to grViz
grViz(diagram)