# SID Genetics Study Step 4: Survival Analysis

## Objective
The purpose of this notebook is to run survival analyses on the main study population and its subsets. 
- Statistical techniques: survival analysis, adjusted Cox regression models, Kaplan-Meier curves

# Pull in Data

**Objective**: The purpose of this section is to load necessary packages + citations, and pull in data frames created in previous steps.

In [None]:
# install.packages('allofus')
# install.packages('tidyverse')
# install.packages('stats')
# install.packages('survival')
# install.packages('survminer')
# install.packages('multcomp')
# install.packages("cowplot")
# install.packages("tableone")

library(allofus)
library(tidyverse)
library(stats)
library(survival)
library(survminer)
library(multcomp)
library(ggplot2)
library(cowplot)

my_bucket <- Sys.getenv('WORKSPACE_BUCKET')

In [None]:
citation('allofus')
citation('tidyverse')
citation('stats')
citation('survival')
citation('survminer')
citation('multcomp')
citation('cowplot')
citation('tableone')

In [None]:
# Pull in data frames for each subset
system(paste0("gsutil cp ", my_bucket, "/sid_pheno_files/", "itt_df_v2.csv", " ."), intern=T)
itt_df <- read.csv("itt_df_v2.csv")

dim(itt_df)
# head(itt_df)

system(paste0("gsutil cp ", my_bucket, "/sid_pheno_files/", "ldl30_df_v2.csv", " ."), intern=T)
ldl30_df <- read.csv("ldl30_df_v2.csv")

dim(ldl30_df)
# head(ldl30_df)

system(paste0("gsutil cp ", my_bucket, "/sid_pheno_files/", "per_protocol_df_v2.csv", " ."), intern=T)
per_protocol_df <- read.csv("per_protocol_df_v2.csv")

dim(per_protocol_df)
# head(per_protocol_df)

system(paste0("gsutil cp ", my_bucket, "/sid_pheno_files/", "white_df_v2.csv", " ."), intern=T)
white_df <- read.csv("white_df_v2.csv")

dim(white_df)
# head(white_df)

# LDL-C Analysis data frame
system(paste0("gsutil cp ", my_bucket, "/sid_pheno_files/", "ldl_df_v3.csv", " ."), intern=T)
ldl30_df2 <- read.csv("ldl_df_v3.csv")

dim(ldl30_df2)
# head(ldl30_df2)

In [None]:
# Make sure LDL-C groups are properly leveled
ldl_df <- ldl30_df2 %>% mutate(max_change_group = factor(max_change_group, 
                                                         levels = c("non_user", "increase", "low", 
                                                                    "moderate", "high")),
                               max_dec_group = factor(max_dec_group, 
                                                         levels = c("non_user", "low", 
                                                                    "moderate", "high")),
                               last_change_group = factor(last_change_group, 
                                                         levels = c("non_user", "increase", "low", 
                                                                    "moderate", "high")),
                               last_statin_change_group = factor(last_statin_change_group, 
                                                         levels = c("non_user", "increase", "low", 
                                                                    "moderate", "high")),
                               first_post_change_group = factor(first_post_change_group, 
                                                         levels = c("non_user", "increase", "low", 
                                                                    "moderate", "high")))

# Helper Functions

**Objective**: The purpose of this section is to create functions to streamline survival analysis.

In [None]:
# Set plot size
options(repr.plot.width=8, repr.plot.height=8)

# Function to plot survival curves
plot_surv <- function(df, main) {
    
    # Fit simple survival model
    surv <- survfit(Surv(time, status) ~ group, data = df)

    # Plot Kaplan-Meier curve
    ggsurvplot(surv, data = df, risk.table = TRUE, fun = "event") +
        labs(
            title = main,
            x = "Years",
            y = "Overall survival probability"
        )
}

In [None]:
# Function to create a data frame with HR, 95% CI, and p-values from Cox regressions
get_cox <- function(df) {
    
    # If running this analysis in self-identified white subset, exclude population covariate,
    # else, include population covariate
    if(all(df$population == "White")) {
        
        # Run adjusted Cox regression model
        surv_adj <- coxph(Surv(time, status) ~ group + index_age + sex_at_birth +  
                        low_hdl + high_tg + high_bmi + smoking_status + htn_status + 
                        pd_status + gd_status, data = df)
        
    } else {
        
        # Run adjusted Cox regression model
        surv_adj <- coxph(Surv(time, status) ~ group + index_age + sex_at_birth + population + 
                        low_hdl + high_tg + high_bmi + smoking_status + htn_status + 
                        pd_status + gd_status, data = df)
        
    }
    
    # Extract necessary outcomes from the model output
    row_names <- names(coef(surv_adj))
    hr <- round(exp(coef(surv_adj)), digits = 2)
    lower <- round(exp(confint(surv_adj)[,1]), digits = 2)
    upper <- round(exp(confint(surv_adj)[,2]), digits = 2)
    p_val <- sprintf("%.2e", coef(summary(surv_adj))[,"Pr(>|z|)"])
    res_df <- data.frame(variables = row_names, hazard = paste0(hr, " (95% CI ", lower, "-", upper, " , P=", p_val, ")"))
    
    return(res_df)
    
}

In [None]:
# Function to create a data frame with HR, 95% CI, and p-values from Cox regressions for statin type, intensity,
# or lipophilicity
get_cox_statin <- function(df, statin) {
    
    # If running this analysis in self-identified white subset, exclude population covariate,
    # else, include population covariate
    if(all(df$population == "White")) {
        
        formula <- as.formula(paste0("Surv(time, status) ~ ", statin, " + index_age + sex_at_birth +
                                    low_hdl + high_tg + high_bmi + smoking_status + htn_status + gd_status"))
        
        # Run adjusted Cox regression model
        surv_adj <- coxph(formula, data = df)
        
    } else {
        
        formula <- as.formula(paste0("Surv(time, status) ~ ", statin, " + index_age + population + sex_at_birth +
                                    low_hdl + high_tg + high_bmi + smoking_status + htn_status + gd_status"))
        
        # Run adjusted Cox regression model
        surv_adj <- coxph(formula, data = df)
        
    }
    
    # Extract necessary outcomes from the model output
    row_names <- names(coef(surv_adj))
    hr <- round(exp(coef(surv_adj)), digits = 2)
    lower <- round(exp(confint(surv_adj)[,1]), digits = 2)
    upper <- round(exp(confint(surv_adj)[,2]), digits = 2)
    p_val <- sprintf("%.2e", coef(summary(surv_adj))[,"Pr(>|z|)"])
    res_df <- data.frame(variables = row_names, hazard = paste0(hr, " (95% CI ", lower, "-", upper, " , P=", p_val, ")"))
    
    # Create data frame with outcomes
    return(res_df)
    
}

In [None]:
# Function to plot survival curves stratified by statin type, intensity, or lipophilicity
plot_surv_statin <- function(df, statin, main) {
    
    # Create formula
    formula <- as.formula(paste0("Surv(time, status) ~ ", statin))
    
    # Fit model
    surv <- survfit(formula, data = df)

    # Plot Kaplan-Meier curve
    ggsurvplot(surv, data = df, risk.table = TRUE, fun = "event") +
        labs(
            title = main,
            x = "Years",
            y = "Overall survival probability"
        )
}

In [None]:
# Function to create a data frame with HR, 95% CI, and p-values from Cox regressions adjusted for baseline LDL-C
get_cox_ldl_adj <- function(df, statin) {
    
    # If running this analysis in self-identified white subset, exclude population covariate,
    # else, include population covariate
    if(all(df$population == "White")) {
        
        formula <- as.formula(paste0("Surv(time, status) ~ ", statin, " + first_ldl + index_age + sex_at_birth +
                                    low_hdl + high_tg + high_bmi + smoking_status + htn_status + gd_status"))
        
        # Run adjusted Cox regression model
        surv_adj <- coxph(formula, data = df)
        
    } else {
        
        formula <- as.formula(paste0("Surv(time, status) ~ ", statin, " + first_ldl + index_age + population + sex_at_birth +
                                    low_hdl + high_tg + high_bmi + smoking_status + htn_status + gd_status"))
        
        # Run adjusted Cox regression model
        surv_adj <- coxph(formula, data = df)
        
    }
    
    # Extract necessary outcomes from the model output
    row_names <- names(coef(surv_adj))
    hr <- round(exp(coef(surv_adj)), digits = 2)
    lower <- round(exp(confint(surv_adj)[,1]), digits = 2)
    upper <- round(exp(confint(surv_adj)[,2]), digits = 2)
    p_val <- sprintf("%.2e", coef(summary(surv_adj))[,"Pr(>|z|)"])
    res_df <- data.frame(variables = row_names, hazard = paste0(hr, " (95% CI ", lower, "-", upper, " , P=", p_val, ")"))
    
    return(res_df)
    
}

In [None]:
# Function to create a data frame with HR, 95% CI, and p-values from linear regressions for every combination of LDL change variable
ldl_change_reg <- function(df, cont, cat) {

    # Initialize final data frame
    final_df <- data.frame()
    
    for(ci in cont) {
        
        # Initialize category data frames
        new_df <- data.frame(continuous = c(ci, ci, ci, ci))
        
        for(a in cat) {
            
            # Create formula
            formula <- as.formula(paste0(ci, " ~ ", a, " + index_age + population + sex_at_birth +
                                    low_hdl + high_tg + high_bmi + smoking_status + htn_status + gd_status"))
            # Run linear regression
            model <- lm(formula, data = df)
            m_summ <- summary(model)
            
            # Put change in LDL per category with 95% CI and P value into a data frame row
            new_df$current_a <- c(ifelse(any(df[a] == 'increase'), paste0("increase", " ", round(coef(m_summ)[paste0(a,"increase"),1], digits = 2), 
                                                      " (95% CI ", round(confint(model)[paste0(a,"increase"),1], digits = 2), "-", 
                                                      round(confint(model)[paste0(a,"increase"),2], digits = 2), " , P=",
                                                      sprintf("%.2e", coef(m_summ)[paste0(a,"increase"),'Pr(>|t|)']), ")"), 'increase'),
                                  ifelse(any(df[a] == 'low'), paste0("low", " ", round(coef(m_summ)[paste0(a,"low"),1], digits = 2), 
                                                      " (95% CI ", round(confint(model)[paste0(a,"low"),1], digits = 2), "-", 
                                                      round(confint(model)[paste0(a,"low"),2], digits = 2), " , P=",
                                                      sprintf("%.2e", coef(m_summ)[paste0(a,"low"),'Pr(>|t|)']), ")"), 'low'),
                                  ifelse(any(df[a] == 'moderate'), paste0("moderate", " ", round(coef(m_summ)[paste0(a,"moderate"),1], digits = 2), 
                                                      " (95% CI ", round(confint(model)[paste0(a,"moderate"),1], digits = 2), "-", 
                                                      round(confint(model)[paste0(a,"moderate"),2], digits = 2), " , P=",
                                                      sprintf("%.2e", coef(m_summ)[paste0(a,"moderate"),'Pr(>|t|)']), ")"), 'moderate'),
                                  ifelse(any(df[a] == 'high'), paste0("high", " ", round(coef(m_summ)[paste0(a,"high"),1], digits = 2), 
                                                      " (95% CI ", round(confint(model)[paste0(a,"high"),1], digits = 2), "-", 
                                                      round(confint(model)[paste0(a,"high"),2], digits = 2), " , P=",
                                                      sprintf("%.2e", coef(m_summ)[paste0(a,"high"),'Pr(>|t|)']), ")"), 'high'))

            # Change column names to reflect current categorical variable
            names(new_df)[names(new_df) == "current_a"] <- a
        
            # Bind rows together
            
        }
        
        final_df <- rbind(new_df, final_df)
        
    }
    
    return(final_df)
} 

In [None]:
# Function to create a data frame with HR, 95% CI, and p-values from linear regressions for every combination of LDL change variable
ldl_change_reg_surv <- function(df, cont, cat) {

    # Initialize final data frame
    final_df <- data.frame()
    
    for(ci in cont) {
        
        # Initialize category data frames
        new_df <- data.frame(continuous = c(ci, ci, ci, ci))
        
        for(a in cat) {
            
            # Create formula
            formula <- as.formula(paste0("Surv(time, status) ~ ", a, " + index_age + population + sex_at_birth +
                                    low_hdl + high_tg + high_bmi + smoking_status + htn_status + gd_status"))
            # Run linear regression
            model <- coxph(formula, data = df)
            m_summ <- summary(model)
            
            # Put change in LDL per category with 95% CI and P value into a data frame row
            new_df$current_a <- c(ifelse(any(df[a] == 'increase'), paste0("increase", " ", round(exp(coef(m_summ)[paste0(a,"increase"),1]), digits = 2), 
                                                      " (95% CI ", round(exp(confint(model)[paste0(a,"increase"),1]), digits = 2), "-", 
                                                      round(exp(confint(model)[paste0(a,"increase"),2]), digits = 2), " , P=",
                                                      sprintf("%.2e", coef(m_summ)[paste0(a,"increase"),'Pr(>|z|)']), ")"), 'increase'),
                                  ifelse(any(df[a] == 'low'), paste0("low", " ", round(exp(coef(m_summ)[paste0(a,"low"),1]), digits = 2), 
                                                      " (95% CI ", round(exp(confint(model)[paste0(a,"low"),1]), digits = 2), "-", 
                                                      round(exp(confint(model)[paste0(a,"low"),2]), digits = 2), " , P=",
                                                      sprintf("%.2e", coef(m_summ)[paste0(a,"low"),'Pr(>|z|)']), ")"), 'low'),
                                  ifelse(any(df[a] == 'moderate'), paste0("moderate", " ", round(exp(coef(m_summ)[paste0(a,"moderate"),1]), digits = 2), 
                                                      " (95% CI ", round(exp(confint(model)[paste0(a,"moderate"),1]), digits = 2), "-", 
                                                      round(exp(confint(model)[paste0(a,"moderate"),2]), digits = 2), " , P=",
                                                      sprintf("%.2e", coef(m_summ)[paste0(a,"moderate"),'Pr(>|z|)']), ")"), 'moderate'),
                                  ifelse(any(df[a] == 'high'), paste0("high", " ", round(exp(coef(m_summ)[paste0(a,"high"),1]), digits = 2), 
                                                      " (95% CI ", round(exp(confint(model)[paste0(a,"high"),1]), digits = 2), "-", 
                                                      round(exp(confint(model)[paste0(a,"high"),2]), digits = 2), " , P=",
                                                      sprintf("%.2e", coef(m_summ)[paste0(a,"high"),'Pr(>|z|)']), ")"), 'high'))

            # Change column names to reflect current categorical variable
            names(new_df)[names(new_df) == "current_a"] <- a
        
            # Bind rows together
            
        }
        
        final_df <- rbind(new_df, final_df)
        
    }
    
    return(final_df)
} 

In [None]:
# Function to summarize change in LDL-C by group
summarize_ldl <- function(df, var, ldl) {
    
    df %>%
          group_by(.data[[var]]) %>%
          filter(!is.na(.data[[ldl]])) %>%
          summarize(label = ldl,
                    count = n(),
                    t2d = sum(t2d_status == "Event"),
                    t2d_prop = round(sum(t2d_status == "Event") / n(), digits = 3) * 100,
                    min = round(min(.data[[ldl]], na.rm = TRUE), digits = 3) * 100,
                    median = round(median(.data[[ldl]], na.rm = TRUE), digits = 3) * 100,
                    mean = round(mean(.data[[ldl]], na.rm = TRUE), digits = 3) * 100,
                    max = round(max(.data[[ldl]], na.rm = TRUE), digits = 3) * 100
                  ) %>%
          arrange(.data[[var]])
    
}

# ITT Subset

**Objective**: The purpose of this section is to run survival analyses on the intention-to-treat (ITT) study population, along with statin type, intensity, and liphophilicity analyses.

## Preliminary survival analysis

In [None]:
# Summarize median follow-up time for each subset
itt_df %>% group_by(group) %>% summarize(median = median(time))
ldl30_df %>% group_by(group) %>% summarize(median = median(time))
per_protocol_df %>% group_by(group) %>% summarize(median = median(time))
white_df %>% group_by(group) %>% summarize(median = median(time))

In [None]:
# Run adjusted Cox regression model and plot Kaplan-Meier curves
# Primary objective is to find the hazard of developing NOD in statin users when compared to non-users
plot_surv(itt_df, "Intention-to-Treat")
get_cox(itt_df)

## Statin Type

In [None]:
# Run adjusted Cox regression model and plot Kaplan-Meier curves
# Primary objective is to find the hazard of developing NOD by statin type at the start when compared to non-users
itt_df <- within(itt_df, statin_type_start <- relevel(as.factor(statin_type_start), ref = "non-user")) # Relevel statin_type to ensure non-users are the reference
get_cox_statin(itt_df, "statin_type_start")

In [None]:
# Run adjusted Cox regression model and plot Kaplan-Meier curves
# Primary objective is to find the hazard of developing NOD by statin type at the end when compared to non-users
itt_df <- within(itt_df, statin_type_end <- relevel(as.factor(statin_type_end), ref = "non-user")) # Relevel statin_type to ensure non-users are the reference
get_cox_statin(itt_df, "statin_type_end")

## Statin Intensity

In [None]:
# Check how many users fall into each statin intensity category
table(ldl_df$statin_intensity_start, useNA = 'always')
table(ldl_df$statin_intensity_end, useNA = 'always')

In [None]:
# Filter out missing statin intensities (some are missing due to missing dosages)
statin_intensity_missing_start <- ldl_df %>% filter(is.na(statin_intensity_start))
statin_intensity_missing_end <- ldl_df %>% filter(is.na(statin_intensity_end))

# Maintain matching structure
statin_intensity_start_df <- ldl_df %>% filter(!match_group %in% statin_intensity_missing_start$person_id)
statin_intensity_end_df <- ldl_df %>% filter(!match_group %in% statin_intensity_missing_end$person_id)

In [None]:
# Check statin intensity counts again
table(statin_intensity_start_df$statin_intensity_start, useNA = 'always')
table(statin_intensity_end_df$statin_intensity_end, useNA = 'always')

In [None]:
# Run adjusted Cox regression model and plot Kaplan-Meier curves
# Primary objective is to find the hazard of developing NOD by statin intensity when compared to non-users
statin_intensity_start_df <- within(statin_intensity_start_df, statin_intensity_start <- relevel(as.factor(statin_intensity_start), ref = "non-user")) # Relevel to make sure non-user is the reference level
statin_intensity_start_df$statin_intensity_start <- factor(statin_intensity_start_df$statin_intensity_start, levels = c('non-user', 'low', 'moderate', 'high'))
get_cox_statin(statin_intensity_start_df, "statin_intensity_start")

In [None]:
# Run adjusted Cox regression model and plot Kaplan-Meier curves
# Primary objective is to find the hazard of developing NOD by statin intensity when compared to non-users
statin_intensity_end_df$statin_intensity_end <- factor(statin_intensity_end_df$statin_intensity_end, levels = c('non-user', 'low', 'moderate', 'high'))
statin_intensity_end_df <- within(statin_intensity_end_df, statin_intensity_end <- relevel(as.factor(statin_intensity_end), ref = "non-user")) # Relevel to make sure non-user is the reference level
get_cox_statin(statin_intensity_end_df, "statin_intensity_end")

In [None]:
# Summarize LDL-C change measures by starting statin intensity
ldl_summ_start <- data.frame()
for (i in c('max_change_ldl', 'max_decrease_ldl', 'change_ldl_last', 
                 'change_ldl_last_statin', 'change_ldl_first_post')) {
    
    ldl_summ_start_new <- summarize_ldl(statin_intensity_start_df, 'statin_intensity_start', i)
    ldl_summ_start<- rbind(ldl_summ_start, ldl_summ_start_new)

}

ldl_summ_start

In [None]:
# Summarize LDL-C change measures by ending statin intensity
ldl_summ_end <- data.frame()
for (i in c('max_change_ldl', 'max_decrease_ldl', 'change_ldl_last', 
                 'change_ldl_last_statin', 'change_ldl_first_post')) {
    
    ldl_summ_end_new <- summarize_ldl(statin_intensity_end_df, 'statin_intensity_end', i)
    ldl_summ_end <- rbind(ldl_summ_end, ldl_summ_end_new)

}

ldl_summ_end

In [None]:
# Relevel to make sure non-user is the reference level
ldl_df <- within(ldl_df, statin_intensity_start <- relevel(as.factor(statin_intensity_start), ref = "non-user"))
ldl_df <- within(ldl_df, statin_intensity_end <- relevel(as.factor(statin_intensity_end), ref = "non-user")) 

# Predict change in LDL-C by statin intensity
ldl_change_reg(ldl_df, 
               c('max_change_ldl', 'max_decrease_ldl', 'change_ldl_last', 
                 'change_ldl_last_statin', 'change_ldl_first_post'), 
               c('statin_intensity_start', 'statin_intensity_end'))

## Statin Lipophilicity

In [None]:
# Run adjusted Cox regression model and plot Kaplan-Meier curves
# Primary objective is to find the hazard of developing NOD by statin intensity when compared to non-users
itt_df <- within(itt_df, statin_lipo_start <- relevel(as.factor(statin_lipo_start), ref = "non-user")) # Relevel to make sure non-user is the reference level
get_cox_statin(itt_df, "statin_lipo_start")

In [None]:
# Run adjusted Cox regression model and plot Kaplan-Meier curves
# Primary objective is to find the hazard of developing NOD by statin intensity when compared to non-users
itt_df <- within(itt_df, statin_lipo_end <- relevel(as.factor(statin_lipo_end), ref = "non-user")) # Relevel to make sure non-user is the reference level
get_cox_statin(itt_df, "statin_lipo_end")

# LDL Analysis

**Objective**: The purpose of this section is to analyze the associations of baseline LDL-C and change in LDL-C with NOD.

## ≥30% Decrease in LDL-C Subset

In [None]:
# Run adjusted Cox regression model and plot Kaplan-Meier curves
# Primary objective is to find the hazard of developing NOD by treatment group in the ≥30% decrease in LDL-C subset
plot_surv(ldl30_df, "≥30% Decrease in LDL-C")
get_cox(ldl30_df)

## Statin Intensity subsets based on LDL-C

In [None]:
# Sanity check to make sure LDL-lowering intensity groups follow expected trends
ldl_change_reg(ldl_df, 
               c('max_change_ldl', 'max_decrease_ldl', 'change_ldl_last', 
                 'change_ldl_last_statin', 'change_ldl_first_post'), 
               c('max_change_group', 'max_dec_group', 'last_change_group', 
                 'last_statin_change_group', 'first_post_change_group'))

In [None]:
for(l in c('max_change_ldl', 'max_decrease_ldl', 'change_ldl_last', 
                 'change_ldl_last_statin', 'change_ldl_first_post')) {
    
    for(i in c('max_change_group', 'max_dec_group', 'last_change_group', 
            'last_statin_change_group', 'first_post_change_group')) {
    
    print(summarize_ldl(i, l))
    
    }
    
}

In [None]:
ldl_change_reg_surv(ldl_df, 
               c('status'), 
               c('max_change_group', 'max_dec_group', 'last_change_group', 
                 'last_statin_change_group', 'first_post_change_group'))

### Baseline LDL-C Analysis

In [None]:
# Run adjusted Cox regression model
# Primary objective is to find the hazard of developing NOD in statin users based on baseline LDL-C
get_cox_statin(ldl_df %>% filter(group == 'user'), "baseline_ldl")

In [None]:
# Using linear functions to determine the effect of a 10mg/dL increase in LDL-C

# Run adjusted Cox regression model
model <- coxph(Surv(time, status) ~ baseline_ldl + index_age + sex_at_birth + population + 
                        low_hdl + high_tg + high_bmi + smoking_status + htn_status + 
                        pd_status + gd_status, data = ldl_df %>% filter(group == 'user'))

# Use generalized linear hypothesis test to find the association with a 10mg/dL increase in baseline LDL-C
base_sum <- summary(glht(model, linfct = c("10 * baseline_ldl = 0")))
base_sum

# Print estimates
df <- as_tibble(confint(base_sum)[['confint']])
df

# Exponentiate estimates to get hazard ratios
df %>% mutate(Estimate = exp(Estimate),
                        lwr = exp(lwr),
                        upr = exp(upr))

### Max Change in LDL-C Analysis

In [None]:
# Run adjusted Cox regression model
# Primary objective is to find the hazard of developing NOD in statin users based on change in LDL-C
get_cox_statin(ldl_df %>% filter(group == 'user'), "max_change_ldl")

In [None]:
# Using linear functions to determine the effect of a 10% decrease in the change in LDL-C

# Run adjusted Cox regression model
model <- coxph(Surv(time, status) ~ max_change_ldl + index_age + sex_at_birth + population + 
                        low_hdl + high_tg + high_bmi + smoking_status + htn_status + 
                        pd_status + gd_status, data = ldl_df %>% filter(group == 'user'))

# Use generalized linear hypothesis test to find the association with a 10mg/dL increase in baseline LDL-C
base_sum <- summary(glht(model, linfct = c("-0.1 * max_change_ldl = 0")))
base_sum

# Print estimates
df <- as_tibble(confint(base_sum)[['confint']])
df

# Exponentiate estimates to get hazard ratios
df %>% mutate(Estimate = exp(Estimate),
                        lwr = exp(lwr),
                        upr = exp(upr))

### Max Decrease in LDL-C

In [None]:
# Run adjusted Cox regression model
# Primary objective is to find the hazard of developing NOD in statin users based on a decrease in LDL-C
get_cox_statin(ldl_df %>% filter(group == 'user'), "max_decrease_ldl")

In [None]:
# Using linear functions to determine the effect of a 10% decrease in the change in LDL-C

# Run adjusted Cox regression model
model <- coxph(Surv(time, status) ~ max_decrease_ldl + index_age + sex_at_birth + population + 
                        low_hdl + high_tg + high_bmi + smoking_status + htn_status + 
                        pd_status + gd_status, data = ldl_df %>% filter(group == 'user'))

# Use generalized linear hypothesis test to find the association with a 10mg/dL increase in baseline LDL-C
base_sum <- summary(glht(model, linfct = c("-0.1 * max_decrease_ldl = 0")))
base_sum

# Print estimates
df <- as_tibble(confint(base_sum)[['confint']])
df

# Exponentiate estimates to get hazard ratios
df %>% mutate(Estimate = exp(Estimate),
                        lwr = exp(lwr),
                        upr = exp(upr))

# Per Protocol Subset

**Objective**: The purpose of this section is to run survival analyses on the per protocol subset.

In [None]:
# Run adjusted Cox regression model and plot Kaplan-Meier curves
# Primary objective is to find the hazard of developing NOD in statin users when compared to non-users
plot_surv(per_protocol_df, "Per Protocol")
get_cox(per_protocol_df)

# Self-Identified White Subset

**Objective**: The purpose of this section is to run survival analyses on the self-identified white subset.

In [None]:
# Run adjusted Cox regression model and plot Kaplan-Meier curves
# Primary objective is to find the hazard of developing NOD in statin users when compared to non-users
plot_surv(white_df, "White")
get_cox(white_df)