In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
}))


In [2]:
# input and parameters

data_d <- '/oak/stanford/groups/mrivas/users/gmcinnes/statin_risk_score'
phe_f <- file.path(data_d, 'statin_PRS_all.phe')
sscore_f <- file.path(data_d, '__PHENOTYPE__', '__PHENOTYPE__.sscore.zst')
snpnet_BETAs_f <- file.path(data_d, '__PHENOTYPE__', 'snpnet.tsv')
snpnet_covar_BETAs_f <- file.path(data_d, '__PHENOTYPE__', 'snpnet.covars.tsv')
covariates <- c('age', 'sex', paste0('PC', 1:10))
refit_split_strs <- c('non_british_white', 'african', 's_asian', 'e_asian')

# output
eval_f <- '3_performance_eval.tsv'


In [3]:
eval_phe <- function(phe_df, phenotype, sscore_f, snpnet_BETAs_f, snpnet_covar_BETAs_f, covariates, family, refit_split_strs=NULL){
    metric.type <- ifelse(family=='binomial', 'auc', 'r2')
    phe_df %>% read_predicted_scores(
        str_replace_all(sscore_f, '__PHENOTYPE__', phenotype),
        str_replace_all(snpnet_covar_BETAs_f, '__PHENOTYPE__', phenotype),
        covariates
    ) %>%
    drop_na(geno_score, covar_score) %>%
    inner_join(
        phe_df %>% rename(!!'phe':= all_of(phenotype)) %>% select(FID, IID, phe, split, all_of(covariates)),
        by=c('FID', 'IID')
    ) %>%
    drop_na(split, phe) %>% filter(phe != -9) -> phe_score_before_refit_df

    if(is.null(refit_split_strs)){
        phe_score_before_refit_df %>%
        select(FID, IID, split, phe, geno_score, covar_score) -> phe_score_df
    }else{
        # refit covar models for non-WB populations
        refit_split_strs %>%
        lapply(function(split_str){
            phe_score_before_refit_df %>%
            filter(split == split_str) %>%
            compute_covar_score('phe', covariates, family)
        }) %>% bind_rows() %>%
        select(FID, IID, covar_score) -> refit_df

        bind_rows(
            # the ones without refit
            phe_score_before_refit_df %>%
            filter(!split %in% refit_split_strs) %>%
            select(FID, IID, split, phe, geno_score, covar_score),

            # the ones from refit
            phe_score_before_refit_df %>%
            select(FID, IID, split, phe, geno_score) %>%
            inner_join(refit_df, by=c('FID', 'IID'))            
        )  -> phe_score_df
    }
    
    phe_score_df %>% count(split, phe) %>%
    mutate(phe = if_else(phe==2, 'case_n', 'control_n')) %>%
    spread(phe, n) %>% filter(control_n>0, case_n>0) %>%
    arrange(-case_n) -> split_cnt_df

    phe_score_df %>% build_eval_df((split_cnt_df %>% pull(split)), metric.type) %>%
    mutate(
        geno_delta = geno_covar - covar,
        phenotype_name = phenotype,
        n_variables = read_BETAs(str_replace_all(snpnet_BETAs_f, '__PHENOTYPE__', phenotype)) %>% nrow()
    ) %>%
    left_join(split_cnt_df, by='split') %>%
    select(phenotype_name, split, geno, covar, geno_covar, geno_delta, n_variables, case_n, control_n)    
}


In [4]:
source('/oak/stanford/groups/mrivas/users/ytanigaw/repos/rivas-lab/snpnet/helpers/snpnet_misc.R')


In [5]:
phe_f %>% 
fread(colClasses = c('FID'='character', 'IID'='character'), data.table=F) %>%
mutate(
    split = if_else(is.na(split), population, paste('WB', split, sep=':'))
) -> phe_df


In [6]:
eval_df <- bind_rows(
    eval_phe(
        phe_df, 'atorvastatin_v_hc_and_Ostatin',
        sscore_f, snpnet_BETAs_f, snpnet_covar_BETAs_f,
        covariates, 'binomial', refit_split_strs
    ),
    eval_phe(
        phe_df, 'simvastatin_v_hc_and_Ostatin',
        sscore_f, snpnet_BETAs_f, snpnet_covar_BETAs_f,
        covariates, 'binomial', refit_split_strs
    )
)


In [7]:
eval_df %>%
rename('#phenotype_name' = 'phenotype_name') %>%
fwrite(eval_f, sep='\t', na = "NA", quote=F)


In [8]:
plot_order <- data.frame(
    split=c('WB:train', 'WB:val', 'WB:test', 'non_british_white', 'african', 's_asian', 'e_asian'),
    order=1:7
) 

In [9]:
p1_delta <- eval_df %>%
left_join(plot_order, by='split') %>%
ggplot(aes(x=reorder(as.factor(split), -order), y=geno_delta)) +
geom_bar(stat="identity") +
theme_bw() + coord_flip() +
geom_hline(yintercept = 0, color='gray') + 
labs(
    title='Incremental predictive performance of snpnet PRS',
    x='training/validation/test split or population',
    y='incremental AUC (full model - covariate-only model)'
) +
facet_wrap(~phenotype_name)


In [10]:
p2_covar <- eval_df %>%
left_join(plot_order, by='split') %>%
ggplot(aes(x=reorder(as.factor(split), -order), y=covar)) +
geom_bar(stat="identity") +
theme_bw() + coord_flip() +
geom_hline(yintercept = 0, color='gray') + 
labs(
    title='Predictive performance of covariate-only model',
    x='training/validation/test split or population',
    y='AUC of the covariate-only model'
) +
facet_wrap(~phenotype_name)


In [11]:
ggsave('3_performance_eval.incremental_AUC.png', p1_delta, width=8, height=4)
ggsave('3_performance_eval.covar_AUC.png', p2_covar, width=8, height=4)
