In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table) 
}))


In [2]:
source('paths.sh')
GBE_IDs <- c('INI50', 'INI21001', 'HC269', 'HC382')



In [3]:
'../20211028_freeze/PRSmap.eval.tsv.gz' %>%
fread %>%
rename_with(function(x){str_replace(x, '#', '')}, starts_with("#")) %>%
select(
    -trait_category, -trait_category_plot,
    -WBtest_P, -WBtest_BYq, -WBtest_is_significant
) %>%
filter(trait %in% GBE_IDs) -> base_results_df


In [4]:
'predictive_performance.tsv' %>%
fread %>%
rename_with(function(x){str_replace(x, '#', '')}, starts_with("#")) %>%
select(-predictors, -response) -> additional_results_df


In [48]:
base_results_df %>%
select(trait, trait_name, family) %>%
unique %>%
left_join(
    bind_rows(
        base_results_df %>%
        select(-trait_name, -family) %>%
        filter(split != 'train_val', model == 'PRS', metric %in% c('r2', 'auc')) %>%
        mutate(genotype = 'without imputed variants'),

        additional_results_df %>%
        filter(split != 'train_val', model == 'PRS', metric %in% c('r2', 'auc')) %>%
        mutate(genotype = 'with imputed variants')

    ),
    by = "trait"
) -> combined_df


In [49]:
get_split_order_df <- function(){
    data.frame(
        split = c(
            'train', 'val', 'test', 'non_british_white',
            's_asian', 'e_asian', 'african'
        ),
        split_plot = c(
            'Training (WB)', 'Validation (WB)', 'Test (WB)',
            'Non-British white', 'South Asian', 'East Asian', 'African'
        ),
        stringsAsFactors=F
    ) %>% mutate(split_order=-1 * 1:n())
}


In [56]:
base_results_df %>%
select(trait, trait_name, family) %>%
unique %>%
left_join(
    combined_df %>%
    filter(split == 'test') %>%
    mutate(genotype = paste0('metric_', genotype)) %>%
    select(trait, `eval`, genotype) %>%
    spread(genotype, `eval`),
    by = "trait"
) %>%
left_join(
    combined_df %>%
    filter(split == 'test') %>%
    mutate(genotype = paste0('n_variables_', genotype)) %>%
    select(trait, n_variables, genotype) %>%
    spread(genotype, n_variables),
    by = "trait"
) -> summary_table_df


In [60]:
summary_table_df %>%
rename('#trait' = 'trait') %>%
fwrite('imputed_variants_performance.tsv', sep='\t', na = "NA", quote=F)


In [45]:
p_binomial <- combined_df %>%
filter(metric == 'auc') %>%
left_join(get_split_order_df(), by='split') %>%
# filter(GBE_ID %in% GBE_IDs, split != 'train_val') %>%
ggplot(aes(x = reorder(split_plot, split_order), y=`eval`, color=genotype, fill=genotype)) +
geom_hline(yintercept = .5, color='black') +
geom_bar(stat='identity', position = "dodge2") +
geom_errorbar(aes(ymin = l_eval, ymax = u_eval), color='black', alpha=1, position = "dodge2") +
coord_flip(ylim = c(.45, NA)) +
theme_bw(base_size=14) +
theme(legend.position='bottom') +
labs(
    title = 'Binary traits (Binomial model)',
    x = 'Hold-out test set',
    y = 'Pedictive performance or snpnet PRS model (AUC)', 
    fill='Genotype data',
    color='Genotype data'
) +
facet_wrap(~ trait, nrow = 1)

In [46]:
p_gaussian <- combined_df %>%
filter(metric == 'r2') %>%
left_join(get_split_order_df(), by='split') %>%
# filter(GBE_ID %in% GBE_IDs, split != 'train_val') %>%
ggplot(aes(x = reorder(split_plot, split_order), y=`eval`, color=genotype, fill=genotype)) +
geom_bar(stat='identity', position = "dodge2") +
geom_errorbar(aes(ymin = l_eval, ymax = u_eval), color='black', alpha=1, position = "dodge2") +
coord_flip() +
theme_bw(base_size=14) +
theme(legend.position='bottom') +
labs(
    title = 'Quantitative traits (Gaussian model)',
    x = 'Hold-out test set',
    y = latex2exp::TeX('Pedictive performance or snpnet PRS model (\\textit{R}$^2$)'), 
    fill='Genotype data',
    color='Genotype data'
) +
facet_wrap(~ trait_name, nrow = 1)


In [47]:
for(ext in c('png', 'pdf')){ggsave(
    sprintf('imputed_variants_performance.%s', ext),
    gridExtra::arrangeGrob(
        p_binomial,
        p_gaussian,
        ncol=1
    ),
    width=8, height=8
)}
