In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
}))


In [2]:
source('0_parameters.sh')


In [3]:
devtools::load_all('/oak/stanford/groups/mrivas/users/ytanigaw/repos/yk-tanigawa/cud4')


Loading cud4



In [4]:
file.path(data_d, 'trait_info.tsv') %>%
fread() %>% rename('GBE_category'='#GBE_category') -> trait_info_df


In [5]:
trait_info_df %>% colnames()

In [6]:
mrp_blacklist_f %>% fread(head=F) %>% pull() -> mrp_blacklist


In [7]:
file.path(res_d, PRS_pval_f) %>% fread() %>%
rename('phe'='#phe') -> PRS_pval_df


In [34]:
file.path(data_d, 'snpnet.eval.2_refit.tsv') %>%
fread(colClasses = c('#GBE_ID'='character')) %>%
rename('GBE_ID'='#GBE_ID') %>%
filter(GBE_ID %in% trait_info_df$GBE_ID) %>%
left_join(
    PRS_pval_df %>% filter(variable == 'PRS') %>%
    select(phe, P) %>% rename('WB_test_P' = 'P'),
    by=c('GBE_ID'='phe')
) %>%
left_join(trait_info_df %>% select(GBE_ID, GBE_category, GBE_short_name), by='GBE_ID') %>%
filter(!GBE_ID %in% mrp_blacklist) -> eval_full_df


In [35]:
eval_full_df %>% filter(split == 'test') %>% count(family)

Unnamed: 0_level_0,family,n
Unnamed: 0_level_1,<chr>,<int>
1,binomial,708
2,gaussian,909


In [36]:
eval_full_df %>% filter(WB_test_P < (0.05 / 2000)) %>% filter(split == 'test') %>% count(family)

Unnamed: 0_level_0,family,n
Unnamed: 0_level_1,<chr>,<int>
1,binomial,242
2,gaussian,186


In [37]:
eval_full_df %>% filter(WB_test_P < (0.05 / 2000)) -> eval_df


In [38]:
eval_df %>% filter(split == 'test') %>% 
filter(GBE_ID %in% c(
    'BIN_FC20020488', 
    'QT_FC10021057', 'QT_FC21057', 
    'QT_FC10003476', 'QT_FC1003476',
    'BIN3571', 'BIN_FC10003571'
))

GBE_ID,split,geno,covar,geno_covar,geno_delta,n_variables,family,n,case_n,control_n,WB_test_P,GBE_category,GBE_short_name
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<chr>,<int>,<int>,<int>,<dbl>,<chr>,<chr>


In [39]:
eval_df %>% 
filter(split == 'test', family == 'binomial') %>%
count(GBE_category) %>% arrange(-n)


GBE_category,n
<chr>,<int>
Disease_outcome,138
Lifestyle_and_environment,35
Health_and_medical_history,31
cancer,9
Psychosocial_factors,9
Family_history,7
Others,4
Work_environment,3
Blood_assays,2
Diet_by_24-hour_recall,2


## Predictive performance vs. model size plot

- x-axis: log10(number of variants in the PRS model), 
- y-axis: the delta in predictive performance (do color coding by gaussian, binomial, etc family)


In [40]:
eval_df %>%
filter(split == 'test', family == 'binomial') %>%
ggplot(aes(x  = n_variables, y = geno_delta)) +
geom_hline(yintercept = 0.05, color='red') +
geom_point() + scale_x_continuous(trans='log10') +
theme_bw(base_size = 16) + labs(
    title = 'Binomial models', x = 'Number of genetic variants', y = 'Incremental AUC'
) -> p_geno_delta_vs_size_binomial


In [42]:
eval_df %>%
filter(split == 'test', family == 'gaussian') %>%
ggplot(aes(x  = n_variables, y = geno_delta)) +
geom_hline(yintercept = 0.05, color='red') +
geom_point() + scale_x_continuous(trans='log10') +
theme_bw(base_size = 16) + labs(
    title = 'Gaussian models', x = 'Number of genetic variants', y = latex2exp::TeX('Incremental r^2') 
) -> p_geno_delta_vs_size_gaussian


In [50]:
for(ext in c('png', 'pdf')){ggsave(
    sprintf('%s.%s', 'test_set_performance_vs_size', ext),
    gridExtra::arrangeGrob(p_geno_delta_vs_size_binomial, p_geno_delta_vs_size_gaussian, ncol=2),
    width=16, height=8
)}


## Trans-ethnic transferability

Violin plot for fraction performance in each population split relative to test split performance in WB
(so you’d have non-british white, s-asian, e-asian, african)

In [29]:
eval_to_transethnic_eval_df <- function(eval_df, metric_col){
    eval_df %>% filter(split != 'train_val') %>%
    rename(!!'metric_col__' := all_of(metric_col)) %>%
    select(GBE_ID, metric_col__, split) %>%
    spread(split,  metric_col__) %>%
    mutate(
        non_british_white = non_british_white / test,
        african           = african / test,
        s_asian           = s_asian / test,
        e_asian           = e_asian / test
    ) %>% select(-test) %>%
    gather(split, eval_relative, -GBE_ID) %>% drop_na() %>%
    left_join(data.frame(
        split = c('non_british_white', 's_asian', 'e_asian', 'african'),
        split_plot = c('Non-British white', 'South Asian', 'East Asian', 'African'),
        split_order=1:4, stringsAsFactors=F
    ), by='split') %>% 
    left_join(
        eval_df %>% select(GBE_ID, split, geno, covar, geno_covar, geno_delta, family),
        by=c('GBE_ID', 'split')
    ) %>%
    rename(!!sprintf('%s_relative', metric_col) :=eval_relative)    
}


In [30]:
get_filtered_GBE_IDs <- function(eval_df, metric_col, metric_threshold){
    eval_df %>% 
    rename(!!'metric_col__' := all_of(metric_col)) %>%
    filter(split == 'test', metric_col__ > metric_threshold) %>%
    pull(GBE_ID)
}


In [31]:
plot_violin <- function(transethnic_df, metric_col){
    transethnic_df %>%
    rename(!!'metric_col__' := sprintf('%s_relative', metric_col)) %>%
    ggplot(aes(x=reorder(split_plot, split_order), y=metric_col__)) + 
    geom_hline(yintercept = 1, color='gray') +
    geom_violin() + geom_jitter(height = 0, width = 0.3, alpha=.2) +
    theme_bw(base_size = 16) + labs(x = 'Ancestry group in UK Biobank', y = metric_col)
}

In [32]:
plot_scatter <- function(eval_df, metric_col){
    eval_df %>%
    rename(!!'metric_col__' := all_of(metric_col)) %>%
    select(GBE_ID, metric_col__, split) %>%
    spread(split, metric_col__) %>% 
    gather(split, metric_col__, -GBE_ID, -test) %>% drop_na() %>%
    left_join(data.frame(
        split = c('non_british_white', 's_asian', 'e_asian', 'african'),
        split_plot = c('Non-British white', 'South Asian', 'East Asian', 'African')
    ), by='split') %>%
    ggplot(aes(x = test, y = metric_col__, color=split_plot)) +
    geom_point(alpha = .3) + theme_bw(base_size = 16) + labs(
        x = 'Incremental predictive performance in WB',
        y = metric_col,
        color = 'Ancestry group in UK Biobank'
    ) + theme(legend.position = c(.3, .85)) +
    scale_color_manual(values=setNames(
        c(cud4_colors()['gray'], cud4_colors()['green'], cud4_colors()['sky_blue'], cud4_colors()['red'] ),
        c('Non-British white', 'South Asian', 'East Asian', 'African')
    ))
}


In [33]:
plot_histogram <- function(eval_df, metric_col, metric_threshold){
    eval_df %>%
    rename(!!'metric_col__' := all_of(metric_col)) %>%    
    ggplot(aes(x = metric_col__)) +
    geom_vline(xintercept = metric_threshold, color='red') +
    geom_histogram(bins=30) + theme_bw(base_size = 16) + labs(x = metric_col)
}


In [46]:
eval_df %>% 
eval_to_transethnic_eval_df('geno_delta') %>%
count(family, split)

family,split,n
<chr>,<chr>,<int>
binomial,african,242
binomial,e_asian,227
binomial,non_british_white,242
binomial,s_asian,241
gaussian,african,186
gaussian,e_asian,186
gaussian,non_british_white,186
gaussian,s_asian,186


In [44]:
eval_df %>% 
eval_to_transethnic_eval_df('geno_delta') %>%
filter(GBE_ID %in% get_filtered_GBE_IDs(eval_df, 'geno_delta', 0.05)) %>%
count(family, split)

family,split,n
<chr>,<chr>,<int>
binomial,african,59
binomial,e_asian,46
binomial,non_british_white,59
binomial,s_asian,59
gaussian,african,100
gaussian,e_asian,100
gaussian,non_british_white,100
gaussian,s_asian,100


In [48]:
for(metric in c('geno_delta', 'geno', 'geno_covar', 'covar')){
    for(ext in c('png', 'pdf')){ggsave(
        sprintf('%s.%s', sprintf('trans-ethnic_%s_violin', metric), ext),
        gridExtra::arrangeGrob(
            eval_df %>% 
            eval_to_transethnic_eval_df(metric) %>%
            filter(GBE_ID %in% get_filtered_GBE_IDs(eval_df, metric, 0.05)) %>%
            filter(family == 'binomial') %>%
            plot_violin(metric) + labs(
                title = 'Binomial models',
                y = latex2exp::TeX('Incremental AUC relative to WB') 
            ),

            eval_df %>% 
            eval_to_transethnic_eval_df(metric) %>%
            filter(GBE_ID %in% get_filtered_GBE_IDs(eval_df, metric, 0.05)) %>%
            filter(family == 'gaussian') %>%
            plot_violin(metric) + labs(
                title = 'Gaussian models',
                y = latex2exp::TeX('Incremental r^2 relative to WB') 
            ),

            ncol=2
        ),
        width=16, height=8
    )}    
}


In [49]:
eval_df %>% 
eval_to_transethnic_eval_df('geno_delta') %>%
count(family, split)


family,split,n
<chr>,<chr>,<int>
binomial,african,242
binomial,e_asian,227
binomial,non_british_white,242
binomial,s_asian,241
gaussian,african,186
gaussian,e_asian,186
gaussian,non_british_white,186
gaussian,s_asian,186


In [51]:
metric <- 'geno_delta'
for(ext in c('png', 'pdf')){ggsave(
    sprintf('%s.%s', sprintf('trans_ethnic_%s_full_violin', metric), ext),
    gridExtra::arrangeGrob(
        eval_df %>% 
        eval_to_transethnic_eval_df(metric) %>%
        filter(family == 'binomial') %>%
        plot_violin(metric) + labs(
            title = 'Binomial models',
            y = latex2exp::TeX('Incremental AUC relative to WB') 
        ),

        eval_df %>% 
        eval_to_transethnic_eval_df(metric) %>%
        filter(family == 'gaussian') %>%
        plot_violin(metric) + labs(
            title = 'Gaussian models',
            y = latex2exp::TeX('Incremental r^2 relative to WB') 
        ),

        ncol=2
    ),
    width=16, height=8
)}    


In [52]:
metric <- 'geno_delta'
for(ext in c('png', 'pdf')){ggsave(
    sprintf('%s.%s', sprintf('trans_ethnic_%s_scatter', metric), ext),
    gridExtra::arrangeGrob(
        eval_df %>% filter(split != 'train_val', family == 'binomial') %>%
        filter(GBE_ID %in% get_filtered_GBE_IDs(eval_df, metric, 0.05)) %>%
        plot_scatter('geno_delta') + labs(
            title = 'Binomial models',
            x = latex2exp::TeX('Incremental AUC in WB'),
            y = latex2exp::TeX('Trans-ethnic incremental AUC')
        ),

        eval_df %>% filter(split != 'train_val', family == 'gaussian') %>%
        filter(GBE_ID %in% get_filtered_GBE_IDs(eval_df, metric, 0.05)) %>%
        plot_scatter('geno_delta') + labs(
            title = 'Gaussian models',
            x = latex2exp::TeX('Incremental r^2 in WB'),
            y = latex2exp::TeX('Trans-ethnic incremental r^2')
        ),
        
        ncol=2
    ),
    width=16, height=8
)}    


## Predictive performance

- histogram of the delta performance across the traits (maybe split for binary and gaussian)


In [53]:
metric <- 'geno_delta'
for(ext in c('png', 'pdf')){ggsave(
    sprintf('%s.%s', sprintf('test_set_%s', metric), ext),
    gridExtra::arrangeGrob(
        eval_df %>%
        filter(split == 'test', family == 'binomial') %>%
        plot_histogram(metric, 0.05) + labs(
            title = 'Binomial models',
            x = latex2exp::TeX('Incremental AUC')
        ),

        eval_df %>%
        filter(split == 'test', family == 'gaussian') %>%
        plot_histogram(metric, 0.05) + labs(
            title = 'Gaussian models',
            x = latex2exp::TeX('Incremental r^2')
        ),
        
        ncol=2
    ),
    width=16, height=8
)}    
