# PRS map - summary plots


## Predictive performance vs. model size plot

- x-axis: log10(number of variants in the PRS model),
- y-axis: the delta in predictive performance (do color coding by gaussian, binomial, etc family)


## library, functions, and constants

In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
}))


In [2]:
source('paths.sh')
devtools::load_all(cud4_d)
devtools::load_all(dirname(dirname(snpnet_helper)))
source(snpnet_helper)


[36mℹ[39m Loading [34m[34mcud4[34m[39m

[36mℹ[39m Loading [34m[34msnpnet[34m[39m



In [3]:
traits_w_metrics_f %>%
fread() %>%
rename_with(
    function(x){str_replace(x, '#', '')}, starts_with("#")
) -> traits_w_metrics_df


eval_fullwDelta_f %>%
fread() %>%
rename_with(
    function(x){str_replace(x, '#', '')}, starts_with("#")
) -> eval_long_df


## relationship between the two variables

In [4]:
cor_test_wrapper <- function(df, cor_test_method){cor.test(
    df %>% pull(value),
    df %>% pull(n_variables),
    method = cor_test_method
)}


In [6]:
eval_long_df %>%
filter(
    WBtest_is_significant,
    split == "test",
    model == "delta"
) %>%
count(family, metric)

family,metric,n
<chr>,<chr>,<int>
binomial,auc,244
binomial,NagelkerkeR2,244
binomial,TjurR2,244
gaussian,r2,569


In [8]:
size_vs_delta_rho <- list()

for(metric_selected in c("r2", "auc", "NagelkerkeR2", "TjurR2")){
    size_vs_delta_rho[[metric_selected]] <- eval_long_df %>%
    filter(
        WBtest_is_significant,
        split == "test",
        model == "delta"
    ) %>%
    filter(metric == metric_selected) %>%
    cor_test_wrapper("spearman")
}


“Cannot compute exact p-value with ties”
“Cannot compute exact p-value with ties”
“Cannot compute exact p-value with ties”
“Cannot compute exact p-value with ties”


In [50]:
# focusing on non-biomarker traits only
size_vs_delta_rho[["r2noBiomarkers"]] <- eval_long_df %>%
filter(trait_category != "Biomarkers") %>%
filter(
    WBtest_is_significant,
    split == "test",
    model == "delta"
) %>%
filter(metric == "r2") %>%
cor_test_wrapper("spearman")


“Cannot compute exact p-value with ties”


In [51]:
eval_long_df %>%
filter(
    WBtest_is_significant,
    split == "test",
    model == "delta"
) %>%
count(family, metric) %>%
full_join(
    size_vs_delta_rho %>% names() %>%
    lapply(function(metric_selected){
        size_vs_delta_rho[[metric_selected]] %>%
        broom::tidy() %>% as.data.frame() %>%
        mutate(metric = metric_selected)
    }) %>% bind_rows(),
    by = "metric"
) -> size_vs_delta_rho_df


In [52]:
size_vs_delta_rho_df


family,metric,n,estimate,statistic,p.value,method,alternative
<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<chr>,<chr>
binomial,auc,244.0,0.02992588,2348636.8,0.6418162,Spearman's rank correlation rho,two.sided
binomial,NagelkerkeR2,244.0,0.21002801,1912593.3,0.000964134,Spearman's rank correlation rho,two.sided
binomial,TjurR2,244.0,0.60544347,955256.9,8.525137999999999e-26,Spearman's rank correlation rho,two.sided
gaussian,r2,569.0,0.61039907,11962010.7,2.210457e-59,Spearman's rank correlation rho,two.sided
,r2noBiomarkers,,0.68493676,8040930.7,2.566473e-75,Spearman's rank correlation rho,two.sided


### list of traits to annotate in the plot

In [13]:
p_delta_vs_size_labels <- data.frame(
    plot_label = c(
        # Binary traits
        'Iritis',
        'Porphyrin and\nbilirubin metabolism\ndisorders (TTE)',
        'Hair color (red)',
        'Hair color (blonde)',
        'Hair color (dark brown)',
        'Celiac disease',
        'Genetic\nhematological\ndisorder',
        'Type 1 diabetes',
        'Ankylosing\nspondylitis',
        'Other coagulation\ndefects (TTE)',
        'Intestinal malabsorption (TTE)',
#         'Never Smoker',
        # Quantitative traits
        'Lipoprotein A',
        'Total bilirubin',
        'Direct billirubin',
        'Mean platelet volume',
        'Apolipoprotein B',
        'EBNA-1 antigen for\nEpstein-Barr Virus',
        'Standing\nheight',
        'Heel bone mineral density'
    ),
    trait = c(
        # Binary traits
        'HC26',
        'HC702',
        'BIN_FC2001747',
        'BIN_FC1001747',
        'BIN_FC4001747',
        'HC303',
        'HC413',
        'HC337',
        'HC422',
        'HC624',
        'HC1132',
#         'BIN_FC10020116',
        # Quantitative traits
        'INI30790',
        'INI30840',
        'INI30660',
        'INI30100',
        'INI30640',
        'INI23004',
        'INI50',
        'INI3148'
    ),
    stringsAsFactors=F
)


## plots

In [21]:
plot_delta_vs_size <- function(eval_long_df, p_delta_vs_size_labels, metric_selected){
    eval_long_df %>%
    filter(
        WBtest_is_significant,
        metric == metric_selected,
        split == "test",
        model == "delta"
    ) %>%
    left_join(p_delta_vs_size_labels, by='trait') %>%
    replace_na(list('plot_label'='')) %>%
    ggplot(aes(x  = n_variables, y = value, color=trait_category_plot, label=trait_label)) +
    geom_point(alpha=.5) + scale_x_continuous(trans='log10') +
    theme_bw(base_size = 16) +
    labs(
        x = 'Number of genetic variants',
        color = 'Trait category'
    ) + 
    theme(legend.position = 'bottom') +
    guides(
      color = guide_legend(
        title = 'Trait category',
        override.aes = aes(label = "", alpha=1),
        ncol=2
      )
    )
}


In [39]:
plot_delta_vs_size_annotate_rho <- function(plot_obj, size_vs_delta_rho_df, metric_selected, ypos = c(.35, .32)){
    plot_obj +
    annotate(
        geom="text", x = 1, y = ypos[1], color="black",
        hjust = 0, parse = TRUE, size = 7,
        label=sprintf(
            "\"Spearman's\" ~ rho == %0.2f",
            round(size_vs_delta_rho_df %>% filter(metric == metric_selected) %>% pull(estimate), 2)
        ),
    ) + 
    annotate(
        geom="text", x = 1, y = ypos[2],color="black",
        hjust = 0, parse = TRUE, size = 7,
        label = sprintf(
            "\"(p-value: \" * %.1e * \")\"",
            size_vs_delta_rho_df %>% filter(metric == metric_selected) %>% pull(p.value)
        )
    )
}


In [40]:
for(ext in c('png', 'pdf')){ggsave(
    file.path("plots", sprintf('size_vs_delta.%s', ext)),
    gridExtra::arrangeGrob(
        # Nagelkerke's pseudo-R2
        (
            eval_long_df %>%
            filter(family == 'binomial') %>%
            plot_delta_vs_size(p_delta_vs_size_labels, "NagelkerkeR2") +
            labs(
                title = 'Binary traits (Binomial model)',
                y = latex2exp::TeX("Incremental Nagelkerke's $\\\\textit{R}^{2}$")
            )
        ) %>%
        plot_delta_vs_size_annotate_rho(size_vs_delta_rho_df, "NagelkerkeR2", ypos = c(.58, .53)) +
        ggrepel::geom_text_repel(force=20, mapping = aes(label = plot_label)),
        # quantitative traits (R2)
        (
            eval_long_df %>%
            filter(family == 'gaussian') %>%
            plot_delta_vs_size(p_delta_vs_size_labels, "r2") +
            labs(
                title = 'Quantitative traits (Gaussian model)',
                y = latex2exp::TeX('Incremental \\textit{R}$^2$')
            )
        ) %>%
        plot_delta_vs_size_annotate_rho(size_vs_delta_rho_df, "r2")+
        ggrepel::geom_text_repel(force=20, mapping = aes(label = plot_label)),
        ncol=2
    ),
    width=16, height=8
)}


“ggrepel: 1 unlabeled data points (too many overlaps). Consider increasing max.overlaps”
“ggrepel: 1 unlabeled data points (too many overlaps). Consider increasing max.overlaps”


In [53]:
for(ext in c('png', 'pdf')){
    ggsave(
        file.path("plots", sprintf('size_vs_delta_NagelkerkeR2.%s', ext)),
        # Nagelkerke's pseudo-R2
        (
            eval_long_df %>%
            filter(family == 'binomial') %>%
            plot_delta_vs_size(p_delta_vs_size_labels, "NagelkerkeR2") +
            labs(
                title = 'Binary traits (Binomial model)',
                y = latex2exp::TeX("Incremental Nagelkerke's $\\\\textit{R}^{2}$")
            )
        ) %>%
        plot_delta_vs_size_annotate_rho(size_vs_delta_rho_df, "NagelkerkeR2", ypos = c(.58, .53)) +
        ggrepel::geom_text_repel(force=20, mapping = aes(label = plot_label)),
        width=8, height=9
    )
    ggsave(
        file.path("plots", sprintf('size_vs_delta_gaussian.%s', ext)),  
        # quantitative traits (R2)
        (
            eval_long_df %>%
            filter(family == 'gaussian') %>%
            plot_delta_vs_size(p_delta_vs_size_labels, "r2") +
            labs(
                title = 'Quantitative traits (Gaussian model)',
                y = latex2exp::TeX('Incremental \\textit{R}$^2$')
            )
        ) %>%
        plot_delta_vs_size_annotate_rho(size_vs_delta_rho_df, "r2")+
        ggrepel::geom_text_repel(force=20, mapping = aes(label = plot_label)),
        width=8, height=9
    )
    
    ggsave(
        file.path("plots", sprintf('size_vs_delta_gaussian_noBiomarkers.%s', ext)),  
        # quantitative traits (R2), non-biomarker traits only
        (
            eval_long_df %>%
            filter(family == 'gaussian') %>%
            filter(trait_category != "Biomarkers") %>%
            plot_delta_vs_size(p_delta_vs_size_labels, "r2") +
            labs(
                title = 'Quantitative traits (Gaussian model)\n(non-biomarker traits only)',
                y = latex2exp::TeX('Incremental \\textit{R}$^2$')
            )
        ) %>%
        plot_delta_vs_size_annotate_rho(size_vs_delta_rho_df, "r2noBiomarkers")+
        ggrepel::geom_text_repel(force=20, mapping = aes(label = plot_label)),
        width=8, height=9
    )
    
    ggsave(
        file.path("plots", sprintf('size_vs_delta_AUC.%s', ext)),
        # binomial traits (AUC)
        (
            eval_long_df %>%
            filter(family == 'binomial') %>%
            plot_delta_vs_size(p_delta_vs_size_labels, "auc") +
            labs(
                title = 'Binary traits (Binomial model)',
                y = latex2exp::TeX("Incremental AUC")
            )
        ) %>%
        plot_delta_vs_size_annotate_rho(size_vs_delta_rho_df, "auc", ypos = c(.38, .35)) +
        ggrepel::geom_text_repel(force=20, mapping = aes(label = plot_label)),
        width=8, height=9
    )
    
    ggsave(
        file.path("plots", sprintf('size_vs_delta_TjurR2.%s', ext)),
        # Tjur's pseudo-R2
        (
            eval_long_df %>%
            filter(family == 'binomial') %>%
            plot_delta_vs_size(p_delta_vs_size_labels, "TjurR2") +
            labs(
                title = 'Binary traits (Binomial model)',
                y = latex2exp::TeX("Incremental Tjur's $\\\\textit{R}^{2}$")
            )
        ) %>%
        plot_delta_vs_size_annotate_rho(size_vs_delta_rho_df, "TjurR2", ypos = c(.48, .43)) +
        ggrepel::geom_text_repel(force=20, mapping = aes(label = plot_label)),
        width=8, height=9
    )
    
}


“ggrepel: 1 unlabeled data points (too many overlaps). Consider increasing max.overlaps”
“ggrepel: 5 unlabeled data points (too many overlaps). Consider increasing max.overlaps”
“ggrepel: 1 unlabeled data points (too many overlaps). Consider increasing max.overlaps”
“ggrepel: 5 unlabeled data points (too many overlaps). Consider increasing max.overlaps”


### export them to plotly plots
Those files are now hosted on AFS (/afs/ir.stanford.edu/users/y/t/ytanigaw/WWW/PRSmap)
- http://web.stanford.edu/~ytanigaw/PRSmap/



In [44]:
htmlwidgets::saveWidget(
    (
        eval_long_df %>%
        filter(family == 'binomial') %>%
        plot_delta_vs_size(p_delta_vs_size_labels, "NagelkerkeR2") +
        labs(
            title = 'Binary traits (Binomial model)',
            y = "Incremental Nagelkerke's R^2"
        )
    ) %>%
    plotly::ggplotly(),
    'ggplotly/size_vs_delta_binomial.html'
)

htmlwidgets::saveWidget(
    (
        eval_long_df %>%
        filter(family == 'gaussian') %>%
        plot_delta_vs_size(p_delta_vs_size_labels, "r2") +
        labs(
            title = 'Quantitative traits (Gaussian model)',
            y = 'Incremental R^2'
        )
    ) %>%
    plotly::ggplotly(),
    'ggplotly/size_vs_delta_gaussian.html'
)
