# PRS map - format tables

Yosuke Tanigawa


## library, functions, and constants

In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
}))


In [2]:
source('paths.sh')

In [3]:
# GWAS h2 table (heritability)
GWAS_h2_f %>% 
fread() %>%
rename_with(
    function(x){str_replace(x, '#', '')}, starts_with("#")
) -> GWAS_h2_df


In [4]:
# PGS catalog ID mapping
"PRSmap_PGScatalog_mapping.tsv" %>%
fread() -> PGScatalog_IDs_df

In [5]:
# evaluation full results
eval_full_f %>%
fread() %>%
rename_with(
    function(x){str_replace(x, '#', '')}, starts_with("#")
) -> eval_full_df


In [6]:
# convert it to a long format
eval_full_df %>%
select(all_of(c(
    "trait", "split", "model", "metric", "eval"
))) %>% 
pivot_wider(
    names_from = c("model"),
    values_from = c("eval")
) %>%
mutate(
    delta = full - covar
) %>%
pivot_longer(
    all_of(c("PRS", "covar", "full", "delta")),
    names_to = c("model")
) -> eval_long_without_trait_info_df


In [7]:
# complie trait info into one data frame
eval_long_without_trait_info_df %>%
filter(
    split == "test",
    metric %in% c("r2", "NagelkerkeR2")
) %>%
select(trait, model, value) %>%
mutate(model = paste("pred", model, sep="_")) %>%
pivot_wider(
    names_from = c("model"), 
    values_from = c("value")
) %>%
left_join(
    eval_full_df %>%
    select(all_of(c(
        'trait', 'trait_name', 'trait_category', 'trait_category_plot', 'family',
        'n_variables', 'WBtest_P', 'WBtest_BYq', 
        'WBtest_is_significant', 'WBtest_BHq'
    ))) %>% unique,
    by = "trait"
) %>%
left_join(
    PGScatalog_IDs_df %>%
    select(trait, PGScatalog_ID, EFO_ID, EFO_label),
    by = "trait"
) %>%
mutate(
    trait_label=sprintf('%s (%s)', trait_name, trait),
) %>%
select(all_of(c(
    'trait', 'trait_name', 'trait_category', 'trait_category_plot', 'family',
    "pred_PRS", "pred_covar", "pred_full", "pred_delta",
    'n_variables', 'WBtest_P', 'WBtest_BYq', 
    'WBtest_is_significant', 'WBtest_BHq', "trait_label",
    'PGScatalog_ID', "EFO_ID", "EFO_label"
))) -> traits_w_metrics_df


In [8]:
eval_long_without_trait_info_df %>%
left_join(
    # we add trait characteristics
    traits_w_metrics_df %>%
    select(-pred_PRS, -pred_covar, -pred_full, -pred_delta),
    by = "trait"
) -> eval_long_df


## write to files

In [10]:
eval_long_df %>%
rename('#trait' = 'trait') %>%
fwrite(eval_fullwDelta_f, sep='\t', na = "NA", quote=F)


In [11]:
eval_long_df %>% 
rename('#trait' = 'trait') %>%
fwrite(file.path(PRS202110_d, basename(eval_fullwDelta_f)), sep='\t', na = "NA", quote=F)


In [9]:
traits_w_metrics_df %>% dim

In [13]:
traits_w_metrics_df %>%
rename('#trait' = 'trait') %>%
fwrite(traits_w_metrics_f, sep='\t', na = "NA", quote=F)


In [52]:
if(! file.exists('LDSCh2.tsv')) {
    eval_full_df %>% 
    filter(
        split %in% c("train_val", "test")
    ) %>%
    select(trait, split, case_n, control_n) %>%
    unique() %>% 
    group_by(trait) %>%
    summarise(
        WB_case_n = sum(case_n),
        WB_control_n = sum(control_n)
    ) %>%
    mutate(
        WB_prev = WB_case_n / (WB_case_n + WB_control_n)
    ) -> WB_prev_df
    
    WB_prev_df$WB_prev_z <- dnorm(qnorm(
        WB_prev_df$WB_prev, lower.tail = FALSE
    ))
    
    traits_w_metrics_df %>%
    filter(WBtest_is_significant) %>%
    select(trait, trait_name, trait_category, family) %>%
    left_join(GWAS_h2_df, by=c('trait'='p')) %>%
    left_join(WB_prev_df, by="trait") %>%
    mutate(
        # compute liability-scale heritability
        h2_liability = h2_obs * WB_prev * (1 - WB_prev) / (WB_prev_z * WB_prev_z)
    ) %>%
    rename('#trait' = 'trait') %>%
    fwrite('LDSCh2.tsv', sep='\t', na = "NA", quote=F)    
}


In [50]:
traits_w_metrics_df%>%
count(WBtest_is_significant, family) %>%
spread(family, n) %>%
arrange(-WBtest_is_significant)


WBtest_is_significant,binomial,gaussian
<lgl>,<int>,<int>
True,244,569
False,450,302
