In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
}))


In [2]:
source('paths.sh')

In [3]:
traits_w_metrics_f %>%
fread() %>%
rename_with(
    function(x){str_replace(x, '#', '')}, starts_with("#")
) -> traits_w_metrics_df


In [4]:
eval_full_f %>%
fread() %>%
rename_with(
    function(x){str_replace(x, '#', '')}, starts_with("#")
) -> eval_full_df


In [5]:
PGS_scores_all_df <- readxl::read_excel(
#     "https://ftp.ebi.ac.uk/pub/databases/spot/pgs/metadata/pgs_all_metadata.xlsx", 
    "pgs_all_metadata.xlsx",
    sheet = "Scores"
)


In [6]:
biomarkers_mapping_f %>%
fread() %>%
rename_with(
    function(x){str_replace(x, '#', '')}, starts_with("#")
) -> biomarkers_mapping_df


In [7]:
'Biomarkers_PGS_catalog_mapping.csv' %>%
fread() %>%
rename_with(
    function(x){str_replace(x, '#', '')}, starts_with("#")
) %>%
select(all_of(c('GBE ID', 'PGS catalog score ID'))) %>%
left_join(
    PGS_scores_all_df %>%
    select(all_of(c('PGS Name', 'Polygenic Score (PGS) ID'))),
    by=c('PGS catalog score ID'='Polygenic Score (PGS) ID')
) -> biomarkers_PGScatalog_mapping_df


## Biomarker traits

- https://www.pgscatalog.org/publication/PGP000128/

In [8]:
biomarkers_mapping_df %>%
select(trait, Biomarkers_covariate_adjusted_GBE_ID) %>%
gather(col, val) %>%
mutate(PGS_Name = paste0('GBE_', val)) %>%
pull(PGS_Name) %>%
unique -> biomarker_PGS_Names


In [9]:
PGS_scores_all_df %>%
filter(
    `PGS Name` %in% biomarker_PGS_Names
) -> PGS_drop_request_df



In [10]:
PGS_drop_request_df %>% dim %>% print


[1] 52 19


In [11]:
PGS_drop_request_df %>%
fwrite('PGScatalog/PGS_drop_request.tsv', sep='\t', na = "NA", quote=F)


## List of PGS scores

In [12]:
traits_w_metrics_df %>%
count(
    trait_category != 'Biomarkers',
    WBtest_is_significant
)

"trait_category != ""Biomarkers""",WBtest_is_significant,n
<lgl>,<lgl>,<int>
False,False,1
False,True,34
True,False,751
True,True,779


In [13]:
traits_w_metrics_df %>%
filter(
    # we do not include biomarkers
    trait_category != 'Biomarkers',
    WBtest_is_significant
) %>%
select(trait, trait_name, n_variables) %>%
unique %>%
mutate(
    URL = paste0('https://biobankengine.stanford.edu/RIVAS_HG19/snpnet/', trait),
    trait = paste0('GBE_', trait)
) %>% 
select(trait, trait_name, URL, n_variables) %>%
left_join(
    PGS_scores_all_df %>%
    select(all_of(c('PGS Name', 'Polygenic Score (PGS) ID', 'Mapped Trait(s) (EFO ID)'))),
    by=c('trait'='PGS Name')
) %>%
mutate(
    `PGS Development Method` = 'snpnet',
    `Original Genome Build` = 'GRCh37',
    `Score Development Details` = '',
    `Number of interaction terms` = 0
) %>%
rename(
    'PGS Name' = 'trait',
    'Reported Trait' = 'trait_name',
    'Number of variants' = 'n_variables'
) %>%
select(all_of(c(
    'Polygenic Score (PGS) ID',
    'PGS Name',
    'Reported Trait',
    'URL',
    'Mapped Trait(s) (EFO ID)',
    'PGS Development Method',
    'Score Development Details',
    'Original Genome Build',
    'Number of variants',
    'Number of interaction terms'
))) -> PGS_scores_df


In [14]:
PGS_scores_df %>%
count(`Polygenic Score (PGS) ID` == '')

"`Polygenic Score (PGS) ID` == """"",n
<lgl>,<int>
False,364
,415


In [15]:
PGS_scores_df %>% dim %>% print
PGS_scores_df %>%
fwrite('PGScatalog/PGS_scores.tsv', sep='\t', na = "", quote=F)


[1] 779  10


## Sample table

In [16]:
get_ancestry_df <- function(){
    data.frame(
        split = c('train', 'val', 'train_val', 'test', 'non_british_white', 's_asian', 'e_asian', 'african'),
        broad_ancestral_category=c(        
            'European',
            'European',
            'European',
            'European',
            'European',
            'South Asian',
            'East Asian',
            'African unspecified'
        ),
        additional_ancestry_description = c(
            'white British ancestry',
            'white British ancestry',
            'white British ancestry',
            'white British ancestry',
            'non-white British ancestry',
            '','',''
        ),
        additional_sample_cohort_information = c(
            'Training cohort (train)',
            'Validation cohort (val) to optimize sparsity',            
            'Training + validation cohort (train_val)',
            'Testing cohort (heldout set)',
            '',
            '',
            '',
            ''
        ),
        stringsAsFactors=F
    )
}


In [17]:
eval_full_df %>%
left_join(
    biomarkers_PGScatalog_mapping_df %>%
    select(all_of(c('GBE ID', 'PGS Name'))),
    by=c('trait'='GBE ID')
) %>%
mutate(
    associated_score_name = if_else(
        is.na(`PGS Name`),
        paste0('GBE_', trait),
        `PGS Name`
    )
) %>%
select(-`PGS Name`) -> eval_w_score_name_df


In [18]:
eval_w_score_name_df %>%
filter(
    ((trait_category == 'Biomarkers') & (split != 'train_val')) |
    ((trait_category != 'Biomarkers') & WBtest_is_significant)
) %>%
select(associated_score_name, trait, split, n, case_n, control_n) %>%
unique %>%
left_join(get_ancestry_df(), by="split") %>%
mutate(
    study_stage = if_else(split == 'train_val', 'Score development', 'Testing'),
    sample_set = if_else(split == 'train_val', '', paste0(trait, '_', split)),
    country_of_recruitment = 'UK',
    cohort='UKB'
) %>%
select(
    associated_score_name, study_stage, sample_set,
    n, case_n, control_n,
    broad_ancestral_category,
    country_of_recruitment,
    additional_ancestry_description,
    cohort,
    additional_sample_cohort_information
) -> PGS_sample_df


In [19]:
PGS_sample_df %>% dim %>% print
PGS_sample_df %>%
fwrite('PGScatalog/PGS_sample.tsv', sep='\t', na = "", quote=F)


[1] 4833   11


## Performance metrics

In [20]:
get_split_order_df <- function(){
    data.frame(
        split = c('train', 'val', 'train_val', 'test', 'non_british_white', 's_asian', 'e_asian', 'african'),
        split_order=1:8,
        stringsAsFactors=F
    )
}


In [21]:
get_covariates_str_df <- function(){
    data.frame(
        model = c('PRS', 'full'),
        covariates = c('', 'age, sex, UKB array type, Genotype PCs'),
        stringsAsFactors=F
    )
}


In [22]:
eval_w_score_name_df %>%
filter(
    split != 'train_val',
    (trait_category == 'Biomarkers') | WBtest_is_significant
) %>%
mutate(
    sample_set = paste0(trait, '_', split)
) %>%
select(
    associated_score_name, sample_set, split,
    trait_name, model, metric, `eval`, l_eval, u_eval
) -> performance_metric_all_df


In [23]:
performance_metric_all_df %>%
filter(
    metric %in% c('auc', 'r2'),
    model %in% c('full', 'PRS'),
    ! is.na(`eval`)
) %>%
mutate(eval_95CI = sprintf('%.5f [%.5f, %.5f]', `eval`, l_eval, u_eval)) %>%
select(-l_eval, -u_eval, -`eval`) %>%
spread(metric, eval_95CI) %>%
rename('AUROC'='auc', 'R2'='r2') -> performance_metric_main_df


In [24]:
performance_metric_all_df %>%
filter(
    metric %in% c('auc', 'r2'),
    model %in% c('full', 'covar'),
    ! is.na(`eval`)
) %>%
select(-l_eval, -u_eval, -split) %>%
spread(model, `eval`) %>%
drop_na(full, covar) %>%
mutate(
    model = 'full',
    other_metric_type = case_when(
        metric == 'auc' ~ 'Incremental AUROC (full-covars)',
        metric == 'r2'  ~ 'Incremental R2 (full-covars)',
        TRUE            ~ 'Incremental (full-covars)'
    ),
    other_metric_val = full - covar
) %>%
select(-metric,-covar,-full) -> performance_metric_incremental_df


In [25]:
performance_metric_all_df %>%
filter(
    ! metric %in% c('auc', 'r2'),
    model %in% c('full', 'PRS'),
    ! is.na(`eval`)
) %>%
select(-l_eval, -u_eval, -split) %>%
mutate(
    other_metric_type = case_when(
        metric == 'NagelkerkeR2' ~ "Nagelkerke's R2",
        metric == 'TjurR2'       ~ "Tjur's R2",
        TRUE                     ~ 'Unknown'
    ),
) %>%
rename('other_metric_val'='eval') %>%
select(-metric) -> performance_metric_pseudoR2_df

In [26]:
bind_rows(
    performance_metric_incremental_df,
    performance_metric_pseudoR2_df
) %>% 
mutate(
    other_metric_str = sprintf('%s = %.5f', other_metric_type, other_metric_val)
) %>%
select(-other_metric_type, -other_metric_val) %>%
group_by(associated_score_name, sample_set, trait_name, model) %>%
summarise(
    other_mterics = paste(other_metric_str, collapse=', '),
    .groups='drop'
) -> performance_metric_others_df


In [27]:
performance_metric_main_df %>%
left_join(
    performance_metric_others_df,
    by = c("associated_score_name", "sample_set", "trait_name", "model")
) %>%
left_join(
    eval_w_score_name_df %>%
    select(associated_score_name) %>% 
    unique %>%
    mutate(associated_score_sort_order = 1:n()),
    by = "associated_score_name"
) %>%
left_join(
    get_split_order_df(),
    by = "split"
) %>%
left_join(
    get_covariates_str_df(),
    by = "model"
) %>%
arrange(associated_score_sort_order, split_order, covariates) %>%
select(
    -split, -split_order, -associated_score_sort_order, -model
) -> PGS_performance_metric_df


In [28]:
PGS_performance_metric_df %>%
pull(associated_score_name) %>%
unique %>%
length

813 + micro albumin in urine

In [29]:
PGS_performance_metric_df %>% dim %>% print
PGS_performance_metric_df %>%
fwrite('PGScatalog/PGS_performance_metric.tsv', sep='\t', na = "", quote=F)


[1] 8078    7
