In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
}))


In [2]:
source('0_parameters.sh')


In [7]:
file.path(data_d, 'eval_full.tsv') %>%
fread() %>%
rename_with(function(x){str_replace(x, '#', '')}, starts_with("#")) %>%
filter(is_significant_in_WB) -> eval_full_df


## scores table

In [26]:
eval_full_df %>%
select(trait, trait_name, n_variables) %>%
unique() %>%
mutate(
    URL = paste0('https://biobankengine.stanford.edu/RIVAS_HG19/snpnet/', trait),
    trait = paste0('GBE_', trait)
) %>% 
select(trait, trait_name, URL, n_variables) -> PGS_scores_df

In [96]:
PGS_scores_df %>% 
rename('#trait' = 'trait') %>%
fwrite('PGS_scores.tsv', sep='\t', na = "NA", quote=F)


## Performance table

In [56]:
get_split_order_df <- function(){
    data.frame(
        split = c('train', 'val', 'train_val', 'test', 'non_british_white', 's_asian', 'e_asian', 'african'),
        split_order=1:8, stringsAsFactors=F
    )
}

In [61]:
eval_full_df %>%
filter(split != 'train_val') %>%
left_join(get_split_order_df(), by='split') %>%
mutate(
    split = paste0(trait, '_', split),
    trait = paste0('GBE_', trait)
) %>%
select(
    trait, split, split_order, trait_name, family, geno, geno_covar, geno_delta
) -> performance_master_df


In [69]:
bind_rows(
    performance_master_df %>%
    filter(family == 'gaussian') %>%
    mutate(covariates = 'age, sex Genotype PCs') %>%
    mutate(geno_delta = paste0('Diff R2 (full-covars) = ', geno_delta, '')) %>%
    rename('R2'='geno_covar', 'other_metric'='geno_delta') %>%
    select(
        trait, split, split_order, trait_name, R2, other_metric, covariates
    ),
    
    performance_master_df %>% 
    filter(family == 'gaussian') %>%
    mutate(covariates = '', other_metric='') %>%
    rename('R2'='geno') %>%
    select(trait, split, split_order, trait_name, R2, other_metric, covariates)
) %>%
left_join(
    performance_master_df %>% select(trait) %>% unique() %>%
    mutate(trait_order = 1:n()), by='trait'
) %>%
arrange(trait_order, covariates, split_order) %>%
select(-trait_order, -split_order) -> performance_gaussian_df


In [70]:
bind_rows(
    performance_master_df %>%
    filter(family == 'binomial') %>%
    mutate(covariates = 'age, sex Genotype PCs') %>%
    mutate(geno_delta = paste0('Diff R2 (full-covars) = ', geno_delta, '')) %>%
    rename('AUROC'='geno_covar', 'other_metric'='geno_delta') %>%
    select(
        trait, split, split_order, trait_name, AUROC, other_metric, covariates
    ),
    
    performance_master_df %>% 
    filter(family == 'binomial') %>%
    mutate(covariates = '', other_metric='') %>%
    rename('AUROC'='geno') %>%
    select(trait, split, split_order, trait_name, AUROC, other_metric, covariates)
) %>%
left_join(
    performance_master_df %>% select(trait) %>% unique() %>%
    mutate(trait_order = 1:n()), by='trait'
) %>%
arrange(trait_order, covariates, split_order) %>%
select(-trait_order, -split_order) -> performance_binomial_df


In [99]:
performance_gaussian_df %>% 
rename('#trait' = 'trait') %>%
fwrite('PGS_performance_gaussian.tsv', sep='\t', na = "NA", quote=F)


In [100]:
performance_binomial_df %>% 
rename('#trait' = 'trait') %>%
fwrite('PGS_performance_binomial.tsv', sep='\t', na = "NA", quote=F)


## Sample table

In [11]:
eval_full_df %>% colnames

In [80]:
eval_full_df %>%
filter(family == 'gaussian') %>%
pull(trait) %>% unique() %>%
lapply(function(t){
    file.path(
        GBE_data_d, sprintf('%s.eval.tsv', t)
    ) %>% fread(select=c('#phenotype_name', 'split', 'n'))    
}) %>% bind_rows() %>%
rename_with(
    function(x){str_replace(x, '#', '')}, starts_with("#")
) -> sample_gaussian_df


In [81]:
eval_full_df %>%
filter(family == 'binomial') %>%
pull(trait) %>% unique() %>%
lapply(function(t){
    file.path(
        GBE_data_d, sprintf('%s.eval.tsv', t)
    ) %>% fread(select=c('#phenotype_name', 'split', 'case_n', 'control_n'))    
}) %>% bind_rows() %>%
rename_with(
    function(x){str_replace(x, '#', '')}, starts_with("#")
) -> sample_binomial_df


In [103]:
get_ancestry_df <- function(){
    data.frame(
        split = c('train', 'val', 'train_val', 'test', 'non_british_white', 's_asian', 'e_asian', 'african'),
        broad_ancestral_category=c(        
            'European',
            'European',
            'European',
            'European',
            'European',
            'South Asian',
            'East Asian',
            'African unspecified'
        ),
        additional_ancestry_description = c(
            'white British ancestry',
            'white British ancestry',
            'white British ancestry',
            'white British ancestry',
            'non-white British ancestry',
            '','',''
        ),
        additional_sample_cohort_information = c(
            'Training cohort (train)',
            'Validation cohort (val) to optimize sparsity',            
            'Training + validation cohort (train_val)',
            'Testing cohort (heldout set)',
            '',
            '',
            '',
            ''
        ),
        stringsAsFactors=F
    )
}

In [117]:
bind_rows(
    sample_gaussian_df,
    sample_binomial_df %>% mutate(n = case_n + control_n)
) %>% 
left_join(get_ancestry_df(), by='split') %>%
mutate(
    study_stage = if_else(split == 'train_val', 'Score development', 'Testing'),
    sample_set = if_else(split == 'train_val', '', paste0(phenotype_name, '_', split)),
    associated_score_name = paste0('GBE_', phenotype_name),
    country_of_recruitment = 'UK',
    cohort='UKB'
) %>%
select(
    associated_score_name, study_stage, sample_set,
    n, case_n, control_n,
    broad_ancestral_category,
    country_of_recruitment,
    additional_ancestry_description,
    cohort,
    additional_sample_cohort_information
) -> sample_df



In [118]:
sample_df %>% 
rename('#associated_score_name' = 'associated_score_name') %>%
fwrite('PGS_sample.tsv', sep='\t', na = "NA", quote=F)
