In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
}))


In [2]:
# reference data
var_annot_f <- '/oak/stanford/groups/mrivas/ukbb24983/array-combined/annotation/annotation_20201012/ukb24983_cal_hla_cnv.annot_compact_20201023.tsv.gz'
clinvar_f <- '/oak/stanford/groups/mrivas/users/ytanigaw/repos/rivas-lab/ukbb-tools/03_filtering/array-combined/clinvar_20200914_patho.tsv'

# without penalty factor
dir_wo_pf <- '/oak/stanford/groups/mrivas/projects/PRS/private_output/20211019_no-pfactor'

# with penalty factor
beta_d <- '/oak/stanford/groups/mrivas/projects/PRS/GBE_data'
eval_f <- '/oak/stanford/groups/mrivas/projects/PRS/private_output/202009_batch/snpnet.eval.2_refit.tsv'


In [14]:
'GBE_ID_list.txt' %>% fread(head=F) %>% pull() -> GBE_IDs


In [34]:
get_split_order_df <- function(){
    data.frame(
        split = c('train', 'val', 'test', 'non_british_white', 's_asian', 'e_asian', 'african'),
        split_plot = c('Training (WB)', 'Validation (WB)', 'Test (WB)', 'Non-British white', 'South Asian', 'East Asian', 'African'),
        stringsAsFactors=F
    ) %>% mutate(split_order=-1 * 1:n())
}


In [3]:
count_consequence <- function(beta_file, var_annot_df, count_col_name = 'n'){
    beta_file %>%
    fread(colClasses = c('CHROM'='character'))  %>%
    inner_join(
        var_annot_df, by = c("CHROM", "POS", "ID", "REF", "ALT")
    ) %>%
    count(
        Genotype_or_allelotype, Consequence_group, ClinVar,
        name = count_col_name
    )
}


In [4]:
count_and_compare_consequence <- function(beta_file_wo_pf, beta_file_w_pf, var_annot_df){    
    full_join(
        beta_file_wo_pf %>%
        count_consequence(var_annot_df, count_col_name = 'without_penalty_factor'),

        beta_file_w_pf %>%
        count_consequence(var_annot_df, count_col_name = 'with_penalty_factor'),

        by = c("Genotype_or_allelotype", "Consequence_group", "ClinVar")
    ) %>%
    left_join(consequence_sort_df, by='Consequence_group') %>%
    arrange(Genotype_or_allelotype, Consequence_order, desc(ClinVar)) %>%
    select(-Consequence_order) %>%
    replace_na(list(without_penalty_factor = 0, with_penalty_factor = 0))
}

In [5]:
clinvar_f %>%
fread(colClasses = c('#CHROM'='character')) %>%
rename_with(function(x){str_replace(x, '#', '')}, starts_with("#")) %>%
mutate(ClinVar = str_replace_all(CLNSIG, '[,/].+', '')) %>%
select(-CLNSIG) -> clinvar_df


In [6]:
data.frame(
    Consequence_group = c('PTVs', 'PAVs', 'PCVs/Intronic/UTRs/Others', ''),
    Consequence_order = 1:4,
    stringsAsFactors=F
) -> consequence_sort_df


In [7]:
var_annot_f %>%
fread(
    colClasses = c('#CHROM'='character'),
    select=c('#CHROM', 'POS', 'ID', 'REF', 'ALT', 'Csq', 'geno_data_source')
) %>%
rename_with(
    function(x){str_replace(x, '#', '')}, starts_with("#")
) %>%
mutate(
    Genotype_or_allelotype = case_when(
        geno_data_source == 'cal' ~ 'Genotyped variants',
        geno_data_source == 'hla' ~ 'HLA allelotype',
        geno_data_source == 'cnv' ~ 'Copy number variants',
        TRUE ~ 'Others'
    ),
    Consequence_group = case_when(
        (geno_data_source == 'cal') & (Csq == 'ptv') ~ 'PTVs',
        (geno_data_source == 'cal') & (Csq == 'pav') ~ 'PAVs',
        geno_data_source == 'cal' ~ 'PCVs/Intronic/UTRs/Others',
        TRUE ~ ''
    )
) %>%
left_join(consequence_sort_df, by='Consequence_group') %>%
left_join(clinvar_df, by=c('CHROM', 'POS', 'REF', 'ALT')) %>%
replace_na(list(ClinVar = '')) -> var_annot_df


In [57]:
eval_f %>%
fread(
    colClasses = c('#GBE_ID'='character'),
    select=c('#GBE_ID', 'split', 'geno_delta', 'GBE_short_name')
) %>%
rename_with(
    function(x){str_replace(x, '#', '')}, starts_with("#")
) -> eval_w_penalty_factor_df


In [81]:
eval_w_penalty_factor_df %>%
mutate(Model = 'with penalty factor') %>%
bind_rows(
    GBE_IDs %>% lapply(function(GBE_ID){    
        file.path(dir_wo_pf, GBE_ID, '2_refit', 'snpnet.eval.tsv') %>%
        fread(
            colClasses = c('#phenotype_name'='character'),
            select=c('#phenotype_name', 'split', 'geno_delta')
        ) %>%
        rename('GBE_ID' = '#phenotype_name')
    }) %>% bind_rows() %>%
    left_join(
        eval_w_penalty_factor_df %>%
        select(GBE_ID, GBE_short_name) %>%
        unique,
        by='GBE_ID'
    ) %>%
    mutate(Model = 'without penalty factor')
) -> eval_comparison_df


In [82]:
eval_comparison_df %>%
left_join(get_split_order_df(), by='split') %>%
filter(GBE_ID %in% GBE_IDs, split != 'train_val') %>%
ggplot(aes(x = reorder(split_plot, split_order), y=geno_delta, color=Model, fill=Model)) +
geom_bar(stat='identity', position = "dodge2") +
coord_flip() +
theme_bw(base_size=14) +
theme(legend.position='bottom') +
labs(
    title = 'Incremental predictive performance',
    x = 'Hold-out test set',
    y = 'Incremental predictive performance'
) +
facet_wrap(~ GBE_short_name, nrow = 2) -> p_eval_comparison


In [85]:
eval_comparison_df %>%
rename('#GBE_ID' = 'GBE_ID') %>%
fwrite('eval_comparison.tsv', sep='\t', na = "NA", quote=F)


In [86]:
for(ext in c('png', 'pdf')){ggsave(
    sprintf('%s.%s', 'eval_comparison', ext),
    p_eval_comparison, width=8, height=8
)}


In [78]:
GBE_IDs %>% lapply(function(id){
    count_and_compare_consequence(
        file.path(dir_wo_pf, id, '2_refit', 'snpnet.tsv'),

        file.path(beta_d, sprintf('%s.tsv', id)),

        var_annot_df

    ) %>%
    mutate(GBE_ID = id)
}) %>% bind_rows() %>%
left_join(
    eval_w_penalty_factor_df %>%
    select(GBE_ID, GBE_short_name) %>%
    unique,
    by='GBE_ID'
) %>%
mutate(
    enrichment = with_penalty_factor / without_penalty_factor
) %>%
select(
    GBE_ID, GBE_short_name,
    Genotype_or_allelotype, Consequence_group, ClinVar,
    without_penalty_factor, with_penalty_factor, enrichment
)-> count_df


In [87]:
count_df %>%
rename('#GBE_ID' = 'GBE_ID') %>%
fwrite('variant_count_comparison.tsv', sep='\t', na = "NA", quote=F)
