In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
}))


In [8]:
# reference data
var_annot_f <- '/oak/stanford/groups/mrivas/ukbb24983/array-combined/annotation/annotation_20201012/ukb24983_cal_hla_cnv.annot_compact_20201023.tsv.gz'
clinvar_f <- '/oak/stanford/groups/mrivas/users/ytanigaw/repos/rivas-lab/ukbb-tools/03_filtering/array-combined/clinvar_20200914_patho.tsv'

# without penalty factor
dir_wo_pf <- '/oak/stanford/groups/mrivas/projects/PRS/private_output/20211019_no-pfactor'

# with penalty factor
beta_d <- '/scratch/groups/mrivas/projects/PRS/20211028_freeze/per_trait'


In [3]:
'GBE_ID_list.txt' %>% fread(head=F) %>% pull() -> GBE_IDs


In [4]:
count_consequence <- function(beta_file, var_annot_df, count_col_name = 'n'){
    beta_file %>%
    fread(colClasses = c('CHROM'='character'))  %>%
    inner_join(
        var_annot_df, by = c("CHROM", "POS", "ID", "REF", "ALT")
    ) %>%
    count(
        Genotype_or_allelotype, Consequence_group, ClinVar,
        name = count_col_name
    )
}

count_and_compare_consequence <- function(beta_file_wo_pf, beta_file_w_pf, var_annot_df){    
    full_join(
        beta_file_wo_pf %>%
        count_consequence(var_annot_df, count_col_name = 'without_penalty_factor'),

        beta_file_w_pf %>%
        count_consequence(var_annot_df, count_col_name = 'with_penalty_factor'),

        by = c("Genotype_or_allelotype", "Consequence_group", "ClinVar")
    ) %>%
    left_join(consequence_sort_df, by='Consequence_group') %>%
    arrange(Genotype_or_allelotype, Consequence_order, desc(ClinVar)) %>%
    select(-Consequence_order) %>%
    replace_na(list(without_penalty_factor = 0, with_penalty_factor = 0))
}


In [5]:
clinvar_f %>%
fread(colClasses = c('#CHROM'='character')) %>%
rename_with(function(x){str_replace(x, '#', '')}, starts_with("#")) %>%
mutate(ClinVar = str_replace_all(CLNSIG, '[,/].+', '')) %>%
select(-CLNSIG) -> clinvar_df


In [6]:
data.frame(
    Consequence_group = c('PTVs', 'PAVs', 'PCVs/Intronic/UTRs/Others', ''),
    Consequence_order = 1:4,
    stringsAsFactors=F
) -> consequence_sort_df


In [7]:
var_annot_f %>%
fread(
    colClasses = c('#CHROM'='character'),
    select=c('#CHROM', 'POS', 'ID', 'REF', 'ALT', 'Csq', 'geno_data_source')
) %>%
rename_with(
    function(x){str_replace(x, '#', '')}, starts_with("#")
) %>%
mutate(
    Genotype_or_allelotype = case_when(
        geno_data_source == 'cal' ~ 'Genotyped variants',
        geno_data_source == 'hla' ~ 'HLA allelotype',
        geno_data_source == 'cnv' ~ 'Copy number variants',
        TRUE ~ 'Others'
    ),
    Consequence_group = case_when(
        (geno_data_source == 'cal') & (Csq == 'ptv') ~ 'PTVs',
        (geno_data_source == 'cal') & (Csq == 'pav') ~ 'PAVs',
        geno_data_source == 'cal' ~ 'PCVs/Intronic/UTRs/Others',
        TRUE ~ ''
    )
) %>%
left_join(consequence_sort_df, by='Consequence_group') %>%
left_join(clinvar_df, by=c('CHROM', 'POS', 'REF', 'ALT')) %>%
replace_na(list(ClinVar = '')) -> var_annot_df


In [33]:
GBE_IDs %>% lapply(function(id){
    count_and_compare_consequence(
        file.path(dir_wo_pf, id, '2_refit', 'snpnet.tsv'),

        file.path(beta_d, sprintf('%s.snpnetBETAs.tsv', id)),

        var_annot_df

    ) %>%
    mutate(GBE_ID = id)
}) %>% bind_rows() %>%
mutate(
    enrichment = with_penalty_factor / without_penalty_factor
) %>%
mutate(
    penalty_factor = case_when(
        Consequence_group == 'PTVs' ~ 0.5,
        ClinVar == 'Pathogenic' ~ 0.5,
        Consequence_group == 'PAVs' ~ 0.75,
        ClinVar == 'Likely_pathogenic' ~ 0.75,
        Genotype_or_allelotype == 'HLA allelotype' ~ 0.75,
        TRUE ~ 1
    )
) %>%
arrange(GBE_ID, penalty_factor, Genotype_or_allelotype, Consequence_group, ClinVar) %>%
select(
    GBE_ID,
    Genotype_or_allelotype, Consequence_group, ClinVar,
    without_penalty_factor, with_penalty_factor, enrichment, penalty_factor
) -> count_df


In [34]:
count_df %>%
rename('#GBE_ID' = 'GBE_ID') %>%
fwrite('variant_count_comparison.tsv', sep='\t', na = "NA", quote=F)


In [52]:
count_df %>%
mutate(
    is_prioritized = (! penalty_factor == 1)
) %>%
group_by(GBE_ID, is_prioritized) %>%
summarise(
    without_penalty_factor = sum(without_penalty_factor),
    with_penalty_factor = sum(with_penalty_factor),
    .groups='drop'
) %>%
left_join(
    count_df %>%
    group_by(GBE_ID) %>%
    summarise(
        sum_without_penalty_factor = sum(without_penalty_factor),
        sum_with_penalty_factor = sum(with_penalty_factor),
        .groups='drop'
    ),
    by = "GBE_ID"
) %>%
mutate(
    enrichment = (
        (with_penalty_factor / sum_with_penalty_factor) /
        (without_penalty_factor / sum_without_penalty_factor)
    )
) %>%
filter(is_prioritized) %>%
select(
    GBE_ID,
    with_penalty_factor,    sum_with_penalty_factor,
    without_penalty_factor, sum_without_penalty_factor,
    enrichment
) -> count_simplified_df

In [53]:
count_simplified_df %>%
rename('#GBE_ID' = 'GBE_ID') %>%
fwrite('variant_count_comparison_simplified.tsv', sep='\t', na = "NA", quote=F)
