In [2]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
}))


# penalty factor file for snpnet

## Yosuke Tanigawa, 2021/1/8

As in array-combined dataset, we generate penalty factor file for snpnet, based on the VEP-predicted consequence and ClinVar annotation.

https://github.com/rivas-lab/ukbb-tools/tree/master/03_filtering/array-combined

In [3]:
# input
annot_f <- '/oak/stanford/groups/mrivas/ukbb24983/exome/annotation/20201025_exome_oqfe_2020/ukb24983_exomeOQFE.annotation.20210108.tsv.gz'

# output
p_factor <- '/oak/stanford/groups/mrivas/ukbb24983/exome/snpnet/ukb24983_exomeOQFE.snpnet.penalty.v1.rds'


In [9]:
annot_f %>%
fread(
    colClasses = c('#CHROM'='character'),
    select=c('#CHROM', 'POS', 'ID', 'REF', 'ALT', 'FILTER', 'Csq', 'CLIN_SIG')
) %>%
rename('CHROM'='#CHROM') -> annot_df


In [11]:
annot_df %>% count(FILTER)

FILTER,n
<chr>,<int>
.,17582030
duplicated,91
duplicated;sample_miss,146
duplicated;sample_miss;WB_HWE_p,76
duplicated;WB_HWE_p,13
sample_miss,150824
sample_miss;WB_HWE_p,20165
WB_HWE_p,24605


In [43]:
annot_df %>% count(Csq)


Csq,n
<chr>,<int>
intron,6688092
others,1180120
pav,5412581
pcv,2196041
ptv,496158
utr,1804958


In [47]:
annot_df %>%
mutate(
    CLIN_SIG_curated = if_else(
        is.na(CLIN_SIG) |
        (! str_detect(CLIN_SIG, 'pathogenic')) |
        str_detect(CLIN_SIG, 'conflicting_interpretation') |
        str_detect(CLIN_SIG, 'benign'),
        '3_no_pathogenic_info',
        if_else(
            str_detect(CLIN_SIG, 'likely'),
            '2_likely_pathogenic',
            '1_pathogenic'
        )
    ),
    w = if_else(
        Csq == 'ptv' | CLIN_SIG_curated == '1_pathogenic', .5,
        if_else(
            Csq == 'pav' | CLIN_SIG_curated == '2_likely_pathogenic', .75, 1
        )
    )
) -> annot_weights_df


In [49]:
annot_weights_df %>%
filter(FILTER == '.') %>%
count(w, CLIN_SIG_curated, Csq)

w,CLIN_SIG_curated,Csq,n
<dbl>,<chr>,<chr>,<int>
0.5,1_pathogenic,intron,50
0.5,1_pathogenic,others,14
0.5,1_pathogenic,pav,4640
0.5,1_pathogenic,pcv,27
0.5,1_pathogenic,ptv,5351
0.5,1_pathogenic,utr,20
0.5,2_likely_pathogenic,ptv,3326
0.5,3_no_pathogenic_info,ptv,484663
0.75,2_likely_pathogenic,intron,34
0.75,2_likely_pathogenic,others,11


In [53]:
annot_weights_df %>%
filter(FILTER == '.') %>%
mutate(ID_ALT = paste(ID, ALT, sep='_')) %>%
select(ID_ALT, w) %>%
deframe() %>%
saveRDS(file = p_factor)


## Appendix. ClinVar significance info

In [12]:
annot_df %>% count(CLIN_SIG) -> annot_CLIN_SIG_cnt_df


In [50]:
annot_CLIN_SIG_cnt_df %>% arrange(-n) %>% head(20)

CLIN_SIG,n
<chr>,<int>
,17568276
uncertain_significance,79167
likely_benign,40553
benign,21468
pathogenic,8987
benign&likely_benign,7331
likely_benign&benign,6530
uncertain_significance&likely_benign,4238
likely_pathogenic,4024
likely_benign&uncertain_significance,3979


some variants have multiple significance levels.


In [31]:
annot_CLIN_SIG_cnt_df %>%
filter(str_detect(CLIN_SIG, 'pathogenic') ) %>%
filter(!str_detect(CLIN_SIG, 'conflicting_interpretation') ) %>%
filter(!str_detect(CLIN_SIG, 'benign') ) %>%

separate(CLIN_SIG, paste0('CLIN_SIG', 1:10), remove=T, fill='right', sep='&')  %>%
gather(CLIN_SIGidx, CLIN_SIG, -n) %>%
drop_na(CLIN_SIG) %>%

group_by(CLIN_SIG) %>% summarise(n = sum(n), .groups = 'drop') %>%

arrange(-n)


CLIN_SIG,n
<chr>,<int>
pathogenic,12673
likely_pathogenic,7212
uncertain_significance,1909
pathogenic/likely_pathogenic,1015
not_provided,478
risk_factor,74
drug_response,47
other,45
affects,14
protective,13
