In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
}))


# UK Biobank Exome OQFE 200k dataset variant-level QC

## Yosuke Tanigawa (ytanigaw@stanford.edu), 2020/12/22

We describe the QC criteria in README.md


In [20]:
version = '20201222'

In [27]:
# input

annot_f <- '/oak/stanford/groups/mrivas/ukbb24983/exome/annotation/20201025_exome_oqfe_2020/ukb24983_exomeOQFE.annotation.20201217.compact.tsv.gz'
dup_f <- '/oak/stanford/groups/mrivas/ukbb24983/exome/pgen/oqfe_2020/ukb24983_exomeOQFE.duplicates.tsv.gz'

# output

QC_pass_f <- sprintf('/oak/stanford/groups/mrivas/ukbb24983/exome/qc/oqfe_2020/ukb24983_exomeOQFE.passQC.%s.tsv', version)


In [10]:
dup_f %>% 
fread(colClasses = c('#CHROM'='character')) %>%
rename('CHROM'='#CHROM') -> dup_df


In [13]:
annot_f %>%
fread(
    colClasses = c('#CHROM'='character'),
    select=c('#CHROM', 'POS', 'ID', 'REF', 'ALT', 'FILTER', 'f_miss', 'UKB_white_british_hwe_p')
) %>%
rename('CHROM'='#CHROM') %>%
mutate(
    QC_duplicated = ! (ID %in% (dup_df$ID)),
    QC_sample_miss = (f_miss < .1),
    QC_WB_HWE_p = (log10(UKB_white_british_hwe_p)>-15)
) -> annot_df


In [15]:
annot_df  %>%
count(QC_duplicated, QC_sample_miss, QC_WB_HWE_p)

QC_duplicated,QC_sample_miss,QC_WB_HWE_p,n
<lgl>,<lgl>,<lgl>,<int>
False,False,False,76
False,False,True,146
False,True,False,13
False,True,True,91
True,False,False,20157
True,False,True,150824
True,False,,8
True,True,False,24479
True,True,True,17582030
True,True,,126


In [18]:
annot_df %>%
replace_na(list(QC_duplicated=FALSE, QC_sample_miss=FALSE, QC_WB_HWE_p=FALSE)) %>%
filter((!QC_duplicated) | (!QC_sample_miss) | (!QC_WB_HWE_p)) -> annot_QC_fail_df


In [26]:
annot_QC_fail_df %>% dim()

In [19]:
annot_QC_fail_df %>%
count(QC_duplicated, QC_sample_miss, QC_WB_HWE_p)

QC_duplicated,QC_sample_miss,QC_WB_HWE_p,n
<lgl>,<lgl>,<lgl>,<int>
False,False,False,76
False,False,True,146
False,True,False,13
False,True,True,91
True,False,False,20165
True,False,True,150824
True,True,False,24605


In [25]:
png(file=sprintf('variant.QC.%s.UpSetR.png', version), width=800, height=600, units="px", family = "Helvetica")
UpSetR::upset(
    UpSetR::fromList(list(
        'duplicated' = annot_QC_fail_df %>% 
        filter(! QC_duplicated) %>% pull(ID),

        'missingness' = annot_QC_fail_df %>% 
        filter(! QC_sample_miss) %>% pull(ID),

        'HWE p-value' = annot_QC_fail_df %>% 
        filter(! QC_WB_HWE_p) %>% pull(ID)
    )),
    mainbar.y.label = "Number of removed variants",
    sets.x.label = "# variants", nsets = 20, nintersects = NA,
    text.scale = 1.8, order.by = "freq", show.numbers = "yes"
)
dev.off()


In [28]:
annot_df %>%
replace_na(list(QC_duplicated=FALSE, QC_sample_miss=FALSE, QC_WB_HWE_p=FALSE)) %>%
filter((QC_duplicated) & (QC_sample_miss) & (QC_WB_HWE_p)) %>%
select(CHROM, POS, ID, REF, ALT) %>%
rename('#CHROM' = 'CHROM') %>%
fwrite(QC_pass_f, sep='\t', na = "NA", quote=F)


In [29]:
QC_pass_f
