In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
}))


In [2]:
# input
pvar_f <- '/oak/stanford/groups/mrivas/ukbb24983/exome/pgen/oqfe_2020/ukb24983_exomeOQFE.pvar.zst'
data_d <- '/scratch/groups/mrivas/ukbb24983/exome/annotation/20201025_exome_oqfe_2020'
af_hwe_f   <- file.path(data_d, 'ukb24983_exomeOQFE.afreq_hwe.20201025.pvar.zst')
vep_f      <- file.path(data_d, 'UKBexomeOQFE.vep101.tsv.gz')
liftOver_f <- file.path(data_d, 'UKBexomeOQFE.hg19.tsv.gz')
vep_csq_f  <- file.path('..', 'VEP_consequence_group.tsv')

# output
annot_f         <- file.path(data_d, 'ukb24983_exomeOQFE.annotation.tsv')
annot_compact_f <- file.path(data_d, 'ukb24983_exomeOQFE.annotation.compact.tsv')



In [3]:
compact_fields <- c(
    'CHROM', 'POS', 'ID', 'REF', 'ALT', 'FILTER',
    'Allele', 'Csq', 'Consequence', 'SYMBOL', 'Gene',
    'f_miss', 'UKB_white_british_hwe_p', 'UKB_white_british_AF', 'UKB_AF',
    'CHROM_hg19', 'POS_hg19', 'REF_hg19', 'ALT_hg19', 'liftOver_unmapped_reason'
)


In [33]:
# functions

cat_or_zcat <- function(f){
    ifelse(endsWith(f, '.zst'), 'zstdcat', ifelse(endsWith(f, '.gz'), 'zcat', 'cat'))
}

fread_CHROM <- function(f, select=NULL){
    fread(cmd=paste(cat_or_zcat(f), f, "| sed -e 's/^chr//g'",  ' | head -n10000'), colClasses = c('#CHROM'='character'), select=select) %>% rename('CHROM'='#CHROM')
}


In [34]:
pvar_f     %>% fread_CHROM() -> pvar_df
liftOver_f %>% fread_CHROM() -> liftOver_df


In [35]:
af_hwe_f %>% fread_CHROM() -> af_hwe_df
vep_f    %>% fread_CHROM() -> vep_df


In [36]:
vep_csq_f %>% fread(select=c('#Consequence', 'Csq')) %>% rename('Consequence'='#Consequence') -> vep_csq_df


In [37]:
pvar_df %>%
left_join(vep_df, by=c('CHROM', 'POS', 'REF', 'ALT')) %>%
left_join(vep_csq_df, by='Consequence') %>%
left_join(af_hwe_df, by=c('CHROM', 'POS', 'ID', 'REF', 'ALT')) %>%
left_join(liftOver_df, by=c('CHROM', 'POS', 'ID', 'REF', 'ALT')) -> full_df


In [34]:
full_df %>%
select(all_of(c(compact_fields, setdiff(colnames(full_df), compact_fields)))) %>%
rename('#CHROM' = 'CHROM') %>%
fwrite(annot_f, sep='\t', na = "NA", quote=F)


In [35]:
full_df %>%
select(all_of(compact_fields)) %>%
rename('#CHROM' = 'CHROM') %>%
fwrite(annot_compact_f, sep='\t', na = "NA", quote=F)


In [33]:
dim(full_df)

In [7]:
colnames(pvar_df)

In [8]:
colnames(liftOver_df)

In [9]:
colnames(af_hwe_df)

In [10]:
colnames(vep_df)