In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
}))


In [48]:
# input files
var_qc_f    <- '/oak/stanford/groups/mrivas/ukbb24983/array-combined/annotation/annotation_20201012/ukb24983_cal_hla_cnv.var_QC.tsv.gz'
cal_annot_f <- '/oak/stanford/groups/mrivas/ukbb24983/cal/annotation_20201002/ukb24983_cal_cALL_v2_hg19.vep101-loftee.Csq.tsv.gz'
cnv_annot_f <- '/oak/stanford/groups/mrivas/ukbb24983/cnv/annotation_20201003/cnv.vep101-loftee.20201009.Csq.tsv.gz'
ld_indep_f  <- '/oak/stanford/groups/mrivas/ukbb24983/array-combined/annotation/ld_indep_20201015/ukb24983_cal_hla_cnv.white_british.bool.prune.in.tsv.gz'

# output
var_annot_f <- '/oak/stanford/groups/mrivas/ukbb24983/array-combined/annotation/annotation_20201012/ukb24983_cal_hla_cnv.annot_20201023.tsv'
var_annot_compact_f <- '/oak/stanford/groups/mrivas/ukbb24983/array-combined/annotation/annotation_20201012/ukb24983_cal_hla_cnv.annot_compact_20201023.tsv'


In [2]:
cat_or_zcat <- function(f){
    ifelse(endsWith(f, '.zst'), 'zstdcat', ifelse(endsWith(f, '.gz'), 'zcat', 'cat'))
}

fread_CHROM <- function(f){
    fread(cmd=paste(cat_or_zcat(f), f), colClasses = c('#CHROM'='character')) %>% rename('CHROM'='#CHROM')
}


In [93]:
get_POS_total <- function(df){
    # compute the location on the linear coordinate system (chr1-22, X, Y, MT) for plotting
    df %>% select(CHROM, POS, ID) %>%
    mutate(CHROM_X = if_else(CHROM == 'XY', 'X', CHROM)) -> CHROM_POS_df

    CHROM_POS_df %>% group_by(CHROM_X) %>%
    summarise(chr_len = max(POS), .groups = 'drop') %>%
    left_join(data.frame(CHROM_X = c(1:22, 'X', 'Y', 'MT'), CHROM_order=1:25, stringsAsFactors=F), by='CHROM_X') %>%
    arrange(CHROM_order) %>% mutate(CHROM_tot= cumsum(as.numeric(chr_len)) - chr_len) %>%
    select(CHROM_X, CHROM_tot) %>% left_join(CHROM_POS_df, by='CHROM_X') %>%
    mutate(POS_total = POS + CHROM_tot) %>%
    select(ID, POS_total)    
}


In [20]:
var_qc_f    %>% fread_CHROM() -> var_qc_df
cal_annot_f %>% fread_CHROM() -> cal_annot_df
cnv_annot_f %>% fread_CHROM() -> cnv_annot_df
ld_indep_f  %>% fread(colClasses = c('#ID'='character')) %>% rename('ID'='#ID') -> ld_indep_df


In [30]:
cnv_annot_df %>%
select(-pvar_order) %>%
rename('CNV_POS_s'='POS_s', 'CNV_POS_e'='POS_e') %>%
bind_rows(cal_annot_df) -> cal_cnv_annot_df


In [95]:
cal_cnv_annot_df %>%
select(-CHROM, -POS, -REF, -ALT) %>%
right_join(
    var_qc_df %>%
    mutate(sort_order=1:n()),
    by='ID'
) %>%
mutate(ld_indep = ID %in% (ld_indep_df$ID)) %>%
arrange(sort_order) %>% 
select(-sort_order) %>% 
left_join(get_POS_total(var_qc_df), by='ID') -> full_df


In [96]:
full_df %>% dim()

In [97]:
cols <- c(
    'CHROM', 'POS', 'ID', 'REF', 'ALT', 
    'FILTER', 'POS_total', 'Allele', 'Csq', 'Consequence', 
    'SYMBOL', 'Gene', 'ld_indep', 'geno_data_source', 'array',
    'CNV_POS_s', 'CNV_POS_e', 'UKB_white_british_MAF', 
    'hwe_p', 'mgi_notes', 'f_miss', 'f_miss_UKBB', 'f_miss_UKBL',
    'LoF', 'LoF_filter', 'LoF_flags', 'LoF_info'
)


In [98]:
full_df %>% 
select(all_of(c(cols, 'HGVSp'))) %>%
rename('#CHROM' = 'CHROM') %>%
fwrite(var_annot_compact_f, sep='\t', na = "NA", quote=F)


In [99]:
full_df %>% 
select(all_of(c(cols, setdiff(colnames(full_df), cols)))) %>%
rename('#CHROM' = 'CHROM') %>%
fwrite(var_annot_f, sep='\t', na = "NA", quote=F)


In [100]:
full_df %>% 
select(all_of(c(cols, setdiff(colnames(full_df), cols)))) %>%
colnames()