In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
}))


In [2]:
ukb_d <- '/scratch/groups/mrivas/ukbb24983'
out_f   <- file.path(ukb_d, 'array-exome-combined/annotation/20201216/ukb24983_cal_hla_cnv_exomeOQFE.annotation.tsv')
array_f <- file.path(ukb_d, 'array-combined/annotation/annotation_20201012/ukb24983_cal_hla_cnv.annot_20201023.tsv.gz')
exome_f <- file.path(ukb_d, 'exome/annotation/20201025_exome_oqfe_2020/ukb24983_exomeOQFE.annotation.20201217.tsv.gz')
pvar_f  <- file.path(ukb_d, 'array-exome-combined/pgen/ukb24983_cal_hla_cnv_exomeOQFE.pvar.gz')


In [3]:
get_POS_total <- function(df){
    # compute the location on the linear coordinate system (chr1-22, X, Y, MT) for plotting
    # original: https://github.com/rivas-lab/ukbb-tools/blob/master/17_annotation/20201012_array-combined/7_finalize.ipynb
    df %>% select(CHROM, POS, ID) %>%
    mutate(CHROM_X = if_else(CHROM == 'XY', 'X', CHROM)) -> CHROM_POS_df

    CHROM_POS_df %>% group_by(CHROM_X) %>%
    summarise(chr_len = max(POS), .groups = 'drop') %>%
    left_join(data.frame(CHROM_X = c(1:22, 'X', 'Y', 'MT'), CHROM_order=1:25, stringsAsFactors=F), by='CHROM_X') %>%
    arrange(CHROM_order) %>% mutate(CHROM_tot= cumsum(as.numeric(chr_len)) - chr_len) %>%
    select(CHROM_X, CHROM_tot) %>% left_join(CHROM_POS_df, by='CHROM_X') %>%
    mutate(POS_total = POS + CHROM_tot) %>%
    select(ID, POS_total)    
}

In [4]:
# read pvar file
pvar_f %>%
fread(colClasses = c('#CHROM'='character')) %>%
rename('CHROM'='#CHROM') %>%
mutate(sort_order = 1:n()) -> pvar_df


In [5]:
pvar_df %>% filter(geno_data_source != 'exome200k') %>%
pull(ID) -> ID_array

pvar_df %>% filter(geno_data_source == 'exome200k') %>%
pull(ID) -> ID_exome


In [6]:
# read & filter array
array_f %>%
fread(colClasses = 'character', nrows=10) %>%
rename('CHROM'='#CHROM') %>%
filter(ID %in% ID_array) -> array_df


In [10]:
# read & filter exome
exome_f %>%
fread(colClasses = 'character', nrows=10) %>%
rename('CHROM'='#CHROM') %>%
filter(ID %in% ID_exome) %>% rename(
    'CHROM_hg38'='CHROM', 'POS_hg38'='POS', 'REF_hg38'='REF', 'ALT_hg38'='ALT',
    'CHROM'='CHROM_hg19', 'POS'='POS_hg19', 'REF'='REF_hg19', 'ALT'='ALT_hg19'
) -> exome_df


In [11]:
exome_df %>% dim()

In [12]:
array_df %>% dim()

In [13]:
intersect(colnames(array_df), colnames(exome_df)) %>% length()


In [14]:
setdiff(colnames(array_df), colnames(exome_df))


In [15]:
setdiff(colnames(exome_df), colnames(array_df))


In [21]:
# combine
pvar_df %>% select(CHROM, POS, ID, REF, ALT, geno_data_source, sort_order) %>%
inner_join(
    bind_rows(
        array_df %>% select(-CHROM, -POS, -REF, -ALT, -geno_data_source, -POS_total) %>%
        rename('UKB_white_british_hwe_p' = 'hwe_p') %>%
        mutate(
            UKB_white_british_MAF = as.numeric(UKB_white_british_MAF)
        ), 
        exome_df %>% select(-CHROM, -POS, -REF, -ALT, -liftOver_unmapped_reason) %>%
        mutate(
            UKB_white_british_MAF = pmin(1 - as.numeric(UKB_white_british_AF), as.numeric(UKB_white_british_AF))
        )
    ),
    by='ID'
) -> combined_df


In [23]:
combined_df %>% 
left_join(get_POS_total(combined_df), by='ID') %>%
arrange(sort_order) %>%
select(-sort_order) -> full_df


In [27]:
full_df %>%
rename('#CHROM' = 'CHROM') %>%
fwrite(out_f, sep='\t', na = "NA", quote=F)


In [28]:
out_f