# reference data prep for the combined dataset of array and exome

In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
}))


In [2]:
source('paramters.sh')


In [3]:
in_array_f <- '../ukb_cal-consequence_wb_maf_gene_ld_indep_mpc_pli_qc.tsv.gz'
in_exome_f <- '../ukb_exm_oqfe-consequence_wb_maf_gene_ld_indep_mpc_pli.tsv.gz'
annotation_f <- file.path(data_dir, annotation_array_exome_f)

out_f <- '../ukb_cal-exm-hg19-consequence_wb_maf_gene_ld_indep_mpc_pli.tsv.gz'


In [12]:
in_array_f %>% fread(colClasses = c('ld_indep'='character')) %>%
rename_with(function(x){str_replace(x, '#', '')}, starts_with("#")) -> in_array_df

in_exome_f %>% fread(colClasses = c('ld_indep'='character')) %>%
rename_with(function(x){str_replace(x, '#', '')}, starts_with("#")) -> in_exome_df


In [5]:
annotation_f %>% fread(select=c(
    '#CHROM', 'POS', 'ID', 'REF', 'ALT',
    'geno_data_source',
    'CHROM_hg38', 'POS_hg38', 'REF_hg38', 'ALT_hg38'
)) %>%
rename_with(function(x){str_replace(x, '#', '')}, starts_with("#")) -> annotation_df


In [17]:
annotation_df %>%
filter(geno_data_source == 'exome200k') %>%
mutate(V = paste(CHROM_hg38, POS_hg38, REF_hg38, ALT_hg38, sep=':')) %>%
select(V, CHROM, POS, REF, ALT) %>%
inner_join(
    in_exome_df,
    by='V'
) %>%
mutate(
    V = paste(CHROM, POS, REF, ALT, sep=':'),
    # in the array-exome combined dataset, 
    # we focus on the LD indep variants in array
    ld_indep = 'False'
) %>%
select(-CHROM, -POS, -REF, -ALT) -> exome_hg19_df


In [22]:
annotation_df %>%
mutate(V = paste(CHROM, POS, REF, ALT, sep=':'), sort_order = 1:n()) %>%
select(sort_order, V) %>%
inner_join(bind_rows(exome_hg19_df, in_array_df), by='V') %>%
arrange(sort_order) %>%
select(-sort_order) -> combined_df


In [25]:
in_array_df %>% dim %>% print
in_exome_df %>% dim %>% print
combined_df %>% dim %>% print

[1] 651935      8
[1] 17582164        8
[1] 18118191        8


In [26]:
651935 + 17582164

In [29]:
combined_df %>%
fwrite(str_replace(out_f, '.gz$', ''), sep='\t', na = "NA", quote=F)


In [30]:
sprintf('bgzip -l9 %s', str_replace(out_f, '.gz$', ''))
