In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
}))


In [2]:
# input
# ukb_d <- '/oak/stanford/groups/mrivas/ukbb24983'
ukb_d <- '/scratch/groups/mrivas/ukbb24983'
array_f <- file.path(ukb_d, 'array-combined/annotation/annotation_20201012/ukb24983_cal_hla_cnv.annot_compact_20201023.tsv.gz')
exome_f <- file.path(ukb_d, 'exome/annotation/20201025_exome_oqfe_2020/ukb24983_exomeOQFE.annotation.20201217.compact.tsv.gz')
dup_f <- '/oak/stanford/groups/mrivas/ukbb24983/exome/qc/oqfe_2020/intermediate_files/ukb24983_exomeOQFE.duplicates.tsv.gz'


In [3]:
# output
combined_f <- file.path(ukb_d, 'array-exome-combined/pgen/merge_list_pvar/ukb24983_cal_hla_cnv_exomeOQFE.unsorted.pvar')


In [4]:
####################################################################
# functions
cat_or_zcat <- function(f){
    ifelse(endsWith(f, '.zst'), 'zstdcat', ifelse(endsWith(f, '.gz'), 'zcat', 'cat'))
}

fread_CHROM <- function(f, select=NULL){
    fread(cmd=paste(cat_or_zcat(f), f), colClasses = c('#CHROM'='character'), select=select) %>% rename('CHROM'='#CHROM')
}


In [5]:
dup_f %>%
fread_CHROM() -> dup_df


In [9]:
array_f %>%
fread_CHROM(
    select=c('#CHROM', 'POS', 'ID', 'REF', 'ALT', 'geno_data_source')
) -> array_df

exome_f %>%
fread_CHROM(
    select=c(
        '#CHROM', 'POS', 'ID', 'REF', 'ALT',
        'CHROM_hg19', 'POS_hg19', 'REF_hg19', 'ALT_hg19', 'liftOver_unmapped_reason'
    )
) %>% rename(
    'CHROM_hg38'='CHROM', 'POS_hg38'='POS', 'REF_hg38'='REF', 'ALT_hg38'='ALT',
    'CHROM'='CHROM_hg19', 'POS'='POS_hg19', 'REF'='REF_hg19', 'ALT'='ALT_hg19'
) %>%
mutate(
    geno_data_source = 'exome200k'
)-> exome_df


In [10]:
array_df %>% head()

CHROM,POS,ID,REF,ALT,geno_data_source
<chr>,<int>,<chr>,<chr>,<chr>,<chr>
1,723307,rs28659788,C,G,cal
1,727841,rs116587930,G,A,cal
1,729632,rs116720794,C,T,cal
1,751314,1:723307-779322_-,N,+,cnv
1,751315,1:723307-779322_+,N,+,cnv
1,752721,rs3131972,A,G,cal


In [11]:
exome_df %>% 
filter(REF != REF_hg38 | ALT != ALT_hg38) %>%
dim()

In [12]:
inner_join(
    array_df %>%
    filter(geno_data_source == 'cal') %>%
    select(CHROM, POS, REF, ALT, ID),
    
    exome_df %>%
    filter(is.na(liftOver_unmapped_reason), !ID %in% dup_df$ID) %>%
    select(CHROM, POS, REF, ALT, ID),
    
    by=c('CHROM', 'POS'),
    suffix = c("_array", "_exome")
) -> intersection_pos_only_df


In [13]:
inner_join(
    array_df %>%
    filter(geno_data_source == 'cal') %>%
    select(CHROM, POS, REF, ALT, ID),
    
    exome_df %>%
    filter(is.na(liftOver_unmapped_reason), !ID %in% dup_df$ID) %>%
    select(CHROM, POS, REF, ALT, ID),
    
    by=c('CHROM', 'POS', 'REF', 'ALT'),
    suffix = c("_array", "_exome")
) -> intersection_df


In [14]:
intersection_pos_only_df %>% dim() %>% print()
intersection_df          %>% dim() %>% print()


[1] 140567      8
[1] 115002      6


In [15]:
intersection_pos_only_df %>% 
count(REF_array != REF_exome, ALT_array != ALT_exome)


REF_array != REF_exome,ALT_array != ALT_exome,n
<lgl>,<lgl>,<int>
False,False,115002
False,True,20623
True,False,195
True,True,4747


In [16]:
intersection_pos_only_df %>% 
filter(REF_array == REF_exome, ALT_array != ALT_exome) %>% head(3)


CHROM,POS,REF_array,ALT_array,ID_array,REF_exome,ALT_exome,ID_exome
<chr>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,871267,C,T,Affx-89007868,C,G,1:935887:C:G
1,881627,G,A,rs2272757,G,C,1:946247:G:C
1,889238,G,A,rs3828049,G,C,1:953858:G:C


In [17]:
intersection_pos_only_df %>% 
filter(REF_array != REF_exome, ALT_array == ALT_exome) %>% head(3)


CHROM,POS,REF_array,ALT_array,ID_array,REF_exome,ALT_exome,ID_exome
<chr>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,11140616,AAAAAAC,A,Affx-52345802,AAAAAACACACACAC,A,1:11080559:D:14
1,11561617,TC,T,Affx-80267250,TCCGCGG,T,1:11501560:D:6
1,13695819,GA,G,Affx-80267273,GAAA,G,1:13369361:D:3


In [18]:
intersection_pos_only_df %>% 
filter(REF_array != REF_exome, ALT_array != ALT_exome) %>% head(3)


CHROM,POS,REF_array,ALT_array,ID_array,REF_exome,ALT_exome,ID_exome
<chr>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,874809,G,C,rs200584816,GCATGATCCCCCTCATCACCTCCCCAGCCA,G,1:939429:D:29
1,876498,GA,G,Affx-52325915,G,A,1:941118:G:A
1,889255,CGACA,C,Affx-89023000,C,T,1:953875:C:T


In [19]:
exome_df %>%
count(ID %in% (intersection_df$ID_exome), !ID %in% dup_df$ID)


ID %in% (intersection_df$ID_exome),!ID %in% dup_df$ID,n
<lgl>,<lgl>,<int>
False,False,326
False,True,17662622
True,True,115002


In [20]:
exome_df %>%
filter(!ID %in% (intersection_df$ID_exome), !ID %in% dup_df$ID) %>% 
count(is.na(liftOver_unmapped_reason), is.na(CHROM), CHROM %in% c(1:22, 'X', 'Y'))


is.na(liftOver_unmapped_reason),is.na(CHROM),"CHROM %in% c(1:22, ""X"", ""Y"")",n
<lgl>,<lgl>,<lgl>,<int>
False,True,False,1924
True,False,False,5
True,False,True,17660693


In [21]:
exome_df %>%
filter(
    !ID %in% (intersection_df$ID_exome),
    !ID %in% dup_df$ID,
    is.na(liftOver_unmapped_reason),
    CHROM %in% c(1:22, 'X', 'Y')
) %>%
select(CHROM, POS, ID, REF, ALT, geno_data_source) %>%
bind_rows(array_df) -> combined_unsorted_df


In [26]:
combined_unsorted_df %>%
count(geno_data_source)


geno_data_source,n
<chr>,<int>
cal,805426
cnv,275180
exome200k,17660693
hla,362


In [27]:
combined_unsorted_df %>% dim()

In [28]:
combined_unsorted_df %>%
left_join(
    data.frame(
        CHROM = c(1:22, 'X', 'Y', 'XY', 'MT'),
        CHROM_order = 1:26,
        stringsAsFactors=F
    ), by='CHROM'
) %>%
rename('#CHROM' = 'CHROM') %>%
fwrite(combined_f, sep='\t', na = "NA", quote=F)
