## compare `cal` and `imp`

We compare the genotyping call from the array (`cal`, v2 on hg19) and the imputation dataset (`imp`, v3, MAF1% biallelic) and see the overlap between the two sets (based on coordinates).




In [1]:
suppressPackageStartupMessages(require(tidyverse))
suppressPackageStartupMessages(require(data.table))


In [2]:
ukb_dir   <- '/oak/stanford/groups/mrivas/ukbb/24983'
imp_dir   <- file.path(ukb_dir, 'imp/pgen/maf1')
cal_pfile <- file.path(ukb_dir, 'cal/pgen/ukb24983_cal_cALL_v2_hg19')
hla_pfile <- file.path(ukb_dir, 'hla/pgen/ukb_hla_v3')
cnv_pfile <- file.path(ukb_dir, 'cnv/pgen/cnv')


In [3]:
cal_df <- fread(paste0(cal_pfile, '.pvar')) %>% rename('CHROM' = '#CHROM')

hla_df <- fread(paste0(hla_pfile, '.pvar')) %>% rename('CHROM' = '#CHROM')

cnv_df <- fread(paste0(cnv_pfile, '.pvar')) %>% rename('CHROM' = '#CHROM')


In [4]:
imp_df <- bind_rows(lapply(c(1:22, 'X', 'XY'), function(c){fread(
cmd=paste0('zstdcat ', file.path(imp_dir, paste0('ukb24983_imp_chr', c, '_v3_maf1.pvar.zst')), 
' | sed -e "s/#CHROM/CHROM/g"'),
colClasses=c(CHROM="character")
)}))


In [15]:
merged_df <- imp_df %>% 
rename(
    ID_imp = ID,
    REF_imp = REF,
    ALT_imp = ALT,
    INFO_imp = INFO
) %>%
inner_join(
    cal_df %>%
    rename(
        ID_cal = ID,
        REF_cal = REF,
        ALT_cal = ALT
    ),
    by=c('CHROM', 'POS')
)


In [16]:
imp_df %>% dim() %>% print()
cal_df %>% dim() %>% print()
merged_df %>% dim()  %>% print()

[1] 10231518        6
[1] 805426      5
[1] 673909      9


In [19]:
merged_df %>%
fwrite(
    file.path(imp_dir, 'ukb24983_cal_v2_hg19_imp_v3_maf1.join.tsv'),
    sep='\t'
)

In [22]:
merged_df %>%
filter(
    REF_imp != REF_cal | ALT_imp != ALT_cal
) %>% dim()

In [26]:
merged_df %>%
count(CHROM)

CHROM,n
<chr>,<int>
1,51546
10,32858
11,32274
12,31217
13,22684
14,21272
15,20571
16,23022
17,21341
18,19644
