In [1]:
suppressPackageStartupMessages(require(tidyverse))
suppressPackageStartupMessages(require(data.table))


## Imputation v3 dataset variant QC
#### Yosuke Tanigawa (ytanigaw@stanford.edu)
#### 2019/10/14

This notebook describes variant QC for the imputation dataset

```
97,059,329 Variants on the imputation dataset
 | 
 | MAF >= 0.01
 | Imputation quality >= 0.7
 |  
 | where, MAF and Imputation quality is obtained from UKB data showcase
 | (http://biobank.ctsu.ox.ac.uk/crystal/refer.cgi?id=1967)
 | 
 | Note, MAF is computed for the entire 500k (by UKB)
 | 
10,061,256
 | 
 | Biallelic only
 |
10,028,119
 |
 | The variant position is not present on genotyping array dataset
 |
 9,354,600
```


In [2]:
ukb_dir   <- '/oak/stanford/groups/mrivas/ukbb/24983'

cal_pfile <- file.path(ukb_dir, 'cal/pgen/ukb24983_cal_cALL_v2_hg19')
cal_df <- fread(paste0(cal_pfile, '.pvar')) %>% rename('CHROM' = '#CHROM')


In [3]:
imp_mfi_file   <- file.path(ukb_dir, 'imp', 'mfi', 'ukb_mfi_v3.tsv.zst')
imp_df <- fread(cmd=paste0(
    'zstdcat ', imp_mfi_file, 
    " | awk -v FS='\t' '(NR>1 && $NF>=0.7 && 0.01 <= $4 && $4 <= 0.99)'"
))

colnames(imp_df) <- c('ID', 'UKB_VAR_ID', 'ORIGINAL_VAR_ID', 'AF_A1', 'INFO')


In [8]:
imp_df %>% dim()

In [55]:
multi_allelic_pos <- imp_df %>% 
separate(ID, c("CHROM", "POS", "REF", "ALT"), sep=':', remove = F) %>%
count(CHROM, POS) %>% filter(n > 1) %>%
mutate(CHR_POS=paste(CHROM, POS, sep=':')) %>%
select(CHR_POS) %>% pull()


In [57]:
length(multi_allelic_pos)


In [56]:
imp_biallelic_df <- imp_df %>%
separate(ID, c("CHROM", "POS", "REF", "ALT"), sep=':', remove = F) %>%
mutate(CHR_POS=paste(CHROM, POS, sep=':')) %>%
filter(! CHR_POS %in% multi_allelic_pos)


In [58]:
imp_biallelic_df %>% dim()


In [59]:
cal_pos <- cal_df %>%
mutate(CHR_POS = paste(CHROM, POS, sep=':')) %>%
select(CHR_POS) %>% pull()


In [60]:
length(cal_pos)


In [61]:
imp_biallelic_non_cal_df <- imp_biallelic_df %>%
filter(! CHR_POS %in%  cal_pos)


In [62]:
imp_biallelic_non_cal_df %>% dim()


In [63]:
imp_biallelic_non_cal_df %>% 
select(imp_df %>% colnames()) %>%
fwrite(
    file.path(ukb_dir, 'imp', 'mfi', 'ukb_mfi_v3.info.maf.biallelic.noncal.tsv'),
    sep='\t'
)
