In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
}))


In [51]:
# input
pvar_f <- '/oak/stanford/groups/mrivas/ukbb24983/exome/pgen/oqfe_2020/ukb24983_exomeOQFE.pvar.zst'
data_d <- '/oak/stanford/groups/mrivas/ukbb24983/exome/annotation/20201025_exome_oqfe_2020'
afreq_hwe_out_d <- file.path(str_replace(data_d, '/oak/stanford/', '/scratch/'), 'afreq_hwe')
# output
out_f <- file.path(data_d, 'ukb24983_exomeOQFE.afreq_hwe.20201025.pvar')

# constants
pops <- c('white_british', 'non_british_white', 'african', 's_asian', 'e_asian', 'related', 'others')


In [20]:
# functions

cat_or_zcat <- function(f){
    ifelse(endsWith(f, '.zst'), 'zstdcat', ifelse(endsWith(f, '.gz'), 'zcat', 'cat'))
}

fread_CHROM <- function(f, select=NULL){
    fread(cmd=paste(cat_or_zcat(f), f), colClasses = c('#CHROM'='character'), select=select) %>% rename('CHROM'='#CHROM')
}

read_tbl <- function(pops, prefix, suffix, select=NULL){
    pops %>% 
    lapply(function(pop){
        ifelse(pop == 'all', sprintf('%s.%s', prefix, suffix), sprintf('%s.%s.%s', prefix, pop, suffix)) %>%
        fread_CHROM(select=select) %>% mutate(population = pop)
    }) %>% bind_rows()
}

spread_pop <- function(df, col, pops, col_prefix, col_suffix, col_sep='_'){    
    df %>% select(ID, col, population) %>%
    rename(!!'VAL' := col) %>% spread(all_of(population), VAL) %>%
    select(ID, all_of(pops)) %>% rename(setNames(
        1:length(pops)+1, 
        paste(rep(col_prefix, length(pops)), pops, rep(col_suffix, length(pops)), sep=col_sep)
    ))
}


## read files

### `pvar` file

In [5]:
pvar_f %>% fread_CHROM() -> pvar_df


### `afreq` file

In [23]:
read_tbl(
    c('all', pops), file.path(afreq_hwe_out_d, 'ukb24983_exomeOQFE'), 'afreq.zst', select=c('#CHROM', 'ID', 'ALT_FREQS')
) %>% select(-CHROM) -> afreq_long_df


In [25]:
dim(pvar_df) %>% print()
dim(afreq_long_df) %>% print()
print(8 * 17777950)


[1] 17777950        5
[1] 142223600         3
[1] 142223600


In [26]:
afreq_long_df %>%
spread_pop('ALT_FREQS', c('all', pops), 'UKB', 'AF') %>%
rename('UKB_AF'=2) -> afreq_wide_df


In [27]:
dim(afreq_wide_df) %>% print()


[1] 17777950        9


In [28]:
rm(afreq_long_df)


### `hwe` file

In [30]:
read_tbl(
    c('all', pops), file.path(afreq_hwe_out_d, 'ukb24983_exomeOQFE'), 'hardy.zst',
    select=c('#CHROM', 'ID', 'MIDP')
) %>% select(-CHROM) -> hwe_autosomes_long_df


In [31]:
read_tbl(
    c('all', pops), file.path(afreq_hwe_out_d, 'ukb24983_exomeOQFE'), 'hardy.x.zst',
    select=c('#CHROM', 'ID', 'MIDP')
) %>% select(-CHROM) -> hwe_chrX_long_df


In [32]:
bind_rows(hwe_autosomes_long_df, hwe_chrX_long_df) %>% 
rename('hwe_p'='MIDP') -> hwe_long_df


In [33]:
hwe_autosomes_long_df %>% dim() %>% print()
hwe_chrX_long_df      %>% dim() %>% print()
hwe_long_df           %>% dim() %>% print()


[1] 139374240         3
[1] 2848288       3
[1] 142222528         3


In [34]:
139374240 + 2848288


In [35]:
rm(hwe_autosomes_long_df)
rm(hwe_chrX_long_df)

In [36]:
hwe_long_df %>%
spread_pop('hwe_p', c('all', pops), 'UKB', 'hwe_p')  %>%
rename('UKB_hwe_p'=2) -> hwe_wide_df


In [37]:
hwe_wide_df           %>% dim() %>% print()


[1] 17777816        9


In [38]:
17777816 * 8 

In [39]:
rm(hwe_long_df)

### `gcount` file

In [11]:
read_tbl(
    c('all', pops), file.path(afreq_hwe_out_d, 'ukb24983_exomeOQFE'), 'gcount.zst',
    select=c('#CHROM', 'ID', 'OBS_CT', 'MISSING_CT', 'HOM_REF_CT', 'HET_REF_ALT_CTS', 'TWO_ALT_GENO_CTS', 'HAP_REF_CT', 'HAP_ALT_CTS')
) %>% select(-CHROM) -> gcount_long_df


In [40]:
dim(gcount_long_df)

## join data frames

### `AF` and `hwe_p`


In [41]:
pvar_df %>%
left_join(afreq_wide_df, by='ID') -> pvar_af_hwe_df


In [42]:
pvar_af_hwe_df %>%
left_join(hwe_wide_df, by='ID') -> pvar_af_hwe_df


In [43]:
pvar_af_hwe_df %>% dim() %>% print()

[1] 17777950       21


In [44]:
pvar_af_hwe_df %>% colnames()

In [45]:
rm(pvar_df)
rm(afreq_wide_df)
rm(hwe_wide_df)


### Other outputs from `gcount` file

In [46]:
for(col in c('OBS_CT', 'MISSING_CT', 'HOM_REF_CT', 'HET_REF_ALT_CTS', 'TWO_ALT_GENO_CTS', 'HAP_REF_CT', 'HAP_ALT_CTS')){
    message(col)
    
    gcount_long_df %>%
    spread_pop(col, c('all', pops), 'UKB', col) %>%
    rename(!!sprintf('UKB_%s', col) := 2) -> tmp
    
    pvar_af_hwe_df %>% left_join(tmp, by='ID') -> pvar_af_hwe_df
    
    rm(tmp)
}
s

OBS_CT

MISSING_CT

HOM_REF_CT

HET_REF_ALT_CTS

TWO_ALT_GENO_CTS

HAP_REF_CT

HAP_ALT_CTS



## compute the missingness

In [48]:
pvar_af_hwe_df %>% 
mutate(
    f_miss = UKB_MISSING_CT / (UKB_MISSING_CT + UKB_OBS_CT)
) -> pvar_af_hwe_df


In [49]:
pvar_af_hwe_df %>% dim()

In [55]:
pvar_af_hwe_df %>%
rename('#CHROM' = 'CHROM') %>%
fwrite(out_f, sep='\t', na = "NA", quote=F)


In [52]:
out_f

In [53]:
pvar_af_hwe_df %>% colnames()

In [54]:
str_replace(out_f, '.pvar$', '.compact.pvar')

In [57]:
pvar_af_hwe_df %>%
select(CHROM, POS, ID, REF, ALT, f_miss, UKB_white_british_hwe_p, UKB_white_british_AF, UKB_AF) %>%
rename('#CHROM' = 'CHROM') %>%
fwrite(str_replace(out_f, '.pvar$', '.compact.pvar'), sep='\t', na = "NA", quote=F)


In [None]:
pvar_af_hwe_df %>% select(ID, UKB_white_british_hwe_p) %>% 
rename('MIDP' = 'UKB_white_british_hwe_p') %>% plot_cumsum() + 
labs(title='Exome 200k HWE p-value (white British)') -> p


In [71]:
plot_cumsum <- function(df){
    df %>% ggplot(aes(x = VALUE)) + 
    stat_bin(aes(y=cumsum(..count..)/nrow(df)),geom="line",color="black", bins=100) +
    labs(
        y = 'cumsum(n) / n '
    ) + theme_bw() + theme(
        legend.title = element_text(size=16),
        legend.text  = element_text(size=16),
        axis.text=element_text(size=16), 
        axis.title=element_text(size=16)
    ) +
    geom_hline(yintercept = 1, color='gray')
}


In [77]:
pvar_af_hwe_df %>% select(ID, UKB_white_british_hwe_p) %>% 
mutate(VALUE=-log10(UKB_white_british_hwe_p + .Machine$double.xmin)) %>%
plot_cumsum() + xlim(0, 20) + ylim(0, 1) + 
labs(x = latex2exp::TeX('HWE (midp) -log_{10}(P)'), title='Exome 200k HWE p-value (white British)') +
geom_vline(xintercept = 7, color='red') -> p_hwe


In [78]:
suppressWarnings(suppressPackageStartupMessages({
for(ext in c('png', 'pdf')){
    ggsave(
        sprintf('6_hwe_midp_plot.%s', ext),
        p_hwe, width=6,height=6
    )    
}
}))


In [75]:
pvar_af_hwe_df %>% select(ID, f_miss) %>% 
mutate(VALUE=-log10(f_miss + .Machine$double.xmin)) %>%
plot_cumsum() + xlim(0, 6) + ylim(0, 1) + 
labs(x = latex2exp::TeX('-log_{10}(Missigness)'), title='Exome 200k missigness (per variant)') +
geom_vline(xintercept = 3, color='red') -> p_miss


In [76]:
suppressWarnings(suppressPackageStartupMessages({
for(ext in c('png', 'pdf')){
    ggsave(
        sprintf('6_miss_plot.%s', ext),
        p_miss, width=6,height=6
    )    
}
}))


In [85]:
pvar_af_hwe_df %>% select(ID, UKB_white_british_AF) %>% 
mutate(VALUE=-log10(pmin(1-UKB_white_british_AF, UKB_white_british_AF) + .Machine$double.xmin)) %>%
plot_cumsum() + xlim(0, 6) + ylim(0, 1) + 
labs(x = latex2exp::TeX('-log_{10}(minor allele frequency)'), title='Exome 200k minor allele frequency in white British subset') +
geom_vline(xintercept = -log10(1/(2*137920)), color='red') -> p_maf_wb


In [87]:
suppressWarnings(suppressPackageStartupMessages({
for(ext in c('png', 'pdf')){
    ggsave(
        sprintf('6_maf_wb_plot.%s', ext),
        p_maf_wb, width=6,height=6
    )    
}
}))


In [86]:
-log10(1/(2*137920))