In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
}))


In [2]:
pvar_f <- '/oak/stanford/groups/mrivas/ukbb24983/array-combined/pgen/ukb24983_cal_hla_cnv.pvar'
data_d <- '/oak/stanford/groups/mrivas/ukbb24983/array-combined/annotation/afreq_20201012'
snp_qc_f <- '/oak/stanford/groups/mrivas/ukbb24983/snp/ukb_snp_qc.pvar.zst'
out_f <- file.path(data_d, 'ukb24983_cal_hla_cnv.afreq_20201012.pvar')

In [48]:
pops <- c('white_british', 'non_british_white', 'african', 's_asian', 'e_asian', 'related', 'others')
arrays <- c('UKBB', 'UKBL')


In [4]:
cat_or_zcat <- function(f){
    ifelse(endsWith(f, '.zst'), 'zstdcat', ifelse(endsWith(f, '.gz'), 'zcat', 'cat'))
}


In [5]:
read_tbl <- function(pops, prefix, suffix){
    pops %>% 
    lapply(function(pop){
        if(pop == 'all'){
            f <- sprintf('%s.%s', prefix, suffix)
        }else{
            f <- sprintf('%s.%s.%s', prefix, pop, suffix)
        }
        fread(
            cmd=paste(cat_or_zcat(f), f), 
            colClasses=c('#CHROM'='character')
        ) %>%
        rename('CHROM'='#CHROM') %>%
        mutate(population = pop)
    }) %>% bind_rows()
}


In [6]:
spread_pop <- function(df, col, pops, col_prefix, col_suffix, col_sep='_'){    
    df %>% select(ID, col, population) %>%
    rename(!!'VAL' := col) %>% spread(population, VAL) %>%
    select(ID, all_of(pops)) %>% rename(setNames(
        1:length(pops)+1, 
        paste(rep(col_prefix, length(pops)), pops, rep(col_suffix, length(pops)), sep=col_sep)
    ))
}


In [19]:
fread(cmd=paste('zstdcat', snp_qc_f), select=c('ID', 'array'), colClasses = 'character') %>%
rename('array_code'='array') %>%
mutate(
    # 0=BiLEVE, 1=Axiom, 2=both
    # see: http://biobank.ctsu.ox.ac.uk/crystal/refer.cgi?id=1955
    array = if_else(array_code == '0', 'UKBL', if_else(array_code == '1', 'UKBB', 'both'))
)-> snp_qc_df


In [20]:
snp_qc_df %>% count(array, array_code)


array,array_code,n
<chr>,<chr>,<int>
both,2,753693
UKBB,1,34197
UKBL,0,17536


In [18]:
pvar_f %>%
fread(colClasses = c('#CHROM'='character')) %>%
rename('CHROM'='#CHROM') -> pvar_df


In [9]:
read_tbl(
    c('all', pops), file.path(data_d, 'plink_output', 'ukb24983_cal_hla_cnv'), 'afreq.zst'
) -> afreq_long_df


In [49]:
read_tbl(
    c('all', pops, arrays), file.path(data_d, 'plink_output', 'ukb24983_cal_hla_cnv'), 'gcount.zst'
) -> gcount_long_df


In [50]:
pvar_df %>%
left_join(snp_qc_df %>% select(-array_code), by='ID') %>%
left_join(
    afreq_long_df %>%
    spread_pop('ALT_FREQS', c('all', pops), 'UKB', 'AF') %>%
    rename('UKB_AF'=2),
    by='ID'
) -> pvar_af_df


In [51]:
for(col in c('OBS_CT', 'MISSING_CT')){
    message(col)
    pvar_af_df %>%
    left_join(
        gcount_long_df %>%
        spread_pop(col, c('all', pops, arrays), 'UKB', col) %>%
        rename(!!sprintf('UKB_%s', col) := 2),
        by='ID'
    ) -> pvar_af_df
}

OBS_CT

MISSING_CT



In [52]:
for(col in c('HOM_REF_CT', 'HET_REF_ALT_CTS', 'TWO_ALT_GENO_CTS', 'HAP_REF_CT', 'HAP_ALT_CTS')){
    message(col)
    pvar_af_df %>%
    left_join(
        gcount_long_df %>%
        spread_pop(col, c('all', pops), 'UKB', col) %>%
        rename(!!sprintf('UKB_%s', col) := 2),
        by='ID'
    ) -> pvar_af_df
}

HOM_REF_CT

HET_REF_ALT_CTS

TWO_ALT_GENO_CTS

HAP_REF_CT

HAP_ALT_CTS



In [54]:
pvar_af_df %>% 
mutate(
    f_miss = UKB_MISSING_CT / (UKB_MISSING_CT + UKB_OBS_CT),
    f_miss_UKBB = if_else((!is.na(array)) & array == 'UKBL', as.double(NA), UKB_UKBB_MISSING_CT / (UKB_UKBB_MISSING_CT + UKB_UKBB_OBS_CT)),
    f_miss_UKBL = if_else((!is.na(array)) & array == 'UKBB', as.double(NA), UKB_UKBL_MISSING_CT / (UKB_UKBL_MISSING_CT + UKB_UKBL_OBS_CT))
) -> pvar_af_df


In [59]:
pvar_af_df %>% dim()

In [60]:
pvar_af_df %>% colnames()

In [61]:
pvar_af_df %>%
rename('#CHROM' = 'CHROM') %>%
fwrite(out_f, sep='\t', na = "NA", quote=F)


In [62]:
out_f