In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
}))


# LD indep analysis for the array + exome combined dataset
## Yosuke Tanigawa

We will prepare an input file to run LD indep command

We follow example analysis notebook here: https://github.com/rivas-lab/ukbb-tools/blob/master/17_annotation/20201012_array-combined/6a_LD_indep_input.ipynb

In [2]:
source('0_parameters.sh')

In [3]:
# filenames
ukb_d <- '/scratch/groups/mrivas/ukbb24983'
# input
array_f <- file.path(ukb_d, 'array-combined/annotation/annotation_20201012/ukb24983_cal_hla_cnv.annot_compact_20201023.tsv.gz')
exome_f <- file.path(ukb_d, 'exome/annotation/20201025_exome_oqfe_2020/ukb24983_exomeOQFE.annotation.20210108.compact.tsv.gz')
pvar_f  <- file.path(ukb_d, 'array-exome-combined/pgen/ukb24983_cal_hla_cnv_exomeOQFE.pvar.gz')
# output 
var_QC_f <- file.path(ldmap_d, 'ukb24983_cal_hla_cnv_exomeOQFE.input.variants.tsv')


In [4]:
# read pvar file
pvar_f %>%
fread(colClasses = c('#CHROM'='character', 'ID'='character')) %>%
rename('CHROM'='#CHROM') -> pvar_df


In [5]:
bind_rows(
    array_f %>%
    fread(select=c('ID', 'Csq', 'ld_indep')) %>%
    rename('ld_indep_array'='ld_indep'),
    
    exome_f %>%
    fread(select=c('ID', 'Csq'))

) -> annot_df


In [6]:
pvar_df %>%
left_join(annot_df, by='ID') %>%
filter(FILTER == '.') %>%
select(
    CHROM, POS, ID, REF, ALT, FILTER, geno_data_source, ld_indep_array, Csq
) -> merged_df


In [7]:
merged_df %>% dim() %>% print()


[1] 18134874        9


In [8]:
merged_df %>%
rename('#CHROM' = 'CHROM') %>%
fwrite(var_QC_f, sep='\t', na = "NA", quote=F)


In [9]:
var_QC_f