In [1]:
source('cascade_helper.R')


### list of parameters and filenames

In [2]:
wd <- getwd()

p_thr <- 5e-9
maf_min_thr <- 1e-4

annot.tbl <- '@@@@@@/private_data/ukbb/variant_filtering/variant_filter_table.tsv.gz'

cascade_files <- file.path(wd, 'cascade.input.files.tsv')
out_array_hits <- file.path(wd, 'out_v3', 'cascade.array.hits.tsv')

gdrive_dir <- '1fGlnrj8Zu1Pox3fmxmFKdcbNVWiN9vhd'


In [3]:
traits <- fread(cascade_files)
dim(traits) %>% print()

[1] 35  4


#### read the variant annotation file for array

In [4]:
annot.arr <- read_annot_arr(annot.tbl)


#### read the GWAS summary statistics for the array

In [6]:
array_anno_unfiltered_df <- traits %>% 
read_array_sumstats_all(p_thr) %>%
annotate_array_df(annot.arr)


In [7]:
# quick check
array_anno_unfiltered_df %>% 
count(
    (as.numeric(P) < p_thr),
    is_autosome
)


(as.numeric(P) < p_thr),is_autosome,n
<lgl>,<lgl>,<int>
True,True,79390


In [8]:
array_anno_unfiltered_df %>% 
count(
    (as.numeric(maf) < maf_min_thr),    
    all_filters,
    is_outside_of_MHC,
    ld_indep
)


(as.numeric(maf) < maf_min_thr),all_filters,is_outside_of_MHC,ld_indep,n
<lgl>,<int>,<lgl>,<lgl>,<int>
False,0,False,False,22356
False,0,False,True,2676
False,0,True,False,35143
False,0,True,True,9735
False,1,False,False,2457
False,1,True,False,4720
False,2,False,False,982
False,2,True,False,1240
False,3,False,False,12
False,3,True,False,1


In [9]:
array_anno_df <- array_anno_unfiltered_df %>%
filter(
    (as.numeric(P) < p_thr),
    is_autosome,
    (as.numeric(maf) > maf_min_thr),
    all_filters == 0,
    is_outside_of_MHC,
    ld_indep    
) %>%
select(
    -is_autosome,
    -filter,
    -hwe,
    -gnomad_af,
    -missingness,
    -mcpi
)


In [10]:
# quick check
array_anno_df %>% count(all_filters, is_outside_of_MHC, ld_indep)


all_filters,is_outside_of_MHC,ld_indep,n
<int>,<lgl>,<lgl>,<int>
0,True,True,9735


In [11]:
array_anno_df %>% 
select(Csq, is_rare, ID) %>%
unique() %>%
count(Csq, is_rare)


Csq,is_rare,n
<chr>,<lgl>,<int>
non-coding,False,4389
non-coding,True,75
protein-altering,False,402
protein-altering,True,192
protein-truncating,False,10
protein-truncating,True,28


In [12]:
array_anno_df %>%
fwrite(out_array_hits, sep='\t')


In [13]:
paste('gdrive', 'upload', '-p', gdrive_dir, out_array_hits, sep=' ')


In [15]:
paste('zstd', '--rm', '-15', out_array_hits, sep=' ')
