In [1]:
suppressPackageStartupMessages(require(tidyverse))
suppressPackageStartupMessages(require(data.table))
library(latex2exp)


“package ‘forcats’ was built under R version 3.6.2”

In [2]:
outd <- file.path(getwd(), 'out_v3')

traits <- fread('cascade.input.files.tsv')


In [32]:
imp_var_annot_df <- fread(cmd=paste('zcat', file.path(outd, 'cascade.imp.hits.var.annot.tsv.gz'))) %>%
rename('CHROM'='#CHROM') %>%
select(-LoF, -LoF_filter, -LoF_flags, -LoF_info, -maf)


In [33]:
imp_var_annot_df %>% dim()

In [34]:
imp_var_annot_df %>% colnames()

In [3]:
imp_anno_df <-
fread(cmd=paste('zstdcat', file.path(outd, 'cascade.imp.hits.tsv.zst'), sep=' '), sep='\t')


In [4]:
imp_anno_df %>% dim()

In [5]:
imp_anno_df %>% colnames()

In [52]:
imp_anno_2_df <- imp_anno_df %>%
rename('ID_old'='ID')%>%
left_join(
    imp_var_annot_df %>%
    select(CHROM, POS, REF, ALT, ID, Gene, Gene_symbol, Consequence, HGVSp) %>%
    unique(), 
    by=c('CHROM', 'POS', 'REF', 'ALT')
)

In [53]:
imp_anno_2_df %>% dim()

In [54]:
imp_anno_2_df %>% colnames()

In [6]:
array_anno_df <- 
fread(cmd=paste('zstdcat', file.path(outd, 'cascade.array.hits.tsv.zst'), sep=' '), sep='\t')


In [7]:
array_anno_df %>% dim()

In [8]:
array_anno_df %>% colnames()

In [55]:
common_cols <- intersect(names(array_anno_df), names(imp_anno_2_df))
common_cols

In [63]:
imp_anno_2_df %>%
count(Consequence)

Consequence,n
<chr>,<int>
3_prime_UTR_variant,21
5_prime_UTR_variant,9
downstream_gene_variant,37
intergenic_variant,140
intron_variant,460
missense_variant,4
non_coding_transcript_exon_variant,16
regulatory_region_variant,27
splice_region_variant,1
stop_gained,1


In [64]:
non_coding_df <- array_anno_df %>%
filter(Csq == 'non-coding') %>%
select(all_of(common_cols)) %>%
bind_rows(
    imp_anno_2_df %>% 
    filter(!Consequence %in% c('missense_variant', 'stop_gained', 'synonymous_variant')) %>%
    select(all_of(common_cols))
)


In [65]:
non_coding_df %>% dim()

In [66]:
p.imp <- non_coding_df %>% 
filter(is_outside_of_MHC) %>%
group_by(name) %>%
mutate(
    rank_abs_effect = rank(-abs(Effect))
) %>%
ungroup() %>%
mutate(
    outlier = if_else(abs(Effect) >= .1, TRUE, FALSE),
    plot_label = ID,
    plot_label = str_replace(plot_label, '[(][)]$', ''),
    plot_label = str_replace(plot_label, '[(]NA[)]$', ''),
    plot_label = str_replace(plot_label, '_', ':'),    
    ggrepel = if_else((abs(Effect) >= .1) & (rank_abs_effect < 5), plot_label, "")
) %>%
ggplot(
    aes(x=maf, y=Effect, color=outlier, label = ggrepel)
) +
geom_hline(yintercept = 0, linetype="dashed") +
geom_point(alpha=.5) + theme_bw() + 
labs(
    title = TeX('Non-coding variant associations ($p \\,<\\, 5x10^{-9}$)'),
    x = 'Minor allele frequency (log-scale)',
    y = 'BETAs'
) +
theme(
    strip.text = element_text(size=7),
    legend.position="none"
)+
scale_color_brewer(palette="Dark2") + 
# scale_x_continuous(trans="log10", breaks=10 ** c(-2, -1, 0)) + 
scale_x_continuous(trans="log10", breaks=c(0.001, 0.01, 0.1)) + 
facet_wrap(~name, ncol = 6, strip.position = "bottom", labeller = label_wrap_gen(17)) + 
ggrepel::geom_text_repel(size=1.5)


In [67]:
for(ext in c('png', 'pdf')){
    ggsave(file.path(outd, sprintf("cascade.imp.%s", ext)), p.imp, width = 12, height = 14)
}


## prepare supplementary table files
- reformat the tables

In [68]:
supl_tbl_rename_list <- list(
    'BETA'='Effect', 'SE'='StdErr', 'trait'='name', 'MAF'='maf'
)

supl_tbl_cols <- c(
    'CHROM', 'POS', 'REF', 'ALT', 'ID', 'variant', 
    'trait', 'BETA', 'SE', 'P',
    'Direction', 'HetISq', 'HetChiSq', 'HetDf', 'HetPVal',
    'MAF', 'Consequence', 'Gene', 'Gene_symbol', 'HGVSp',
    'is_outside_of_MHC', 'Comments'
)


In [69]:
non_coding_df %>%
mutate(Comments='', P=sprintf('%.2e', P), variant=paste(CHROM, POS, REF, ALT, sep=':'))%>%
rename(unlist(supl_tbl_rename_list)) %>%
select(supl_tbl_cols) %>%
arrange(CHROM, POS, trait) %>% 
rename('#CHROM' = 'CHROM') %>%
fwrite(
    file.path(outd, "cascade.nc.tsv"),
    sep='\t', na = "NA", quote=F
)
