In [1]:
suppressPackageStartupMessages(require(tidyverse))
suppressPackageStartupMessages(require(data.table))
library(latex2exp)


“package ‘forcats’ was built under R version 3.6.2”

In [2]:
wd <- getwd()

cascade_files <- file.path(wd, 'cascade.input.files.tsv')

array_hits <- file.path(wd, 'out_v3', 'cascade.array.hits.tsv.zst')

plot_filehead <- list(
    PAVs = file.path(wd, 'out_v3', "cascade.PAVs"),
    PTVs = file.path(wd, 'out_v3', "cascade.PTVs")
)


In [3]:
traits <- fread(cascade_files)

array_anno_df <- fread(cmd=paste('zstdcat', array_hits, sep=' '), sep='\t')


In [4]:
plots <- list()

In [5]:
plots[['PTVs']] <- array_anno_df %>% 
filter(Csq == 'protein-truncating', ld_indep, is_outside_of_MHC) %>%
mutate(
    outlier = if_else(abs(Effect) >= .1, TRUE, FALSE),
    ggrepel = if_else(abs(Effect) >= .1, Gene_symbol, "")
) %>%
ggplot(
    aes(x=maf, y=Effect, color=outlier, label = ggrepel)
) +
geom_hline(yintercept = 0, linetype="dashed") +
geom_point() + theme_bw() + 
labs(
    title = TeX('Protein-truncating variant associations ($p \\,<\\, 5x10^{-9}$)'),
    x = 'Minor allele frequency (log-scale)',
    y = 'BETA'
) +
theme(
    strip.text = element_text(size=7),
    legend.position="none"
)+
scale_color_brewer(palette="Dark2") + 
scale_x_continuous(trans="log10", breaks=10 ** c(-4, -2, 0)) + 
facet_wrap(~name, ncol = 6, strip.position = "bottom", labeller = label_wrap_gen(17)) + 
ggrepel::geom_text_repel(size=2, fontface=3)


In [6]:
plots[['PAVs']] <- array_anno_df %>% 
filter(Csq == 'protein-altering', ld_indep, is_outside_of_MHC) %>%
group_by(name) %>%
mutate(
    rank_abs_effect = rank(-abs(Effect))
) %>%
ungroup() %>%
mutate(
    outlier = if_else(abs(Effect) >= .1, TRUE, FALSE),
    ggrepel = if_else((abs(Effect) >= .1) & (rank_abs_effect < 7) , Gene_symbol, "")
) %>%
ggplot(
    aes(x=maf, y=Effect, color=outlier, label = ggrepel)
) +
geom_hline(yintercept = 0, linetype="dashed") +
geom_point() + theme_bw() + 
labs(
    title = TeX('Protein-altering variant associations ($p \\,<\\, 5x10^{-9}$)'),
    x = 'Minor allele frequency (log-scale)',
    y = 'BETA'
) +
theme(
    strip.text = element_text(size=7),
    legend.position="none"
)+
scale_color_brewer(palette="Dark2") + 
scale_x_continuous(trans="log10", breaks=10 ** c(-4, -2, 0)) + 
facet_wrap(~name, ncol = 6, strip.position = "bottom", labeller = label_wrap_gen(17)) + 
ggrepel::geom_text_repel(size=2, fontface=3)


In [7]:
for(v in c('PTVs', 'PAVs')){
    for(ext in c('png', 'pdf')){ 
        ggsave(
            paste(plot_filehead[[v]], ext, sep='.'), 
            plot=plots[[v]], width = 10, height = 10
        )
    }
}

In [8]:
setdiff(
    traits %>% select(name) %>% pull(),
    array_anno_df %>% filter(Csq == 'protein-truncating') %>% select(name) %>% unique() %>% pull()
)

In [9]:
setdiff(
    traits %>% select(name) %>% pull(),
    array_anno_df %>% filter(Csq == 'protein-altering') %>% select(name) %>% unique() %>% pull()
)

## prepare supplementary table files
- reformat the tables

In [45]:
array_anno_df %>% colnames()


In [46]:
supl_tbl_rename_list <- list('BETA'='Effect', 'SE'='StdErr', 'trait'='name')


In [47]:
supl_tbl_cols <- c(
    'CHROM', 'POS', 'REF', 'ALT', 'ID', 'variant',
    'trait', 'BETA', 'SE', 'P',
    'Direction', 'HetISq', 'HetChiSq', 'HetDf', 'HetPVal',
    'MAF', 'Consequence',
    'Gene', 'Gene_symbol', 'HGVSp',
    'is_outside_of_MHC', 'ld_indep',
    'Comments'
)

In [58]:
df_PTVs <- array_anno_df %>% 
filter(Csq == 'protein-truncating') %>% 
mutate(Comments='', P=sprintf('%.2e', P))%>%
rename(unlist(supl_tbl_rename_list)) %>%
select(supl_tbl_cols) %>%
arrange(CHROM, POS, trait)


In [59]:
df_PAVs <- array_anno_df %>% 
filter(Csq == 'protein-altering') %>% 
mutate(Comments='', P=sprintf('%.2e', P))%>%
rename(unlist(supl_tbl_rename_list)) %>%
select(supl_tbl_cols) %>%
arrange(CHROM, POS, trait)


In [60]:
df_PTVs %>%
rename('#CHROM' = 'CHROM') %>%
fwrite(
    sprintf('%s.%s', plot_filehead[['PTVs']], 'tsv'), 
    sep='\t', na = "NA", quote=F
)


In [61]:
df_PAVs %>%
rename('#CHROM' = 'CHROM') %>%
fwrite(
    sprintf('%s.%s', plot_filehead[['PAVs']], 'tsv'), 
    sep='\t', na = "NA", quote=F
)
