In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
}))


In [2]:
# input file/dir
in_d <- '@@@@@@/users/ytanigaw/repos/rivas-lab/biomarkers/cascade/out_v3'


In [3]:
dfs <- list()
for(vtype in c('PTVs', 'PAVs', 'nc')){
    dfs[[vtype]] <- fread(file.path(in_d, sprintf('cascade.%s.tsv', vtype))) %>% rename('CHROM'='#CHROM')
}


In [27]:
df <- bind_rows(
    dfs[['PTVs']] %>% mutate(Csq='PTVs'),
    dfs[['PAVs']] %>% mutate(Csq='PAVs')
) %>%
filter(is_outside_of_MHC) %>%
select(CHROM, POS, REF, ALT, ID, trait, Gene_symbol, BETA, MAF, Csq, is_outside_of_MHC, P) %>%
bind_rows(
    dfs[['nc']] %>% filter(is_outside_of_MHC) %>%
    replace_na(list(Gene_symbol=''))%>%
    select(CHROM, POS, REF, ALT, ID, trait, BETA, MAF, is_outside_of_MHC, P) %>%
    mutate(Gene_symbol='', Csq='nc')
)


In [28]:
df %>% colnames()

### the number of associations

In [20]:
df %>% 
mutate(is_rare = if_else(MAF < 0.01, 'rare', 'non_rare')) %>%
count(is_outside_of_MHC, Csq, is_rare) %>%
spread(is_rare, n, fill=0) %>%
mutate(n = rare + non_rare)


is_outside_of_MHC,Csq,non_rare,rare,n
<lgl>,<chr>,<dbl>,<dbl>,<dbl>
True,nc,9001,116,9117
True,PAVs,1017,306,1323
True,PTVs,15,43,58


In [8]:
df %>% 
filter(abs(BETA) > .1) %>%
mutate(
    is_rare = if_else(MAF < 0.01, 'rare', 'non_rare'),
    sign = if_else(BETA > 0, '+', '-')
) %>%
count(is_outside_of_MHC, Csq, sign, is_rare) %>%
spread(is_rare, n, fill=0) %>%
mutate(n = rare + non_rare)


is_outside_of_MHC,Csq,sign,non_rare,rare,n
<lgl>,<chr>,<chr>,<dbl>,<dbl>,<dbl>
True,nc,-,298,41,339
True,nc,+,263,46,309
True,PAVs,-,79,161,240
True,PAVs,+,57,125,182
True,PTVs,-,2,17,19
True,PTVs,+,2,24,26


### the number of variants

In [21]:
df %>% select(CHROM, POS, REF, ALT, ID) %>% unique %>% nrow


In [22]:
df %>% pull(ID) %>% n_distinct


In [23]:
df %>% filter(is.na(ID))


CHROM,POS,REF,ALT,ID,trait,Gene_symbol,BETA,MAF,Csq,is_outside_of_MHC
<int>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<lgl>
1,198994696,TA,T,,Cholesterol,,0.0173,0.279055,nc,True
1,198994696,TA,T,,LDL cholesterol,,0.0176,0.279055,nc,True
5,87703384,GA,G,,C-reactive protein,,-0.0178,0.256082,nc,True
22,44424192,G,T,,Gamma glutamyltransferase,,-0.0146,0.392425,nc,True


In [9]:
df %>% 
select(CHROM, POS, ID, MAF, Csq, is_outside_of_MHC) %>%
unique() %>% 
mutate(is_rare = if_else(MAF < 0.01, 'rare', 'non_rare')) %>%
count(is_outside_of_MHC, Csq, is_rare) %>%
spread(is_rare, n, fill=0) %>%
mutate(n = rare + non_rare)


is_outside_of_MHC,Csq,non_rare,rare,n
<lgl>,<chr>,<dbl>,<dbl>,<dbl>
True,nc,5087,75,5162
True,PAVs,402,192,594
True,PTVs,10,28,38


In [16]:
5162 + 594 + 38

In [10]:
df %>% 
filter(abs(BETA) > .1) %>%
select(CHROM, POS, ID, MAF, Csq, is_outside_of_MHC) %>%
unique() %>% 
mutate(
    is_rare = if_else(MAF < 0.01, 'rare', 'non_rare'),
) %>%
count(is_outside_of_MHC, Csq, is_rare) %>%
spread(is_rare, n, fill=0) %>%
mutate(n = rare + non_rare)


is_outside_of_MHC,Csq,non_rare,rare,n
<lgl>,<chr>,<dbl>,<dbl>,<dbl>
True,nc,382,59,441
True,PAVs,62,179,241
True,PTVs,4,27,31


In [11]:
df %>% count(trait)

Unnamed: 0_level_0,trait,n
Unnamed: 0_level_1,<chr>,<int>
1,Alanine aminotransferase,189
2,Albumin,202
3,Alkaline phosphatase,715
4,Apolipoprotein A,454
5,Apolipoprotein B,593
6,Aspartate aminotransferase,265
7,AST to ALT ratio,226
8,C-reactive protein,351
9,Calcium,208
10,Cholesterol,596


In [30]:
df %>% select(CHROM, POS, REF, ALT, ID) %>% unique %>%
arrange(CHROM, POS, ID) %>%
rename('#CHROM' = 'CHROM') %>%
fwrite(file.path(in_d, 'list_of_loci.tsv'), sep='\t', na = "NA", quote=F)
