In [1]:
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(data.table))


# summary

In this notebook, we will perform the followings:

- load the list of variants that are present from PheWAS and linked variants (due to LD) from `phewas_hits_ld` file.
- dump the list of phenotype coding pair (GBE_ID and EBI) for manual inspection (`GBE_EBI_pheno_match`).
- perform a manual annotation on Google Spreadsheet (mark whether associations discovered for the phenotype coded by GBE_ID can be supported by EBI phenotypes).
  - https://docs.google.com/spreadsheets/d/1yocReg2dL84x1NQDEyc9ywEDJM6gdfr2ELNlCcP3zPo
  - `GBE_EBI_hits` table
- load the results of manual annotation
- check whether the PheWAS discovery is novel or not.
- write the results to two files:
  - `phewas_hits_ld_gwas_catalog`: each line is a PheWAS discovery and has columns describing the followings. This table is for debugging.
    - `is_novel`: whether the PheWAS association is novel
    - rsID: the rsID for the original variant
    - EBI_hit_rsID: the rsID where we found previous reports for the associations in EMBL-EBI GWAS catalog
  - `phewas_hits_ld_gwas_catalog_summary`: each line is a unique combination of the original phewas discovery (variant and GBE_ID), variant in linkage, and EBI phenotype reported for the linked variant.
    - We apply manual check on Open Targets
- In addition, prepare a list of FinnGen variants for the FinnGen R2 scan.
  


In [2]:
gwas_c_data_dir <- '@@@@@@@@/public_data/gwas_catalog_20200216' 
gwas_c_file  <- file.path(gwas_c_data_dir, 'gwas_catalog_v1.0.2-associations_e98_r2020-02-08.tsv.gz')


In [37]:
res_dir <- '@@@@@@/projects/biomarkers/phewas/v2_imp_nc'
phewas_hits_ld <- file.path(res_dir, 'ukb24983_imp_v3.nc.phewas.hits.ld.tsv')
GBE_EBI_pheno_match <- file.path(res_dir, 'GBE_EBI_hits.tsv')
phewas_hits_ld_gwas_catalog <- file.path(res_dir, 'ukb24983_imp_v3.nc.phewas.hits.ld.gwascatalog.tsv')
phewas_hits_ld_gwas_catalog_summary <- file.path(res_dir, 'ukb24983_imp_v3.nc.phewas.hits.ld.gwascatalog.summary.tsv')
finngen_var_ids <- file.path(res_dir, 'ukb24983_imp_v3.nc.phewas.hits.ld.finngen_vars.lst')

In [4]:
gwas_c_df <- fread(cmd=paste('zcat', gwas_c_file), sep='\t')


In [5]:
df <- fread(phewas_hits_ld) %>% rename('CHROM' = '#CHROM')

In [6]:
gwas_c_hits <- gwas_c_df %>% 
select(SNPS, 'P-VALUE', 'MAPPED_TRAIT', 'MAPPED_TRAIT_URI') %>% 
filter(SNPS %in% (df %>% select(LD_rsID) %>% unique() %>% pull())) %>%
rename('EBI_P' = 'P-VALUE', 'EBI_TRAIT' = 'MAPPED_TRAIT', 'EBI_URI' = 'MAPPED_TRAIT_URI')


In [7]:
gwas_c_hits %>% dim()

In [8]:
gwas_c_hits %>% filter(is.na('EBI_P')) %>% head()

SNPS,EBI_P,EBI_TRAIT,EBI_URI
<chr>,<chr>,<chr>,<chr>


In [9]:
idx_df <- df %>%
arrange(CHROM, POS, GBE_ID) %>%
select(ID) %>%
unique()%>%
mutate(phewas_idx = 1:n())


In [10]:
idx_df %>% dim()

In [11]:
df_with_ebi_unfiltered <- df %>% 
left_join(
    idx_df, by=c('ID')
) %>%
full_join(gwas_c_hits, by=c('LD_rsID' = 'SNPS')) %>%
drop_na(LD_ID) %>%
mutate(EBI_P = as.numeric(EBI_P)) %>%
arrange(phewas_idx, CHROM, POS, LD_CHROM, LD_POS)


In [12]:
df_with_ebi_unfiltered %>% dim()


### Manual annotation.
We check whether the PheWAS hits should be considered as a match

In [22]:
df_with_ebi_unfiltered %>%
select(GBE_ID, GBE_short_name, EBI_TRAIT, EBI_URI) %>%
drop_na(EBI_URI) %>%
unique() %>%
mutate(is_hit = FALSE) %>%
arrange(GBE_ID, EBI_URI) %>%
fwrite(GBE_EBI_pheno_match, sep='\t')


In [13]:
GBE_EBI_pheno_match

In [14]:
library(googlesheets)
gs_auth(token = "/home/users/ytanigaw/.googlesheets_token.rds")

GBE_EBI_pheno_match_df <- 
'https://docs.google.com/spreadsheets/d/1yocReg2dL84x1NQDEyc9ywEDJM6gdfr2ELNlCcP3zPo' %>% 
gs_url() %>% 
gs_read(ws = 'GBE_EBI_hits')


Auto-refreshing stale OAuth token.
Sheet-identifying info appears to be a browser URL.
googlesheets will attempt to extract sheet key from the URL.
Putative key: 1yocReg2dL84x1NQDEyc9ywEDJM6gdfr2ELNlCcP3zPo
Sheet successfully identified: "phewas"
Accessing worksheet titled 'GBE_EBI_hits'.
Parsed with column specification:
cols(
  GBE_ID = [31mcol_character()[39m,
  GBE_short_name = [31mcol_character()[39m,
  EBI_TRAIT = [31mcol_character()[39m,
  EBI_URI = [31mcol_character()[39m,
  is_hit = [33mcol_logical()[39m,
  note = [31mcol_character()[39m
)


In [15]:
GBE_EBI_pheno_match_df %>%
count(is_hit)

is_hit,n
<lgl>,<int>
False,1118
True,71


In [16]:
df_with_ebi <- df_with_ebi_unfiltered %>%
left_join(
    GBE_EBI_pheno_match_df %>%
    select(GBE_ID, EBI_URI, is_hit) %>%
    rename('EBI_is_hit' = 'is_hit'),
    by=c('GBE_ID', 'EBI_URI')
) %>%
replace_na(list(EBI_is_hit = FALSE))


In [17]:
df_with_novelty_check <- df_with_ebi %>%
left_join(
    df_with_ebi %>% 
    select(ID, GBE_ID, EBI_is_hit) %>%
    group_by(ID, GBE_ID) %>%
    summarise(is_novel = (! any(EBI_is_hit))) %>%
    ungroup(), 
    by=c('ID', 'GBE_ID')
) %>%
left_join(
    df_with_ebi %>% 
    filter(EBI_is_hit) %>%
    select(ID, GBE_ID, EBI_P, LD_rsID) %>%
    group_by(ID, GBE_ID) %>%
    arrange(EBI_P) %>%
    filter(row_number() == 1) %>%
    ungroup() %>%
    select(ID, GBE_ID, LD_rsID) %>%
    rename('EBI_hit_rsID' = 'LD_rsID'), 
    by=c('ID', 'GBE_ID')
)


In [71]:
df_with_novelty_check %>% 
arrange(CHROM, POS, GBE_ID) %>%
rename('#CHROM' = 'CHROM') %>%
fwrite(phewas_hits_ld_gwas_catalog, sep='\t')


In [18]:
phewas_hits_ld_gwas_catalog

In [19]:
df_with_novelty_check %>% 
colnames()

In [20]:
df_summary <- df_with_novelty_check %>% 
select(
    -LD_CHROM, -LD_POS, -LD_ID, -LD_R2, -LD_rsID, 
    -Consequence, -Gene, -Gene_symbol,
    -EBI_P, -EBI_TRAIT, -EBI_URI, -EBI_is_hit,
    -phewas_idx,
    -finngen_var_id
) %>%
left_join(
    df_with_novelty_check %>% 
    filter(LD_R2 == 1) %>% 
    select(ID, LD_rsID) %>%
    rename('rsID' = 'LD_rsID'),
    by='ID'
) %>%
unique()

In [21]:
df_summary %>% count(is_novel)

is_novel,n
<lgl>,<int>
False,83
True,186


In [73]:
df_summary %>% 
arrange(CHROM, POS, GBE_ID) %>%
rename('#CHROM' = 'CHROM') %>%
fwrite(phewas_hits_ld_gwas_catalog_summary, sep='\t')

In [22]:
phewas_hits_ld_gwas_catalog_summary

In [28]:
df_with_novelty_check %>%
select(ID, finngen_var_id) %>% 
drop_na(finngen_var_id) %>%
filter(finngen_var_id != '') %>% 
group_by(ID) %>%
summarise(n_finngen = n()) %>%
ungroup() %>% 
filter(n_finngen < 1)

ID,n_finngen
<chr>,<int>


In [38]:
df_with_novelty_check %>%
select(finngen_var_id) %>% 
drop_na(finngen_var_id) %>%
filter(finngen_var_id != '') %>% 
unique() %>% 
rename('#finngen_var_id' = 'finngen_var_id') %>%
fwrite(finngen_var_ids)


In [39]:
finngen_var_ids