In [1]:
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(data.table))


In [2]:
res_dir <- '@@@@@@/projects/biomarkers/phewas/v2_imp_nc'

phewas_hits_ld <- file.path(res_dir, 'ukb24983_imp_v3.nc.phewas.hits.ld.tsv')

finngen_R2_extracted <- file.path(res_dir, 'ukb24983_imp_v3.nc.phewas.hits.ld.finngen.R2.tsv') %>%
fread(sep='\t') %>%
rename('chrom' = '#chrom')

phewas_hits_ld_gwas_catalog_finngen_summary <- 
file.path(res_dir, 'ukb24983_imp_v3.nc.phewas.hits.ld.gwascatalog.finngen.summary.tsv')


In [7]:
GBE_Finngen_pheno_match <- file.path(res_dir, 'GBE_Finngen_hits.tsv')

In [3]:
finngen_R2_manifest <- '@@@@@@@@/users/ytanigaw/20200114_FinnGen_R2/summary_stats/r2_manifest.tsv' %>%
fread(sep='\t')


In [4]:
finngen_R2_df <- finngen_R2_extracted %>% 
mutate(var_id = paste(chrom, pos, ref, alt, sep='-')) %>%
left_join(
    finngen_R2_manifest  %>%
    select(phenocode, name), 
    by='phenocode'
) %>%
select(-chrom, -pos, -ref, -alt)

finngen_R2_df.colnames <- colnames(finngen_R2_df) %>%
lapply(function(x){paste0('finngen_', x)})

colnames(finngen_R2_df) <- finngen_R2_df.colnames

In [5]:
df <- fread(phewas_hits_ld) %>% rename('CHROM' = '#CHROM')


In [8]:
df %>% 
select(ID, GBE_ID, GBE_short_name, finngen_var_id) %>% 
full_join(
    finngen_R2_df %>% 
    select(finngen_var_id, finngen_name, finngen_phenocode),
    by = "finngen_var_id"
) %>% 
drop_na(ID, finngen_phenocode) %>%
select(-ID, -finngen_var_id) %>%
arrange(GBE_ID, finngen_phenocode) %>%
rename('#GBE_ID' = 'GBE_ID') %>%
unique() %>%
mutate(
    is_hit=FALSE,
    Note=''
)%>%
fwrite(GBE_Finngen_pheno_match, sep='\t')


In [8]:
GBE_Finngen_pheno_match

#### we performed manual annotation for the phenotypes

In [9]:
library(googlesheets)
gs_auth(token = "/home/users/ytanigaw/.googlesheets_token.rds")

GBE_Finngen_pheno_match_df <- 
'https://docs.google.com/spreadsheets/d/1yocReg2dL84x1NQDEyc9ywEDJM6gdfr2ELNlCcP3zPo' %>% 
gs_url() %>% 
gs_read(ws = 'GBE_Finngen_hits')


Auto-refreshing stale OAuth token.
Sheet-identifying info appears to be a browser URL.
googlesheets will attempt to extract sheet key from the URL.
Putative key: 1yocReg2dL84x1NQDEyc9ywEDJM6gdfr2ELNlCcP3zPo
Sheet successfully identified: "phewas"
Accessing worksheet titled 'GBE_Finngen_hits'.
Parsed with column specification:
cols(
  GBE_ID = [31mcol_character()[39m,
  GBE_short_name = [31mcol_character()[39m,
  finngen_name = [31mcol_character()[39m,
  finngen_phenocode = [31mcol_character()[39m,
  is_related = [33mcol_logical()[39m,
  is_hit = [33mcol_logical()[39m,
  Note = [33mcol_logical()[39m
)


In [10]:
finngen_hits <- df %>%
drop_na(finngen_var_id) %>%
filter(finngen_var_id != '') %>%
unique() %>%
mutate(sort_order = 1:n()) %>%
full_join(
    finngen_R2_df,
    by='finngen_var_id'
) %>%
drop_na(ID, finngen_phenocode) %>%
left_join(
    GBE_Finngen_pheno_match_df %>%
    select(GBE_ID, finngen_phenocode, is_hit),
    by=c('GBE_ID', 'finngen_phenocode')
)


In [11]:
finngen_hits_summary <- finngen_hits %>%
filter(is_hit) %>%
unique() %>%
group_by(ID, GBE_ID) %>%
arrange(finngen_pval) %>%
filter(row_number() == 1) %>%
ungroup() %>%
arrange(sort_order) %>%
select(-sort_order)


In [12]:
phewas_hits_ld_gwas_catalog_summary_df <- 
'https://docs.google.com/spreadsheets/d/1yocReg2dL84x1NQDEyc9ywEDJM6gdfr2ELNlCcP3zPo' %>% 
gs_url() %>% 
gs_read(ws = 'ukb24983_imp_v3.nc.phewas.summary') %>% 
rename('CHROM' = '#CHROM')


Sheet-identifying info appears to be a browser URL.
googlesheets will attempt to extract sheet key from the URL.
Putative key: 1yocReg2dL84x1NQDEyc9ywEDJM6gdfr2ELNlCcP3zPo
Sheet successfully identified: "phewas"
Accessing worksheet titled 'ukb24983_imp_v3.nc.phewas.summary'.
Parsed with column specification:
cols(
  `#CHROM` = [32mcol_double()[39m,
  POS = [32mcol_number()[39m,
  ID = [31mcol_character()[39m,
  OBS_CT = [32mcol_double()[39m,
  OR = [32mcol_double()[39m,
  `LOG(OR)_SE` = [32mcol_double()[39m,
  Z_STAT = [32mcol_double()[39m,
  P = [32mcol_double()[39m,
  GBE_ID = [31mcol_character()[39m,
  GBE_short_name = [31mcol_character()[39m,
  is_novel = [33mcol_logical()[39m,
  EBI_hit_rsID = [31mcol_character()[39m,
  rsID = [31mcol_character()[39m,
  Note = [31mcol_character()[39m
)


In [13]:
finngen_hits_summary %>% colnames()

In [14]:
phewas_hits_ld_gwas_catalog_finngen_summary_df <- 
phewas_hits_ld_gwas_catalog_summary_df %>% 
left_join(
    finngen_hits_summary %>% 
    select(ID, GBE_ID, finngen_rsids) %>%
    rename('Finngen_hit_rsID' = 'finngen_rsids'),
    by=c('ID', 'GBE_ID')
)

In [15]:
phewas_hits_ld_gwas_catalog_summary_df         %>% dim() %>% print()
phewas_hits_ld_gwas_catalog_finngen_summary_df %>% dim() %>% print()


[1] 269  14
[1] 269  15


In [16]:
phewas_hits_ld_gwas_catalog_finngen_summary_df %>% 
rename('#CHROM' = 'CHROM') %>%
fwrite(phewas_hits_ld_gwas_catalog_finngen_summary, sep='\t')


In [17]:
phewas_hits_ld_gwas_catalog_finngen_summary

In [23]:
phewas_hits_ld_gwas_catalog_finngen_summary_df %>% 
count(is_novel, is.na(Finngen_hit_rsID))


is_novel,is.na(Finngen_hit_rsID),n
<lgl>,<lgl>,<int>
False,False,40
False,True,118
True,False,19
True,True,92
