In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
}))


In [2]:
args <- c(
'INI50'
)

In [3]:
GBE_ID <- args[1]

In [4]:
source('paths.sh')

In [5]:
covariates <- c('age','sex','Array',paste0('PC',1:10))

In [6]:
out_f <- file.path(out_d, 'output', sprintf('%s.tsv', GBE_ID))
out_f <- 'dev.tsv'


In [10]:
cat_or_zcat <- function(f){
    ifelse(endsWith(f, '.zst'), 'zstdcat', ifelse(endsWith(f, '.gz'), 'zcat', 'cat'))
}


In [8]:
covar_model_BETAs_f %>%
str_replace_all('__TRAIT__', GBE_ID) %>%
fread() %>%
rename_with(
    function(x){str_replace(x, '#', '')}, starts_with("#")
) -> covar_model_BETAs_df


In [9]:
fam <- ifelse(
    str_replace(GBE_ID, '[0-9]+$', '') %in% c('INI', 'QT_FC'),
    'gaussian', 'binomial'
)


In [None]:
# analysis center
centers_f %>%
fread(colClasses = c('#FID'='character', 'IID'='character')) %>%
rename_with(function(x){str_replace(x, '#', '')}, starts_with("#")) %>%
drop_na(f.54.0.0) %>%
mutate(center_id = relevel(as.factor(f.54.0.0), ref = "11010")) -> centers_df
# the ref class: 11010 denotes "Leeds", which is the most common assessment center

# phenotype file
fread(
    cmd=paste(cat_or_zcat(phe_f), phe_f),
    colClasses = c('#FID'='character', 'IID'='character'),
    select=c('#FID', 'IID', 'population', 'split', covariates, GBE_ID)
) %>%
rename_with(
    function(x){str_replace(x, '#', '')}, starts_with("#")
) %>% 
na_if(list(GBE_ID = -9)) -> phe_df


In [12]:
# PRS
fread(
    cmd=paste(cat_or_zcat(PRS202110_f), PRS202110_f),
    colClasses = c('#FID'='character', 'IID'='character'),
    select=c('#FID', 'IID', paste0('PRS_', GBE_ID))
) %>%
rename_with(
    function(x){str_replace(x, '#', '')}, starts_with("#")
) -> PRS_df

phe_df %>% 
inner_join(PRS_df, by=c('FID', 'IID')) %>%
inner_join(centers_df, by=c('FID', 'IID')) %>%
drop_na(all_of(c(GBE_ID, paste0('PRS_', GBE_ID)))) -> full_df

full_df %>% 
filter(population == 'white_british', split == 'test') -> test_df

test_df %>%
column_to_rownames('IID') %>%
select(all_of(covariates)) %>% 
as.matrix %*% (
    covar_model_BETAs_df %>%
    filter(split == 'train_val') %>%
    filter(variable %in% covariates) %>%
    select(variable, estimate) %>%
    rename('covar_score' = 'estimate') %>%
    column_to_rownames('variable') %>%
    as.matrix
) %>%
as.data.frame %>%
rownames_to_column('IID') -> covar_score_df


test_df %>%
left_join(covar_score_df, by='IID') -> test_df


In [13]:
glmfit_center <- glm(
    stats::as.formula(
        sprintf('(%s - 1) ~ 1 + (1 * covar_score) + %s + %s', GBE_ID, 'center_id', paste0('PRS_', GBE_ID))
    ),
    family=fam,
    data=test_df
)

In [14]:
summary(glmfit_center)$coefficients %>%
as.data.frame() %>% rownames_to_column('variable') %>%
rename('variable' = 1, 'estimate' = 2, 'SE' =3, 'z_or_t_value' =4, 'P' = 5) %>%
mutate(phenotype = GBE_ID) %>%
select(phenotype, variable, estimate, SE, z_or_t_value, P) -> results_df


In [15]:
results_df

phenotype,variable,estimate,SE,z_or_t_value,P
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
INI50,(Intercept),164.90492307,0.107947421,1527.641162,0.0
INI50,center_id10003,2.06775672,1.109813779,1.8631565,0.06244461
INI50,center_id11001,-0.25980243,0.223322599,-1.1633504,0.2446915
INI50,center_id11002,1.04039764,0.214613245,4.8477793,1.251292e-06
INI50,center_id11003,-0.06121557,0.196105759,-0.3121559,0.754923
INI50,center_id11004,-1.08379564,0.194240695,-5.5796528,2.419237e-08
INI50,center_id11005,0.06109116,0.198473874,0.3078046,0.758232
INI50,center_id11006,0.63910935,0.191250913,3.3417323,0.0008330265
INI50,center_id11007,1.19016504,0.167003052,7.1266065,1.039028e-12
INI50,center_id11008,-0.7818782,0.167609409,-4.6648825,3.093864e-06


In [None]:
results_df %>%
rename('#model' = 'model') %>%
fwrite(out_f, sep='\t', na = "NA", quote=F)