## MetaSTAAR_simulation.r

In [None]:
#import libraries
library(data.table)
library(dplyr)
library(STAAR)
library(MetaSTAAR)

#load genotype and covariate files

source('/media/leelabsg-storage0/eunjae/MetaSTAAR/MetaSTAAR_merge_simulation.R')
variant_info1 <- fread('/media/leelabsg-storage0/eunjae/Meta-code/computational_cost/meta-saige/step3_res/group1.marker_info.txt') ; variant_info1 = variant_info1[,c('CHR', 'POS', 'Major_Allele', 'Minor_Allele')]
variant_info2 <- fread('/media/leelabsg-storage0/eunjae/Meta-code/computational_cost/meta-saige/step3_res/group2.marker_info.txt') ; variant_info2 = variant_info2[,c('CHR', 'POS', 'Major_Allele', 'Minor_Allele')]
variant_info3 <- fread('/media/leelabsg-storage0/eunjae/Meta-code/computational_cost/meta-saige/step3_res/group3.marker_info.txt') ; variant_info3 = variant_info3[,c('CHR', 'POS', 'Major_Allele', 'Minor_Allele')]

colnames(variant_info1) <- c('chr', 'pos', 'ref', 'alt')
colnames(variant_info2) <- c('chr', 'pos', 'ref', 'alt')
colnames(variant_info3) <- c('chr', 'pos', 'ref', 'alt')



#genotype loading
load('/media/leelabsg-storage0/eunjae/Meta-code/computational_cost/meta-starr/genotype.RData')

#phenotype loading
pheno = fread('/media/leelabsg-storage0/eunjae/Meta-code/computational_cost/case1_group3_mock_pheno.txt')

id_1 = fread('/media/leelabsg-storage0/eunjae/Meta-code/computational_cost/genotype/group1_id.txt')
id_2 = fread('/media/leelabsg-storage0/eunjae/Meta-code/computational_cost/genotype/group2_id.txt')
id_3 = fread('/media/leelabsg-storage0/eunjae/Meta-code/computational_cost/genotype/group3_id.txt')

pheno1 <- merge(id_1, pheno, by.x = 'V1', by.y = 'IID')
pheno2 <- merge(id_2, pheno, by.x = 'V1', by.y = 'IID')
pheno3 <- merge(id_3, pheno, by.x = 'V1', by.y = 'IID')

glm.null.study1 <- fit_null_glm(Y~Sex+Age+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10,data=pheno1,family="binomial")
glm.null.study2 <- fit_null_glm(Y~Sex+Age+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10,data=pheno2,family="binomial")
glm.null.study3 <- fit_null_glm(Y~Sex+Age+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10,data=pheno3,family="binomial")

sumstat1 <- MetaSTAAR_worker_sumstat(genotype=genotype[[1]],glm.null.study1,variant_info1)
sumstat2 <- MetaSTAAR_worker_sumstat(genotype=genotype[[2]],glm.null.study2,variant_info2)
sumstat3 <- MetaSTAAR_worker_sumstat(genotype=genotype[[3]],glm.null.study3,variant_info3)

start <- Sys.time()
cov1 <- MetaSTAAR_worker_cov(genotype=genotype[[1]], obj_nullmodel=glm.null.study1, cov_maf_cutoff =0.05, variant_pos = variant_info1$pos, region_midpos = variant_info1$pos[nrow(variant_info1)])
cov2 <- MetaSTAAR_worker_cov(genotype=genotype[[2]], obj_nullmodel=glm.null.study2, cov_maf_cutoff =0.05, variant_pos = variant_info2$pos, region_midpos = variant_info2$pos[nrow(variant_info2)])
cov3 <- MetaSTAAR_worker_cov(genotype=genotype[[3]], obj_nullmodel=glm.null.study3, cov_maf_cutoff =0.05, variant_pos = variant_info3$pos, region_midpos = variant_info3$pos[nrow(variant_info3)])
end <- Sys.time()
cat('elapsed time for MetaSTAAR_worker_cov ', end - start , '\n')

save(sumstat1, sumstat2, sumstat3, cov1, cov2, cov3, variant_info1, file = '/media/leelabsg-storage0/eunjae/Meta-code/computational_cost/meta-starr/sumstat_cov.RData')

start <- Sys.time()
test <- MetaSTAAR_merge_simulation(chr=2, start.loc=variant_info1$pos[1], end.loc=variant_info1$pos[nrow(variant_info1)], sample.sizes=rep(10000, 3),
                                        sumstat.list=list(sumstat1, sumstat2, sumstat3),
                                        cov.list=list(cov1, cov2, cov3), cov_maf_cutoff = 0.05)

MetaSTAAR(obj_MetaSTAAR_merge=test, annotation_phred = NULL, rv_num_cutoff = 2)$results_MetaSTAAR_O
end <- Sys.time()
cat('elapsed time for MetaSTAAR ', end - start , '\n')


## Read Genotype

In [None]:
library(seqminer)
library(data.table)
library(dplyr)
library(argparser, quietly = TRUE)

p <- arg_parser("Read genotype data")
p <- add_argument(p, "--n_sample", help = 'number of samples')
argv <- parse_args(p)
n <- argv$n_sample


fname1 = paste0('/media/leelabsg-storage0/eunjae/Meta-code/computational_cost/genotype/group1_', n, '_chr2')
plinkObj1 = openPlink(fname1) ; plinkObj1$bim$idx = 1:nrow(plinkObj1$bim)
fname2 = paste0('/media/leelabsg-storage0/eunjae/Meta-code/computational_cost/genotype/group2_', n, '_chr2')
plinkObj2 = openPlink(fname2) ; plinkObj2$bim$idx = 1:nrow(plinkObj2$bim)
fname3 = paste0('/media/leelabsg-storage0/eunjae/Meta-code/computational_cost/genotype/group3_', n, '_chr2')
plinkObj3 = openPlink(fname3) ; plinkObj3$bim$idx = 1:nrow(plinkObj3$bim)

group1_id <- fread(paste0('/media/leelabsg-storage0/eunjae/Meta-code/computational_cost/genotype/group1_', n, '_id.txt'))
group2_id <- fread(paste0('/media/leelabsg-storage0/eunjae/Meta-code/computational_cost/genotype/group2_', n, '_id.txt'))
group3_id <- fread(paste0('/media/leelabsg-storage0/eunjae/Meta-code/computational_cost/genotype/group3_', n, '_id.txt'))

variant_info1 <- fread(paste0('/media/leelabsg-storage0/eunjae/Meta-code/computational_cost/meta-saige/step3_res/group1_', n, '.marker_info.txt'))
variant_info2 <- fread(paste0('/media/leelabsg-storage0/eunjae/Meta-code/computational_cost/meta-saige/step3_res/group2_', n, '.marker_info.txt'))
variant_info3 <- fread(paste0('/media/leelabsg-storage0/eunjae/Meta-code/computational_cost/meta-saige/step3_res/group3_', n, '.marker_info.txt'))

info_gene1 <- variant_info1[which(variant_info1$Set == 'TTN'),]
info_gene2 <- variant_info2[which(variant_info2$Set == 'TTN'),]
info_gene3 <- variant_info3[which(variant_info3$Set == 'TTN'),]

var_id1 <- left_join(info_gene1, plinkObj1$bim, by = c('POS' = 'V4', 'Minor_Allele' = 'V5', 'Major_Allele' = 'V6'))
var_id2 <- left_join(info_gene2, plinkObj2$bim, by = c('POS' = 'V4', 'Minor_Allele' = 'V5', 'Major_Allele' = 'V6'))
var_id3 <- left_join(info_gene3, plinkObj3$bim, by = c('POS' = 'V4', 'Minor_Allele' = 'V5', 'Major_Allele' = 'V6'))


var_idx1 <- var_id1$idx
var_idx2 <- var_id2$idx
var_idx3 <- var_id3$idx



genotype1 <- readPlinkToMatrixByIndex(fname1, 1:nrow(group1_id), var_idx1)
genotype1[genotype1 == -9] <- NA
genotype1 <- 2 - genotype1
genotype1[is.na(genotype1)] <- 0

genotype2 <- readPlinkToMatrixByIndex(fname2, 1:nrow(group2_id), var_idx2)
genotype2[genotype2 == -9] <- NA
genotype2 <- 2 - genotype2
genotype2[is.na(genotype2)] <- 0

genotype3 <- readPlinkToMatrixByIndex(fname3, 1:nrow(group3_id), var_idx3)
genotype3[genotype3 == -9] <- NA
genotype3 <- 2 - genotype3
genotype3[is.na(genotype3)] <- 0

genotype <- list(genotype1, genotype2, genotype3)

save(genotype, file = paste0('/media/leelabsg-storage0/eunjae/Meta-code/computational_cost/meta-starr/genotype_', n, '.RData'))


## Fit the null model

In [None]:
#import libraries
library(data.table)
library(dplyr)
library(STAAR)
library(MetaSTAAR)
library(argparser, quietly = TRUE)
library(Matrix)

p <- arg_parser("Read genotype data")
p <- add_argument(p, "--n_sample", help = 'number of samples')
argv <- parse_args(p)
n <- argv$n_sample

#load genotype and covariate files

start_time_fitting <- Sys.time()

source('/media/leelabsg-storage0/eunjae/MetaSTAAR/MetaSTAAR_merge_simulation.R')
variant_info1 <- fread(paste0('/media/leelabsg-storage0/eunjae/Meta-code/computational_cost/meta-saige/step3_res/group1_', n, '.marker_info.txt'))
variant_info1 = variant_info1[,c('CHR', 'POS', 'Major_Allele', 'Minor_Allele')]
variant_info2 <- fread(paste0('/media/leelabsg-storage0/eunjae/Meta-code/computational_cost/meta-saige/step3_res/group2_', n, '.marker_info.txt'))
variant_info2 = variant_info2[,c('CHR', 'POS', 'Major_Allele', 'Minor_Allele')]
variant_info3 <- fread(paste0('/media/leelabsg-storage0/eunjae/Meta-code/computational_cost/meta-saige/step3_res/group3_', n, '.marker_info.txt'))
variant_info3 = variant_info3[,c('CHR', 'POS', 'Major_Allele', 'Minor_Allele')]

colnames(variant_info1) <- c('chr', 'pos', 'ref', 'alt')
colnames(variant_info2) <- c('chr', 'pos', 'ref', 'alt')
colnames(variant_info3) <- c('chr', 'pos', 'ref', 'alt')

# #genotype loading
# load(paste0('/media/leelabsg-storage0/eunjae/Meta-code/computational_cost/meta-starr/genotype_', n, '.RData'))

#phenotype loading
pheno = fread('/media/leelabsg-storage0/eunjae/Meta-code/Inflation_eval/whole_WES/seed_1_WES_200k_WB_tau_1_prev_0.01.pheno.evenChr_pca')

id_1 = fread(paste0('/media/leelabsg-storage0/eunjae/Meta-code/computational_cost/genotype/group1_', n, '_id.txt'))
id_2 = fread(paste0('/media/leelabsg-storage0/eunjae/Meta-code/computational_cost/genotype/group2_', n, '_id.txt'))
id_3 = fread(paste0('/media/leelabsg-storage0/eunjae/Meta-code/computational_cost/genotype/group3_', n, '_id.txt'))

#Kinship matrix loading
kin1 = readMM(paste0('/media/leelabsg-storage0/eunjae/Meta-code/computational_cost/meta-saige/step0_res/group1_', n, '_relatednessCutoff_0.125_2000_randomMarkersUsed.sparseGRM.mtx'))
kin2 = readMM(paste0('/media/leelabsg-storage0/eunjae/Meta-code/computational_cost/meta-saige/step0_res/group2_', n, '_relatednessCutoff_0.125_2000_randomMarkersUsed.sparseGRM.mtx'))
kin3 = readMM(paste0('/media/leelabsg-storage0/eunjae/Meta-code/computational_cost/meta-saige/step0_res/group3_', n, '_relatednessCutoff_0.125_2000_randomMarkersUsed.sparseGRM.mtx'))

pheno1 <- merge(id_1, pheno, by.x = 'V1', by.y = 'IND_ID')
# genotype[[1]] = genotype[[1]][which(rownames(genotype[[1]]) %in% pheno1$V1),]

pheno2 <- merge(id_2, pheno, by.x = 'V1', by.y = 'IND_ID')
# genotype[[2]] = genotype[[2]][which(rownames(genotype[[2]]) %in% pheno2$V1),]

pheno3 <- merge(id_3, pheno, by.x = 'V1', by.y = 'IND_ID')
# genotype[[3]] = genotype[[3]][which(rownames(genotype[[3]]) %in% pheno3$V1),]

kin1 = kin1[1: nrow(pheno1), 1:nrow(pheno1)] ; colnames(kin1) <- pheno1$V1 ; rownames(kin1) <- pheno1$V1
kin2 = kin2[1: nrow(pheno2), 1:nrow(pheno2)] ; colnames(kin2) <- pheno2$V1 ; rownames(kin2) <- pheno2$V1
kin3 = kin3[1: nrow(pheno3), 1:nrow(pheno3)] ; colnames(kin3) <- pheno3$V1 ; rownames(kin3) <- pheno3$V1

glm.null.study1 <- fit_null_glmmkin(y~x1+x2+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10,data=pheno1, kins = kin1, id = 'V1', use_sparse = T)
glm.null.study2 <- fit_null_glmmkin(y~x1+x2+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10,data=pheno2, kins = kin2, id = 'V1', use_sparse = T)
glm.null.study3 <- fit_null_glmmkin(y~x1+x2+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10,data=pheno3, kins = kin3, id = 'V1', use_sparse = T)

save(glm.null.study1, glm.null.study2, glm.null.study3, variant_info1, variant_info2, variant_info3, pheno1, pheno2, pheno3,
file = paste0('/media/leelabsg-storage0/eunjae/Meta-code/computational_cost/meta-starr/glm_null_', n, '_.RData'))


## Get Summary Statistics

In [None]:
#import libraries
library(data.table)
library(dplyr)
library(STAAR)
library(MetaSTAAR)
library(argparser, quietly = TRUE)
library(Matrix)

p <- arg_parser("Read genotype data")
p <- add_argument(p, "--n_sample", help = 'number of samples')
argv <- parse_args(p)
n <- argv$n_sample

#load genotype and covariate files

source('/media/leelabsg-storage0/eunjae/MetaSTAAR/MetaSTAAR_merge_simulation.R')
load(paste0('/media/leelabsg-storage0/eunjae/Meta-code/computational_cost/meta-starr/glm_null_', n, '_.RData'))

sumstat1 <- MetaSTAAR_worker_sumstat(genotype=genotype[[1]],glm.null.study1,variant_info1)
sumstat2 <- MetaSTAAR_worker_sumstat(genotype=genotype[[2]],glm.null.study2,variant_info2)
sumstat3 <- MetaSTAAR_worker_sumstat(genotype=genotype[[3]],glm.null.study3,variant_info3)

sample.sizes = c(nrow(pheno1), nrow(pheno2), nrow(pheno3))

save(genotype, sumstat1, sumstat2, sumstat3, glm.null.study1, glm.null.study2, glm.null.study3, variant_info1, variant_info2, variant_info3, sample.sizes, 
file = paste0('/media/leelabsg-storage0/eunjae/Meta-code/computational_cost/meta-starr/sumstat_', n, '_.RData'))


## Run MetaAnalysis

In [None]:
#import libraries
library(data.table)
library(dplyr)
library(STAAR)
library(MetaSTAAR)
library(argparser, quietly = TRUE)

p <- arg_parser("Read genotype data")
p <- add_argument(p, "--n_sample", help = 'number of samples')
argv <- parse_args(p)
n <- argv$n_sample

#load genotype and covariate files

source('/media/leelabsg-storage0/eunjae/MetaSTAAR/MetaSTAAR_merge_simulation.R')


load(paste0('/media/leelabsg-storage0/eunjae/Meta-code/computational_cost/meta-starr/sumstat_cov_', n, '_.RData'))

test <- MetaSTAAR_merge_simulation(chr=2, start.loc=variant_info1$pos[1], end.loc=variant_info1$pos[nrow(variant_info1)], sample.sizes=sample.sizes,
                                        sumstat.list=list(sumstat1, sumstat2, sumstat3),
                                        cov.list=list(cov1, cov2, cov3), cov_maf_cutoff = 0.05)



MetaSTAAR(obj_MetaSTAAR_merge=test, annotation_phred = NULL, rv_num_cutoff = 2)$results_MetaSTAAR_O


## Running bash script

In [None]:

#No docker Genotype loading
#step 1 reading genotype
export MKL_NUM_THREADS=1;export MKL_DYNAMIC=false; export OMP_NUM_THREADS=1; export OMP_DYNAMIC=false;
for n in 10000 50000 100000 179176
do
    /usr/bin/time -o /media/leelabsg-storage0/eunjae/Meta-code/computational_cost/meta-starr/logs/1_read_genotype_${n}.log -v \
    Rscript /media/leelabsg-storage0/eunjae/Meta-code/computational_cost/meta-starr/1_read_genotype.r \
    --n_sample ${n}
done

#step2 loading genotype
export MKL_NUM_THREADS=1;export MKL_DYNAMIC=false; export OMP_NUM_THREADS=1; export OMP_DYNAMIC=false;
for n in 10000 50000 100000 179176
do
    /usr/bin/time -o /media/leelabsg-storage0/eunjae/Meta-code/computational_cost/meta-starr/logs/2_load_genotype_${n}.log -v \
    Rscript /media/leelabsg-storage0/eunjae/Meta-code/computational_cost/meta-starr/2_load_genotype.r \
    --n_sample ${n}
done

#Need docker for MetaSTAAR
docker run -v /media/leelabsg-storage0/eunjae/Meta-code/computational_cost:/media/leelabsg-storage0/eunjae/Meta-code/computational_cost \
    -v /media/leelabsg-storage0/eunjae/MetaSTAAR/:/media/leelabsg-storage0/eunjae/MetaSTAAR/ \
    -v /media/leelabsg-storage0/eunjae/Meta-code/Inflation_eval/whole_WES:/media/leelabsg-storage0/eunjae/Meta-code/Inflation_eval/whole_WES \
    -w /media/leelabsg-storage0/eunjae/ -it meta-starr

#step3 get sumstat
export MKL_NUM_THREADS=1;export MKL_DYNAMIC=false; export OMP_NUM_THREADS=1; export OMP_DYNAMIC=false;
for n in 10000 50000 100000 179176
do
    /usr/bin/time -o /media/leelabsg-storage0/eunjae/Meta-code/computational_cost/meta-starr/logs/3_get_fit${n}.log -v \
        Rscript Meta-code/computational_cost/meta-starr/3_get_fit.r \
        --n_sample ${n}

    # /usr/bin/time -o /media/leelabsg-storage0/eunjae/Meta-code/computational_cost/meta-starr/logs/3_get_sumstat${n}.log -v \
    #     Rscript Meta-code/computational_cost/meta-starr/3_get_sumstat.r \
    #     --n_sample ${n}
done

#step4 get cov
export MKL_NUM_THREADS=1;export MKL_DYNAMIC=false; export OMP_NUM_THREADS=1; export OMP_DYNAMIC=false;
for n in 10000 50000 100000 179176
do
    /usr/bin/time -o /media/leelabsg-storage0/eunjae/Meta-code/computational_cost/meta-starr/logs/4_get_cov${n}.log -v \
        Rscript Meta-code/computational_cost/meta-starr/4_get_cov.r \
        --n_sample ${n}
done

#step5 run meta
export MKL_NUM_THREADS=1;export MKL_DYNAMIC=false; export OMP_NUM_THREADS=1; export OMP_DYNAMIC=false;
for n in 10000 50000 100000 179176
do
    /usr/bin/time -o /media/leelabsg-storage0/eunjae/Meta-code/computational_cost/meta-starr/logs/5_run_meta${n}.log -v \
        Rscript Meta-code/computational_cost/meta-starr/5_run_meta.r \
        --n_sample ${n}
done