## 1. Generate frequency of the variants

In [None]:
%%bash

plink2 \
--bfile /home/leelabsg/media/leelabsg-storage0/eunjae/Meta-code/Inflation_eval/whole_WES/LD_pruned/prune_all \
#plink file of pruned WES file for 470k white british samples
--freq \
--out /home/leelabsg/media/leelabsg-storage0/eunjae/Meta-code/Inflation_eval/whole_WES/WES_200k_freq_ldpruned

## 2. Get random beta values

In [None]:
%%R

options(stringsAsFactors=F)
set.seed(1)

## load R libraries
#library(SKAT)
#library(MetaSK#AT)
library(optparse)
library(data.table)
## set list of cmd line arguments
option_list <- list(
#  make_option("--seedNum", type="integer",default=1,
#    help="seed number for simulation"),
  make_option("--tau", type="numeric",default=1,
    help="tau for variance component"),
  make_option("--prev", type="numeric", default="",
    help="prevalance")		    
)
## list of options
parser <- OptionParser(usage="%prog [options]", option_list=option_list)
args <- parse_args(parser, positional_arguments = 0)
opt <- args$options
print(opt)

## output folder
outputPath="/home/leelabsg/media/leelabsg-storage0/eunjae/Meta-code/Inflation_eval/whole_WES/"

GetMean<-function(x, prev_a, eta1_a){
        eta_a = eta1_a+ x
        PI_a<-exp(eta_a)/(1+exp(eta_a))
        mean(PI_a) - prev_a
}

### 1. Randomly draw beta for genetic markers 

#freqfile is output by Binary_typeIerror_simu_0.sh 
freqfile="/home/leelabsg/media/leelabsg-storage0/eunjae/Meta-code/Inflation_eval/whole_WES/WES_200k_freq_ldpruned.afreq" #output of the previous step
markerData = fread(freqfile, header=T, data.table=F)
markerData <- markerData[which(markerData[,1] %% 2 == 1 & markerData$ALT_FREQ > 0.01),]
markerData <- markerData[sample(1:nrow(markerData), 30000),]

#CHR           SNP   A1   A2          MAF  NCHROBS
#1   rs116720794    T    C      0.03635   321170


BETAG = markerData[,c(2,4)]
for (seedNum in 1:100){
  set.seed(seedNum)
  betaG = rnorm(n = nrow(markerData), mean=0, sd = sqrt(opt$tau/nrow(markerData)))
  BETAG = cbind(BETAG, betaG)
}

colnames(BETAG)[3:102] = paste0("seed_",1:100)
write.table(BETAG, paste0(outputPath,"tau_",opt$tau,"_randomBeta.100seeds.",nrow(markerData),"Markers.txt.oddChr"), quote=F, row.names=F, col.names=T)



## 3. Generate Gbeta

In [None]:
%%bash

plink2 \
    --bfile /home/leelabsg/media/leelabsg-storage0/eunjae/Meta-code/Inflation_eval/whole_WES/LD_pruned/prune_all \
    --score /home/leelabsg/media/leelabsg-storage0/eunjae/Meta-code/Inflation_eval/whole_WES/tau_1_randomBeta.100seeds.30000Markers.txt.oddChr \
    #output from the beta generation
    cols=-scoreavgs,+scoresums \
    list-variants \
    header-read \
    --score-col-nums 3-102 \
    --read-freq /home/leelabsg/media/leelabsg-storage0/eunjae/Meta-code/Inflation_eval/whole_WES/WES_200k_freq_ldpruned.afreq \
    #output from the frequency generation
    --out /home/leelabsg/media/leelabsg-storage0/eunjae/Meta-code/Inflation_eval/whole_WES/WES_200k_freq_ldpruned.G_tildesum

## 4. Generate the Null Y (Phenotype)

In [None]:
%%R


options(stringsAsFactors=F)


## load R libraries
#library(SKAT)
#library(MetaSKAT)
library(optparse)
library(data.table)
## set list of cmd line arguments
option_list <- list(
#  make_option("--seedNum", type="integer",default=1,
#    help="seed number for simulation"),
  make_option("--tau", type="numeric",default=1,
    help="tau for variance component"),
  make_option("--prev", type="numeric", default="",
    help="prevalance")		    
)
## list of options
parser <- OptionParser(usage="%prog [options]", option_list=option_list)
args <- parse_args(parser, positional_arguments = 0)
opt <- args$options
print(opt)

## output folder
outputPath="/home/leelabsg/media/leelabsg-storage0/eunjae/Meta-code/Inflation_eval/whole_WES/"

GetMean<-function(x, prev_a, eta1_a){
        eta_a = eta1_a+ x
        PI_a<-exp(eta_a)/(1+exp(eta_a))
        mean(PI_a) - prev_a
}


dataG = fread("/home/leelabsg/media/leelabsg-storage0/eunjae/Meta-code/Inflation_eval/whole_WES/WES_200k_freq_ldpruned.G_tildesum.sscore", header=T, data.table=F)



for (seedNum in 1:100){

print(seedNum)	
outfile = paste0(outputPath, "seed_",seedNum,"_WES_200k_WB_tau_",opt$tau,"_prev_",opt$prev,".pheno.oddChr")
set.seed(seedNum)
print(4+seedNum)
a=4+seedNum
GTilde = as.vector(dataG[,(4+seedNum)])
N = length(GTilde)
print(N)
#E<-sqrt(1-opt$tau)*rnorm(N)
x1 = rnorm(N)
x2 = rbinom(N,1,0.5)

eta1 = x1 + x2 + GTilde
re = uniroot(f=GetMean, c(-100, 100), prev_a=opt$prev, eta1_a=eta1)
alpha0_new = re$root
eta<-alpha0_new + eta1
PI = exp(eta)/(1+exp(eta))
rfun = function(x){rbinom(n=1,size=1,prob=x)}
y = unlist(lapply(PI, rfun))

#mean(Y)
#sd(Y)

data = data.frame(y=y, x1=x1, x2=x2, IND_ID=dataG[,1])
colnames(data)[4] = "IND_ID"
print(head(data))


## this file contains the PCs estimated for the UKBB samples that we usually use as covariates in the null model

pcafile = "/home/leelabsg/media/leelabsg-storage0/DATA/UKBB/PC/PEDMASTER_ALL_20180514_v1_MAPPED.txt"

pcadata = fread(pcafile, header=T, data.table=F)
pcadata = pcadata[,c("FID", "PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","PC9","PC10")]

head(data)
head(pcadata)

data_a = merge(data, pcadata, by.x="IND_ID", by.y="FID")
data_a = data_a[complete.cases(data_a),]
write.table(data_a, paste0(outfile,"_pca"), sep=" ", quote=F, col.names=T, row.names=F)

}
