# Regenie Interaction

This piepeline implement regenie interaction based on a self-made version of regenie. It was built on the raw source code that has already had the code for interaction test and option from [https://github.com/rgcgithub/regenie](https://github.com/rgcgithub/regenie) (2021-11-24).

The regenie program can be found in `/mnt/mfs/statgen/guangyou/containers/regenie.v.2.3`

## Method

### The interaction model

$$
y = \beta_{0}+\beta_{1} snp_{1}+\beta_{2} snp_{2}+\beta_{12} snp_{1}snp_{2}+\beta_{c} c+g+\varepsilon\ \ g\sim MNV(0,\sigma^{2}_{a}K), \varepsilon\sim MNV(0,\sigma^{2}_{e}I_{N})
$$

Where $K$ is the genetic-relatedness matrix $K=\frac{G_{S}G_{S}^{T}}{M}$, $G_{S}$ is the standardized genotypes

The interaction between covariates and genotypes are also supported. Only one phenotype is supported.

In [1]:
sos run Regenie_interaction.ipynb -h

usage: sos run Regenie_interaction.ipynb
               [workflow_name | -t targets] [options] [workflow_options]
  workflow_name:        Single or combined workflows defined in this script
  targets:              One or more targets to generate
  options:              Single-hyphen sos parameters (see "sos run -h" for details)
  workflow_options:     Double-hyphen workflow-specific parameters

Workflows:
  regenie_qc
  regenie

Global Workflow Options:
  --cwd VAL (as path, required)
                        the output directory for generated files
  --sampleFile . (as path)
                        Path to sample file
  --bfile VAL (as path, required)
                        Genotype files in plink binary this is used for
                        computing the GRM
  --genoFile  paths('.')

                        Path to bgen or bed files
  --phenoFile VAL (as path, required)
                        Phenotype file for quantitative trait (BMI)
  --phenoCol VAL VAL ... (as type, required)

In [None]:
[global]
# the output directory for generated files
parameter: cwd = path
# Path to sample file
parameter: sampleFile = path('.')
# Genotype files in plink binary this is used for computing the GRM
parameter: bfile = path
# Phenotype file for quantitative trait (BMI)
parameter: phenoFile = path
# Phenotype to be analyzed (specify the column)
parameter: phenoCol = list
# Covariate file path. Will use phenoFile if empty
parameter: covarFile = path('.')
# Summary statisticss format file path used for unifying output column names. Will not unify names if empty
parameter: formatFile = path('.')
# Qualitative covariates to be used in the analysis
parameter: covarCol = []
# Quantitative covariates to be used in the analysis
parameter: qCovarCol = []
# Path to bgen or bed files 
parameter: genoFile = list
# Interacting enviroment covariates to be used in the analysis
parameter: covariates = list
# Path to regenie v2.3
parameter: regenieFile = path
# Specific number of threads to use
parameter: numThreads = 2
# Minimum MAF to be used
parameter: bgenMinMAF = 0.001
# Mimimum info score to be used
parameter: bgenMinINFO = 0.8
# For cluster jobs, number commands to run job_sizeper job
parameter: job_size = 1
# For cluster jobs, mem to run job_sizeper job
parameter: mem = '2G'
# The container with the lmm software. Can be either a dockerhub image or a singularity `sif` file.
# Default is set to using dockerhub image
parameter: container_lmm = 'statisticalgenetics/lmm:2.4'
parameter: container_marp = 'gaow/marp'
if not covarFile.is_file():
    covarFile = phenoFile
cwd = path(f"{cwd:a}")

In [None]:
# Select the SNPs and samples to be used based on maf, geno, hwe and mind options
[regenie_qc]
parameter: maf_filter = 0.0
parameter: geno_filter = 0.0
parameter: hwe_filter = 0.0
parameter: mind_filter = 0.0
input: bfile
output: f'{cwd}/cache/{bfile:bn}.qc_pass.id', f'{cwd}/cache/{bfile:bn}.qc_pass.snplist' 
task: trunk_workers = 1, walltime = '10h', mem = '10G', cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
bash: container=container_lmm, expand= "${ }", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout' 
    plink2 \
      --bfile ${bfile:n} --mac 1 \
      ${('--maf %s' % maf_filter) if maf_filter > 0 else ''} ${('--geno %s' % geno_filter) if geno_filter > 0 else ''} ${('--hwe %s' % hwe_filter) if hwe_filter > 0 else ''} ${('--mind %s' % mind_filter) if mind_filter > 0 else ''} \
      --write-snplist --write-samples --no-id-header \
      --threads ${numThreads} \
      --out ${_output[0]:n} 

In [None]:
# extract and prepare phenotype & covariate files
[regenie_1]
import pandas as pd
import numpy as np
dat = pd.read_csv(phenoFile, header=0, delim_whitespace=True, dtype=str)
dat = dat.replace(to_replace =np.nan, value ="NA")
if len(phenoCol) > 0:    
    dat.to_csv(f"{cwd}/{phenoFile:bn}.regenie_phenotype", sep='\t', index=False, columns = ['FID', 'IID'] + phenoCol)
dat = pd.read_csv(covarFile, header=0, delim_whitespace=True, dtype=str)
if len(covarCol) > 0 or len(qCovarCol) > 0:
    dat = dat.dropna(subset=covarCol)
    dat = dat.dropna(subset=qCovarCol)
    dat.replace(to_replace =np.nan, value ="NA")
    dat1 = pd.DataFrame(dat, columns = ['FID','IID'] + covarCol)
    #dat1 = dat1.astype(int)
    dat2 = pd.DataFrame(dat, columns = ['IID'] + qCovarCol)
    merged_left = pd.merge(left=dat1, right=dat2, how='left', left_on='IID', right_on='IID')
    merged_left.to_csv(f"{cwd}/{phenoFile:bn}.regenie_covar", sep=' ', index=False)
input: for_each = dict(cov=covariates)
output: f"{cwd}/{phenoFile:bn}.{cov}.regenie_covar"
task: trunk_workers = 1, trunk_size = 2, walltime = '24h', mem = '2G', cores = numThreads, tags = f'{step_name}_{_output:bn}'
python: container=container_lmm, expand = "${ }" #, stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'
    import pandas as pd
    dat = pd.read_csv("${phenoFile}", header=0, sep='\t', dtype=str)
    dat = dat.replace(to_replace =np.nan, value ="NA")
    if len(phenoCol) > 0:    
        dat.to_csv(f"${cwd}/${phenoFile:bn}.regenie_phenotype", sep='\t', index=False, columns = ['FID', 'IID'] + ${phenoCol})
    dat = pd.read_csv("${covarFile}", header=0, sep='\t', dtype=str)
    if len(${covarCol}) > 0 or len(${qCovarCol}) > 0:
        dat = dat.dropna(subset=${covarCol})
        dat = dat.dropna(subset=${qCovarCol})
        dat = dat.dropna(subset=[${cov}])
        dat.replace(to_replace =np.nan, value ="NA")
        dat1 = pd.DataFrame(dat, columns = ['FID','IID'] + ${covarCol} + [${cov}])
        #dat1 = dat1.astype(int)
        dat2 = pd.DataFrame(dat, columns = ['IID'] + ${qCovarCol})
        merged_left = pd.merge(left=dat1, right=dat2, how='left', left_on='IID', right_on='IID')
        merged_left.to_csv("${_output}", sep=' ', index=False)

In [None]:
# Run REGENIE step 1: fitting the null
[regenie_2]
# Size of the genotype blocks to be used 
parameter: bsize = 400
# Path to temporarily store block predictions
parameter: lowmem_dir = cwd
# Specify that traits are binary with 0=control,1=case,NA=missing (default is quantitative)
parameter: trait = 'bt'
depends: f'{cwd}/cache/{bfile:bn}.qc_pass.snplist', f'{cwd}/cache/{bfile:bn}.qc_pass.id'
input: geno = bfile, pheno = f"{cwd}/{phenoFile:bn}.regenie_phenotype", covar = f"{cwd}/{phenoFile:bn}.regenie_covar", qc = output_from("regenie_qc")
output: f'{cwd}/{phenoFile:bn}_' + "_".join([x for x in phenoCol]) + f'.regenie_pred.list'
task: trunk_workers = 1, trunk_size = 2, walltime = '24h', mem = '5G', cores = numThreads, tags = f'{step_name}_{_output:bn}'
bash: container=container_lmm, expand = "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', volumes = [f"{lowmem_dir:a}:{lowmem_dir:a}"]
    ${regenieFile} \
      --step 1 \
      --bed ${_input["geno"]:n} \
      --phenoFile ${_input["pheno"]} \
      --covarFile ${_input["covar"]} \
      --keep ${_input["qc"][0]} \
      --extract ${_input["qc"][1]} \
      ${('--' + trait) if trait in ['bt'] else ''} \
      --bsize ${bsize} \
      --lowmem --lowmem-prefix ${lowmem_dir:a}/${_output:bn} \
      --threads ${numThreads} \
      --out ${_output:nn}.regenie

In [1]:
# Run REGENIE step 2: association analysis
[regenie_3]
# Size of the genotype blocks to be used 
parameter: bsize = 400
# Mimimum allele count to be used
parameter: minMAC = int
parameter: trait = 'bt'
parameter: trunk_workers = 1
parameter: job_size = 5
parameter: mem = '3G'
parameter: walltime = '2h'
# in the case of bgen data from UKBB ref_first should be set to true
parameter: ref_first= False
input:for_each = [dict(geno = genoFile),dict(cov=covariates)]
input_options = f"--bgen {geno} --sample {sampleFile}" if geno.endswith('.bgen') else f"--bed " + geno.split('.bed')[0]
info = f'{cwd}/{phenoFile:bn}_' + "_".join([x for x in phenoCol]) + '.regenie_pred.list'
output: f'{cwd}/cache/'+geno.split('/')[-1].split('.bgen')[0]+f'.{cov}_'+ "_".join([x for x in phenoCol]) + f".regenie.gz"
outputprefix=f'{cwd}/cache/'+geno.split('/')[-1].split('.bgen')[0]+f'.{cov}'
task: trunk_workers = trunk_workers, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'
bash:container=container_lmm, expand = "${ }", stderr = f'{cwd}/cache/{info:bn}.stderr', stdout = f'{cwd}/cache/{info:bn}.stdout', volumes = [f"{cwd:a}:{cwd:a}"]
    set -e
    ${regenieFile} \
     --step 2 \
     ${input_options} \
     --phenoFile ${cwd}/${phenoFile:bn}.regenie_phenotype \
     --covarFile ${cwd}/${phenoFile:bn}.${cov}.regenie_covar \
     --phenoColList ${','.join(phenoCol)} \
     ${('--' + trait) if trait in ['bt'] else ''} \
     ${('--ref-first') if ref_first else ''} \
     --interaction ${cov} \
     --firth 0.01 --approx \
     --pred ${info} \
     --bsize ${bsize} \
     --minMAC ${minMAC} \
     --minINFO ${bgenMinINFO}\
     --threads ${numThreads} \
     --out ${outputprefix} && \
     gzip -f --best ${_output:n}