# Regenie

In [None]:
[global]
# the output directory for generated files
parameter: cwd = path
# Path to sample file
parameter: sampleFile = path('.')
# Genotype files in plink binary this is used for computing the GRM
parameter: bfile = path
# Path to bgen or bed files 
parameter: genoFile = str
# Phenotype file for quantitative trait (BMI)
parameter: phenoFile = []
# Phenotype to be analyzed (specify the column)
parameter: phenoCol = list
# Covariate file path. Will use phenoFile if empty
parameter: covarFile = path('.')
# Summary statisticss format file path used for unifying output column names. Will not unify names if empty
parameter: formatFile = path('.')
# Qualitative covariates to be used in the analysis
parameter: covarCol = []
# Quantitative covariates to be used in the analysis
parameter: qCovarCol = []
# Specific number of threads to use
parameter: numThreads = 2
# Minimum MAF to be used
parameter: bgenMinMAF = 0.001
# Mimimum info score to be used
parameter: bgenMinINFO = 0.8
# For cluster jobs, number commands to run per job
parameter: job_size = 1
# The container with the lmm software. Can be either a dockerhub image or a singularity `sif` file.
# Default is set to using dockerhub image
parameter: container_lmm = 'statisticalgenetics/lmm:2.4'
parameter: container_marp = 'gaow/marp'
if not covarFile.is_file():
    covarFile = phenoFile
cwd = path(f"{cwd:a}")

In [None]:
# Run REGENIE step 1: fitting the null
[fit]
# Size of the genotype blocks to be used 
parameter: bsize = 400
# Path to temporarily store block predictions
parameter: lowmem_dir = cwd
# Specify that traits are binary with 0=control,1=case,NA=missing (default is quantitative)
parameter: trait = 'bt'
# extract and prepare phenotype & covariate files
import pandas as pd
import numpy as np
dat = pd.read_csv(phenoFile, header=0, delimiter='\t', dtype=str)
dat = dat.replace(to_replace =np.nan, value ="NA")
if len(phenoCol) > 0:    
    dat.to_csv(f"{cwd}/{phenoFile:bn}.regenie_phenotype", sep='\t', index=False, columns = ['FID', 'IID'] + phenoCol)
dat = pd.read_csv(covarFile, header=0, delimiter='\t', dtype=str)
if len(covarCol) > 0 or len(qCovarCol) > 0:
    dat = dat.dropna(subset=covarCol)
    dat = dat.dropna(subset=qCovarCol)
    dat.replace(to_replace =np.nan, value ="NA")
    dat1 = pd.DataFrame(dat, columns = ['FID','IID'] + covarCol)
    #dat1 = dat1.astype(int)
    dat2 = pd.DataFrame(dat, columns = ['IID'] + qCovarCol)
    merged_left = pd.merge(left=dat1, right=dat2, how='left', left_on='IID', right_on='IID')
    merged_left.to_csv(f"{cwd}/{phenoFile:bn}.regenie_covar", sep=' ', index=False)
depends: f'{cwd}/cache/{bfile:bn}.qc_pass.snplist', f'{cwd}/cache/{bfile:bn}.qc_pass.id'
input: geno = bfile, pheno = f"{cwd}/{phenoFile:bn}.regenie_phenotype", covar = f"{cwd}/{phenoFile:bn}.regenie_covar", qc = output_from("regenie_qc")
output: f'{cwd}/{phenoFile:bn}_' + "_".join([x for x in phenoCol]) + '.regenie_pred.list'
task: trunk_workers = 1, trunk_size = job_size, walltime = '12h', mem = '15G', cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
bash: container=container_lmm, expand = "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', volumes = [f"{lowmem_dir:a}:{lowmem_dir:a}"]
    regenie \
      --step 1 \
      --bed ${_input["geno"]:n} \
      --phenoFile ${_input["pheno"]} \
      --covarFile ${_input["covar"]} \
      --keep ${_input["qc"][0]} \
      --extract ${_input["qc"][1]} \
      ${('--' + trait) if trait in ['bt'] else ''} \
      --bsize ${bsize} \
      --lowmem --lowmem-prefix ${lowmem_dir:a}/${_output:bn} \
      --threads ${numThreads} \
      --out ${_output:nn}.regenie

In [None]:
# Run REGENIE step 2: association analysis
[asso]
# Size of the genotype blocks to be used 
parameter: bsize = 400
# Mimimum allele count to be used
parameter: minMAC = int
parameter: trait = 'bt'
parameter: pred = f'{cwd}/*.regenie_pred.list'
input: phenoFile, group_by = 1
input_options = f"--bgen {genoFile} --sample {sampleFile}" if genoFile.endswith('.bgen') else f"--bed " + genoFile.split('.bed')[0]
output: [f'{cwd}/cache/{_input:bn}_'+ str(phenoCol[i]) + '.regenie.gz' for i in range(len(phenoCol))]
task: trunk_workers = 1, trunk_size = 10, walltime = '0.5h', mem = '1G', cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
bash:container=container_lmm, expand = "${ }", stderr = f'{cwd}/cache/{_input:bn}.stderr', stdout = f'{cwd}/cache/{_input:bn}.stdout', volumes = [f"{cwd:a}:{cwd:a}"]
    set -e
    regenie \
     --step 2 \
     ${input_options} \
     --phenoFile ${_input:n}.regenie_phenotype \
     --covarFile ${_input:n}.regenie_covar \
     --phenoColList ${','.join(phenoCol)} \
     ${('--' + trait) if trait in ['bt'] else ''} \
     --firth 0.01 --approx \
     --pred ${pred} \
     --bsize ${bsize} \
     --minMAC ${minMAC} \
     --minINFO ${bgenMinINFO}\
     --threads ${numThreads} \
     --out ${cwd}/cache/${_input:bn} && \
     gzip -f --best ${_output:n}

In [None]:
# Run REGENIE step 2: association analysis with the permutation of phenotype and covarites date for each job
[asso_permutation]
# ID column file containing and only containing the sample FID and IID column (with header)
parameter: id_column = path
# Phenotype file containing and only containing the pheontypes in phenoCol (with header). The sample order should the same with [id_column] 
parameter: pheno_column = path
# Covariate file prefix containing and only containing the covariates (wiht header). It should ends with ".{pheontype}". The sample order should the same with [pheno_column]
parameter: covar_column = str
# Phenotype to be analyzed (specify the column)
parameter: phenoCol = list
parameter: phenolist = ' '.join([pheno for pheno in phenoCol])
# The index of permutation to test
parameter: idstart = int
parameter: idend = int
# Size of the genotype blocks to be used 
parameter: bsize = 400
# Mimimum allele count to be used
parameter: minMAC = int
parameter: trait = 'bt'
# pred list file prefix from regenie step 1 result (ends with .regenie_pred.list), The prefix should be the same for all the phenotypes
parameter: pred = str
parameter: indexnum = list(range(idstart,idend+1))
input: for_each = "indexnum"
input_options = f"--bgen {genoFile} --sample {sampleFile}" if genoFile.endswith('.bgen') else f"--bed " + genoFile.split('.bed')[0]
output: [f'{cwd}/cache/permutation.{_indexnum}_'+ str(phenoCol[i]) + '.regenie.gz' for i in range(len(phenoCol))]
task: trunk_workers = 1, trunk_size = 10, walltime = '0.5h', mem = '1G', cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
bash:container=container_lmm, expand = "${ }", stderr = f'{cwd}/cache/permutation.{_indexnum}.stderr', stdout = f'{cwd}/cache/permutation.{_indexnum}.stdout', volumes = [f"{cwd:a}:{cwd:a}"]
    # ID shifting
    id=`sed '1d' ${id_column} | awk '{print $1,$2}' OFS='\t' | shuf`
    echo -e "FID\tIID\n$id" > ${cwd}/cache/${_indexnum}.id
    # Combine the ID column with pheno_column and covar_column
    for pheno in ${phenolist}
    do
    paste ${cwd}/cache/${_indexnum}.id \
    ${pheno_column} \
    > ${cwd}/cache/$pheno.${_indexnum}.regenie_phenotype
    paste ${cwd}/cache/${_indexnum}.id \
    ${covar_column}.$pheno \
    > ${cwd}/cache/$pheno.${_indexnum}.regenie_covar
    done
    # Regenie by pheontypes
    for pheno in ${phenolist}
    do
    set -e
    regenie \
     --step 2 \
     ${input_options} \
     --phenoFile ${cwd}/cache/$pheno.${_indexnum}.regenie_phenotype \
     --covarFile ${cwd}/cache/$pheno.${_indexnum}.regenie_covar \
     --phenoColList $pheno \
     ${('--' + trait) if trait in ['bt'] else ''} \
     --firth 0.01 --approx \
     --pred ${pred}_$pheno.regenie_pred.list \
     --bsize ${bsize} \
     --minMAC ${minMAC} \
     --minINFO ${bgenMinINFO}\
     --threads ${numThreads} \
     --out ${cwd}/cache/permutation.${_indexnum} && \
    gzip -f --best ${cwd}/cache/permutation.${_indexnum}_$pheno.regenie
    done
    # Delete the permuted phenotype files
    #rm -rf ${cwd}/cache/${_indexnum}.id ${cwd}/cache/*.${_indexnum}.regenie_phenotype ${cwd}/cache/*.${_indexnum}.regenie_covar ${cwd}/cache/permutation.${_indexnum}.stderr ${cwd}/cache/permutation.${_indexnum}.stdout

In [None]:
# Run regenie burden test step 2
[regenie_burden]
# Size of the genotype blocks to be used 
parameter: bsize = 400
# Mimimum allele count to be used
parameter: minMAC = int
parameter: trait = 'bt'
parameter: pred = f'{cwd}/*.regenie_pred.list'
# Specify that traits are binary with 0=control,1=case,NA=missing (default is quantitative)
parameter: trait = 'bt'
# Annotation file format: variantID, gene and functional annotation (space/tab delimited)
parameter: anno_file = path
# This file lists variants within each set/gene to use when building masks. Format: set/gene name, chromosome, physical pos set/gene, then by a comma-separated list of variants included in the set/gene.
parameter: set_list = path
# Select specific genes/sets to test
parameter: keep_gene = path(".")
# Allele frequency file. format: variantId, alternative allele frequency
parameter: aaf_file = path(".")
# Select the annotations to be used in the mask file. format: mask# annotation type
parameter: mask_file = path(".")
# Select the upper MAF to generate masks
parameter: aaf_bins =[0.01]
# The way in which the alternative alleles are counted
parameter: build_mask = 'max'
# Mimimum allele count to be used
parameter: minMAC = int
input: phenoFile, group_by = 1
input_options = f"--bgen {genoFile} --sample {sampleFile}" if genoFile.endswith('.bgen') else f"--bed " + genoFile.split('.bed')[0]
output: [f'{cwd}/cache/{_input:bn}_burden_'+ str(phenoCol[i]) + '.regenie.gz' for i in range(len(phenoCol))]
task: trunk_workers = 1, trunk_size = 10, walltime = '0.5h', mem = '5G', cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
python:container=container_lmm, expand = "${ }", stderr = f'{cwd}/cache/{_input:bn}.stderr', stdout = f'{cwd}/cache/{_input:bn}.stdout', volumes = [f"{cwd:a}:{cwd:a}"]
    import pandas as pd
    import numpy as np
    dat = pd.read_csv("${_input:n}.regenie_phenotype", header=0, delimiter='\t', dtype=str)
    dat = dat.replace(to_replace =np.nan, value ="NA")
    dat = dat.dropna(subset=${phenoCol})
    if len(${phenoCol}) > 0:    
        dat.to_csv(f"${cwd}/cache/${_input:nb}.regenie_phenotype", sep='\t', index=False, columns = ['FID', 'IID']+${phenoCol})
    dat = pd.read_csv("${_input:n}.regenie_covar", header=0, delimiter='\t', dtype=str)
    if len(${covarCol}) > 0 or len(${qCovarCol}) > 0:
        dat = dat.dropna(subset=${covarCol})
        dat = dat.dropna(subset=${qCovarCol})
        dat.replace(to_replace =np.nan, value ="NA")
        dat1 = pd.DataFrame(dat, columns = ['FID','IID'] + ${covarCol})
        #dat1 = dat1.astype(int)
        dat2 = pd.DataFrame(dat, columns = ['IID'] + ${qCovarCol})
        merged_left = pd.merge(left=dat1, right=dat2, how='left', left_on='IID', right_on='IID')
        merged_left.to_csv(f"${cwd}/cache/${_input:nb}.regenie_covar", sep=' ', index=False)
bash:container=container_lmm, expand = "${ }", stderr = f'{cwd}/cache/{_input:bn}.stderr', stdout = f'{cwd}/cache/{_input:bn}.stdout', volumes = [f"{cwd:a}:{cwd:a}"]
    set -e
    regenie \
        --step 2 \
        ${input_options} \
        --phenoFile ${cwd}/cache/${_input:nb}.regenie_phenotype \
        --covarFile ${cwd}/cache/${_input:nb}.regenie_covar \
        --phenoColList ${','.join(phenoCol)} \
        ${('--' + trait) if trait in ['bt'] else ''} \
        ${("--extract-sets " + str(keep_gene)) if keep_gene.is_file() else ""} \
        --firth --approx \
        --pred ${pred} \
        --set-list ${set_list} \
        --anno-file ${anno_file} \
        --mask-def ${mask_file} \
        --aaf-bins ${",".join([str(x) for x in aaf_bins])}\
        ${('--build-mask ' + build_mask) if build_mask in ['max','sum','comphet'] else ''} \
        ${('--aaf-file ' + str(aaf_file)) if aaf_file.is_file() else ""}\
        --singleton-carrier \
        --minMAC ${minMAC} \
        --bsize ${bsize} \
        --check-burden-files \
        --out  ${cwd}/cache/${_input:bn}_burden && \
        gzip -f --best ${_output:n}

In [None]:
# Run regenie burden test step 2 with the permutation of phenotype and covarites date for each job
[regenie_burden_permutation]
# ID column file containing and only containing the sample FID and IID column (with header)
parameter: id_column = path
# Phenotype file containing and only containing the pheontypes in phenoCol (wiht header). The sample order should the same with [id_column] 
parameter: pheno_column = path
# Covariate file prefix containing and only containing the covariates (wiht header). It should ends with ".{pheontype}". The sample order should the same with [pheno_column]
parameter: covar_column = str
# Phenotype to be analyzed (specify the column)
parameter: phenoCol = list
parameter: phenolist = ' '.join([pheno for pheno in phenoCol])
# The index of permutation to test
parameter: idstart = int
parameter: idend = int
parameter: indexnum = list(range(idstart,idend+1))
# Size of the genotype blocks to be used 
parameter: bsize = 400
# Mimimum allele count to be used
parameter: minMAC = int
parameter: trait = 'bt'
# Specify that traits are binary with 0=control,1=case,NA=missing (default is quantitative)
parameter: trait = 'bt'
# Annotation file format: variantID, gene and functional annotation (space/tab delimited)
parameter: anno_file = path
# This file lists variants within each set/gene to use when building masks. Format: set/gene name, chromosome, physical pos set/gene, then by a comma-separated list of variants included in the set/gene.
parameter: set_list = path
# Select specific genes/sets to test
parameter: keep_gene = path(".")
# Allele frequency file. format: variantId, alternative allele frequency
parameter: aaf_file = path(".")
# Select the annotations to be used in the mask file. format: mask# annotation type
parameter: mask_file = path(".")
# Select the upper MAF to generate masks
parameter: aaf_bins =[0.01]
# The way in which the alternative alleles are counted
parameter: build_mask = 'max'
# Mimimum allele count to be used
parameter: minMAC = int
input: for_each = "indexnum"
input_options = f"--bgen {genoFile} --sample {sampleFile}" if genoFile.endswith('.bgen') else f"--bed " + genoFile.split('.bed')[0]
output: [f'{cwd}/cache/permutation.{_indexnum}_burden_'+ str(phenoCol[i]) + '.regenie.gz' for i in range(len(phenoCol))]
task: trunk_workers = 1, trunk_size = 2, walltime = '2h', mem = '5G', cores = 1, tags = f'{step_name}_{_output[0]:bn}'
bash:container=container_lmm, expand = "${ }", stderr = f'{cwd}/cache/permutation.{_indexnum}.stderr', stdout = f'{cwd}/cache/permutation.{_indexnum}.stdout', volumes = [f"{cwd:a}:{cwd:a}"]
    # ID shifting
    id=`sed '1d' ${id_column} | awk '{print $1,$2}' OFS='\t' | shuf`
    echo -e "FID\tIID\n$id" > ${cwd}/cache/${_indexnum}.id
    # Combine the ID column with pheno_column and covar_column
    for pheno in ${phenolist}
    do
    paste ${cwd}/cache/${_indexnum}.id \
    ${pheno_column} \
    > ${cwd}/cache/$pheno.${_indexnum}.regenie_phenotype
    paste ${cwd}/cache/${_indexnum}.id \
    ${covar_column}.$pheno \
    > ${cwd}/cache/$pheno.${_indexnum}.regenie_covar
    done
    # Regenie by pheontypes
    for pheno in ${phenolist}
    do
    set -e
    regenie \
        --step 2 \
        ${input_options} \
        --phenoFile ${cwd}/cache/$pheno.${_indexnum}.regenie_phenotype \
        --covarFile ${cwd}/cache/$pheno.${_indexnum}.regenie_covar \
        --phenoColList $pheno \
        ${('--' + trait) if trait in ['bt'] else ''} \
        ${("--extract-sets " + str(keep_gene)) if keep_gene.is_file() else ""} \
        --firth --approx \
        --pred ${cwd}/$pheno.regenie_pred.list \
        --set-list ${set_list} \
        --anno-file ${anno_file} \
        --mask-def ${mask_file} \
        --aaf-bins ${",".join([str(x) for x in aaf_bins])}\
        ${('--build-mask ' + build_mask) if build_mask in ['max','sum','comphet'] else ''} \
        ${('--aaf-file ' + str(aaf_file)) if aaf_file.is_file() else ""}\
        --singleton-carrier \
        --minMAC ${minMAC} \
        --bsize ${bsize} \
        --check-burden-files \
        --out  ${cwd}/cache/permutation.${_indexnum}_burden && \
    gzip -f --best ${cwd}/cache/permutation.${_indexnum}_burden_$pheno.regenie
    done
    # Delete the permuted phenotype files
    #rm -rf ${cwd}/cache/${_indexnum}.id ${cwd}/cache/*.${_indexnum}.regenie_phenotype ${cwd}/cache/*.${_indexnum}.regenie_covar ${cwd}/cache/permutation.${_indexnum}.stderr ${cwd}/cache/permutation.${_indexnum}.stdout

In [None]:
# Merge results and log files
[plot_1]
parameter:reverse_log_p = False
depends: formatFile
input: [f'{cwd}/cache/{phenoFile:bn}_'+ str(phenoCol[i]) + '.regenie.gz' for i in range(len(phenoCol))], group_by=1
output: f'{cwd}/{_input:bn}.{step_name.rsplit("_",1)[0]}.snp_stats.gz',
        f'{cwd}/{_input:bn}.{step_name.rsplit("_",1)[0]}.snp_counts.txt'
task: trunk_workers = 1, trunk_size = 1, walltime = '0.5h', mem = '16G', cores = 1, tags = f'{step_name}_{_output[0]:bn}'
python: container=container_lmm, expand ='${ }', stderr = f'{cwd}/cache/{_input:bn}.stderr', stdout = f'{cwd}/cache/{_input:bn}.stdout', volumes = [f"{cwd:a}:{cwd:a}"]
    import gzip
    import pandas as pd
    if ${formatFile.is_file()}:
        output = '${_output[0]:n}' + '_original_columns' + '${_output[0]:x}'
    else:
        output = '${_output[0]}'
   
    data = pd.concat([pd.read_csv(f, compression='gzip', header=0, delim_whitespace=True, quotechar='"', comment='#') for f in [${_input:r,}]], ignore_index=True)
    data.to_csv(output, compression='gzip', sep='\t', header = True, index = False)
    # unify output format
    if ${formatFile.is_file()} or ${reverse_log_p}:
        sumstats = pd.read_csv(output, compression='gzip', header=0, delim_whitespace=True, quotechar='"')  
        if ${formatFile.is_file()}:
            import yaml
            config = yaml.safe_load(open(${formatFile:r}, 'r'))
        try:
            sumstats = sumstats.loc[:,list(config.values())]
        except:
            raise ValueError(f'According to ${formatFile}, input summary statistics should have the following columns: {list(config.values())}.')
        sumstats.columns = list(config.keys())
        if ${reverse_log_p}:
            sumstats['P'] = sumstats['P'].apply(lambda row: 10**-row)
        sumstats.to_csv(${_output[0]:r}, compression='gzip', sep='\t', header = True, index = False)        

bash: container=container_lmm, expand="$( )"
    # count result SNPs
    for f in $(_input); do echo "$f: `zcat $f | wc -l`"; done > $(_output[1])
    # merge stderr and stdout files
    for f in $(_input); do 
        for ext in stderr stdout log; do
            echo "$f $ext:"
            cat ${f%.gz}.$ext 2>/dev/null || true
            rm -f ${f%.gz}.$ext 
        done
    done > $(_output[0]:n).log

In [None]:
# Manhattan and QQ plots using `qqman`
[plot_2]
# Column name for BP
parameter: bp = 'POS'
# Column name for p-value
parameter: pval = 'P'
# Column name for SNP
parameter: snp = 'SNP'
# Plot only on p-values smaller than this
parameter: p_filter = '0.05'
# Higlight SNPs with P-values lower than this
parameter: sigp = 5e-08
# ylim set to 0 to use maximum -log10(p) in data
parameter: ylim = 0
# Path to the annotation file if present
parameter: anno_file = path()
# The label to annotate in the manhattan plot options are (SNP, avsnp150, Gene)
parameter: label_annotate = str
# Option to add annotation data
parameter: annotate = False
# Decide wether to use top_snps list
parameter: top_snps = False
# Provide a list of SNPs to highlight
parameter: snp_list = path()
sep = '\n\n---\n'
if any(['fastGWA' in step_name]):
    heritability = get_output(f'grep Heritability {_input[0]:n}.log | head -1').strip()
else:
    heritability = None
depends: phenoFile
input: group_by = 2, group_with = 'phenoCol'
output: manhattan = f'{_input[0]:nn}.manhattan.png',
        qq = f'{_input[0]:nn}.qq.png',
        annotated_manhattan = f'{_input[0]:nn}.manhattan_annotated.png',
        analysis_summary = f'{_input[0]:nn}.analysis_summary.md',
        plot_data = f'{_input[0]:nn}.plot_data.rds',
        manhattan_pdf = f'{_input[0]:nn}.manhattan_annotated.pdf'
task: trunk_workers = 1, trunk_size = job_size, walltime = '3h', mem = '64G', tags = f'{step_name}_{_output[0]:bn}'    
bash: container=container_lmm, expand = "${ }"
    echo '''---
    theme: base-theme
    style: |
      img {
        height: 80%;
        display: block;
        margin-left: auto;
        margin-right: auto;
      }
    ---    
    ''' > ${_output[3]}
    
R: container=container_lmm, expand='${ }', stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout'
    # some summary statistics for phenotype
    pheno = read.table(${phenoFile:r}, header=T, sep = "\t")$${_phenoCol}
    if (length(unique(pheno))>2) {
      out = capture.output(summary(pheno))
    } else {
      out = as.data.frame(table(pheno))
      rownames(out) = c('n_ctrl', 'n_case')
      out = out[,2,drop=F]
    }
    write('# ${_phenoCol} result summary\n## Phenotype summary:\n```', ${_output[3]:r}, append = T)
    write.table(out, ${_output[3]:r}, append = T)
    write('${(" Heritability is %s" % heritability) if heritability is not None else ''}', ${_output[3]:r}, append = T)
    write("```", ${_output[3]:r}, append = T)

R: container=container_lmm, expand='${ }', stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout'
    library('qqman')
    data <- read.table(gzfile('${_input[0]}'), sep='\t', header=T)
    lambda <- median(qchisq(1-data$${pval},1), na.rm=TRUE)/qchisq(0.5,1)
    ifelse((${ylim} == 0 && min(data$${pval}, na.rm=TRUE)!=0), ylim <- abs(floor(log10(min(data$${pval}, na.rm=TRUE)))), ylim <- abs(floor(log10(2.225074e-308))))
    # Creating manhattan plot
    png('${_output[0]}', width = 6, height = 4, unit='in', res=300)
    manhattan_plot <- manhattan(data, chr='CHR', bp='${bp}', snp='${snp}', p='${pval}', main = 'Manhattan plot for ${_phenoCol} (${step_name.rsplit("_",1)[0]})', ylim = c(0, ylim), cex = 0.6, 
    cex.axis = 0.9, col = c("blue4", "orange3"), chrlabs = as.character(c(1:22)))
    dev.off()
    # Creating qqplot
    png('${_output[1]}', width = 5, height = 5, unit='in', res=300)
    qq_plot <- qq(data$${pval}, main = 'QQ Plot for ${_phenoCol} (${step_name.rsplit("_",1)[0]})', xlim = c(0, 8), ylim = c(0, ylim), pch = 18, col = "blue4", cex = 1.5, las = 1)
    dev.off()
    write('## p-value summary:', ${_output[3]:r}, append=T)
    write(paste("Genomic inflation factor is", round(lambda,3), "for", nrow(data), "variants analyzed.${sep}"), ${_output[3]:r}, append=T)
    
  
R: container=container_lmm, expand='${ }', stderr = f'{_output[2]:n}.stderr', stdout = f'{_output[2]:n}.stdout'
    library('dplyr')
    library('ggrepel')
    library('tidyr')
    library('ggplot2')
    library('svglite')
    #Load your data
    gwas <- read.table(gzfile('${_input[0]}'),sep='\t', header=T)
    # Select SNPs of interest
    if(${"TRUE" if top_snps else "FALSE"}){
      snpsOfInterest <- read.table('${snp_list}', sep="\t", header=T)
      } else {
          snpsOfInterest <- gwas %>%
          filter(${pval}<${sigp})
          write.table(snpsOfInterest,'${_output[5]:nn}.top_snps.tsv', sep='\t', row.names=FALSE, quote=FALSE)
      }
    
    # Read in the annotation file if present
    if (${"TRUE" if annotate else "FALSE"}) {
         print("The annotation file has been provided")
         annot <- read.csv('${anno_file}')
         annot <- annot %>%
         select("alternate_id", "Gene.refGene", "avsnp150") %>%
         separate("Gene.refGene", "Gene") %>%
         rename(SNP = alternate_id)
     # Merge snps of interest with the annotation info
         snps_annot <- merge(snpsOfInterest,annot,by="SNP")
    } else {
          print("The annotation file has not been provided")
    }
  
    # Prepare the dataset
    dat <- gwas %>%  
    # Compute chromosome size
    group_by(CHR) %>% 
    summarise(chr_len=max(${bp})) %>%
    # Calculate cumulative position of each chromosome
    mutate(tot=cumsum(as.numeric(chr_len))- as.numeric(chr_len)) %>%
    select(-chr_len) %>%      
    # Add this info to the initial dataset
    left_join(gwas, ., by=c("CHR"="CHR")) %>%
    # Add a cumulative position of each SNP
    arrange(CHR, ${bp}) %>%
    mutate(BPcum=${bp}+tot) %>%  
    # Add highlight and annotation information
    mutate(is_highlight=ifelse(${snp} %in% snpsOfInterest$SNP, "yes", "no")) %>%
    mutate(is_annotate=ifelse(${pval}<${sigp}, "yes", "no")) %>%
    # Filter SNP to make the plot lighter
    filter(-log10(P)>${p_filter})
    
    # Merge the gwas filtered data with the annotation
    if (${"TRUE" if annotate else "FALSE"}) {
          print("Merging the gwas results with the annotation file")
          dat_anno <- merge(x = dat, y = snps_annot[ , c("SNP", "Gene", "avsnp150")], by = "SNP", all.x=TRUE)
          gene_anno <- dat_anno %>% 
              filter(!is.na(Gene)) %>%
              group_by(Gene) %>% 
              slice_min(order_by=P)
          dat_anno <- dat_anno %>% mutate(is_top = ifelse(SNP %in% gene_anno$SNP, "yes", "no"))
        } else {
          print("The gwas results are kept as they are because no annotation file is provided")
          dat_anno <- dat
        }
    
    # Create axis for the manhattan plot
    axisdf = dat %>% group_by(CHR) %>% summarize(center=( max(BPcum) + min(BPcum) ) / 2 )
  
    # Make the plot
    sig <- ${sigp}
    if (${ylim} == 0) ylim <- abs(floor(log10(min(dat$${pval})))) + 2
    y_limits <- c(8, NA)

    manhplot <- ggplot(dat_anno, aes(x=BPcum, y=-log10(${pval}))) +
        # Show all points
        geom_point( aes(color=as.factor(CHR)), alpha=0.8, size=1.3) +
        scale_color_manual(values = rep(c("black", "grey"), 22 )) +
        # custom X axis:
        scale_x_continuous(label = axisdf$CHR, breaks= axisdf$center) +
        scale_y_continuous(expand = c(0, 0), limits=c(NA, ylim)) +   # remove space between plot area and x axis
        # Add highlighted points
        geom_point(data=subset(dat_anno, is_highlight=="yes"), color="red", size=2) +
        # Add label using ggrepel to avoid overlapping, if annotate option is used label the top snps in each gene
        geom_label_repel( data=subset(dat_anno, ${'is_top=="yes"' if annotate else 'is_annotate=="yes"'}), aes(label=${label_annotate}), size=2,  max.overlaps = Inf, ylim = y_limits, box.padding = unit(0.35, "lines"), point.padding = unit(0.5, "lines")) +
        # Add significance level line
        geom_hline(yintercept = -log10(sig), color = "red1", linetype = "dashed") +
        #create X and Y axes labels
        labs(x = "Chromosome",
         y = "-log10(p)",
         title ='Manhattan plot for ${_phenoCol} (${step_name.rsplit("_",1)[0]})') +
        # Custom the theme:
        theme_classic() +
        theme( 
          legend.position="none",
          panel.border = element_blank(),
          panel.grid.major.x = element_blank(),
          panel.grid.minor.x = element_blank(),
          axis.text.x = element_text(angle = 90, size = 8, vjust = 0.5))
  
    # To save a plot created with ggplot2 you have to use to print() function

    png('${_output[2]}', width = 6, height = 4, unit='in', res=300)
    print(manhplot)
    dev.off()
    ggsave(${_output[5]:r}, plot= manhplot, device = "pdf", width = 6, height = 4, dpi=300, unit='in')
    # save significant data to a file for further evaluations
    tmp = dat[,c('CHR', '${bp}', 'BPcum', '${snp}', '${pval}')]
    colnames(tmp) = c('CHR', 'POS', 'POScum', 'SNP', 'pvalue')
    saveRDS(list(data = tmp, 
                 ylim = abs(floor(log10(min(dat$${pval})))) + 2,
                 axisdf = axisdf), ${_output[4]:r})

bash: container=container_lmm, expand = True
  set -e
  echo -e "# QQ plot for {_phenoCol}\n" >> {_output[3]}
  echo -e "![]({_output[1]:bn}.png){sep}" >> {_output[3]}
  echo -e "# Manhattan plot for {_phenoCol}\n" >> {_output[3]}
  echo -e "![]({_output[0]:bn}.png){sep}" >> {_output[3]}
  echo -e "# Manhattan plot for {_phenoCol}\n" >> {_output[3]}
  echo -e "![]({_output[2]:bn}.png){sep}" >> {_output[3]}
  echo -e "# Manhattan plot for {_phenoCol}\n" >> {_output[3]}
  echo -e "![]({_output[2]:bn}.svg){sep}" >> {_output[3]}
  echo -e "# Result files\n\`\`\`" >> {_output[3]}
  ls {_input[0]:nn}.* | grep -vP 'stderr|stdout'>> {_output[3]}
  echo -e "\`\`\`" >> {_output[3]}

In [None]:
# Generate analysis report: HTML file, and optionally PPTX file
[plot_3]
input: group_by = 6, group_with='phenoCol'
output: f"{_input['analysis_summary']:n}.html"
sh: container=container_marp, expand = True, stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'
    node /opt/marp/.cli/marp-cli.js {_input['analysis_summary']} -o {_output:a} \
        --title '{_phenoCol} {step_name.rsplit("_",1)[0]} analysis' \
        --allow-local-files
    node /opt/marp/.cli/marp-cli.js {_input['analysis_summary']} -o {_output:an}.pptx \
        --title '{_phenoCol} {step_name.rsplit("_",1)[0]} analysis' \
        --allow-local-files