## Minimal working example

Script creation for the LDclumping.ipynb

In [88]:
%save MWE_ldclump.sh -f

# Defining bash variables for the different paths,

output=output
bfile=~/MWE0612/genotypes.bed
bgenFile=`echo ~/MWE0612/imputed_genotypes_chr2?.bgen`
sampleFile=~/MWE0612/imputed_genotypes.sample
sumstatsFile=~/MWE0612/output/phenotypes_BMI.fastGWA.snp_stats.gz
unrelated_samples=~/MWE0612/unrelated_samples.txt
ld_sample_size=100
clump_field=P
clump_p1=0.05
clump_p2=1
clump_r2=0.2
clump_kb=1000
clump_annotate=OR
numThreads=20
job_size=1

#Running the workflow for BMI trait

sos dryrun ~/UKBB_GWAS_DEV/workflow/LD_Clumping.ipynb \
    --cwd $output \
    --bfile $bfile \
    --bgenFile $bgenFile \
    --sampleFile $sampleFile \
    --sumstatsFile $sumstatsFile \
    --unrelated_samples $unrelated_samples \
    --ld_sample_size $ld_sample_size \
    --clump_field $clump_field \
    --clump_p1 $clump_p1 \
    --clump_p2 $clump_p2 \
    --clump_r2 $clump_r2 \
    --clump_kb $clump_kb \
    --clump_annotate $clump_annotate \
    --numThreads $numThreads \
    --job_size $job_size
    -q localhost \
    -s build &> 062420-sos-LD-MWE.log

## MWE for REGENIE

In [6]:
cd
sos run ~/project/bioworkflows/GWAS/LMM.ipynb regenie\
    --cwd output \
    --bfile data/genotypes21_22.bed \
    --sampleFile data/imputed_genotypes.sample \
    --bgenFile data/imputed_genotypes_chr*.bgen \
    --phenoFile data/phenotypes.txt \
    --formatFile data/regenie_template.yml \
    --phenoCol ASTHMA T2D\
    --covarCol SEX \
    --qCovarCol AGE \
    --numThreads 8 \
    --bsize 1000 \
    --lowmem_prefix /home/dc2325/output \
    --trait bt \
    --minMAC 4 \
    --bgenMinMAF 0.05 \
    --bgenMinINFO 0.8 \
    --reverse_log_p \
    --p_filter 1\
    --container_option /SAY/dbgapstg/scratch/UKBiobank/lmm.sif

INFO: Running [32mregenie_1[0m: Run REGENIE step 1: fitting the null
INFO: [32mregenie_1[0m is [32mcompleted[0m.
INFO: [32mregenie_1[0m output:   [32moutput/phenotypes_ASTHMA_T2D.regenie_pred.list[0m
INFO: Running [32mregenie_2[0m: Run REGENIE step 2: association analysis
INFO: [32mregenie_2[0m (index=0) is [32mcompleted[0m.
INFO: [32mregenie_2[0m (index=1) is [32mcompleted[0m.
INFO: [32mregenie_2[0m output:   [32moutput/imputed_genotypes_chr21_ASTHMA.regenie.gz output/imputed_genotypes_chr21_T2D.regenie.gz... (4 items in 2 groups)[0m
INFO: Running [32mregenie_3[0m: Merge results and log files
INFO: [32mregenie_3[0m (index=0) is [32mcompleted[0m.
INFO: [32mregenie_3[0m (index=1) is [32mcompleted[0m.
INFO: [32mregenie_3[0m output:   [32moutput/phenotypes_ASTHMA.regenie.snp_stats.gz output/phenotypes_ASTHMA.regenie.snp_counts.txt... (4 items in 2 groups)[0m
INFO: Running [32mregenie_4[0m: Manhattan and QQ plots using `qqman`
INFO: [32mregenie_4[0

In [31]:
# Common variables
tpl_file=~/project/UKBB_GWAS_DEV/farnam.yml
bfile=~/data/genotypes.bed
sampleFile=~/data/imputed_genotypes.sample
bgenFile=~/data/imputed_genotypes_chr*.bgen
# LMM directories
lmm_dir_regenie=~/output
lmm_sos=~/project/bioworkflows/GWAS/LMM.ipynb
lmm_sbatch_regenie=../output/$(date +"%Y-%m-%d")_mwe_regenie.sbatch
phenoFile=~/data/phenotypes.txt
lowmem=~/output
## LMM variables 
phenoCol=BMI
covarCol=SEX
covarMaxLevels=10
qCovarCol=AGE
numThreads=8
bgenMinMAF=0.05
bgenMinINFO=0.8
minMAC=4
lmm_job_size=1
bsize=1000
ylim=0

In [32]:
lmm_args="""regenie
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --sampleFile $sampleFile
    --bgenFile $bgenFile 
    --phenoFile $phenoFile  
    --phenoCol $phenoCol 
    --covarCol $covarCol 
    --qCovarCol $qCovarCol
    --bgenMinMAF $bgenMinMAF
    --bgenMinINFO $bgenMinINFO
    --bsize $bsize
    --minMAC $minMAC
    --lowmem $lowmem
    --numThreads $numThreads  
    --job_size $lmm_job_size
    --ylim $ylim
    --trait
"""

sos run ~/project/bioworkflows/GWAS/Get_Job_Script.ipynb farnam \
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args"

INFO: Running [32mfarnam[0m: Configuration for Yale `farnam` cluster
INFO: [32mfarnam[0m is [32mcompleted[0m.
INFO: [32mfarnam[0m output:   [32m../output/2020-08-10_mwe_regenie.sbatch[0m
INFO: Workflow farnam (ID=280055fda8fd6626) is executed successfully with 1 completed step.


## MWE for BOLT-LMM

In [4]:
cd
sos run ~/project/bioworkflows/GWAS/LMM.ipynb boltlmm \
    --cwd output \
    --bfile data/genotypes.bed \
    --sampleFile data/imputed_genotypes.sample \
    --bgenFile data/imputed_genotypes_chr*.bgen \
    --phenoFile data/phenotypes.txt \
    --formatFile data/boltlmm_template.yml \
    --LDscoresFile ~/software/BOLT-LMM_v2.3.4/tables/LDSCORE.1000G_EUR.tab.gz \
    --geneticMapFile ~/software/BOLT-LMM_v2.3.4/tables/genetic_map_hg19_withX.txt.gz \
    --phenoCol BMI \
    --covarCol SEX \
    --covarMaxLevels 10 \
    --qCovarCol AGE \
    --numThreads 5 \
    --bgenMinMAF 0.001 \
    --bgenMinINFO 0.1 \
    --lmm-option \
    --p_filter 1 \
    --container_option /SAY/dbgapstg/scratch/UKBiobank/lmm.sif

INFO: Running [32mboltlmm_1[0m: Run BOLT analysis
INFO: [32mboltlmm_1[0m (index=0) is [32mignored[0m due to saved signature
INFO: [32mboltlmm_1[0m (index=1) is [32mignored[0m due to saved signature
INFO: [32mboltlmm_1[0m output:   [32moutput/cache/imputed_genotypes_chr21.phenotypes_BMI.boltlmm.snp_stats.gz output/cache/imputed_genotypes_chr22.phenotypes_BMI.boltlmm.snp_stats.gz in 2 groups[0m
INFO: Running [32mboltlmm_2[0m: Merge results and log files
INFO: [32mboltlmm_2[0m (index=0) is [32mignored[0m due to saved signature
INFO: [32mboltlmm_2[0m output:   [32moutput/phenotypes_BMI.boltlmm.snp_stats.gz output/phenotypes_BMI.boltlmm.snp_counts.txt[0m
INFO: Running [32mboltlmm_3[0m: Manhattan and QQ plots using `qqman`
INFO: [32mboltlmm_3[0m is [32mcompleted[0m.
INFO: [32mboltlmm_3[0m output:   [32moutput/phenotypes_BMI.boltlmm.manhattan.png output/phenotypes_BMI.boltlmm.qq.png... (5 items)[0m
INFO: Running [32mboltlmm_4[0m: Generate analysis report: H

## MWE for FastGWA

In [5]:
sos run ~/project/bioworkflows/GWAS/LMM.ipynb fastGWA \
    --cwd output \
    --bfile data/genotypes.bed \
    --sampleFile data/imputed_genotypes.sample \
    --bgenFile data/imputed_genotypes_chr*.bgen \
    --phenoFile data/phenotypes.txt \
    --formatFile data/fastGWA_template.yml \
    --phenoCol BMI \
    --covarCol SEX \
    --qCovarCol AGE \
    --numThreads 1 \
    --bgenMinMAF 0.001 \
    --bgenMinINFO 0.1 \
    --parts 2 \
    --p_filter 1 \
    --container_option /SAY/dbgapstg/scratch/UKBiobank/lmm.sif \
    $JOB_OPT

INFO: Running [32mfastGWA_1[0m: fastGWA mixed model (based on the sparse GRM generated above)
INFO: [32mfastGWA_1[0m (index=0) is [32mignored[0m due to saved signature
INFO: [32mfastGWA_1[0m (index=1) is [32mignored[0m due to saved signature
INFO: [32mfastGWA_1[0m output:   [32moutput/cache/imputed_genotypes_chr21.phenotypes.fastGWA.gz output/cache/imputed_genotypes_chr22.phenotypes.fastGWA.gz in 2 groups[0m
INFO: Running [32mfastGWA_2[0m: Merge results and log files
INFO: [32mfastGWA_2[0m (index=0) is [32mignored[0m due to saved signature
INFO: [32mfastGWA_2[0m output:   [32moutput/phenotypes_BMI.fastGWA.snp_stats.gz output/phenotypes_BMI.fastGWA.snp_counts.txt[0m
INFO: Running [32mfastGWA_3[0m: Manhattan and QQ plots using `qqman`
INFO: [32mfastGWA_3[0m is [32mcompleted[0m.
INFO: [32mfastGWA_3[0m output:   [32moutput/phenotypes_BMI.fastGWA.manhattan.png output/phenotypes_BMI.fastGWA.qq.png... (5 items)[0m
INFO: Running [32mfastGWA_4[0m: Generate an