# Analysis of Regenie Burden Test for hearing impairment traits

# Hearing impairment 091321 new

## Create annotation file with annovar for 200K exomes

In [3]:
tpl_file=~/working/bioworkflows/admin/csg.yml
annovar_dir=~/UKBiobank/results/ukb23155_200Kexomes_annovar/091321_exome_bim_merge
annovar_sos=~/working/bioworkflows/variant-annotation/annovar.ipynb
annovar_sbatch=~/scripts/ukb23155_200kexome_annotation_$(date +"%Y-%m-%d").sbatch
bfiles=~/UKBiobank/results/ukb23155_200Kexomes_annovar/091321_exome_bim_merge/ukb23155_chr1_chr22_091321.bim
walltime="60h"
mem="30G"

annovar_args="""annovar
    --cwd $annovar_dir 
    --bim_name $bfiles 
    --humandb /mnt/mfs/statgen/isabelle/REF/humandb  
    --xref_path /mnt/mfs/statgen/isabelle/REF/humandb 
    --job_size 1 
    --build 'hg38' 
    --name_prefix ukb23155_chr1_chr22_qcexomedata_091321
    --walltime $walltime
    --mem $mem
    --container_annovar /mnt/mfs/statgen/containers/gatk4-annovar.sif
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $annovar_sos \
    --to-script $annovar_sbatch \
    --args "$annovar_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/scripts/ukb23155_200kexome_annotation_2021-09-13.sbatch[0m
INFO: Workflow csg (ID=wdd488ffa49fe7402) is executed successfully with 1 completed step.



## Create the anno_file, set_list_file and mask_files necessary for burden test

In [3]:
burden_dir=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files
anno_sbatch_burden=~/scripts/ukb23155_200Kexomes_burdenfiles_$(date +"%Y-%m-%d").sbatch
annotated_file_hg38=~/UKBiobank/results/ukb23155_200Kexomes_annovar/091321_exome_bim_merge/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.csv
bim_name=~/UKBiobank/results/ukb23155_200Kexomes_annovar/091321_exome_bim_merge/ukb23155_chr1_chr22_091321.bim
job_size=1
name_prefix='ukb23155_chr1_chr22_qcexomedata_091321_burden_files'
anno_sos=~/working/bioworkflows/variant-annotation/annovar.ipynb
tpl_file=~/working/bioworkflows/admin/csg.yml
container_annovar=$HOME/containers/gatk4-annovar.sif

anno_args="""burden_files
    --cwd $burden_dir
    --annotated_file $annotated_file_hg38
    --bim_name $bim_name
    --name_prefix $name_prefix
    --job_size $job_size
    --container_annovar $container_annovar
    --container_lmm /mnt/mfs/statgen/containers/lmm.sif
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $anno_sos \
    --to-script $anno_sbatch_burden\
    --args "$anno_args"


INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/scripts/ukb23155_200Kexomes_burdenfiles_2021-09-14.sbatch[0m
INFO: Workflow csg (ID=wac133bb8c5b16846) is executed successfully with 1 completed step.



## Regenie Burden with 200K exomes

### 091321_f3393_hearing_aid

In [1]:
lmm_dir_regenie=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_f3393_hearing_aid
lmm_sbatch_regenie=~/scripts/f3393_hearing_aid_200k_exomes-regenie-burden_$(date +"%Y-%m-%d").sbatch
phenoFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_aid_f3393_expandedwhite_6436cases_96601ctrl_PC1_2.tsv
covarFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_aid_f3393_expandedwhite_6436cases_96601ctrl_PC1_2.tsv
phenoCol=f3393
covarCol=sex
qCovarCol='age PC1 PC2'
#This run do it with unqc'ed plink files while we wait for the qc'ed ones
genoFile=`echo /home/gl2776/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c{1..22}.merged.filtered.bed`
bfile=/home/gl2776/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
anno_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.anno_file
set_list=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.set_list_file
mask_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.mask_file
keep_gene=
build_mask=max
aaf_bins='0.005 0.01'
tpl_file=~/working/bioworkflows/admin/csg.yml
lmm_sos=~/working/bioworkflows/GWAS/LMM.ipynb
container_marp=/home/gl2776/containers/marp.sif
container_lmm=/home/gl2776/containers/lmm.sif 
lmm_job_size=1
ylim=0
reverse_log_p=True
numThreads=20
formatFile_regenie=/home/gl2776/working/UKBB_GWAS_dev/data/regenie_template.yml
bsize=1000
trait=bt
minMAC=1
snpannofile=~/UKBiobank/results/ukb23155_200Kexomes_annovar/091321_exome_bim_merge/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.csv

lmm_args="""regenie_burden
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --sampleFile $sampleFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --trait $trait
    --anno_file $anno_file
    --set_list $set_list
    --mask_file $mask_file
    --keep_gene $keep_gene
    --aaf_bins $aaf_bins
    --build_mask $build_mask
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --minMAC $minMAC
    --snpannofile $snpannofile
    --label_annotate
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args"

  msg['msg_id'] = self._parent_header['header']['msg_id']


[91mERROR[0m: [91mFailed to locate /home/dmc2245/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb.sos[0m



### 090321_f2247_hearing_difficulty

In [16]:
lmm_dir_regenie=~/UKBiobank/results/REGENIE_results/results_burden_exome/090321_f2247_hearing_difficulty
lmm_sbatch_regenie=~/scripts/f2247_hearing_difficulty_200k_exomes-regenie-burden_$(date +"%Y-%m-%d").sbatch
phenoFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_difficulty_f2247_expandedwhite_45502cases_96601ctrl_PC1_2.tsv
covarFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_difficulty_f2247_expandedwhite_45502cases_96601ctrl_PC1_2.tsv
phenoCol=f2247
covarCol=sex
qCovarCol='age PC1 PC2'
#This run do it with unqc'ed plink files while we wait for the qc'ed ones
genoFile=`echo /home/gl2776/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c{1..22}.merged.filtered.bed`
bfile=/home/gl2776/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
anno_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.anno_file
set_list=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.set_list_file
mask_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.mask_file
keep_gene=
build_mask=max
aaf_bins='0.005 0.01'
tpl_file=~/working/bioworkflows/admin/csg.yml
lmm_sos=~/working/bioworkflows/GWAS/LMM.ipynb
container_marp=/home/gl2776/containers/marp.sif
container_lmm=/home/gl2776/containers/lmm.sif 
lmm_job_size=1
ylim=0
reverse_log_p=True
numThreads=20
formatFile_regenie=/home/gl2776/working/UKBB_GWAS_dev/data/regenie_template.yml
bsize=1000
trait=bt
minMAC=1
snpannofile=~/UKBiobank/results/ukb23155_200Kexomes_annovar/091321_exome_bim_merge/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.csv

lmm_args="""regenie_burden
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --sampleFile $sampleFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --trait $trait
    --anno_file $anno_file
    --set_list $set_list
    --mask_file $mask_file
    --keep_gene $keep_gene
    --aaf_bins $aaf_bins
    --build_mask $build_mask
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --minMAC $minMAC
    --snpannofile $snpannofile
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/scripts/f2247_hearing_difficulty_200k_exomes-regenie-burden_2021-09-15.sbatch[0m
INFO: Workflow csg (ID=w90db456c6ca63366) is executed successfully with 1 completed step.



### 090321_f2257_hearing_noise

In [17]:
lmm_dir_regenie=~/UKBiobank/results/REGENIE_results/results_burden_exome/090321_f2257_hearing_noise
lmm_sbatch_regenie=~/scripts/f2257_hearing_noise_200k_exomes-regenie-burden_$(date +"%Y-%m-%d").sbatch
phenoFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_noise_f2257_expandedwhite_65660cases_96601ctrl_PC1_2.tsv
covarFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_noise_f2257_expandedwhite_65660cases_96601ctrl_PC1_2.tsv
phenoCol=f2257
covarCol=sex
qCovarCol='age PC1 PC2'
#This run do it with unqc'ed plink files while we wait for the qc'ed ones
genoFile=`echo /home/gl2776/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c{1..22}.merged.filtered.bed`
bfile=/home/gl2776/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
anno_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.anno_file
set_list=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.set_list_file
mask_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.mask_file
keep_gene=
build_mask=max
aaf_bins='0.005 0.01'
tpl_file=~/working/bioworkflows/admin/csg.yml
lmm_sos=~/working/bioworkflows/GWAS/LMM.ipynb
container_marp=/home/gl2776/containers/marp.sif
container_lmm=/home/gl2776/containers/lmm.sif 
lmm_job_size=1
ylim=0
reverse_log_p=True
numThreads=20
formatFile_regenie=/home/gl2776/working/UKBB_GWAS_dev/data/regenie_template.yml
bsize=1000
trait=bt
minMAC=1
snpannofile=~/UKBiobank/results/ukb23155_200Kexomes_annovar/091321_exome_bim_merge/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.csv

lmm_args="""regenie_burden
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --sampleFile $sampleFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --trait $trait
    --anno_file $anno_file
    --set_list $set_list
    --mask_file $mask_file
    --keep_gene $keep_gene
    --aaf_bins $aaf_bins
    --build_mask $build_mask
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --minMAC $minMAC
    --snpannofile $snpannofile
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/scripts/f2257_hearing_noise_200k_exomes-regenie-burden_2021-09-15.sbatch[0m
INFO: Workflow csg (ID=w78800ab614faa54c) is executed successfully with 1 completed step.



### 090321_Combined_f2247_f2257

In [18]:
lmm_dir_regenie=~/UKBiobank/results/REGENIE_results/results_burden_exome/090321_Combined_f2247_f2257
lmm_sbatch_regenie=~/scripts/Combined_f2247_f2257_200k_exomes-regenie-burden_$(date +"%Y-%m-%d").sbatch
phenoFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Combined_f2247_f2257_expandedwhite_38410cases_96601ctrl_PC1_2.tsv
covarFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Combined_f2247_f2257_expandedwhite_38410cases_96601ctrl_PC1_2.tsv
phenoCol=f2247_f2257
covarCol=sex
qCovarCol='age PC1 PC2'
#This run do it with unqc'ed plink files while we wait for the qc'ed ones
genoFile=`echo /home/gl2776/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c{1..22}.merged.filtered.bed`
bfile=/home/gl2776/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
anno_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.anno_file
set_list=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.set_list_file
mask_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.mask_file
keep_gene=
build_mask=max
aaf_bins='0.005 0.01'
tpl_file=~/working/bioworkflows/admin/csg.yml
lmm_sos=~/working/bioworkflows/GWAS/LMM.ipynb
container_marp=/home/gl2776/containers/marp.sif
container_lmm=/home/gl2776/containers/lmm.sif 
lmm_job_size=1
ylim=0
reverse_log_p=True
numThreads=20
formatFile_regenie=/home/gl2776/working/UKBB_GWAS_dev/data/regenie_template.yml
bsize=1000
trait=bt
minMAC=1
snpannofile=~/UKBiobank/results/ukb23155_200Kexomes_annovar/091321_exome_bim_merge/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.csv

lmm_args="""regenie_burden
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --sampleFile $sampleFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --trait $trait
    --anno_file $anno_file
    --set_list $set_list
    --mask_file $mask_file
    --keep_gene $keep_gene
    --aaf_bins $aaf_bins
    --build_mask $build_mask
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --minMAC $minMAC
    --snpannofile $snpannofile
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/scripts/Combined_f2247_f2257_200k_exomes-regenie-burden_2021-09-15.sbatch[0m
INFO: Workflow csg (ID=w8797907b7fc089ea) is executed successfully with 1 completed step.



## Regenie Burden with 150K exomes

### 091321_f3393_hearing_aid

In [19]:
lmm_dir_regenie=~/UKBiobank/results/REGENIE_results/results_burden_exome/150k/091321_f3393_hearing_aid
lmm_sbatch_regenie=~/scripts/f3393_hearing_aid_150k_exomes-regenie-burden_$(date +"%Y-%m-%d").sbatch
phenoFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_aid_f3393_expandedwhite_78848ind_150K.tsv
covarFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_aid_f3393_expandedwhite_78848ind_150K.tsv
phenoCol=f3393
covarCol=sex
qCovarCol='age PC1 PC2'
#This run do it with unqc'ed plink files while we wait for the qc'ed ones
genoFile=`echo /home/gl2776/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c{1..22}.merged.filtered.bed`
bfile=/home/gl2776/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
anno_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.anno_file
set_list=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.set_list_file
mask_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.mask_file
keep_gene=
build_mask=max
aaf_bins='0.005 0.01'
tpl_file=~/working/bioworkflows/admin/csg.yml
lmm_sos=~/working/bioworkflows/GWAS/LMM.ipynb
container_marp=/home/gl2776/containers/marp.sif
container_lmm=/home/gl2776/containers/lmm.sif 
lmm_job_size=1
ylim=0
reverse_log_p=True
numThreads=20
formatFile_regenie=/home/gl2776/working/UKBB_GWAS_dev/data/regenie_template.yml
bsize=1000
trait=bt
minMAC=1
snpannofile=~/UKBiobank/results/ukb23155_200Kexomes_annovar/091321_exome_bim_merge/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.csv

lmm_args="""regenie_burden
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --sampleFile $sampleFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --trait $trait
    --anno_file $anno_file
    --set_list $set_list
    --mask_file $mask_file
    --keep_gene $keep_gene
    --aaf_bins $aaf_bins
    --build_mask $build_mask
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --minMAC $minMAC
    --snpannofile $snpannofile
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/scripts/f3393_hearing_aid_150k_exomes-regenie-burden_2021-09-15.sbatch[0m
INFO: Workflow csg (ID=w51162fcc771707b4) is executed successfully with 1 completed step.



### 090321_f2247_hearing_difficulty

In [20]:
lmm_dir_regenie=~/UKBiobank/results/REGENIE_results/results_burden_exome/150k/090321_f2247_hearing_difficulty
lmm_sbatch_regenie=~/scripts/f2247_hearing_difficulty_150k_exomes-regenie-burden_$(date +"%Y-%m-%d").sbatch
phenoFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_difficulty_f2247_expandedwhite_107507ind_150K.tsv
covarFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_difficulty_f2247_expandedwhite_107507ind_150K.tsv
phenoCol=f2247
covarCol=sex
qCovarCol='age PC1 PC2'
#This run do it with unqc'ed plink files while we wait for the qc'ed ones
genoFile=`echo /home/gl2776/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c{1..22}.merged.filtered.bed`
bfile=/home/gl2776/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
anno_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.anno_file
set_list=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.set_list_file
mask_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.mask_file
keep_gene=
build_mask=max
aaf_bins='0.005 0.01'
tpl_file=~/working/bioworkflows/admin/csg.yml
lmm_sos=~/working/bioworkflows/GWAS/LMM.ipynb
container_marp=/home/gl2776/containers/marp.sif
container_lmm=/home/gl2776/containers/lmm.sif 
lmm_job_size=1
ylim=0
reverse_log_p=True
numThreads=20
formatFile_regenie=/home/gl2776/working/UKBB_GWAS_dev/data/regenie_template.yml
bsize=1000
trait=bt
minMAC=1
snpannofile=~/UKBiobank/results/ukb23155_200Kexomes_annovar/091321_exome_bim_merge/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.csv

lmm_args="""regenie_burden
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --sampleFile $sampleFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --trait $trait
    --anno_file $anno_file
    --set_list $set_list
    --mask_file $mask_file
    --keep_gene $keep_gene
    --aaf_bins $aaf_bins
    --build_mask $build_mask
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --minMAC $minMAC
    --snpannofile $snpannofile
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/scripts/f2247_hearing_difficulty_150k_exomes-regenie-burden_2021-09-15.sbatch[0m
INFO: Workflow csg (ID=wad7f4d041448e8f0) is executed successfully with 1 completed step.



### 090321_f2257_hearing_noise

In [21]:
lmm_dir_regenie=~/UKBiobank/results/REGENIE_results/results_burden_exome/150k/090321_f2257_hearing_noise
lmm_sbatch_regenie=~/scripts/f2257_hearing_noise_150k_exomes-regenie-burden_$(date +"%Y-%m-%d").sbatch
phenoFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_noise_f2257_expandedwhite_123538ind_150K.tsv
covarFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_noise_f2257_expandedwhite_123538ind_150K.tsv
phenoCol=f2257
covarCol=sex
qCovarCol='age PC1 PC2'
#This run do it with unqc'ed plink files while we wait for the qc'ed ones
genoFile=`echo /home/gl2776/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c{1..22}.merged.filtered.bed`
bfile=/home/gl2776/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
anno_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.anno_file
set_list=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.set_list_file
mask_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.mask_file
keep_gene=
build_mask=max
aaf_bins='0.005 0.01'
tpl_file=~/working/bioworkflows/admin/csg.yml
lmm_sos=~/working/bioworkflows/GWAS/LMM.ipynb
container_marp=/home/gl2776/containers/marp.sif
container_lmm=/home/gl2776/containers/lmm.sif 
lmm_job_size=1
ylim=0
reverse_log_p=True
numThreads=20
formatFile_regenie=/home/gl2776/working/UKBB_GWAS_dev/data/regenie_template.yml
bsize=1000
trait=bt
minMAC=1
snpannofile=~/UKBiobank/results/ukb23155_200Kexomes_annovar/091321_exome_bim_merge/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.csv

lmm_args="""regenie_burden
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --sampleFile $sampleFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --trait $trait
    --anno_file $anno_file
    --set_list $set_list
    --mask_file $mask_file
    --keep_gene $keep_gene
    --aaf_bins $aaf_bins
    --build_mask $build_mask
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --minMAC $minMAC
    --snpannofile $snpannofile
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/scripts/f2257_hearing_noise_150k_exomes-regenie-burden_2021-09-15.sbatch[0m
INFO: Workflow csg (ID=wab5d433b4da56679) is executed successfully with 1 completed step.



### 090321_Combined_f2247_f2257

In [22]:
lmm_dir_regenie=~/UKBiobank/results/REGENIE_results/results_burden_exome/150k/090321_Combined_f2247_f2257
lmm_sbatch_regenie=~/scripts/Combined_f2247_f2257_150k_exomes-regenie-burden_$(date +"%Y-%m-%d").sbatch
phenoFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Combined_f2247_f2257_expandedwhite_102133ind_150K.tsv
covarFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Combined_f2247_f2257_expandedwhite_102133ind_150K.tsv
phenoCol=f2247_f2257
covarCol=sex
qCovarCol='age PC1 PC2'
#This run do it with unqc'ed plink files while we wait for the qc'ed ones
genoFile=`echo /home/gl2776/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c{1..22}.merged.filtered.bed`
bfile=/home/gl2776/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
anno_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.anno_file
set_list=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.set_list_file
mask_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.mask_file
keep_gene=
build_mask=max
aaf_bins='0.005 0.01'
tpl_file=~/working/bioworkflows/admin/csg.yml
lmm_sos=~/working/bioworkflows/GWAS/LMM.ipynb
container_marp=/home/gl2776/containers/marp.sif
container_lmm=/home/gl2776/containers/lmm.sif 
lmm_job_size=1
ylim=0
reverse_log_p=True
numThreads=20
formatFile_regenie=/home/gl2776/working/UKBB_GWAS_dev/data/regenie_template.yml
bsize=1000
trait=bt
minMAC=1
snpannofile=~/UKBiobank/results/ukb23155_200Kexomes_annovar/091321_exome_bim_merge/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.csv

lmm_args="""regenie_burden
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --sampleFile $sampleFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --trait $trait
    --anno_file $anno_file
    --set_list $set_list
    --mask_file $mask_file
    --keep_gene $keep_gene
    --aaf_bins $aaf_bins
    --build_mask $build_mask
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --minMAC $minMAC
    --snpannofile $snpannofile
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/scripts/Combined_f2247_f2257_150k_exomes-regenie-burden_2021-09-15.sbatch[0m
INFO: Workflow csg (ID=w604dbcdcac6c65da) is executed successfully with 1 completed step.



## Regenie Burden with 50K exomes

### 091321_f3393_hearing_aid

In [23]:
lmm_dir_regenie=~/UKBiobank/results/REGENIE_results/results_burden_exome/50k/091321_f3393_hearing_aid
lmm_sbatch_regenie=~/scripts/f3393_hearing_aid_50k_exomes-regenie-burden_$(date +"%Y-%m-%d").sbatch
phenoFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_aid_f3393_expandedwhite_24189ind_50K.tsv
covarFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_aid_f3393_expandedwhite_24189ind_50K.tsv
phenoCol=f3393
covarCol=sex
qCovarCol='age PC1 PC2'
#This run do it with unqc'ed plink files while we wait for the qc'ed ones
genoFile=`echo /home/gl2776/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c{1..22}.merged.filtered.bed`
bfile=/home/gl2776/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
anno_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.anno_file
set_list=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.set_list_file
mask_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.mask_file
keep_gene=
build_mask=max
aaf_bins='0.005 0.01'
tpl_file=~/working/bioworkflows/admin/csg.yml
lmm_sos=~/working/bioworkflows/GWAS/LMM.ipynb
container_marp=/home/gl2776/containers/marp.sif
container_lmm=/home/gl2776/containers/lmm.sif 
lmm_job_size=1
ylim=0
reverse_log_p=True
numThreads=20
formatFile_regenie=/home/gl2776/working/UKBB_GWAS_dev/data/regenie_template.yml
bsize=1000
trait=bt
minMAC=1
snpannofile=~/UKBiobank/results/ukb23155_200Kexomes_annovar/091321_exome_bim_merge/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.csv

lmm_args="""regenie_burden
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --sampleFile $sampleFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --trait $trait
    --anno_file $anno_file
    --set_list $set_list
    --mask_file $mask_file
    --keep_gene $keep_gene
    --aaf_bins $aaf_bins
    --build_mask $build_mask
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --minMAC $minMAC
    --snpannofile $snpannofile
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/scripts/f3393_hearing_aid_50k_exomes-regenie-burden_2021-09-15.sbatch[0m
INFO: Workflow csg (ID=w9ee3bb0306c92e88) is executed successfully with 1 completed step.



### 090321_f2247_hearing_difficulty

In [24]:
lmm_dir_regenie=~/UKBiobank/results/REGENIE_results/results_burden_exome/50k/090321_f2247_hearing_difficulty
lmm_sbatch_regenie=~/scripts/f2247_hearing_difficulty_50k_exomes-regenie-burden_$(date +"%Y-%m-%d").sbatch
phenoFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_difficulty_f2247_expandedwhite_34596ind_50K.tsv
covarFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_difficulty_f2247_expandedwhite_34596ind_50K.tsv
phenoCol=f2247
covarCol=sex
qCovarCol='age PC1 PC2'
#This run do it with unqc'ed plink files while we wait for the qc'ed ones
genoFile=`echo /home/gl2776/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c{1..22}.merged.filtered.bed`
bfile=/home/gl2776/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
anno_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.anno_file
set_list=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.set_list_file
mask_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.mask_file
keep_gene=
build_mask=max
aaf_bins='0.005 0.01'
tpl_file=~/working/bioworkflows/admin/csg.yml
lmm_sos=~/working/bioworkflows/GWAS/LMM.ipynb
container_marp=/home/gl2776/containers/marp.sif
container_lmm=/home/gl2776/containers/lmm.sif 
lmm_job_size=1
ylim=0
reverse_log_p=True
numThreads=20
formatFile_regenie=/home/gl2776/working/UKBB_GWAS_dev/data/regenie_template.yml
bsize=1000
trait=bt
minMAC=1
snpannofile=~/UKBiobank/results/ukb23155_200Kexomes_annovar/091321_exome_bim_merge/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.csv

lmm_args="""regenie_burden
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --sampleFile $sampleFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --trait $trait
    --anno_file $anno_file
    --set_list $set_list
    --mask_file $mask_file
    --keep_gene $keep_gene
    --aaf_bins $aaf_bins
    --build_mask $build_mask
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --minMAC $minMAC
    --snpannofile $snpannofile
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/scripts/f2247_hearing_difficulty_50k_exomes-regenie-burden_2021-09-15.sbatch[0m
INFO: Workflow csg (ID=w43418dbdd66619db) is executed successfully with 1 completed step.



### 090321_f2257_hearing_noise

In [25]:
lmm_dir_regenie=~/UKBiobank/results/REGENIE_results/results_burden_exome/50k/090321_f2257_hearing_noise
lmm_sbatch_regenie=~/scripts/f2257_hearing_noise_50k_exomes-regenie-burden_$(date +"%Y-%m-%d").sbatch
phenoFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_noise_f2257_expandedwhite_38723ind_50K.tsv
covarFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_noise_f2257_expandedwhite_38723ind_50K.tsv
phenoCol=f2257
covarCol=sex
qCovarCol='age PC1 PC2'
#This run do it with unqc'ed plink files while we wait for the qc'ed ones
genoFile=`echo /home/gl2776/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c{1..22}.merged.filtered.bed`
bfile=/home/gl2776/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
anno_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.anno_file
set_list=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.set_list_file
mask_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.mask_file
keep_gene=
build_mask=max
aaf_bins='0.005 0.01'
tpl_file=~/working/bioworkflows/admin/csg.yml
lmm_sos=~/working/bioworkflows/GWAS/LMM.ipynb
container_marp=/home/gl2776/containers/marp.sif
container_lmm=/home/gl2776/containers/lmm.sif 
lmm_job_size=1
ylim=0
reverse_log_p=True
numThreads=20
formatFile_regenie=/home/gl2776/working/UKBB_GWAS_dev/data/regenie_template.yml
bsize=1000
trait=bt
minMAC=1
snpannofile=~/UKBiobank/results/ukb23155_200Kexomes_annovar/091321_exome_bim_merge/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.csv

lmm_args="""regenie_burden
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --sampleFile $sampleFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --trait $trait
    --anno_file $anno_file
    --set_list $set_list
    --mask_file $mask_file
    --keep_gene $keep_gene
    --aaf_bins $aaf_bins
    --build_mask $build_mask
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --minMAC $minMAC
    --snpannofile $snpannofile
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/scripts/f2257_hearing_noise_50k_exomes-regenie-burden_2021-09-15.sbatch[0m
INFO: Workflow csg (ID=w0b166151a3ada337) is executed successfully with 1 completed step.



### 090321_Combined_f2247_f2257

In [27]:
lmm_dir_regenie=~/UKBiobank/results/REGENIE_results/results_burden_exome/50k/090321_Combined_f2247_f2257
lmm_sbatch_regenie=~/scripts/Combined_f2247_f2257_50k_exomes-regenie-burden_$(date +"%Y-%m-%d").sbatch
phenoFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Combined_f2247_f2257_expandedwhite_32878ind_50K.tsv
covarFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Combined_f2247_f2257_expandedwhite_32878ind_50K.tsv
phenoCol=f2247_f2257
covarCol=sex
qCovarCol='age PC1 PC2'
#This run do it with unqc'ed plink files while we wait for the qc'ed ones
genoFile=`echo /home/gl2776/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c{1..22}.merged.filtered.bed`
bfile=/home/gl2776/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
anno_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.anno_file
set_list=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.set_list_file
mask_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.mask_file
keep_gene=
build_mask=max
aaf_bins='0.005 0.01'
tpl_file=~/working/bioworkflows/admin/csg.yml
lmm_sos=~/working/bioworkflows/GWAS/LMM.ipynb
container_marp=/home/gl2776/containers/marp.sif
container_lmm=/home/gl2776/containers/lmm.sif 
lmm_job_size=1
ylim=0
reverse_log_p=True
numThreads=20
formatFile_regenie=/home/gl2776/working/UKBB_GWAS_dev/data/regenie_template.yml
bsize=1000
trait=bt
minMAC=1
snpannofile=~/UKBiobank/results/ukb23155_200Kexomes_annovar/091321_exome_bim_merge/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.csv

lmm_args="""regenie_burden
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --sampleFile $sampleFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --trait $trait
    --anno_file $anno_file
    --set_list $set_list
    --mask_file $mask_file
    --keep_gene $keep_gene
    --aaf_bins $aaf_bins
    --build_mask $build_mask
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --minMAC $minMAC
    --snpannofile $snpannofile
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/scripts/Combined_f2247_f2257_50k_exomes-regenie-burden_2021-09-15.sbatch[0m
INFO: Workflow csg (ID=w61d178575df1ecf6) is executed successfully with 1 completed step.



## Regenie Burden with 200K exomes for paper

### 091321_f3393_hearing_aid

In [None]:
lmm_dir_regenie=~/UKBiobank/results/REGENIE_results/results_burden_exome/200k_paper/091321_f3393_hearing_aid
lmm_sbatch_regenie=~/scripts/f3393_hearing_aid_200k_exomes-regenie-burden_$(date +"%Y-%m-%d").sbatch
phenoFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_aid_f3393_expandedwhite_6436cases_96601ctrl_PC1_2.tsv
covarFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_aid_f3393_expandedwhite_6436cases_96601ctrl_PC1_2.tsv
phenoCol=f3393
covarCol=sex
qCovarCol='age PC1 PC2'
#This run do it with unqc'ed plink files while we wait for the qc'ed ones
genoFile=`echo /home/gl2776/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c{1..22}.merged.filtered.bed`
bfile=/home/gl2776/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
anno_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.anno_file
set_list=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.set_list_file
mask_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.mask_file
keep_gene=
build_mask=max
aaf_bins='0.005 0.01'
tpl_file=~/working/bioworkflows/admin/csg.yml
lmm_sos=~/working/bioworkflows/GWAS/LMM.ipynb
container_marp=/home/gl2776/containers/marp.sif
container_lmm=/home/gl2776/containers/lmm.sif 
lmm_job_size=1
ylim=0
reverse_log_p=True
numThreads=20
formatFile_regenie=/home/gl2776/working/UKBB_GWAS_dev/data/regenie_template.yml
bsize=1000
trait=bt
minMAC=1
snpannofile=~/UKBiobank/results/ukb23155_200Kexomes_annovar/091321_exome_bim_merge/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.csv

lmm_args="""regenie_burden
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --sampleFile $sampleFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --trait $trait
    --anno_file $anno_file
    --set_list $set_list
    --mask_file $mask_file
    --keep_gene $keep_gene
    --aaf_bins $aaf_bins
    --build_mask $build_mask
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --minMAC $minMAC
    --snpannofile $snpannofile
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args"

# HSPG2

In [2]:
tpl_file=~/working/bioworkflows/admin/csg.yml
annovar_dir=/mnt/mfs/statgen/guangyou/HSPG2/burden_files/
annovar_sos=~/working/bioworkflows/variant-annotation/annovar.ipynb
annovar_sbatch=/mnt/mfs/statgen/guangyou/HSPG2/burden_files/HSPG2_annotation_$(date +"%Y-%m-%d").sbatch
bfiles=/mnt/mfs/statgen/guangyou/HSPG2/HSPG2_snp_0.005.bim
walltime="60h"
mem="30G"

annovar_args="""annovar
    --cwd $annovar_dir 
    --bim_name $bfiles 
    --humandb /mnt/mfs/statgen/isabelle/REF/humandb  
    --xref_path /mnt/mfs/statgen/isabelle/REF/humandb 
    --job_size 1 
    --build 'hg38' 
    --name_prefix HSPG2_snp_0.005
    --walltime $walltime
    --mem $mem
    --container_annovar /mnt/mfs/statgen/containers/gatk4-annovar.sif
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $annovar_sos \
    --to-script $annovar_sbatch \
    --args "$annovar_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/mnt/mfs/statgen/guangyou/HSPG2/burden_files/HSPG2_annotation_2021-09-21.sbatch[0m
INFO: Workflow csg (ID=w199a4fba9b04ecbf) is executed successfully with 1 completed step.



In [3]:
burden_dir=/mnt/mfs/statgen/guangyou/HSPG2/burden_files/
anno_sbatch_burden=/mnt/mfs/statgen/guangyou/HSPG2/burden_files/HSPG2_burdenfiles_$(date +"%Y-%m-%d").sbatch
annotated_file_hg38=/mnt/mfs/statgen/guangyou/HSPG2/burden_files/HSPG2_snp_0.005.hg38.hg38_multianno.csv
bim_name=/mnt/mfs/statgen/guangyou/HSPG2/HSPG2_snp_0.005.bim
job_size=1
name_prefix='HSPG2_burden_files'
anno_sos=~/working/bioworkflows/variant-annotation/annovar.ipynb
tpl_file=~/working/bioworkflows/admin/csg.yml
container_annovar=$HOME/containers/gatk4-annovar.sif

anno_args="""burden_files
    --cwd $burden_dir
    --annotated_file $annotated_file_hg38
    --bim_name $bim_name
    --name_prefix $name_prefix
    --job_size $job_size
    --container_annovar $container_annovar
    --container_lmm /mnt/mfs/statgen/containers/lmm.sif
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $anno_sos \
    --to-script $anno_sbatch_burden\
    --args "$anno_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/mnt/mfs/statgen/guangyou/HSPG2/burden_files/HSPG2_burdenfiles_2021-09-21.sbatch[0m
INFO: Workflow csg (ID=w6e2eecbf7cf27212) is executed successfully with 1 completed step.



In [9]:
lmm_dir_regenie=/mnt/mfs/statgen/guangyou/HSPG2/burden_test
lmm_sbatch_regenie=/mnt/mfs/statgen/guangyou/HSPG2/burden_test/HSPG2-regenie-burden_$(date +"%Y-%m-%d").sbatch
phenoFile=/mnt/mfs/statgen/guangyou/HSPG2/ukb47922_exomed_white_187908qced_092021_f41270_neurodegenerative.csv
covarFile=/mnt/mfs/statgen/guangyou/HSPG2/ukb47922_exomed_white_187908qced_092021_f41270_neurodegenerative.csv
phenoCol=f41270
covarCol=sex
qCovarCol=age
#This run do it with unqc'ed plink files while we wait for the qc'ed ones
genoFile=/home/gl2776/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c1.merged.filtered.bed
bfile=/home/gl2776/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
anno_file=/mnt/mfs/statgen/guangyou/HSPG2/burden_files/HSPG2_snp_0.005.hg38.hg38_multianno.anno_file
set_list=/mnt/mfs/statgen/guangyou/HSPG2/burden_files/HSPG2_snp_0.005.hg38.hg38_multianno.set_list_file
mask_file=/mnt/mfs/statgen/guangyou/HSPG2/burden_files/HSPG2_snp_0.005.hg38.hg38_multianno.mask_file
build_mask=max
aaf_bins='0.0005 0.01'
tpl_file=~/working/bioworkflows/admin/csg.yml
lmm_sos=~/working/bioworkflows/GWAS/LMM.ipynb
container_marp=/home/gl2776/containers/marp.sif
container_lmm=/home/gl2776/containers/lmm.sif 
lmm_job_size=1
ylim=0
reverse_log_p=True
numThreads=20
formatFile_regenie=/home/gl2776/working/UKBB_GWAS_dev/data/regenie_template.yml
bsize=1000
trait=bt
minMAC=1
snpannofile=/mnt/mfs/statgen/guangyou/HSPG2/burden_files/HSPG2_snp_0.005.hg38.hg38_multianno.csv

lmm_args="""regenie_burden
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --sampleFile $sampleFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --trait $trait
    --anno_file $anno_file
    --set_list $set_list
    --mask_file $mask_file
    --keep_gene $keep_gene
    --aaf_bins $aaf_bins
    --build_mask $build_mask
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --minMAC $minMAC
    --snpannofile $snpannofile
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/mnt/mfs/statgen/guangyou/HSPG2/burden_test/HSPG2-regenie-burden_2021-09-21.sbatch[0m
INFO: Workflow csg (ID=w36b89c1376c5bd42) is executed successfully with 1 completed step.



In [None]:
lmm_dir_regenie=/mnt/mfs/statgen/guangyou/HSPG2/burden_test
lmm_sbatch_regenie=~/scripts/f3393_hearing_aid_200k_exomes-regenie-burden_$(date +"%Y-%m-%d").sbatch
phenoFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_aid_f3393_expandedwhite_6436cases_96601ctrl_PC1_2.tsv
covarFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_aid_f3393_expandedwhite_6436cases_96601ctrl_PC1_2.tsv
phenoCol=f3393
covarCol=sex
qCovarCol='age PC1 PC2'
#This run do it with unqc'ed plink files while we wait for the qc'ed ones
genoFile=`echo /home/gl2776/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c{1..22}.merged.filtered.bed`
bfile=/home/gl2776/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
anno_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.anno_file
set_list=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.set_list_file
mask_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/091321_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.mask_file
keep_gene=
build_mask=max
aaf_bins='0.005 0.01'
tpl_file=~/working/bioworkflows/admin/csg.yml
lmm_sos=~/working/bioworkflows/GWAS/LMM.ipynb
container_marp=/home/gl2776/containers/marp.sif
container_lmm=/home/gl2776/containers/lmm.sif 
lmm_job_size=1
ylim=0
reverse_log_p=True
numThreads=20
formatFile_regenie=/home/gl2776/working/UKBB_GWAS_dev/data/regenie_template.yml
bsize=1000
trait=bt
minMAC=1
snpannofile=~/UKBiobank/results/ukb23155_200Kexomes_annovar/091321_exome_bim_merge/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.csv

lmm_args="""regenie_burden
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --sampleFile $sampleFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --trait $trait
    --anno_file $anno_file
    --set_list $set_list
    --mask_file $mask_file
    --keep_gene $keep_gene
    --aaf_bins $aaf_bins
    --build_mask $build_mask
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --minMAC $minMAC
    --snpannofile $snpannofile
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args"

# Hearing impairment 100421 300k exome

## Create annotation file with annovar for 300K exomes

In [2]:
tpl_file=~/working/bioworkflows/admin/csg.yml
annovar_dir=~/UKBiobank/results/ukb23155_300Kexomes_annovar/
annovar_sos=~/working/bioworkflows/variant-annotation/annovar.ipynb
annovar_sbatch=~/UKBiobank/results/ukb23155_300Kexomes_annovar/ukb23155_300kexome_annotation_$(date +"%Y-%m-%d").sbatch
bfiles=~/UKBiobank/results/ukb23155_200Kexomes_annovar/091321_exome_bim_merge/ukb23155_chr1_chr22_091321.bim
walltime="60h"
mem="100G"

annovar_args="""annovar
    --cwd $annovar_dir 
    --bim_name $bfiles 
    --humandb /mnt/mfs/statgen/isabelle/REF/humandb  
    --xref_path /mnt/mfs/statgen/isabelle/REF/humandb 
    --job_size 1 
    --build 'hg38' 
    --name_prefix ukb23155_chr1_chr22_qcexomedata_100421
    --walltime $walltime
    --mem $mem
    --container_annovar /mnt/mfs/statgen/containers/gatk4-annovar.sif
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $annovar_sos \
    --to-script $annovar_sbatch \
    --args "$annovar_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/UKBiobank/results/ukb23155_300Kexomes_annovar/ukb23155_300kexome_annotation_2021-10-04.sbatch[0m
INFO: Workflow csg (ID=w930be3fb4f021648) is executed successfully with 1 completed step.



## Create the anno_file, set_list_file and mask_files necessary for burden test

In [1]:
burden_dir=~/UKBiobank/results/REGENIE_results/results_burden_exome/300k/100421_burden_files
anno_sbatch_burden=~/UKBiobank/results/REGENIE_results/results_burden_exome/300k/100421_burden_files/ukb23155_300Kexomes_burdenfiles_$(date +"%Y-%m-%d").sbatch
annotated_file_hg38=~/UKBiobank/results/ukb23155_200Kexomes_annovar/091321_exome_bim_merge/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.csv
bim_name=~/UKBiobank/results/ukb23155_200Kexomes_annovar/091321_exome_bim_merge/ukb23155_chr1_chr22_091321.bim
job_size=1
name_prefix='ukb23155_chr1_chr22_qcexomedata_100421_burden_files'
anno_sos=~/working/bioworkflows/variant-annotation/annovar.ipynb
tpl_file=~/working/bioworkflows/admin/csg.yml
container_annovar=$HOME/containers/gatk4-annovar.sif

anno_args="""burden_files
    --cwd $burden_dir
    --annotated_file $annotated_file_hg38
    --bim_name $bim_name
    --name_prefix $name_prefix
    --job_size $job_size
    --container_annovar $container_annovar
    --container_lmm /mnt/mfs/statgen/containers/lmm.sif
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $anno_sos \
    --to-script $anno_sbatch_burden\
    --args "$anno_args"


INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/UKBiobank/results/REGENIE_results/results_burden_exome/300k/100421_burden_files/ukb23155_300Kexomes_burdenfiles_2021-10-04.sbatch[0m
INFO: Workflow csg (ID=w694b908ef55bc328) is executed successfully with 1 completed step.



## Regenie Burden with 300K exomes

### 100421_f3393_hearing_aid

In [3]:
lmm_dir_regenie=~/UKBiobank/results/REGENIE_results/results_burden_exome/300k/100421_f3393_hearing_aid
lmm_sbatch_regenie=~/UKBiobank/results/REGENIE_results/results_burden_exome/300k/100421_f3393_hearing_aid/f3393_hearing_aid_300k_exomes-regenie-burden_$(date +"%Y-%m-%d").sbatch
phenoFile=~/project/guangyou/HI_UKBB/100421_UKBB_Hearing_aid_f3393_expandedwhite_9165cases_141003ctrl.tsv
covarFile=~/project/guangyou/HI_UKBB/100421_UKBB_Hearing_aid_f3393_expandedwhite_9165cases_141003ctrl.tsv
phenoCol=f3393
covarCol=sex
qCovarCol='age PC1 PC2'
#This run do it with unqc'ed plink files while we wait for the qc'ed ones
genoFile=`echo /home/gl2776/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c{1..22}.merged.filtered.bed`
bfile=/home/gl2776/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
anno_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/300k/100421_burden_files/ukb23155_chr1_chr22_100421.hg38.hg38_multianno.anno_file
set_list=~/UKBiobank/results/REGENIE_results/results_burden_exome/300k/100421_burden_files/ukb23155_chr1_chr22_100421.hg38.hg38_multianno.set_list_file
mask_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/300k/100421_burden_files/ukb23155_chr1_chr22_100421.hg38.hg38_multianno.mask_file
#keep_gene=
build_mask=max
aaf_bins='0.005 0.01'
tpl_file=~/working/bioworkflows/admin/csg.yml
lmm_sos=~/working/bioworkflows/GWAS/LMM.ipynb
container_marp=/home/gl2776/containers/marp.sif
container_lmm=/home/gl2776/containers/lmm.sif 
lmm_job_size=1
ylim=0
reverse_log_p=True
numThreads=20
formatFile_regenie=/home/gl2776/working/UKBB_GWAS_dev/data/regenie_template.yml
bsize=1000
trait=bt
minMAC=1
snpannofile=~/UKBiobank/results/ukb23155_200Kexomes_annovar/091321_exome_bim_merge/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.csv

lmm_args="""regenie_burden
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --sampleFile $sampleFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --trait $trait
    --anno_file $anno_file
    --set_list $set_list
    --mask_file $mask_file
    --keep_gene $keep_gene
    --aaf_bins $aaf_bins
    --build_mask $build_mask
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --minMAC $minMAC
    --snpannofile $snpannofile
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/UKBiobank/results/REGENIE_results/results_burden_exome/300k/100421_f3393_hearing_aid/f3393_hearing_aid_300k_exomes-regenie-burden_2021-10-04.sbatch[0m
INFO: Workflow csg (ID=wed9bcc3971241667) is executed successfully with 1 completed step.



### 100421_f2247_hearing_difficulty

In [5]:
lmm_dir_regenie=~/UKBiobank/results/REGENIE_results/results_burden_exome/300k/100421_f2247_hearing_difficulty
lmm_sbatch_regenie=~/UKBiobank/results/REGENIE_results/results_burden_exome/300k/100421_f2247_hearing_difficulty/f2247_hearing_difficulty_300k_exomes-regenie-burden_$(date +"%Y-%m-%d").sbatch
phenoFile=~/project/guangyou/HI_UKBB/100421_UKBB_Hearing_difficulty_f2247_expandedwhite_64962cases_141003ctrl.tsv
covarFile=~/project/guangyou/HI_UKBB/100421_UKBB_Hearing_difficulty_f2247_expandedwhite_64962cases_141003ctrl.tsv
phenoCol=f2247
covarCol=sex
qCovarCol='age PC1 PC2'
#This run do it with unqc'ed plink files while we wait for the qc'ed ones
genoFile=`echo /home/gl2776/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c{1..22}.merged.filtered.bed`
bfile=/home/gl2776/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
anno_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/300k/100421_burden_files/ukb23155_chr1_chr22_100421.hg38.hg38_multianno.anno_file
set_list=~/UKBiobank/results/REGENIE_results/results_burden_exome/300k/100421_burden_files/ukb23155_chr1_chr22_100421.hg38.hg38_multianno.set_list_file
mask_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/300k/100421_burden_files/ukb23155_chr1_chr22_100421.hg38.hg38_multianno.mask_file
#keep_gene=
build_mask=max
aaf_bins='0.005 0.01'
tpl_file=~/working/bioworkflows/admin/csg.yml
lmm_sos=~/working/bioworkflows/GWAS/LMM.ipynb
container_marp=/home/gl2776/containers/marp.sif
container_lmm=/home/gl2776/containers/lmm.sif 
lmm_job_size=1
ylim=0
reverse_log_p=True
numThreads=20
formatFile_regenie=/home/gl2776/working/UKBB_GWAS_dev/data/regenie_template.yml
bsize=1000
trait=bt
minMAC=1
snpannofile=~/UKBiobank/results/ukb23155_200Kexomes_annovar/091321_exome_bim_merge/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.csv

lmm_args="""regenie_burden
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --sampleFile $sampleFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --trait $trait
    --anno_file $anno_file
    --set_list $set_list
    --mask_file $mask_file
    --keep_gene $keep_gene
    --aaf_bins $aaf_bins
    --build_mask $build_mask
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --minMAC $minMAC
    --snpannofile $snpannofile
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/UKBiobank/results/REGENIE_results/results_burden_exome/300k/100421_f2247_hearing_difficulty/f2247_hearing_difficulty_300k_exomes-regenie-burden_2021-10-04.sbatch[0m
INFO: Workflow csg (ID=w3024954673c4a868) is executed successfully with 1 completed step.



### 100421_f2257_hearing_noise

In [7]:
lmm_dir_regenie=~/UKBiobank/results/REGENIE_results/results_burden_exome/300k/100421_f2257_hearing_noise
lmm_sbatch_regenie=~/UKBiobank/results/REGENIE_results/results_burden_exome/300k/100421_f2257_hearing_noise/f2257_hearing_noise_300k_exomes-regenie-burden_$(date +"%Y-%m-%d").sbatch
phenoFile=~/project/guangyou/HI_UKBB/100421_UKBB_Hearing_noise_f2257_expandedwhite_95796cases_141003ctrl.tsv
covarFile=~/project/guangyou/HI_UKBB/100421_UKBB_Hearing_noise_f2257_expandedwhite_95796cases_141003ctrl.tsv
phenoCol=f2257
covarCol=sex
qCovarCol='age PC1 PC2'
#This run do it with unqc'ed plink files while we wait for the qc'ed ones
genoFile=`echo /home/gl2776/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c{1..22}.merged.filtered.bed`
bfile=/home/gl2776/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
anno_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/300k/100421_burden_files/ukb23155_chr1_chr22_100421.hg38.hg38_multianno.anno_file
set_list=~/UKBiobank/results/REGENIE_results/results_burden_exome/300k/100421_burden_files/ukb23155_chr1_chr22_100421.hg38.hg38_multianno.set_list_file
mask_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/300k/100421_burden_files/ukb23155_chr1_chr22_100421.hg38.hg38_multianno.mask_file
#keep_gene=
build_mask=max
aaf_bins='0.005 0.01'
tpl_file=~/working/bioworkflows/admin/csg.yml
lmm_sos=~/working/bioworkflows/GWAS/LMM.ipynb
container_marp=/home/gl2776/containers/marp.sif
container_lmm=/home/gl2776/containers/lmm.sif 
lmm_job_size=1
ylim=0
reverse_log_p=True
numThreads=20
formatFile_regenie=/home/gl2776/working/UKBB_GWAS_dev/data/regenie_template.yml
bsize=1000
trait=bt
minMAC=1
snpannofile=~/UKBiobank/results/ukb23155_200Kexomes_annovar/091321_exome_bim_merge/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.csv

lmm_args="""regenie_burden
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --sampleFile $sampleFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --trait $trait
    --anno_file $anno_file
    --set_list $set_list
    --mask_file $mask_file
    --keep_gene $keep_gene
    --aaf_bins $aaf_bins
    --build_mask $build_mask
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --minMAC $minMAC
    --snpannofile $snpannofile
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/UKBiobank/results/REGENIE_results/results_burden_exome/300k/100421_f2257_hearing_noise/f2257_hearing_noise_300k_exomes-regenie-burden_2021-10-04.sbatch[0m
INFO: Workflow csg (ID=w8d460ba41b2de798) is executed successfully with 1 completed step.



### 100421_Combined_f2247_f2257

In [8]:
lmm_dir_regenie=~/UKBiobank/results/REGENIE_results/results_burden_exome/300k/100421_Combined_f2247_f2257
lmm_sbatch_regenie=~/UKBiobank/results/REGENIE_results/results_burden_exome/300k/100421_Combined_f2247_f2257/Combined_f2247_f2257_300k_exomes-regenie-burden_$(date +"%Y-%m-%d").sbatch
phenoFile=~/project/guangyou/HI_UKBB/100421_UKBB_Combined_f2247_f2257_expandedwhite_38410cases_141003ctrl.tsv
covarFile=~/project/guangyou/HI_UKBB/100421_UKBB_Combined_f2247_f2257_expandedwhite_38410cases_141003ctrl.tsv
phenoCol=f2247_f2257
covarCol=sex
qCovarCol='age PC1 PC2'
#This run do it with unqc'ed plink files while we wait for the qc'ed ones
genoFile=`echo /home/gl2776/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c{1..22}.merged.filtered.bed`
bfile=/home/gl2776/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
anno_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/300k/100421_burden_files/ukb23155_chr1_chr22_100421.hg38.hg38_multianno.anno_file
set_list=~/UKBiobank/results/REGENIE_results/results_burden_exome/300k/100421_burden_files/ukb23155_chr1_chr22_100421.hg38.hg38_multianno.set_list_file
mask_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/300k/100421_burden_files/ukb23155_chr1_chr22_100421.hg38.hg38_multianno.mask_file
#keep_gene=
build_mask=max
aaf_bins='0.005 0.01'
tpl_file=~/working/bioworkflows/admin/csg.yml
lmm_sos=~/working/bioworkflows/GWAS/LMM.ipynb
container_marp=/home/gl2776/containers/marp.sif
container_lmm=/home/gl2776/containers/lmm.sif 
lmm_job_size=1
ylim=0
reverse_log_p=True
numThreads=20
formatFile_regenie=/home/gl2776/working/UKBB_GWAS_dev/data/regenie_template.yml
bsize=1000
trait=bt
minMAC=1
snpannofile=~/UKBiobank/results/ukb23155_200Kexomes_annovar/091321_exome_bim_merge/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.csv

lmm_args="""regenie_burden
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --sampleFile $sampleFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --trait $trait
    --anno_file $anno_file
    --set_list $set_list
    --mask_file $mask_file
    --keep_gene $keep_gene
    --aaf_bins $aaf_bins
    --build_mask $build_mask
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --minMAC $minMAC
    --snpannofile $snpannofile
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/UKBiobank/results/REGENIE_results/results_burden_exome/300k/100421_Combined_f2247_f2257/Combined_f2247_f2257_300k_exomes-regenie-burden_2021-10-04.sbatch[0m
INFO: Workflow csg (ID=w58a9d65830c52bda) is executed successfully with 1 completed step.



## 100521 Get PCA for every phenotype for 300K samples to obtain PC's for LMM analysis

In [2]:
gwasqc_sos=~/working/bioworkflows/GWAS/GWAS_QC.ipynb
tpl_file=~/working/bioworkflows/admin/csg.yml
pca_sos=~/working/bioworkflows/GWAS/PCA.ipynb
container_lmm=$HOME/containers/lmm.sif
numThreads=1
job_size=1




### f.3393

#### step 1

In [None]:
## Columbia's cluster
cwd=~/UKBiobank/results/092921_PCA_results_300K/100521_f3393_300Kimputed_pca
gwas_sbatch=$cwd/qc1_f3393_qcarray_50K_$(date +"%Y-%m-%d").sbatch
## Use qc'ed genotype array
genoFile=~/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
keep_samples=~/project/guangyou/HI_UKBB/100421_UKBB_Hearing_aid_f3393_expandedwhite_9165cases_141003ctrl.keep_id
#Keep variants after LD pruning
keep_variants=~/UKBiobank/results/092921_PCA_results_300K/092921_ldprun_unrelated/cache/*.filtered.prune.in
#GWAS QC variables set all of this variables to 0 to avoid doing more filtering
maf_filter=0
geno_filter=0
hwe_filter=0
mind_filter=0
numThreads=1
mem='30G'

gwasqc_args="""qc:1
    --cwd $cwd
    --genoFile $genoFile
    --keep_samples $keep_samples
    --keep_variants $keep_variants
    --maf_filter $maf_filter
    --geno_filter $geno_filter
    --hwe_filter $hwe_filter
    --mind_filter $mind_filter
    --numThreads $numThreads 
    --job_size $job_size
    --container_lmm $container_lmm
    --mem $mem
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $gwasqc_sos \
    --to-script $gwas_sbatch \
    --args "$gwasqc_args"

#### step 2

In [None]:
## Columbia's cluster
cwd=~/UKBiobank/results/092921_PCA_results_300K/100521_f3393_300Kimputed_pca
#This is the bfile obtained in step 1
genoFile=~/UKBiobank/results/092921_PCA_results_300K/100521_f3393_300Kimputed_pca/cache/*.filtered.extracted.bed
# Format FID, IID, ethnicity
phenoFile=~/project/guangyou/HI_UKBB/100421_UKBB_Hearing_aid_f3393_expandedwhite_9165cases_141003ctrl.phenopca
label_col=ethnicity
pop_col=ethnicity
pca_sbatch=$cwd/flashpca_f3393_pc_300K_$(date +"%Y-%m-%d").sbatch
k=2
min_axis=0
max_axis=0

pca_args="""flashpca
    --cwd $cwd
    --genoFile $genoFile
    --phenoFile $phenoFile
    --k $k
    --label_col $label_col
    --pop_col $pop_col
    --min_axis $min_axis
    --max_axis $max_axis
    --numThreads $numThreads 
    --job_size $job_size
    --container_lmm $container_lmm
"""

sos run  ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg\
    --template-file $tpl_file \
    --workflow-file $pca_sos \
    --to-script $pca_sbatch \
    --args "$pca_args"

### f.2247

In [3]:
## Columbia's cluster
cwd=~/UKBiobank/results/092921_PCA_results_300K/100521_f2247_300Kimputed_pca
gwas_sbatch=$cwd/qc1_f2247_qcarray_300K_$(date +"%Y-%m-%d").sbatch
## Use qc'ed genotype array
genoFile=~/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
keep_samples=~/project/guangyou/HI_UKBB/100421_UKBB_Hearing_difficulty_f2247_expandedwhite_64962cases_141003ctrl.keep_id
#Keep variants after LD pruning
keep_variants=~/UKBiobank/results/092921_PCA_results_300K/092921_ldprun_unrelated/cache/*.filtered.prune.in
#GWAS QC variables set all of this variables to 0 to avoid doing more filtering
maf_filter=0
geno_filter=0
hwe_filter=0
mind_filter=0
numThreads=1
mem='30G'

gwasqc_args="""qc:1
    --cwd $cwd
    --genoFile $genoFile
    --keep_samples $keep_samples
    --keep_variants $keep_variants
    --maf_filter $maf_filter
    --geno_filter $geno_filter
    --hwe_filter $hwe_filter
    --mind_filter $mind_filter
    --numThreads $numThreads 
    --job_size $job_size
    --container_lmm $container_lmm
    --mem $mem
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $gwasqc_sos \
    --to-script $gwas_sbatch \
    --args "$gwasqc_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/UKBiobank/results/092921_PCA_results_300K/100521_f2247_300Kimputed_pca/qc1_f3393_qcarray_50K_2021-10-05.sbatch[0m
INFO: Workflow csg (ID=wb85bba47f155cccd) is executed successfully with 1 completed step.



In [4]:
## Columbia's cluster
cwd=~/UKBiobank/results/092921_PCA_results_300K/100521_f2247_300Kimputed_pca
#This is the bfile obtained in step 1
genoFile=~/UKBiobank/results/092921_PCA_results_300K/100521_f2247_300Kimputed_pca/cache/*.filtered.extracted.bed
# Format FID, IID, ethnicity
phenoFile=~/project/guangyou/HI_UKBB/100421_UKBB_Hearing_difficulty_f2247_expandedwhite_64962cases_141003ctrl.phenopca
label_col=ethnicity
pop_col=ethnicity
pca_sbatch=$cwd/flashpca_f2247_pc_300K_$(date +"%Y-%m-%d").sbatch
k=2
min_axis=0
max_axis=0

pca_args="""flashpca
    --cwd $cwd
    --genoFile $genoFile
    --phenoFile $phenoFile
    --k $k
    --label_col $label_col
    --pop_col $pop_col
    --min_axis $min_axis
    --max_axis $max_axis
    --numThreads $numThreads 
    --job_size $job_size
    --container_lmm $container_lmm
"""

sos run  ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg\
    --template-file $tpl_file \
    --workflow-file $pca_sos \
    --to-script $pca_sbatch \
    --args "$pca_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/UKBiobank/results/092921_PCA_results_300K/100521_f2247_300Kimputed_pca/flashpca_f2247_pc_300K_2021-10-05.sbatch[0m
INFO: Workflow csg (ID=w2697f21d631f38cb) is executed successfully with 1 completed step.



#### f.2257

In [7]:
## Columbia's cluster
cwd=~/UKBiobank/results/092921_PCA_results_300K/100521_f2257_300Kimputed_pca
gwas_sbatch=$cwd/qc1_f2257_qcarray_50K_$(date +"%Y-%m-%d").sbatch
## Use qc'ed genotype array
genoFile=~/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
keep_samples=~/project/guangyou/HI_UKBB/100421_UKBB_Hearing_noise_f2257_expandedwhite_95796cases_141003ctrl.keep_id
#Keep variants after LD pruning
keep_variants=~/UKBiobank/results/092921_PCA_results_300K/092921_ldprun_unrelated/cache/*.filtered.prune.in
#GWAS QC variables set all of this variables to 0 to avoid doing more filtering
maf_filter=0
geno_filter=0
hwe_filter=0
mind_filter=0
numThreads=1
mem='30G'

gwasqc_args="""qc:1
    --cwd $cwd
    --genoFile $genoFile
    --keep_samples $keep_samples
    --keep_variants $keep_variants
    --maf_filter $maf_filter
    --geno_filter $geno_filter
    --hwe_filter $hwe_filter
    --mind_filter $mind_filter
    --numThreads $numThreads 
    --job_size $job_size
    --container_lmm $container_lmm
    --mem $mem
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $gwasqc_sos \
    --to-script $gwas_sbatch \
    --args "$gwasqc_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/UKBiobank/results/092921_PCA_results_300K/100521_f2257_300Kimputed_pca/qc1_f2257_qcarray_50K_2021-10-05.sbatch[0m
INFO: Workflow csg (ID=w1c878f0c4ac82418) is executed successfully with 1 completed step.



In [8]:
## Columbia's cluster
cwd=~/UKBiobank/results/092921_PCA_results_300K/100521_f2257_300Kimputed_pca
#This is the bfile obtained in step 1
genoFile=~/UKBiobank/results/092921_PCA_results_300K/100521_f2257_300Kimputed_pca/cache/*.filtered.extracted.bed
# Format FID, IID, ethnicity
phenoFile=~/project/guangyou/HI_UKBB/100421_UKBB_Hearing_noise_f2257_expandedwhite_95796cases_141003ctrl.phenopca
label_col=ethnicity
pop_col=ethnicity
pca_sbatch=$cwd/flashpca_f2257_pc_300K_$(date +"%Y-%m-%d").sbatch
k=2
min_axis=0
max_axis=0

pca_args="""flashpca
    --cwd $cwd
    --genoFile $genoFile
    --phenoFile $phenoFile
    --k $k
    --label_col $label_col
    --pop_col $pop_col
    --min_axis $min_axis
    --max_axis $max_axis
    --numThreads $numThreads 
    --job_size $job_size
    --container_lmm $container_lmm
"""

sos run  ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg\
    --template-file $tpl_file \
    --workflow-file $pca_sos \
    --to-script $pca_sbatch \
    --args "$pca_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/UKBiobank/results/092921_PCA_results_300K/100521_f2257_300Kimputed_pca/flashpca_f2257_pc_300K_2021-10-05.sbatch[0m
INFO: Workflow csg (ID=w70f105a15ccd00af) is executed successfully with 1 completed step.



#### f.2247_f.2257

In [9]:
## Columbia's cluster
cwd=~/UKBiobank/results/092921_PCA_results_300K/100521_f2247_f2257_300Kimputed_pca
gwas_sbatch=$cwd/qc1_f2247_f2257_qcarray_300K_$(date +"%Y-%m-%d").sbatch
## Use qc'ed genotype array
genoFile=~/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
keep_samples=~/project/guangyou/HI_UKBB/100421_UKBB_Combined_f2247_f2257_expandedwhite_38410cases_141003ctrl.keep_id
#Keep variants after LD pruning
keep_variants=~/UKBiobank/results/092921_PCA_results_300K/092921_ldprun_unrelated/cache/*.filtered.prune.in
#GWAS QC variables set all of this variables to 0 to avoid doing more filtering
maf_filter=0
geno_filter=0
hwe_filter=0
mind_filter=0
numThreads=1
mem='30G'

gwasqc_args="""qc:1
    --cwd $cwd
    --genoFile $genoFile
    --keep_samples $keep_samples
    --keep_variants $keep_variants
    --maf_filter $maf_filter
    --geno_filter $geno_filter
    --hwe_filter $hwe_filter
    --mind_filter $mind_filter
    --numThreads $numThreads 
    --job_size $job_size
    --container_lmm $container_lmm
    --mem $mem
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $gwasqc_sos \
    --to-script $gwas_sbatch \
    --args "$gwasqc_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/UKBiobank/results/092921_PCA_results_300K/100521_f2247_f2257_300Kimputed_pca/qc1_f2247_f2257_qcarray_300K_2021-10-05.sbatch[0m
INFO: Workflow csg (ID=wfdc02a67f2ad3206) is executed successfully with 1 completed step.



In [10]:
## Columbia's cluster
cwd=~/UKBiobank/results/092921_PCA_results_300K/100521_f2247_f2257_300Kimputed_pca
#This is the bfile obtained in step 1
genoFile=~/UKBiobank/results/092921_PCA_results_300K/100521_f2247_f2257_300Kimputed_pca/cache/*.filtered.extracted.bed
# Format FID, IID, ethnicity
phenoFile=~/project/guangyou/HI_UKBB/100421_UKBB_Combined_f2247_f2257_expandedwhite_38410cases_141003ctrl.phenopca
label_col=ethnicity
pop_col=ethnicity
pca_sbatch=$cwd/flashpca_f2247_f2257_pc_300K_$(date +"%Y-%m-%d").sbatch
k=2
min_axis=0
max_axis=0

pca_args="""flashpca
    --cwd $cwd
    --genoFile $genoFile
    --phenoFile $phenoFile
    --k $k
    --label_col $label_col
    --pop_col $pop_col
    --min_axis $min_axis
    --max_axis $max_axis
    --numThreads $numThreads 
    --job_size $job_size
    --container_lmm $container_lmm
"""

sos run  ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg\
    --template-file $tpl_file \
    --workflow-file $pca_sos \
    --to-script $pca_sbatch \
    --args "$pca_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/UKBiobank/results/092921_PCA_results_300K/100521_f2247_f2257_300Kimputed_pca/flashpca_f2247_f2257_pc_300K_2021-10-05.sbatch[0m
INFO: Workflow csg (ID=w4a20f8a740e103da) is executed successfully with 1 completed step.



# 10-13-21 Hearing impairment

## Create the anno_file, set_list_file and mask_files necessary for burden test

In [1]:
burden_dir=~/UKBiobank/results/REGENIE_results/results_burden_exome/101321_burden_files
anno_sbatch_burden=~/UKBiobank/results/REGENIE_results/results_burden_exome/101321_burden_files/ukb23155_200Kexomes_burdenfiles_$(date +"%Y-%m-%d").sbatch
annotated_file_hg38=~/UKBiobank/results/ukb23155_200Kexomes_annovar/2021_10_12_hg38_exome/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.renamedcols.csv.gz
bim_name=~/UKBiobank/results/ukb23155_200Kexomes_annovar/091321_exome_bim_merge/ukb23155_chr1_chr22_091321.bim
job_size=1
name_prefix='ukb23155_chr1_chr22_qcexomedata_101321_burden_files'
anno_sos=~/project/bioworkflows/variant-annotation/annovar.ipynb
tpl_file=~/project/bioworkflows/admin/csg.yml
container_annovar=$HOME/containers/gatk4-annovar.sif

anno_args="""burden_files
    --cwd $burden_dir
    --annotated_file $annotated_file_hg38
    --bim_name $bim_name
    --name_prefix $name_prefix
    --job_size $job_size
    --container_annovar $container_annovar
    --container_lmm /mnt/mfs/statgen/containers/lmm.sif
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $anno_sos \
    --to-script $anno_sbatch_burden\
    --args "$anno_args"


INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/UKBiobank/results/REGENIE_results/results_burden_exome/101321_burden_files/ukb23155_200Kexomes_burdenfiles_2021-10-13.sbatch[0m
INFO: Workflow csg (ID=w2360ed87c5992d95) is executed successfully with 1 completed step.



## Regenie Burden with 200K exomes
### 101321_f3393_hearing_aid

In [3]:
lmm_dir_regenie=~/UKBiobank/results/REGENIE_results/results_burden_exome/101321_f3393_hearing_aid
lmm_sbatch_regenie=~/UKBiobank/results/REGENIE_results/results_burden_exome/101321_f3393_hearing_aid/f3393_hearing_aid_200k_exomes-regenie-burden_$(date +"%Y-%m-%d").sbatch
phenoFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_aid_f3393_expandedwhite_6436cases_96601ctrl_PC1_2.tsv
covarFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_aid_f3393_expandedwhite_6436cases_96601ctrl_PC1_2.tsv
phenoCol=f3393
covarCol=sex
qCovarCol='age PC1 PC2'
genoFile=`echo ~/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c{1..22}.merged.filtered.bed`
bfile=~/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
anno_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.renamedcols.csv.anno_file
set_list=~/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.renamedcols.csv.set_list_file
mask_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_101321.hg38.hg38_multianno.renamedcols.mask_file
aaf_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.renamedcols.csv.aff_file
build_mask=max
aaf_bins='0.005 0.01'
tpl_file=~/project/bioworkflows/admin/csg.yml
lmm_sos=~/project/bioworkflows/GWAS/LMM.ipynb
container_marp=~/containers/marp.sif
container_lmm=~/containers/lmm.sif 
lmm_job_size=1
ylim=0
reverse_log_p=True
numThreads=20
formatFile_regenie=~/project/UKBB_GWAS_dev/data/regenie_template.yml
bsize=1000
trait=bt
minMAC=1
snpannofile=~/UKBiobank/results/ukb23155_200Kexomes_annovar/2021_10_12_hg38_exome/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.csv

lmm_args="""regenie_burden
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --trait $trait
    --anno_file $anno_file
    --set_list $set_list
    --mask_file $mask_file
    --aaf_file $aaf_file
    --aaf_bins $aaf_bins
    --build_mask $build_mask
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --minMAC $minMAC
    --snpannofile $snpannofile
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/project/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/dmc2245/UKBiobank/results/REGENIE_results/results_burden_exome/101321_f3393_hearing_aid/f3393_hearing_aid_200k_exomes-regenie-burden_2021-11-05.sbatch[0m
INFO: Workflow csg (ID=wb83e51d8a2499f76) is executed successfully with 1 completed step.



### 101321_f2247_hearing_difficulty

In [4]:
lmm_dir_regenie=~/UKBiobank/results/REGENIE_results/results_burden_exome/101321_f2247_hearing_difficulty
lmm_sbatch_regenie=~/UKBiobank/results/REGENIE_results/results_burden_exome/101321_f2247_hearing_difficulty/f2247_Hearing_difficulty_200k_exomes-regenie-burden_$(date +"%Y-%m-%d").sbatch
phenoFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_difficulty_f2247_expandedwhite_45502cases_96601ctrl_PC1_2.tsv
covarFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_difficulty_f2247_expandedwhite_45502cases_96601ctrl_PC1_2.tsv
phenoCol=f2247
covarCol=sex
qCovarCol='age PC1 PC2'
genoFile=`echo ~/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c{1..22}.merged.filtered.bed`
bfile=~/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
anno_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.renamedcols.csv.anno_file
set_list=~/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.renamedcols.csv.set_list_file
mask_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_101321.hg38.hg38_multianno.renamedcols.mask_file
aaf_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.renamedcols.csv.aff_file
build_mask=max
aaf_bins='0.005 0.01'
tpl_file=~/project/bioworkflows/admin/csg.yml
lmm_sos=~/project/bioworkflows/GWAS/LMM.ipynb
container_marp=~/containers/marp.sif
container_lmm=~/containers/lmm.sif 
lmm_job_size=1
ylim=0
reverse_log_p=True
numThreads=20
formatFile_regenie=~/project/UKBB_GWAS_dev/data/regenie_template.yml
bsize=1000
trait=bt
minMAC=1
snpannofile=~/UKBiobank/results/ukb23155_200Kexomes_annovar/2021_10_12_hg38_exome/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.csv

lmm_args="""regenie_burden
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --trait $trait
    --anno_file $anno_file
    --set_list $set_list
    --mask_file $mask_file
    --aaf_file $aaf_file
    --aaf_bins $aaf_bins
    --build_mask $build_mask
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --minMAC $minMAC
    --snpannofile $snpannofile
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/project/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/dmc2245/UKBiobank/results/REGENIE_results/results_burden_exome/101321_f2247_hearing_difficulty/f2247_Hearing_difficulty_200k_exomes-regenie-burden_2021-11-05.sbatch[0m
INFO: Workflow csg (ID=w2f4723017cfb785a) is executed successfully with 1 completed step.



### 101321_f2257_hearing_noise

In [5]:
lmm_dir_regenie=~/UKBiobank/results/REGENIE_results/results_burden_exome/101321_f2257_hearing_noise
lmm_sbatch_regenie=~/UKBiobank/results/REGENIE_results/results_burden_exome/101321_f2257_hearing_noise/f2257_hearing_noise_200k_exomes-regenie-burden_$(date +"%Y-%m-%d").sbatch
phenoFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_noise_f2257_expandedwhite_65660cases_96601ctrl_PC1_2.tsv
covarFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_noise_f2257_expandedwhite_65660cases_96601ctrl_PC1_2.tsv
phenoCol=f2257
covarCol=sex
qCovarCol='age PC1 PC2'
genoFile=`echo ~/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c{1..22}.merged.filtered.bed`
bfile=~/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
anno_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.renamedcols.csv.anno_file
set_list=~/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.renamedcols.csv.set_list_file
mask_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_101321.hg38.hg38_multianno.renamedcols.mask_file
aaf_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.renamedcols.csv.aff_file
build_mask=max
aaf_bins='0.005 0.01'
tpl_file=~/project/bioworkflows/admin/csg.yml
lmm_sos=~/project/bioworkflows/GWAS/LMM.ipynb
container_marp=~/containers/marp.sif
container_lmm=~/containers/lmm.sif 
lmm_job_size=1
ylim=0
reverse_log_p=True
numThreads=20
formatFile_regenie=~/project/UKBB_GWAS_dev/data/regenie_template.yml
bsize=1000
trait=bt
minMAC=1
snpannofile=~/UKBiobank/results/ukb23155_200Kexomes_annovar/2021_10_12_hg38_exome/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.csv

lmm_args="""regenie_burden
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --trait $trait
    --anno_file $anno_file
    --set_list $set_list
    --mask_file $mask_file
    --aaf_file $aaf_file
    --aaf_bins $aaf_bins
    --build_mask $build_mask
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --minMAC $minMAC
    --snpannofile $snpannofile
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/project/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/dmc2245/UKBiobank/results/REGENIE_results/results_burden_exome/101321_f2257_hearing_noise/f2257_hearing_noise_200k_exomes-regenie-burden_2021-11-05.sbatch[0m
INFO: Workflow csg (ID=wbb95dcedc62a5741) is executed successfully with 1 completed step.



### 101321_combined_burden

In [6]:
lmm_dir_regenie=~/UKBiobank/results/REGENIE_results/results_burden_exome/101321_Combined_f2247_f2257
lmm_sbatch_regenie=~/UKBiobank/results/REGENIE_results/results_burden_exome/101321_Combined_f2247_f2257/Combined_f2247_f2257_200K_exomes-regenie-burden_$(date +"%Y-%m-%d").sbatch
phenoFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Combined_f2247_f2257_expandedwhite_38410cases_96601ctrl_PC1_2.tsv
covarFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Combined_f2247_f2257_expandedwhite_38410cases_96601ctrl_PC1_2.tsv
phenoCol=f2247_f2257
covarCol=sex
qCovarCol='age PC1 PC2'
genoFile=`echo ~/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c{1..22}.merged.filtered.bed`
bfile=~/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
anno_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.renamedcols.csv.anno_file
set_list=~/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.renamedcols.csv.set_list_file
mask_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_101321.hg38.hg38_multianno.renamedcols.mask_file
aaf_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.renamedcols.csv.aff_file
build_mask=max
aaf_bins='0.005 0.01'
tpl_file=~/project/bioworkflows/admin/csg.yml
lmm_sos=~/project/bioworkflows/GWAS/LMM.ipynb
container_marp=~/containers/marp.sif
container_lmm=~/containers/lmm.sif 
lmm_job_size=1
ylim=0
reverse_log_p=True
numThreads=20
formatFile_regenie=~/project/UKBB_GWAS_dev/data/regenie_template.yml
bsize=1000
trait=bt
minMAC=1
snpannofile=~/UKBiobank/results/ukb23155_200Kexomes_annovar/2021_10_12_hg38_exome/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.csv

lmm_args="""regenie_burden
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --trait $trait
    --anno_file $anno_file
    --set_list $set_list
    --mask_file $mask_file
    --aaf_file $aaf_file
    --aaf_bins $aaf_bins
    --build_mask $build_mask
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --minMAC $minMAC
    --snpannofile $snpannofile
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/project/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/dmc2245/UKBiobank/results/REGENIE_results/results_burden_exome/101321_Combined_f2247_f2257/Combined_f2247_f2257_200K_exomes-regenie-burden_2021-11-05.sbatch[0m
INFO: Workflow csg (ID=w5e22927012c61d26) is executed successfully with 1 completed step.



## Regenie burden with 150K exomes

### 120621_f3393_hearing_aid

In [2]:
lmm_dir_regenie=~/UKBiobank/results/REGENIE_results/results_burden_exome/150k/091321_f3393_hearing_aid
lmm_sbatch_regenie=~/UKBiobank/results/REGENIE_results/results_burden_exome/150k/091321_f3393_hearing_aid/f3393_hearing_aid_150k_exomes-regenie-burden_$(date +"%Y-%m-%d").sbatch
phenoFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_aid_f3393_expandedwhite_78848ind_150K.tsv
covarFile=~/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_aid_f3393_expandedwhite_78848ind_150K.tsv
phenoCol=f3393
covarCol=sex
qCovarCol='age PC1 PC2'
genoFile=`echo ~/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c{1..22}.merged.filtered.bed`
bfile=~/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
anno_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.renamedcols.csv.anno_file
set_list=~/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.renamedcols.csv.set_list_file
mask_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_101321.hg38.hg38_multianno.renamedcols.mask_file
aaf_file=~/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.renamedcols.csv.aff_file
build_mask=max
aaf_bins='0.005 0.01'
tpl_file=~/project/bioworkflows/admin/csg.yml
lmm_sos=~/project/bioworkflows/GWAS/LMM.ipynb
container_marp=~/containers/marp.sif
container_lmm=~/containers/lmm.sif  
lmm_job_size=1
ylim=20
reverse_log_p=True
numThreads=20
formatFile_regenie=~/project/UKBB_GWAS_dev/data/regenie_template.yml
bsize=1000
trait=bt
minMAC=1
snpannofile=~/UKBiobank/results/ukb23155_200Kexomes_annovar/2021_10_12_hg38_exome/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.csv

lmm_args="""regenie_burden
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --trait $trait
    --anno_file $anno_file
    --set_list $set_list
    --mask_file $mask_file
    --aaf_file $aaf_file
    --aaf_bins $aaf_bins
    --build_mask $build_mask
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --minMAC $minMAC
    --snpannofile $snpannofile
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/project/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/dmc2245/UKBiobank/results/REGENIE_results/results_burden_exome/150k/091321_f3393_hearing_aid/f3393_hearing_aid_150k_exomes-regenie-burden_2021-12-06.sbatch[0m
INFO: Workflow csg (ID=w450138e74160bfbb) is executed successfully with 1 completed step.



### 120621_f2247_hearing_difficulty

In [3]:
lmm_dir_regenie=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/150k/090321_f2247_hearing_difficulty
lmm_sbatch_regenie=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/150k/090321_f2247_hearing_difficulty/f2247_hearing_difficulty_150k_exomes-regenie-burden_$(date +"%Y-%m-%d").sbatch
phenoFile=/mnt/mfs/statgen/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_difficulty_f2247_expandedwhite_107507ind_150K.tsv
covarFile=/mnt/mfs/statgen/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_difficulty_f2247_expandedwhite_107507ind_150K.tsv
phenoCol=f2247
covarCol=sex
qCovarCol='age PC1 PC2'
genoFile=`echo ~/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c{1..22}.merged.filtered.bed`
bfile=/mnt/mfs/statgen/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
anno_file=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.renamedcols.csv.anno_file
set_list=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.renamedcols.csv.set_list_file
mask_file=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_101321.hg38.hg38_multianno.renamedcols.mask_file
aaf_file=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.renamedcols.csv.aff_file
build_mask=max
aaf_bins='0.005 0.01'
tpl_file=~/project/bioworkflows/admin/csg.yml
lmm_sos=~/project/bioworkflows/GWAS/LMM.ipynb
container_marp=~/containers/marp.sif
container_lmm=~/containers/lmm.sif
lmm_job_size=1
ylim=20
reverse_log_p=True
numThreads=20
formatFile_regenie=~/project/UKBB_GWAS_dev/data/regenie_template.yml
bsize=1000
trait=bt
minMAC=1
snpannofile=/mnt/mfs/statgen/UKBiobank/results/ukb23155_200Kexomes_annovar/2021_10_12_hg38_exome/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.csv

lmm_args="""regenie_burden
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --trait $trait
    --anno_file $anno_file
    --set_list $set_list
    --mask_file $mask_file
    --aaf_file $aaf_file
    --aaf_bins $aaf_bins
    --build_mask $build_mask
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --minMAC $minMAC
    --snpannofile $snpannofile
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/project/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/dmc2245/UKBiobank/results/REGENIE_results/results_burden_exome/150k/090321_f2247_hearing_difficulty/f2247_hearing_difficulty_150k_exomes-regenie-burden_2021-12-06.sbatch[0m
INFO: Workflow csg (ID=w7103c703d25d5a04) is executed successfully with 1 completed step.



### 120621_f2257_hearing_noise

In [5]:
lmm_dir_regenie=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/150k/090321_f2257_hearing_noise
lmm_sbatch_regenie=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/150k/090321_f2257_hearing_noise/f2257_hearing_noise_150k_exomes-regenie-burden_$(date +"%Y-%m-%d").sbatch
phenoFile=/mnt/mfs/statgen/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_noise_f2257_expandedwhite_123538ind_150K.tsv
covarFile=/mnt/mfs/statgen/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_noise_f2257_expandedwhite_123538ind_150K.tsv
phenoCol=f2257
covarCol=sex
qCovarCol='age PC1 PC2'
genoFile=`echo /mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c{1..22}.merged.filtered.bed`
bfile=/mnt/mfs/statgen/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
anno_file=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.renamedcols.csv.anno_file
set_list=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.renamedcols.csv.set_list_file
mask_file=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_101321.hg38.hg38_multianno.renamedcols.mask_file
aaf_file=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.renamedcols.csv.aff_file
build_mask=max
aaf_bins='0.005 0.01'
tpl_file=~/project/bioworkflows/admin/csg.yml
lmm_sos=~/project/bioworkflows/GWAS/LMM.ipynb
container_marp=~/containers/marp.sif
container_lmm=~/containers/lmm.sif
lmm_job_size=1
ylim=20
reverse_log_p=True
numThreads=20
formatFile_regenie=~/project/UKBB_GWAS_dev/data/regenie_template.yml
bsize=1000
trait=bt
minMAC=1
snpannofile=/mnt/mfs/statgen/UKBiobank/results/ukb23155_200Kexomes_annovar/2021_10_12_hg38_exome/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.csv

lmm_args="""regenie_burden
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --trait $trait
    --anno_file $anno_file
    --set_list $set_list
    --mask_file $mask_file
    --aaf_file $aaf_file
    --aaf_bins $aaf_bins
    --build_mask $build_mask
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --minMAC $minMAC
    --snpannofile $snpannofile
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/project/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/150k/090321_f2257_hearing_noise/f2257_hearing_noise_150k_exomes-regenie-burden_2021-12-06.sbatch[0m
INFO: Workflow csg (ID=w93d54b67932d180a) is executed successfully with 1 completed step.



### 120621_combined

In [6]:
lmm_dir_regenie=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/150k/090321_Combined_f2247_f2257
lmm_sbatch_regenie=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/150k/090321_Combined_f2247_f2257/Combined_f2247_f2257_150k_exomes-regenie-burden_$(date +"%Y-%m-%d").sbatch
phenoFile=/mnt/mfs/statgen/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Combined_f2247_f2257_expandedwhite_102133ind_150K.tsv
covarFile=/mnt/mfs/statgen/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Combined_f2247_f2257_expandedwhite_102133ind_150K.tsv
phenoCol=f2247_f2257
covarCol=sex
qCovarCol='age PC1 PC2'
genoFile=`echo /mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c{1..22}.merged.filtered.bed`
bfile=/mnt/mfs/statgen/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
anno_file=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.renamedcols.csv.anno_file
set_list=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.renamedcols.csv.set_list_file
mask_file=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_101321.hg38.hg38_multianno.renamedcols.mask_file
aaf_file=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.renamedcols.csv.aff_file
build_mask=max
aaf_bins='0.005 0.01'
tpl_file=~/project/bioworkflows/admin/csg.yml
lmm_sos=~/project/bioworkflows/GWAS/LMM.ipynb
container_marp=~/containers/marp.sif
container_lmm=~/containers/lmm.sif
lmm_job_size=1
ylim=20
reverse_log_p=True
numThreads=20
formatFile_regenie=~/project/UKBB_GWAS_dev/data/regenie_template.yml
bsize=1000
trait=bt
minMAC=1
snpannofile=/mnt/mfs/statgen/UKBiobank/results/ukb23155_200Kexomes_annovar/2021_10_12_hg38_exome/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.csv

lmm_args="""regenie_burden
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --trait $trait
    --anno_file $anno_file
    --set_list $set_list
    --mask_file $mask_file
    --aaf_file $aaf_file
    --aaf_bins $aaf_bins
    --build_mask $build_mask
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --minMAC $minMAC
    --snpannofile $snpannofile
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/project/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/150k/090321_Combined_f2247_f2257/Combined_f2247_f2257_150k_exomes-regenie-burden_2021-12-06.sbatch[0m
INFO: Workflow csg (ID=wc689c4864f6e9424) is executed successfully with 1 completed step.



## Regenie burden with 50K exomes

### 120621_f3393_hearing_aid

In [8]:
lmm_dir_regenie=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/50k/091321_f3393_hearing_aid
lmm_sbatch_regenie=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/50k/091321_f3393_hearing_aid/f3393_hearing_aid_50k_exomes-regenie-burden_$(date +"%Y-%m-%d").sbatch
phenoFile=/mnt/mfs/statgen/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_aid_f3393_expandedwhite_24189ind_50K.tsv
covarFile=/mnt/mfs/statgen/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_aid_f3393_expandedwhite_24189ind_50K.tsv
phenoCol=f3393
covarCol=sex
qCovarCol='age PC1 PC2'
genoFile=`echo /mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c{1..22}.merged.filtered.bed`
bfile=/mnt/mfs/statgen/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
anno_file=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.renamedcols.csv.anno_file
set_list=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.renamedcols.csv.set_list_file
mask_file=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_101321.hg38.hg38_multianno.renamedcols.mask_file
aaf_file=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.renamedcols.csv.aff_file
build_mask=max
aaf_bins='0.005 0.01'
tpl_file=~/project/bioworkflows/admin/csg.yml
lmm_sos=~/project/bioworkflows/GWAS/LMM.ipynb
container_marp=~/containers/marp.sif
container_lmm=~/containers/lmm.sif
lmm_job_size=1
ylim=10
reverse_log_p=True
numThreads=20
formatFile_regenie=~/project/UKBB_GWAS_dev/data/regenie_template.yml
bsize=1000
trait=bt
minMAC=1
snpannofile=/mnt/mfs/statgen/UKBiobank/results/ukb23155_200Kexomes_annovar/2021_10_12_hg38_exome/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.csv

lmm_args="""regenie_burden
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --trait $trait
    --anno_file $anno_file
    --set_list $set_list
    --mask_file $mask_file
    --aaf_file $aaf_file
    --aaf_bins $aaf_bins
    --build_mask $build_mask
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --minMAC $minMAC
    --snpannofile $snpannofile
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/project/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/50k/091321_f3393_hearing_aid/f3393_hearing_aid_50k_exomes-regenie-burden_2021-12-06.sbatch[0m
INFO: Workflow csg (ID=w910860fed8cf33f7) is executed successfully with 1 completed step.



### 120621_f2247_hearing_difficulty

In [9]:
lmm_dir_regenie=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/50k/090321_f2247_hearing_difficulty
lmm_sbatch_regenie=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/50k/090321_f2247_hearing_difficulty/f2247_hearing_difficulty_50k_exomes-regenie-burden_$(date +"%Y-%m-%d").sbatch
phenoFile=/mnt/mfs/statgen/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_difficulty_f2247_expandedwhite_34596ind_50K.tsv
covarFile=/mnt/mfs/statgen/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_difficulty_f2247_expandedwhite_34596ind_50K.tsv
phenoCol=f2247
covarCol=sex
qCovarCol='age PC1 PC2'
genoFile=`echo /mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c{1..22}.merged.filtered.bed`
bfile=/mnt/mfs/statgen/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
anno_file=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.renamedcols.csv.anno_file
set_list=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.renamedcols.csv.set_list_file
mask_file=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_101321.hg38.hg38_multianno.renamedcols.mask_file
aaf_file=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.renamedcols.csv.aff_file
build_mask=max
aaf_bins='0.005 0.01'
tpl_file=~/project/bioworkflows/admin/csg.yml
lmm_sos=~/project/bioworkflows/GWAS/LMM.ipynb
container_marp=~/containers/marp.sif
container_lmm=~/containers/lmm.sif
lmm_job_size=1
ylim=10
reverse_log_p=True
numThreads=20
formatFile_regenie=~/project/UKBB_GWAS_dev/data/regenie_template.yml
bsize=1000
trait=bt
minMAC=1
snpannofile=/mnt/mfs/statgen/UKBiobank/results/ukb23155_200Kexomes_annovar/2021_10_12_hg38_exome/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.csv

lmm_args="""regenie_burden
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --trait $trait
    --anno_file $anno_file
    --set_list $set_list
    --mask_file $mask_file
    --aaf_file $aaf_file
    --aaf_bins $aaf_bins
    --build_mask $build_mask
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --minMAC $minMAC
    --snpannofile $snpannofile
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/project/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/50k/090321_f2247_hearing_difficulty/f2247_hearing_difficulty_50k_exomes-regenie-burden_2021-12-06.sbatch[0m
INFO: Workflow csg (ID=w7f0943a69fe2c5ae) is executed successfully with 1 completed step.



### 120621_f2257_hearing_noise

In [13]:
lmm_dir_regenie=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/50k/090321_f2257_hearing_noise
lmm_sbatch_regenie=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/50k/090321_f2257_hearing_noise/f2257_hearing_noise_50k_exomes-regenie-burden_$(date +"%Y-%m-%d").sbatch
phenoFile=/mnt/mfs/statgen/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_noise_f2257_expandedwhite_38723ind_50K.tsv
covarFile=/mnt/mfs/statgen/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_noise_f2257_expandedwhite_38723ind_50K.tsv
phenoCol=f2257
covarCol=sex
qCovarCol='age PC1 PC2'
genoFile=`echo /mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c{1..22}.merged.filtered.bed`
bfile=/mnt/mfs/statgen/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
anno_file=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.renamedcols.csv.anno_file
set_list=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.renamedcols.csv.set_list_file
mask_file=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_101321.hg38.hg38_multianno.renamedcols.mask_file
aaf_file=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.renamedcols.csv.aff_file
build_mask=max
aaf_bins='0.005 0.01'
tpl_file=~/project/bioworkflows/admin/csg.yml
lmm_sos=~/project/bioworkflows/GWAS/LMM.ipynb
container_marp=~/containers/marp.sif
container_lmm=~/containers/lmm.sif
lmm_job_size=1
ylim=10
reverse_log_p=True
numThreads=20
formatFile_regenie=~/project/UKBB_GWAS_dev/data/regenie_template.yml
bsize=1000
trait=bt
minMAC=1
snpannofile=/mnt/mfs/statgen/UKBiobank/results/ukb23155_200Kexomes_annovar/2021_10_12_hg38_exome/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.csv

lmm_args="""regenie_burden
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --trait $trait
    --anno_file $anno_file
    --set_list $set_list
    --mask_file $mask_file
    --aaf_file $aaf_file
    --aaf_bins $aaf_bins
    --build_mask $build_mask
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --minMAC $minMAC
    --snpannofile $snpannofile
    --container_lmm $container_lmm
    --container_marp $container_marp
"""
sos run ~/project/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/50k/090321_f2257_hearing_noise/f2257_hearing_noise_50k_exomes-regenie-burden_2021-12-06.sbatch[0m
INFO: Workflow csg (ID=w7aa12a7ca24f72d5) is executed successfully with 1 completed step.



### 120621_combined

In [12]:
lmm_dir_regenie=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/50k/090321_Combined_f2247_f2257
lmm_sbatch_regenie=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/50k/090321_Combined_f2247_f2257/Combined_f2247_f2257_50k_exomes-regenie-burden_$(date +"%Y-%m-%d").sbatch
phenoFile=/mnt/mfs/statgen/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Combined_f2247_f2257_expandedwhite_32878ind_50K.tsv
covarFile=/mnt/mfs/statgen/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Combined_f2247_f2257_expandedwhite_32878ind_50K.tsv
phenoCol=f2247_f2257
covarCol=sex
qCovarCol='age PC1 PC2'
genoFile=`echo /mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c{1..22}.merged.filtered.bed`
bfile=/mnt/mfs/statgen/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
anno_file=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.renamedcols.csv.anno_file
set_list=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.renamedcols.csv.set_list_file
mask_file=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_101321.hg38.hg38_multianno.renamedcols.mask_file
aaf_file=/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/102121_burden_files/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.renamedcols.csv.aff_file
build_mask=max
aaf_bins='0.005 0.01'
tpl_file=~/project/bioworkflows/admin/csg.yml
lmm_sos=~/project/bioworkflows/GWAS/LMM.ipynb
container_marp=~/containers/marp.sif
container_lmm=~/containers/lmm.sif
lmm_job_size=1
ylim=10
reverse_log_p=True
numThreads=20
formatFile_regenie=~/project/UKBB_GWAS_dev/data/regenie_template.yml
bsize=1000
trait=bt
minMAC=1
snpannofile=/mnt/mfs/statgen/UKBiobank/results/ukb23155_200Kexomes_annovar/2021_10_12_hg38_exome/ukb23155_chr1_chr22_091321.hg38.hg38_multianno.csv

lmm_args="""regenie_burden
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --trait $trait
    --anno_file $anno_file
    --set_list $set_list
    --mask_file $mask_file
    --aaf_file $aaf_file
    --aaf_bins $aaf_bins
    --build_mask $build_mask
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --minMAC $minMAC
    --snpannofile $snpannofile
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/project/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/mnt/mfs/statgen/UKBiobank/results/REGENIE_results/results_burden_exome/50k/090321_Combined_f2247_f2257/Combined_f2247_f2257_50k_exomes-regenie-burden_2021-12-06.sbatch[0m
INFO: Workflow csg (ID=we9e057c086015572) is executed successfully with 1 completed step.

