## Scripts for pVCF QC UKBB

In [1]:
# Common variables
vcfs=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/ukb23156_c*
user_path=/home/dmc2245
tpl_file=$user_path/project/bioworkflows/admin/csg.yml
vcf_sos=$user_path/project/UKBB_GWAS_dev/workflow/VCF_QC_pipeline.ipynb

# Directories
vcf_output=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/plink_files 
vcf_sbatch=../output/vcf_all_chromosomes_qc_$(date +"%Y-%m-%d").sh
ref_hg38=/home/dmc2245/software/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa

#Variables for analysis

DP_snp=10
DP_indel=10
GQ=20
AB_snp=0.15
AB_indel=0.2

# Plink filtering variables for first try located in /mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/plink_files/plink_geno_mind
#geno_filter=0.1
#mind_filter=0.1
#maf_filter=0.0

# Plink filtering variables for second try located in /mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/plink_files/
geno_filter=0.1
mind_filter=0.0
maf_filter=0.0

# Container
container_lmm=/mnt/mfs/statgen/containers/lmm.sif
container_marp=/mnt/mfs/statgen/containers/marp.sif

## Pipeline including snps and indels

In [2]:
vcf_args="""qc
    --cwd $vcf_output \
    --vcfs $vcfs \
    --ref_hg38 $ref_hg38 \
    --DP_snp $DP_snp \
    --DP_indel $DP_indel \
    --GQ $GQ \
    --AB_snp $AB_snp \
    --AB_indel $AB_indel \
    --geno_filter $geno_filter \
    --mind_filter $mind_filter \
    --maf_filter $maf_filter \
    --container_lmm $container_lmm
"""

sos run ~/project/bioworkflows/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $vcf_sos \
    --to-script $vcf_sbatch \
    --args "$vcf_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m../output/vcf_all_chromosomes_qc_2021-05-20.sh[0m
INFO: Workflow csg (ID=wa11eef0f3befea48) is executed successfully with 1 completed step.


## Run 07/27/21 including spns and indels

In [14]:
vcfs=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/ukb23156_c*
user_path=/home/dmc2245
tpl_file=$user_path/project/bioworkflows/admin/csg.yml
vcf_sos=$user_path/project/UKBB_GWAS_dev/workflow/VCF_QC_pipeline.ipynb

# Directories
vcf_output=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/072721_run
vcf_sbatch=./../output/vcf_all_chromosomes_qc_$(date +"%Y-%m-%d").sh
#vcf_sbatch=./../output/vcf_chr21_b0_qc_$(date +"%Y-%m-%d").sh
ref_hg38=/home/dmc2245/software/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa

#Variables for analysis

DP_snp=10
DP_indel=10
GQ=20
AB_snp=0.15
AB_indel=0.2
##Files for annotation of known/novel snps
## Note: the tab file is created with the 1-based coordinate system that numbers nucleotides directly (https://www.biostars.org/p/84686/)
anno_file=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/dbSPN_hg38/00-All.renamechrs.indels_snps.tab.gz
anno_header=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/dbSPN_hg38/00-All.renamechrs.indels_snps.header.txt
bins="0,5E-5"

# Plink filtering variables
geno_filter=0.1
mind_filter=0.0
maf_filter=0.0

# Container
container_lmm=/mnt/mfs/statgen/containers/lmm.sif
container_marp=/mnt/mfs/statgen/containers/marp.sif
vcf_args="""qc
    --cwd $vcf_output \
    --vcfs $vcfs \
    --ref_hg38 $ref_hg38 \
    --DP_snp $DP_snp \
    --DP_indel $DP_indel \
    --GQ $GQ \
    --AB_snp $AB_snp \
    --AB_indel $AB_indel \
    --geno_filter $geno_filter \
    --mind_filter $mind_filter \
    --maf_filter $maf_filter \
    --anno_file $anno_file \
    --anno_header $anno_header \
    --bins $bins \
    --container_lmm $container_lmm
"""

sos run ~/project/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $vcf_sos \
    --to-script $vcf_sbatch \
    --args "$vcf_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m../output/vcf_all_chromosomes_qc_2021-07-29.sh[0m
INFO: Workflow csg (ID=we5349cf471e2992f) is executed successfully with 1 completed step.


## Test run with modified pipeline (stats at the end)

In [2]:
user_path=/home/dmc2245
tpl_file=$user_path/project/bioworkflows/admin/csg.yml
vcf_sos=$user_path/project/UKBB_GWAS_dev/workflow/VCF_QC_pipeline.ipynb

# Directories
## Run for chr5_b12 as a test
#vcfs=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/ukb23156_c5_b12_v1.vcf.gz
#vcf_output=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/080221_testrun
#vcf_sbatch=./../output/vcf_test_chr5b12_qc_$(date +"%Y-%m-%d").sh
## Run for all chromosomes
vcfs=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/ukb23156_c*
vcf_output=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/072721_run
vcf_sbatch=./../output/vcf_allchr_qc_$(date +"%Y-%m-%d").sh
ref_hg38=/home/dmc2245/software/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa

#Variables for analysis

DP_snp=10
DP_indel=10
GQ=20
AB_snp=0.15
AB_indel=0.2
##Files for annotation of known/novel snps
## Note: the tab file is created with the 1-based coordinate system that numbers nucleotides directly (https://www.biostars.org/p/84686/)
anno_file=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/dbSPN_hg38/00-All.renamechrs.indels_snps.tab.gz
anno_header=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/dbSPN_hg38/00-All.renamechrs.indels_snps.header.txt
bins="0,5E-5"

# Plink filtering variables
geno_filter=0.1
mind_filter=0.0
maf_filter=0.0

# Container
container_lmm=/mnt/mfs/statgen/containers/lmm.sif
container_marp=/mnt/mfs/statgen/containers/marp.sif
vcf_args="""qc
    --cwd $vcf_output \
    --vcfs $vcfs \
    --ref_hg38 $ref_hg38 \
    --DP_snp $DP_snp \
    --DP_indel $DP_indel \
    --GQ $GQ \
    --AB_snp $AB_snp \
    --AB_indel $AB_indel \
    --geno_filter $geno_filter \
    --mind_filter $mind_filter \
    --maf_filter $maf_filter \
    --anno_file $anno_file \
    --anno_header $anno_header \
    --bins $bins \
    --container_lmm $container_lmm
"""

sos run ~/project/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $vcf_sos \
    --to-script $vcf_sbatch \
    --args "$vcf_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m../output/vcf_allchr_qc_2021-08-02.sh[0m
INFO: Workflow csg (ID=w748b28473a86c84f) is executed successfully with 1 completed step.


## dryrun

In [None]:
export PATH=$HOME/miniconda3/bin:$PATH
module load Singularity/3.5.3
sos run /home/dmc2245/project/UKBB_GWAS_dev/workflow/VCF_QC_pipeline.ipynb \
    qc:1\
    --cwd /mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/testing_bcftools\
    --vcfs /mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/testing_bcftools/chr3_mwe.subset_10samples.vcf.gz\
    --ref_hg38 /home/dmc2245/software/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa\
    --DP_snp 10\
    --DP_indel 10\
    --GQ 20\
    --AB_snp 0.15\
    --AB_indel 0.2\
    --geno_filter 0.1\
    --mind_filter 0.0\
    --maf_filter 0.0\
    --anno_file /mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/dbSPN_hg38/00-All.renamechrs.indels_snps.tab.gz\
    --anno_header /mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/dbSPN_hg38/00-All.renamechrs.indels_snps.header.txt\
    --bins 0,5e-05\
    --container_lmm /mnt/mfs/statgen/containers/lmm.sif

INFO: Running [32mqc_1[0m: Split multiallelic sites and create unique variant annotation


## Pipeline filtering only bi-allelic SNPs 

This is useful to calculate vcf stats more accurately. Specifically ti/tv ratio that gives weird results when including multiallelic variants

In [1]:
vcfs=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/ukb23156_c*
user_path=/home/dmc2245
tpl_file=$user_path/project/bioworkflows/admin/csg.yml
vcf_sos=$user_path/project/UKBB_GWAS_dev/workflow/VCF_QC_pipeline_biallelic.ipynb
vcf_output=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/biallelic_analysis
vcf_sbatch=../output/vcf_all_chromosomes_qc_biallelic_$(date +"%Y-%m-%d").sh
DP_snp=10
GQ=20
AB_snp=0.15
geno_filter=0.1
mind_filter=0.0
maf_filter=0.0
container_lmm=/mnt/mfs/statgen/containers/lmm.sif
container_marp=/mnt/mfs/statgen/containers/marp.sif

vcf_args="""qc:1-2
    --cwd $vcf_output \
    --vcfs $vcfs \
    --DP_snp $DP_snp \
    --GQ $GQ \
    --AB_snp $AB_snp \
    --geno_filter $geno_filter \
    --mind_filter $mind_filter \
    --maf_filter $maf_filter \
    --container_lmm $container_lmm
"""

sos run ~/project/bioworkflows/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $vcf_sos \
    --to-script $vcf_sbatch \
    --args "$vcf_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m../output/vcf_all_chromosomes_qc_biallelic2021-06-21.sh[0m
INFO: Workflow csg (ID=w0c1e53ecc6f61f78) is executed successfully with 1 completed step.


## Test normal conditions with stats verbose mode and per sample counts

In [1]:
# Common variables
vcfs=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/testing_bcftools/ukb23156_c22_b0_v1.vcf.gz
user_path=/home/dmc2245
tpl_file=$user_path/project/bioworkflows/admin/csg.yml
vcf_sos=$user_path/project/UKBB_GWAS_dev/workflow/VCF_QC_pipeline.ipynb

# Directories
vcf_output=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/testing_bcftools/verbose_stats
vcf_sbatch=../output/test_verbose_titv_chr22_b0_$(date +"%Y-%m-%d").sh
ref_hg38=/home/dmc2245/software/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa

#Variables for analysis

DP_snp=7
DP_indel=10
GQ=20
AB_snp=0.15
AB_indel=0.2

# Plink filtering variables for second try located in /mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/plink_files/
geno_filter=0.1
mind_filter=0.0
maf_filter=0.0

# Container
container_lmm=/mnt/mfs/statgen/containers/lmm.sif
container_marp=/mnt/mfs/statgen/containers/marp.sif

vcf_args="""qc:1-2
    --cwd $vcf_output \
    --vcfs $vcfs \
    --ref_hg38 $ref_hg38 \
    --DP_snp $DP_snp \
    --DP_indel $DP_indel \
    --GQ $GQ \
    --AB_snp $AB_snp \
    --AB_indel $AB_indel \
    --geno_filter $geno_filter \
    --mind_filter $mind_filter \
    --maf_filter $maf_filter \
    --container_lmm $container_lmm
"""

sos run ~/project/bioworkflows/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $vcf_sos \
    --to-script $vcf_sbatch \
    --args "$vcf_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m../output/test_verbose_titv_chr22_b0_2021-06-03.sh[0m
INFO: Workflow csg (ID=w62c8f868f0750244) is executed successfully with 1 completed step.


## Test scripts for ti/tv problem

In [1]:
# Common variables
vcfs=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/testing_bcftools/chr22_b0_test-intersect.vcf.gz
user_path=/home/dmc2245
tpl_file=$user_path/project/bioworkflows/admin/csg.yml
vcf_sos=$user_path/project/UKBB_GWAS_dev/workflow/VCF_QC_pipeline.ipynb

# Directories
vcf_output=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/testing_bcftools/
vcf_sbatch=../output/test_titv_chr22_b0_$(date +"%Y-%m-%d").sh
ref_hg38=/home/dmc2245/software/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa

#Variables for analysis

DP_snp=7
DP_indel=10
GQ=20
AB_snp=0.15
AB_indel=0.2

# Plink filtering variables for second try located in /mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/plink_files/
geno_filter=0.1
mind_filter=0.0
maf_filter=0.0

# Container
container_lmm=/mnt/mfs/statgen/containers/lmm.sif
container_marp=/mnt/mfs/statgen/containers/marp.sif

vcf_args="""qc:1-2
    --cwd $vcf_output \
    --vcfs $vcfs \
    --ref_hg38 $ref_hg38 \
    --DP_snp $DP_snp \
    --DP_indel $DP_indel \
    --GQ $GQ \
    --AB_snp $AB_snp \
    --AB_indel $AB_indel \
    --geno_filter $geno_filter \
    --mind_filter $mind_filter \
    --maf_filter $maf_filter \
    --container_lmm $container_lmm
"""

sos run ~/project/bioworkflows/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $vcf_sos \
    --to-script $vcf_sbatch \
    --args "$vcf_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m../output/test_titv_chr22_b0_2021-06-01.sh[0m
INFO: Workflow csg (ID=w55bb966ff77a4c03) is executed successfully with 1 completed step.


## Apply harsh filtering

In [2]:
# Common variables
vcfs=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/testing_bcftools/chr22_b0_test-intersect.vcf.gz
user_path=/home/dmc2245
tpl_file=$user_path/project/bioworkflows/admin/csg.yml
vcf_sos=$user_path/project/UKBB_GWAS_dev/workflow/VCF_QC_pipeline.ipynb

# Directories
vcf_output=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/testing_bcftools/harsh_filter
vcf_sbatch=../output/harsh_test_titv_chr22_b0_$(date +"%Y-%m-%d").sh
ref_hg38=/home/dmc2245/software/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa

#Variables for analysis

DP_snp=14
DP_indel=20
GQ=20
AB_snp=0.15
AB_indel=0.2

# Plink filtering variables for second try located in /mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/plink_files/
geno_filter=0.1
mind_filter=0.0
maf_filter=0.0

# Container
container_lmm=/mnt/mfs/statgen/containers/lmm.sif
container_marp=/mnt/mfs/statgen/containers/marp.sif

vcf_args="""qc:1-2
    --cwd $vcf_output \
    --vcfs $vcfs \
    --ref_hg38 $ref_hg38 \
    --DP_snp $DP_snp \
    --DP_indel $DP_indel \
    --GQ $GQ \
    --AB_snp $AB_snp \
    --AB_indel $AB_indel \
    --geno_filter $geno_filter \
    --mind_filter $mind_filter \
    --maf_filter $maf_filter \
    --container_lmm $container_lmm
"""

sos run ~/project/bioworkflows/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $vcf_sos \
    --to-script $vcf_sbatch \
    --args "$vcf_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m../output/harsh_test_titv_chr22_b0_2021-06-01.sh[0m
INFO: Workflow csg (ID=w716ad32a92ca69dd) is executed successfully with 1 completed step.


## Use 150K (exclude those exomed with other capture array)

In [1]:
# Common variables
vcfs=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/testing_bcftools/chr22_b0_test-intersect_50Kremoved.vcf.gz
user_path=/home/dmc2245
tpl_file=$user_path/project/bioworkflows/admin/csg.yml
vcf_sos=$user_path/project/UKBB_GWAS_dev/workflow/VCF_QC_pipeline.ipynb

# Directories
vcf_output=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/testing_bcftools/150K_samples
vcf_sbatch=../output/samples150k_test_titv_chr22_b0_$(date +"%Y-%m-%d").sh
ref_hg38=/home/dmc2245/software/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa

#Variables for analysis

DP_snp=7
DP_indel=10
GQ=20
AB_snp=0.15
AB_indel=0.2

# Plink filtering variables for second try located in /mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/plink_files/
geno_filter=0.1
mind_filter=0.0
maf_filter=0.0

# Container
container_lmm=/mnt/mfs/statgen/containers/lmm.sif
container_marp=/mnt/mfs/statgen/containers/marp.sif

vcf_args="""qc:1-2
    --cwd $vcf_output \
    --vcfs $vcfs \
    --ref_hg38 $ref_hg38 \
    --DP_snp $DP_snp \
    --DP_indel $DP_indel \
    --GQ $GQ \
    --AB_snp $AB_snp \
    --AB_indel $AB_indel \
    --geno_filter $geno_filter \
    --mind_filter $mind_filter \
    --maf_filter $maf_filter \
    --container_lmm $container_lmm
"""

sos run ~/project/bioworkflows/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $vcf_sos \
    --to-script $vcf_sbatch \
    --args "$vcf_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m../output/samples150k_test_titv_chr22_b0_2021-06-02.sh[0m
INFO: Workflow csg (ID=w0e466be5a0bcb429) is executed successfully with 1 completed step.


## Separate snps from indels using new pipeline

In [None]:
# Common variables
vcfs=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/testing_bcftools/chr22_b0_test-intersect_50Kremoved.vcf.gz
user_path=/home/dmc2245
tpl_file=$user_path/project/bioworkflows/admin/csg.yml
vcf_sos=$user_path/project/UKBB_GWAS_dev/workflow/VCF_QC_pipeline.ipynb

# Directories
vcf_output=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/testing_bcftools/snps_indels
vcf_sbatch=../output/samples150k_snps_indels_test_titv_chr22_b0_$(date +"%Y-%m-%d").sh
ref_hg38=/home/dmc2245/software/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa

#Variables for analysis

DP_snp=7
DP_indel=10
GQ=20
AB_snp=0.15
AB_indel=0.2

# Plink filtering variables for second try located in /mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/plink_files/
geno_filter=0.1
mind_filter=0.0
maf_filter=0.0

# Container
container_lmm=/mnt/mfs/statgen/containers/lmm.sif
container_marp=/mnt/mfs/statgen/containers/marp.sif

vcf_args="""qc:1-2
    --cwd $vcf_output \
    --vcfs $vcfs \
    --ref_hg38 $ref_hg38 \
    --DP_snp $DP_snp \
    --DP_indel $DP_indel \
    --GQ $GQ \
    --AB_snp $AB_snp \
    --AB_indel $AB_indel \
    --geno_filter $geno_filter \
    --mind_filter $mind_filter \
    --maf_filter $maf_filter \
    --container_lmm $container_lmm
"""

sos run ~/project/bioworkflows/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $vcf_sos \
    --to-script $vcf_sbatch \
    --args "$vcf_args"

## Patch to change name using bcftools

In [3]:
#chromosome 2: ukb23156_c2_b{0..57}_v1.leftnorm.filtered.vcf.gz 
#chromosome 2: ukb23156_c2_b{63..70}_v1.leftnorm.filtered.vcf.gz 
user_path=/home/dmc2245
tpl_file=$user_path/project/bioworkflows/admin/csg.yml
vcf_sos=$user_path/project/UKBB_GWAS_dev/workflow/patch_vcf_files.ipynb
vcf_output=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/plink_files/cache
vcfs=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/plink_files/cache/ukb23156_c2_b{63..70}_v1.leftnorm.filtered.vcf.gz
samples_name=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/plink_files/samples_to_rename_vcf.txt
container_lmm=/mnt/mfs/statgen/containers/lmm.sif
vcf_sbatch=../output/rename_chr2_63_70_vcfs_$(date +"%Y-%m-%d").sh

vcf_args="""reheader
    --cwd $vcf_output \
    --vcfs $vcfs \
    --samples_name $samples_name \
    --container_lmm $container_lmm
"""

sos run ~/project/bioworkflows/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $vcf_sos \
    --to-script $vcf_sbatch \
    --args "$vcf_args"


INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m../output/rename_chr2_63_70_vcfs_2021-05-17.sh[0m
INFO: Workflow csg (ID=we63ba3d399799dfd) is executed successfully with 1 completed step.


In [6]:
### Chromosome 20
#chrosomome 20: ukb23156_c20_b{0..2}_v1.leftnorm.filtered.vcf.gz 
#chrosomome 20: ukb23156_c20_b{11..24}_v1.leftnorm.filtered.vcf.gz
#chrosomome 20: ukb23156_c20_b4_v1.leftnorm.filtered.vcf.gz 
#chrosomome 20: ukb23156_c20_b9_v1.leftnorm.filtered.vcf.gz 
user_path=/home/dmc2245
tpl_file=$user_path/project/bioworkflows/admin/csg.yml
vcf_sos=$user_path/project/UKBB_GWAS_dev/workflow/patch_vcf_files.ipynb
vcf_output=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/plink_files/cache
vcfs=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/plink_files/cache/ukb23156_c20_b9_v1.leftnorm.filtered.vcf.gz
samples_name=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/plink_files/samples_to_rename_vcf.txt
container_lmm=/mnt/mfs/statgen/containers/lmm.sif
vcf_sbatch=../output/rename_chr20_9_vcfs_$(date +"%Y-%m-%d").sh

vcf_args="""reheader
    --cwd $vcf_output \
    --vcfs $vcfs \
    --samples_name $samples_name \
    --container_lmm $container_lmm
"""

sos run ~/project/bioworkflows/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $vcf_sos \
    --to-script $vcf_sbatch \
    --args "$vcf_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m../output/rename_chr20_9_vcfs_2021-05-17.sh[0m
INFO: Workflow csg (ID=w2ae776a28d0c444d) is executed successfully with 1 completed step.
