## Scripts for pVCF QC UKBB

In [1]:
# Common variables
vcfs=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/ukb23156_c*
user_path=/home/dmc2245
tpl_file=$user_path/project/bioworkflows/admin/csg.yml
vcf_sos=$user_path/project/UKBB_GWAS_dev/workflow/VCF_QC_pipeline.ipynb

# Directories
vcf_output=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/plink_files
vcf_sbatch=../output/vcf_all_chromosomes_qc_$(date +"%Y-%m-%d").sh
ref_hg38=/home/dmc2245/software/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa

#Variables for analysis

DP_snp=7
DP_indel=10
GQ=20
AB_snp=0.15
AB_indel=0.2
geno_filter=0.1
mind_filter=0.1
maf_filter=0.0

# Container
container_lmm=/mnt/mfs/statgen/containers/lmm.sif
container_marp=/mnt/mfs/statgen/containers/marp.sif

In [2]:
vcf_args="""qc
    --cwd $vcf_output \
    --vcfs $vcfs \
    --ref_hg38 $ref_hg38 \
    --DP_snp $DP_snp \
    --DP_indel $DP_indel \
    --GQ $GQ \
    --AB_snp $AB_snp \
    --AB_indel $AB_indel \
    --geno_filter $geno_filter \
    --mind_filter $mind_filter \
    --maf_filter $maf_filter \
    --container_lmm $container_lmm
"""

sos run ~/project/bioworkflows/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $vcf_sos \
    --to-script $vcf_sbatch \
    --args "$vcf_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m../output/vcf_all_chromosomes_qc_2021-05-05.sh[0m
INFO: Workflow csg (ID=w16bcb76fc7b630d9) is executed successfully with 1 completed step.


## Patch to change name 

In [3]:
#chromosome 2: ukb23156_c2_b{0..57}_v1.leftnorm.filtered.vcf.gz 
#chromosome 2: ukb23156_c2_b{63..70}_v1.leftnorm.filtered.vcf.gz 
user_path=/home/dmc2245
tpl_file=$user_path/project/bioworkflows/admin/csg.yml
vcf_sos=$user_path/project/UKBB_GWAS_dev/workflow/patch_vcf_files.ipynb
vcf_output=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/plink_files/cache
vcfs=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/plink_files/cache/ukb23156_c2_b{63..70}_v1.leftnorm.filtered.vcf.gz
samples_name=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/plink_files/samples_to_rename_vcf.txt
container_lmm=/mnt/mfs/statgen/containers/lmm.sif
vcf_sbatch=../output/rename_chr2_63_70_vcfs_$(date +"%Y-%m-%d").sh

vcf_args="""reheader
    --cwd $vcf_output \
    --vcfs $vcfs \
    --samples_name $samples_name \
    --container_lmm $container_lmm
"""

sos run ~/project/bioworkflows/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $vcf_sos \
    --to-script $vcf_sbatch \
    --args "$vcf_args"


INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m../output/rename_chr2_63_70_vcfs_2021-05-17.sh[0m
INFO: Workflow csg (ID=we63ba3d399799dfd) is executed successfully with 1 completed step.


In [6]:
### Chromosome 20
#chrosomome 20: ukb23156_c20_b{0..2}_v1.leftnorm.filtered.vcf.gz 
#chrosomome 20: ukb23156_c20_b{11..24}_v1.leftnorm.filtered.vcf.gz
#chrosomome 20: ukb23156_c20_b4_v1.leftnorm.filtered.vcf.gz 
#chrosomome 20: ukb23156_c20_b9_v1.leftnorm.filtered.vcf.gz 
user_path=/home/dmc2245
tpl_file=$user_path/project/bioworkflows/admin/csg.yml
vcf_sos=$user_path/project/UKBB_GWAS_dev/workflow/patch_vcf_files.ipynb
vcf_output=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/plink_files/cache
vcfs=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/plink_files/cache/ukb23156_c20_b9_v1.leftnorm.filtered.vcf.gz
samples_name=/mnt/mfs/statgen/UKBiobank/data/exome_files/project_VCF/plink_files/samples_to_rename_vcf.txt
container_lmm=/mnt/mfs/statgen/containers/lmm.sif
vcf_sbatch=../output/rename_chr20_9_vcfs_$(date +"%Y-%m-%d").sh

vcf_args="""reheader
    --cwd $vcf_output \
    --vcfs $vcfs \
    --samples_name $samples_name \
    --container_lmm $container_lmm
"""

sos run ~/project/bioworkflows/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $vcf_sos \
    --to-script $vcf_sbatch \
    --args "$vcf_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m../output/rename_chr20_9_vcfs_2021-05-17.sh[0m
INFO: Workflow csg (ID=w2ae776a28d0c444d) is executed successfully with 1 completed step.
