## Understand filtering strategy for the RAP 500K WES data

In [2]:
module load BCFTOOLS/1.16




In [None]:
## Create a random sample of the vcf file
bcftools view /mnt/vast/hpc/csg/UKBiobank/data/exome_files/project_VCF/ukb23156_c3_b12_v1.vcf.gz| vcfrandomsample -r 0.012 > ~/test/bcftools/chr3_mwe.subset.vcf

[MetaKernelApp] ERROR | KeyboardInterrupt caught in kernel.


In [4]:
## Subset the vcf to have 10 specific samples
bcftools view --force-samples -o ~/test/bcftools/chr3_mwe.subset_10samples.vcf.gz -O z -s 1434748,5523981,5023838,4023729,4442146,5654789,4515669,1129683,5327043,4744741 ~/test/bcftools/chr3_mwe.subset.vcf.gz

In [4]:
cd ~/test/bcftools/ && ls

[0m[01;31mchr3_mwe.subset.vcf.gz[0m      chr3_mwe.subset.vcf.gz.tbi
chr3_mwe.subset.vcf.gz.csi  [01;31mchr3_mwe.subset_10samples.vcf.gz[0m



In [2]:
## View the number of lines in the VCF file suppresing the header
bcftools view -H ~/test/bcftools/chr3_mwe.subset.vcf.gz | wc -l

206



In [5]:
## Index the vcf file with tabix
bcftools index -t -f ~/test/bcftools/chr3_mwe.subset_10samples.vcf.gz




In [6]:
cd ~/test/bcftools/ && ls

[0m[01;31mchr3_mwe.subset.vcf.gz[0m      [01;31mchr3_mwe.subset_10samples.vcf.gz[0m
chr3_mwe.subset.vcf.gz.csi  chr3_mwe.subset_10samples.vcf.gz.tbi
chr3_mwe.subset.vcf.gz.tbi



In [7]:
## Show the samples present in the VCF
bcftools query -l ~/test/bcftools/chr3_mwe.subset_10samples.vcf.gz

1434748
5523981
5023838
4023729
4442146
5654789
4515669
1129683
5327043
4744741



In [8]:
## Show the number of variants
bcftools index -n ~/test/bcftools/chr3_mwe.subset_10samples.vcf.gz

206



In [9]:
bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\t[GT=%GT:AD=%AD:DP=%DP\t]\n' ~/test/bcftools/chr3_mwe.subset_10samples.vcf.gz

chr3	43343244	C	T	GT=0/0:AD=12,0:DP=12	GT=0/0:AD=30,0:DP=30	GT=0/0:AD=16,0:DP=16	GT=0/0:AD=14,0:DP=14	GT=0/0:AD=19,0:DP=19	GT=0/0:AD=16,0:DP=16	GT=0/0:AD=19,0:DP=19	GT=0/0:AD=16,0:DP=16	GT=0/0:AD=16,0:DP=16	GT=0/0:AD=16,0:DP=16	
chr3	43343506	G	GT	GT=0/0:AD=25,0:DP=25	GT=0/0:AD=30,0:DP=30	GT=./.:AD=24,0:DP=24	GT=0/0:AD=25,0:DP=25	GT=0/0:AD=19,0:DP=19	GT=0/0:AD=30,0:DP=30	GT=0/0:AD=19,0:DP=19	GT=0/0:AD=16,0:DP=16	GT=0/0:AD=21,0:DP=21	GT=0/0:AD=26,0:DP=26	
chr3	43347372	G	A	GT=0/0:AD=16,0:DP=16	GT=0/0:AD=22,0:DP=22	GT=0/0:AD=25,0:DP=25	GT=0/0:AD=21,0:DP=21	GT=0/0:AD=18,0:DP=18	GT=0/0:AD=17,0:DP=17	GT=0/0:AD=24,0:DP=24	GT=0/0:AD=16,0:DP=16	GT=0/0:AD=28,0:DP=28	GT=0/0:AD=25,0:DP=25	
chr3	43348524	A	T	GT=0/0:AD=23,0:DP=23	GT=0/0:AD=19,0:DP=19	GT=0/0:AD=21,0:DP=21	GT=0/0:AD=16,0:DP=16	GT=0/0:AD=17,0:DP=17	GT=0/0:AD=16,0:DP=16	GT=0/0:AD=24,0:DP=24	GT=0/0:AD=16,0:DP=16	GT=0/0:AD=16,0:DP=16	GT=0/0:AD=16,0:DP=16	
chr3	43366896	C	T	GT=0/0:AD=16,0:DP=16	GT=0/0:AD=16,0:DP=16	GT=0/0:AD=21,0:DP=21	GT

In [10]:
## Left normalize and split multiallelic variants
bcftools norm -m-any ~/test/bcftools/chr3_mwe.subset_10samples.vcf.gz| \
    bcftools norm --check-ref w -f ~/test/bcftools/GRCh38_full_analysis_set_plus_decoy_hla.fa   -Oz | \
    bcftools annotate -Oz -x ID -I +'%CHROM:%POS:%REF:%ALT'  > chr3_mwe.subset_10samples.leftnorm.vcf.gz

Lines   total/split/realigned/skipped:	206/23/0/0
Lines   total/split/realigned/skipped:	234/0/12/0



In [22]:
bcftools index -t -f ~/test/bcftools/chr3_mwe.subset_10samples.leftnorm.vcf.gz




In [12]:
## In simple terms there are 23 multiallelic variants that need to be split and 12 indels that need to be realigned
bcftools view -H ~/test/bcftools/chr3_mwe.subset_10samples.leftnorm.vcf.gz | wc -l

234



In [13]:
bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\t[GT=%GT:AD=%AD:DP=%DP\t]\n' ~/test/bcftools/chr3_mwe.subset_10samples.leftnorm.vcf.gz

chr3	43343244	C	T	GT=0/0:AD=12,0:DP=12	GT=0/0:AD=30,0:DP=30	GT=0/0:AD=16,0:DP=16	GT=0/0:AD=14,0:DP=14	GT=0/0:AD=19,0:DP=19	GT=0/0:AD=16,0:DP=16	GT=0/0:AD=19,0:DP=19	GT=0/0:AD=16,0:DP=16	GT=0/0:AD=16,0:DP=16	GT=0/0:AD=16,0:DP=16	
chr3	43343506	G	GT	GT=0/0:AD=25,0:DP=25	GT=0/0:AD=30,0:DP=30	GT=./.:AD=24,0:DP=24	GT=0/0:AD=25,0:DP=25	GT=0/0:AD=19,0:DP=19	GT=0/0:AD=30,0:DP=30	GT=0/0:AD=19,0:DP=19	GT=0/0:AD=16,0:DP=16	GT=0/0:AD=21,0:DP=21	GT=0/0:AD=26,0:DP=26	
chr3	43347372	G	A	GT=0/0:AD=16,0:DP=16	GT=0/0:AD=22,0:DP=22	GT=0/0:AD=25,0:DP=25	GT=0/0:AD=21,0:DP=21	GT=0/0:AD=18,0:DP=18	GT=0/0:AD=17,0:DP=17	GT=0/0:AD=24,0:DP=24	GT=0/0:AD=16,0:DP=16	GT=0/0:AD=28,0:DP=28	GT=0/0:AD=25,0:DP=25	
chr3	43348524	A	T	GT=0/0:AD=23,0:DP=23	GT=0/0:AD=19,0:DP=19	GT=0/0:AD=21,0:DP=21	GT=0/0:AD=16,0:DP=16	GT=0/0:AD=17,0:DP=17	GT=0/0:AD=16,0:DP=16	GT=0/0:AD=24,0:DP=24	GT=0/0:AD=16,0:DP=16	GT=0/0:AD=16,0:DP=16	GT=0/0:AD=16,0:DP=16	
chr3	43366896	C	T	GT=0/0:AD=16,0:DP=16	GT=0/0:AD=16,0:DP=16	GT=0/0:AD=21,0:DP=21	GT

In [11]:
bcftools view \
-c 1 \
-O u \
 ~/test/bcftools/chr3_mwe.subset.vcf.gz |\
bcftools query \
-f '%CHROM\t%POS\t.\t%REF\t%ALT\t.\t.\t.\t%AC\t%AN\n' \
-o  ~/test/bcftools/test




In [None]:
bcftools view -f PASS -O u /mnt/vast/hpc/csg/UKBiobank/data/exome_files/project_VCF/ukb23156_c17_b51_v1.vcf.gz | bcftools query \
-f '%CHROM\t%POS\t.\t%REF\t%ALT\t.\t.\t.\t%AC\t%AN\n' \
-o  ~/test/bcftools/test