## Test code for bcftools

## Load bcftools on the cluster

In [36]:
module load BCFTOOLS/1.16

## Generated a random sample of the vcfFile

In [1]:
bcftools view /mnt/vast/hpc/csg/UKBiobank/data/exome_files/project_VCF/ukb23156_c3_b12_v1.vcf.gz| vcfrandomsample -r 0.012 > ~/test/bcftools/chr3_mwe.subset.vcf




## Look at how many variants are left

In [2]:
bcftools view -H ~/test/bcftools/chr3_mwe.subset.vcf.gz | wc -l

206


## Compress and index VCF test file

In [None]:
# compress vcf
bgzip ~/test/bcftools/chr3_mwe.subset.vcf
# index vcf
bcftools index ~/test/bcftools/chr3_mwe.subset.vcf.gz

## Subset only 10 samples

In [None]:
bcftools view --force-samples -o chr3_mwe.subset_10samples.vcf.gz -O z -s 1434748,5523981,5023838,4023729,4442146,5654789,4515669,1129683,5327043,4744741 ~/test/bcftools/chr3_mwe.subset.vcf.gz

## Ref genome Hg38

In [None]:
~/test/bcftools/GRCh38_full_analysis_set_plus_decoy_hla.fa

## 1. Split multiallelic variants and leftnormalize indels 

In [19]:
bcftools norm -m-any ~/test/bcftools/chr3_mwe.subset_10samples.vcf.gz |\
bcftools norm --check-ref w -f ~/test/bcftools/GRCh38_full_analysis_set_plus_decoy_hla.fa -Oz > ~/test/bcftools/test.norm.vcf.gz

Lines   total/split/realigned/skipped:	206/23/0/0
Lines   total/split/realigned/skipped:	234/0/12/0


The following command line is from https://github.com/mrcepid-rap/mrcepid-filterbcf

In [11]:
bcftools norm --threads 4 -Oz -o ~/test/bcftools/test.mrcepid.norm.vcf.gz -m - -f ~/test/bcftools/GRCh38_full_analysis_set_plus_decoy_hla.fa \
~/test/bcftools/chr3_mwe.subset_10samples.vcf.gz 

Lines   total/split/realigned/skipped:	206/23/12/0


Both commands produce the same result

In [18]:
bcftools query -f'%CHROM\t%POS\t%REF\t%ALT\tAC=%AC\tAN=%AN\t%INFO\n' ~/test/bcftools/test.norm.vcf.gz | grep -v "^#" | wc -l

234


In [17]:
bcftools query -f'%CHROM\t%POS\t%REF\t%ALT\tAC=%AC\tAN=%AN\t%INFO\n' ~/test/bcftools/test.mrcepid.norm.vcf.gz | grep -v "^#" | wc -l

234


## 2. Filter variants

### Check per-sample genotypes

In [65]:
bcftools filter -Oz -o ~/test/bcftools/test.norm.filtered.snp_indel.vcf.gz -S . \
          -i '(TYPE="snp" & FMT/DP >= 7 & ((FMT/GT="RR" & FMT/GQ >= 20) |  
          (FMT/GT="RA" & FMT/GQ >= 20 & ((FORMAT/AD[*:1])/(FORMAT/DP))>=0.15) | 
          (FMT/GT="AA" & FMT/GQ >= 20))) | 
          (TYPE="indel" & FMT/DP >= 10 & FMT/GQ >= 20 & ((FMT/GT="RR") | 
          (FMT/GT="RA" & ((FORMAT/AD[*:1])/(FORMAT/DP))>=0.20) | 
          (FMT/GT="AA")))' ~/test/bcftools/test.norm.vcf.gz

The command line below is the one available at the mrcepid github repo

In [74]:
bcftools filter -Oz -o ~/test/bcftools/test.mrcepid.norm.filtered.snp_indel.vcf.gz -S . \
          -i '(TYPE="snp" & FMT/DP >= 7 & ((FMT/GT="RR" & FMT/GQ >= 20) |  
          (FMT/GT="RA" & FMT/GQ >= 20 & binom(FMT/AD) > 0.001) | 
          (FMT/GT="AA"))) | 
          (TYPE="indel" & FMT/DP >= 10 & FMT/GQ >= 20)' ~/test/bcftools/test.norm.vcf.gz

In [48]:
bcftools filter -Oz -o ~/test/bcftools/test.mrcepid.norm.filtered.vcf.gz -S . -i 'TYPE="snp" & FMT/DP >= 7 & (FMT/GT="RA" & FMT/GQ >= 20 & binom(FMT/AD) > 0.001)' ~/test/bcftools/test.mrcepid.norm.vcf.gz

In [58]:
bcftools filter -Oz -o ~/test/bcftools/test.mrcepid.norm.filtered.AB.vcf.gz -S . -i 'TYPE="snp" & FMT/DP >= 7 & (FMT/GT="RA" & FMT/GQ >= 20 & ((FORMAT/AD[*:1])/(FORMAT/DP))>=0.15)' ~/test/bcftools/test.mrcepid.norm.vcf.gz

In [50]:
bcftools query -f'%CHROM\t%POS\t%REF\t%ALT\t[AD=%AD DP=%DP GT=%GT GQ=%GQ ]\n' ~/test/bcftools/test.mrcepid.norm.vcf.gz

chr3	43343244	C	T	AD=12,0 DP=12 GT=0/0 GQ=36 AD=30,0 DP=30 GT=0/0 GQ=50 AD=16,0 DP=16 GT=0/0 GQ=48 AD=14,0 DP=14 GT=0/0 GQ=42 AD=19,0 DP=19 GT=0/0 GQ=50 AD=16,0 DP=16 GT=0/0 GQ=48 AD=19,0 DP=19 GT=0/0 GQ=50 AD=16,0 DP=16 GT=0/0 GQ=48 AD=16,0 DP=16 GT=0/0 GQ=48 AD=16,0 DP=16 GT=0/0 GQ=48 
chr3	43343506	G	GT	AD=25,0 DP=25 GT=0/0 GQ=48 AD=30,0 DP=30 GT=0/0 GQ=50 AD=24,0 DP=24 GT=./. GQ=0 AD=25,0 DP=25 GT=0/0 GQ=48 AD=19,0 DP=19 GT=0/0 GQ=50 AD=30,0 DP=30 GT=0/0 GQ=30 AD=19,0 DP=19 GT=0/0 GQ=50 AD=16,0 DP=16 GT=0/0 GQ=48 AD=21,0 DP=21 GT=0/0 GQ=48 AD=26,0 DP=26 GT=0/0 GQ=18 
chr3	43347372	G	A	AD=16,0 DP=16 GT=0/0 GQ=48 AD=22,0 DP=22 GT=0/0 GQ=50 AD=25,0 DP=25 GT=0/0 GQ=50 AD=21,0 DP=21 GT=0/0 GQ=50 AD=18,0 DP=18 GT=0/0 GQ=50 AD=17,0 DP=17 GT=0/0 GQ=50 AD=24,0 DP=24 GT=0/0 GQ=50 AD=16,0 DP=16 GT=0/0 GQ=48 AD=28,0 DP=28 GT=0/0 GQ=50 AD=25,0 DP=25 GT=0/0 GQ=50 
chr3	43348524	A	T	AD=23,0 DP=23 GT=0/0 GQ=50 AD=19,0 DP=19 GT=0/0 GQ=50 AD=21,0 DP=21 GT=0/0 GQ=48 AD=16,0 DP=16 GT=0/0 GQ=48 AD=17,0

In [51]:
bcftools query -f'%CHROM\t%POS\t%REF\t%ALT\t[AD=%AD DP=%DP GT=%GT GQ=%GQ ]\n' ~/test/bcftools/test.mrcepid.norm.filtered.vcf.gz

chr3	43343244	C	T	AD=12,0 DP=12 GT=./. GQ=36 AD=30,0 DP=30 GT=./. GQ=50 AD=16,0 DP=16 GT=./. GQ=48 AD=14,0 DP=14 GT=./. GQ=42 AD=19,0 DP=19 GT=./. GQ=50 AD=16,0 DP=16 GT=./. GQ=48 AD=19,0 DP=19 GT=./. GQ=50 AD=16,0 DP=16 GT=./. GQ=48 AD=16,0 DP=16 GT=./. GQ=48 AD=16,0 DP=16 GT=./. GQ=48 
chr3	43343506	G	GT	AD=25,0 DP=25 GT=./. GQ=48 AD=30,0 DP=30 GT=./. GQ=50 AD=24,0 DP=24 GT=./. GQ=0 AD=25,0 DP=25 GT=./. GQ=48 AD=19,0 DP=19 GT=./. GQ=50 AD=30,0 DP=30 GT=./. GQ=30 AD=19,0 DP=19 GT=./. GQ=50 AD=16,0 DP=16 GT=./. GQ=48 AD=21,0 DP=21 GT=./. GQ=48 AD=26,0 DP=26 GT=./. GQ=18 
chr3	43347372	G	A	AD=16,0 DP=16 GT=./. GQ=48 AD=22,0 DP=22 GT=./. GQ=50 AD=25,0 DP=25 GT=./. GQ=50 AD=21,0 DP=21 GT=./. GQ=50 AD=18,0 DP=18 GT=./. GQ=50 AD=17,0 DP=17 GT=./. GQ=50 AD=24,0 DP=24 GT=./. GQ=50 AD=16,0 DP=16 GT=./. GQ=48 AD=28,0 DP=28 GT=./. GQ=50 AD=25,0 DP=25 GT=./. GQ=50 
chr3	43348524	A	T	AD=23,0 DP=23 GT=./. GQ=50 AD=19,0 DP=19 GT=./. GQ=50 AD=21,0 DP=21 GT=./. GQ=48 AD=16,0 DP=16 GT=./. GQ=48 AD=17,0

In [57]:
bcftools query -f'%CHROM\t%POS\t%REF\t%ALT\t[AD=%AD DP=%DP GT=%GT GQ=%GQ ]\n' ~/test/bcftools/test.mrcepid.norm.filtered.AB.vcf.gz

chr3	43343244	C	T	AD=12,0 DP=12 GT=./. GQ=36 AD=30,0 DP=30 GT=./. GQ=50 AD=16,0 DP=16 GT=./. GQ=48 AD=14,0 DP=14 GT=./. GQ=42 AD=19,0 DP=19 GT=./. GQ=50 AD=16,0 DP=16 GT=./. GQ=48 AD=19,0 DP=19 GT=./. GQ=50 AD=16,0 DP=16 GT=./. GQ=48 AD=16,0 DP=16 GT=./. GQ=48 AD=16,0 DP=16 GT=./. GQ=48 
chr3	43343506	G	GT	AD=25,0 DP=25 GT=./. GQ=48 AD=30,0 DP=30 GT=./. GQ=50 AD=24,0 DP=24 GT=./. GQ=0 AD=25,0 DP=25 GT=./. GQ=48 AD=19,0 DP=19 GT=./. GQ=50 AD=30,0 DP=30 GT=./. GQ=30 AD=19,0 DP=19 GT=./. GQ=50 AD=16,0 DP=16 GT=./. GQ=48 AD=21,0 DP=21 GT=./. GQ=48 AD=26,0 DP=26 GT=./. GQ=18 
chr3	43347372	G	A	AD=16,0 DP=16 GT=./. GQ=48 AD=22,0 DP=22 GT=./. GQ=50 AD=25,0 DP=25 GT=./. GQ=50 AD=21,0 DP=21 GT=./. GQ=50 AD=18,0 DP=18 GT=./. GQ=50 AD=17,0 DP=17 GT=./. GQ=50 AD=24,0 DP=24 GT=./. GQ=50 AD=16,0 DP=16 GT=./. GQ=48 AD=28,0 DP=28 GT=./. GQ=50 AD=25,0 DP=25 GT=./. GQ=50 
chr3	43348524	A	T	AD=23,0 DP=23 GT=./. GQ=50 AD=19,0 DP=19 GT=./. GQ=50 AD=21,0 DP=21 GT=./. GQ=48 AD=16,0 DP=16 GT=./. GQ=48 AD=17,0

In [66]:
bcftools query -f'%CHROM\t%POS\t%REF\t%ALT\t[AD=%AD DP=%DP GT=%GT GQ=%GQ ]\n' ~/test/bcftools/test.norm.filtered.snp_indel.vcf.gz

chr3	43343244	C	T	AD=12,0 DP=12 GT=0/0 GQ=36 AD=30,0 DP=30 GT=0/0 GQ=50 AD=16,0 DP=16 GT=0/0 GQ=48 AD=14,0 DP=14 GT=0/0 GQ=42 AD=19,0 DP=19 GT=0/0 GQ=50 AD=16,0 DP=16 GT=0/0 GQ=48 AD=19,0 DP=19 GT=0/0 GQ=50 AD=16,0 DP=16 GT=0/0 GQ=48 AD=16,0 DP=16 GT=0/0 GQ=48 AD=16,0 DP=16 GT=0/0 GQ=48 
chr3	43343506	G	GT	AD=25,0 DP=25 GT=0/0 GQ=48 AD=30,0 DP=30 GT=0/0 GQ=50 AD=24,0 DP=24 GT=./. GQ=0 AD=25,0 DP=25 GT=0/0 GQ=48 AD=19,0 DP=19 GT=0/0 GQ=50 AD=30,0 DP=30 GT=0/0 GQ=30 AD=19,0 DP=19 GT=0/0 GQ=50 AD=16,0 DP=16 GT=0/0 GQ=48 AD=21,0 DP=21 GT=0/0 GQ=48 AD=26,0 DP=26 GT=./. GQ=18 
chr3	43347372	G	A	AD=16,0 DP=16 GT=0/0 GQ=48 AD=22,0 DP=22 GT=0/0 GQ=50 AD=25,0 DP=25 GT=0/0 GQ=50 AD=21,0 DP=21 GT=0/0 GQ=50 AD=18,0 DP=18 GT=0/0 GQ=50 AD=17,0 DP=17 GT=0/0 GQ=50 AD=24,0 DP=24 GT=0/0 GQ=50 AD=16,0 DP=16 GT=0/0 GQ=48 AD=28,0 DP=28 GT=0/0 GQ=50 AD=25,0 DP=25 GT=0/0 GQ=50 
chr3	43348524	A	T	AD=23,0 DP=23 GT=0/0 GQ=50 AD=19,0 DP=19 GT=0/0 GQ=50 AD=21,0 DP=21 GT=0/0 GQ=48 AD=16,0 DP=16 GT=0/0 GQ=48 AD=17,0

In [73]:
bcftools query -f'%CHROM\t%POS\t%REF\t%ALT\t[AD=%AD DP=%DP GT=%GT GQ=%GQ ]\n' ~/test/bcftools/test.norm.filltag.vcf.gz

chr3	43343244	C	T	AD=12,0 DP=12 GT=0/0 GQ=36 AD=30,0 DP=30 GT=0/0 GQ=50 AD=16,0 DP=16 GT=0/0 GQ=48 AD=14,0 DP=14 GT=0/0 GQ=42 AD=19,0 DP=19 GT=0/0 GQ=50 AD=16,0 DP=16 GT=0/0 GQ=48 AD=19,0 DP=19 GT=0/0 GQ=50 AD=16,0 DP=16 GT=0/0 GQ=48 AD=16,0 DP=16 GT=0/0 GQ=48 AD=16,0 DP=16 GT=0/0 GQ=48 
chr3	43343506	G	GT	AD=25,0 DP=25 GT=0/0 GQ=48 AD=30,0 DP=30 GT=0/0 GQ=50 AD=24,0 DP=24 GT=./. GQ=0 AD=25,0 DP=25 GT=0/0 GQ=48 AD=19,0 DP=19 GT=0/0 GQ=50 AD=30,0 DP=30 GT=0/0 GQ=30 AD=19,0 DP=19 GT=0/0 GQ=50 AD=16,0 DP=16 GT=0/0 GQ=48 AD=21,0 DP=21 GT=0/0 GQ=48 AD=26,0 DP=26 GT=0/0 GQ=18 
chr3	43347372	G	A	AD=16,0 DP=16 GT=0/0 GQ=48 AD=22,0 DP=22 GT=0/0 GQ=50 AD=25,0 DP=25 GT=0/0 GQ=50 AD=21,0 DP=21 GT=0/0 GQ=50 AD=18,0 DP=18 GT=0/0 GQ=50 AD=17,0 DP=17 GT=0/0 GQ=50 AD=24,0 DP=24 GT=0/0 GQ=50 AD=16,0 DP=16 GT=0/0 GQ=48 AD=28,0 DP=28 GT=0/0 GQ=50 AD=25,0 DP=25 GT=0/0 GQ=50 
chr3	43348524	A	T	AD=23,0 DP=23 GT=0/0 GQ=50 AD=19,0 DP=19 GT=0/0 GQ=50 AD=21,0 DP=21 GT=0/0 GQ=48 AD=16,0 DP=16 GT=0/0 GQ=48 AD=17,0

### 2. Check the proportion of missing genotypes per each variant

First we calculate per-allele missingness using the bcftools plugin `bcftools +fill-tags`

In [2]:
bcftools +fill-tags ~/test/bcftools/test.norm.filtered.snp_indel.vcf.gz -Oz -o \
          ~/test/bcftools/test.norm.filtered.tagged.snp_indel.vcf.gz -- -t F_MISSING,AC,AF,AN

In [3]:
bcftools query -f'%CHROM\t%POS\t%REF\t%ALT\tF_MISSING=%F_MISSING\tAC=%AC\tAF=%AF\n' ~/test/bcftools/test.norm.filtered.tagged.snp_indel.vcf.gz

chr3	43343244	C	T	F_MISSING=0	AC=0	AF=0
chr3	43343506	G	GT	F_MISSING=0.2	AC=0	AF=0
chr3	43347372	G	A	F_MISSING=0	AC=0	AF=0
chr3	43348524	A	T	F_MISSING=0	AC=0	AF=0
chr3	43366896	C	T	F_MISSING=0	AC=0	AF=0
chr3	43366925	C	T	F_MISSING=0	AC=0	AF=0
chr3	43366971	T	C	F_MISSING=0	AC=0	AF=0
chr3	43432603	G	C	F_MISSING=0	AC=0	AF=0
chr3	43432603	G	GCT	F_MISSING=0	AC=0	AF=0
chr3	43555211	T	C	F_MISSING=0	AC=0	AF=0
chr3	43576722	G	A	F_MISSING=0	AC=0	AF=0
chr3	43577344	T	C	F_MISSING=0.1	AC=0	AF=0
chr3	43598749	T	C	F_MISSING=0.5	AC=0	AF=0
chr3	43605787	T	C	F_MISSING=0	AC=0	AF=0
chr3	43690963	A	G	F_MISSING=0	AC=0	AF=0
chr3	43690965	A	G	F_MISSING=0	AC=0	AF=0
chr3	43690986	G	A	F_MISSING=0	AC=0	AF=0
chr3	43715040	G	A	F_MISSING=0	AC=0	AF=0
chr3	43718347	C	G	F_MISSING=0.5	AC=0	AF=0
chr3	43718432	T	C	F_MISSING=0	AC=0	AF=0
chr3	44242040	C	G	F_MISSING=0	AC=0	AF=0
chr3	44242048	G	C	F_MISSING=0	AC=0	AF=0
chr3	44242048	G	A	F_MISSING=0	AC=0	AF=0
chr3	44242318	G	A	F_MISSING=0	AC=0	AF=0
chr3	44243738	A	G	F_MISSING=0

## Investigate known issue with genotype quality distribution

In [4]:
cp ~/UKBiobank/data/exome_files/project_VCF/ukb23156_c1_b23_v1.vcf.gz ~/test/bcftools/

In [6]:
# Create index of test file
bcftools index ~/test/bcftools/ukb23156_c1_b23_v1.vcf.gz

In [7]:
bcftools view -r chr1:32191128-32210310  -v snps -Ou -f PASS,. ~/test/bcftools/ukb23156_c1_b23_v1.vcf.gz | bcftools query -f '[%GT %GQ\n]' | gzip -c > ~/test/bcftools/ukbb-snp-gqs.txt.gz

In [8]:
bcftools view -r chr1:32191128-32210310  -v indels -Ou -f PASS,. ~/test/bcftools/ukb23156_c1_b23_v1.vcf.gz | bcftools query -f '[%GT %GQ\n]' | gzip -c > ~/test/bcftools/ukbb-indel-gqs.txt.gz

In [9]:
zcat ~/test/bcftools/ukbb-snp-gqs.txt.gz | sort | uniq -c > ~/test/bcftools/snp.counts.txt
zcat ~/test/bcftools/ukbb-indel-gqs.txt.gz | sort | uniq -c > ~/test/bcftools/indel.counts.txt

In [None]:
library(ggplot2)
read.counts <- function(fname,variant) {
    dat <- read.table(fname,as.is=TRUE,col.names=c('count','gt','gq'),na.strings='.')
    denominator <- with(dat,tapply(count,gt,sum))
    dat$frequency <- with(dat,count/denominator[gt])
    dat$variant <- variant
    dat
}
dat <- rbind(read.counts('snp.counts.txt','snp'),read.counts('indel.counts.txt','indel'))
dat$variant <- factor(dat$variant,levels=c('snp','indel'))
ggplot(subset(dat,gt%in%c('0/0','0/1','1/1')),aes(x=gq,y=frequency,col=gt))  + geom_line()+facet_wrap(~variant)
ggsave("ukbb-gq.png",width=8,height=4)

In [10]:
cp ~/UKBiobank/data/exome_files/project_VCF/ukb23156_c22_b20_v1.vcf.gz ~/test/bcftools/

In [2]:
bcftools index ~/test/bcftools/ukb23156_c22_b20_v1.vcf.gz

[E::main_vcfindex] the index file exists. Please use '-f' to overwrite /home/dmc2245/test/bcftools/ukb23156_c22_b20_v1.vcf.gz.csi


: 1

In [4]:
bcftools view -r chr22:49920058-49940058  -v snps -Ou -f PASS,. ~/test/bcftools/ukb23156_c22_b20_v1.vcf.gz  | bcftools query -f '[%GT %GQ\n]' | gzip -c > ~/test/bcftools/ukbb-snp-gqs22.txt.gz 

In [5]:
bcftools view -r chr22:49920058-49940058  -v indels -Ou -f PASS,. ~/test/bcftools/ukb23156_c22_b20_v1.vcf.gz | bcftools query -f '[%GT %GQ\n]' | gzip -c > ~/test/bcftools/ukbb-indel-gqs22.txt.gz

In [6]:
zcat ~/test/bcftools/ukbb-snp-gqs22.txt.gz | sort | uniq -c > ~/test/bcftools/snp.counts.22.txt
zcat ~/test/bcftools/ukbb-indel-gqs22.txt.gz | sort | uniq -c > ~/test/bcftools/indel.counts.22.txt

In [3]:
bcftools query -f'%POS\n' ~/test/bcftools/ukb23156_c22_b20_v1.vcf.gz

49920058
49920061
49920063
49920064
49920075
49920079
49920082
49920083
49920085
49920086
49920088
49920094
49920098
49920099
49920100
49920102
49920107
49920114
49920115
49920125
49920127
49920136
49920137
49920143
49920146
49920148
49920160
49920161
49920162
49920163
49920173
49920177
49920178
49920180
49920181
49920186
49920191
49920193
49920198
49920204
49920209
49920220
49920228
49920231
49920232
49920236
49920237
49920238
49920246
49920250
49920251
49920252
49920261
49920271
49920273
49920274
49920281
49920282
49920292
49920299
49920304
49920314
49920319
49920323
49920329
49920333
49920340
49920343
49920345
49921488
49921489
49921490
49921491
49921492
49921496
49921497
49921511
49921513
49921520
49921521
49921526
49921530
49921533
49921534
49921535
49921538
49921539
49921545
49921554
49921564
49921565
49921577
49921580
49921586
49921588
49921595
49921596
49921599
49921600
49921614
49921616
49921617
49921619
49921621
49921625
49921627
49921634
49921635
49921640
49921655
49921656
4

In [6]:
bcftools view -r chr1:32191128-32210310  -v snps -Ou -f PASS,. ~/test/bcftools/ukb23156_c1_b23_v1.vcf.gz | bcftools query -f  '%CHROM\t%POS\t%REF\t%ALT\tAF=%AF\n' | head -n 50

chr1	32192212	G	T	AF=2e-06
chr1	32192215	G	A	AF=7e-06
chr1	32192220	G	A	AF=2e-06
chr1	32192221	C	A	AF=3e-05
chr1	32192222	T	C	AF=7e-06
chr1	32192223	G	A	AF=1e-05
chr1	32192224	G	A	AF=1.2e-05
chr1	32192226	C	G	AF=5e-06
chr1	32192228	A	T	AF=2e-06
chr1	32192232	G	T	AF=2e-06
chr1	32192233	C	T	AF=2e-06
chr1	32192248	G	A,C	AF=7e-06,2e-06
chr1	32192255	T	C	AF=2e-06
chr1	32192256	A	G	AF=2e-06
chr1	32192272	C	T	AF=1e-05
chr1	32192273	G	A	AF=2.5e-05
chr1	32192284	C	T	AF=4.2e-05
chr1	32192285	G	A	AF=0.001176
chr1	32192289	G	A	AF=0.001224
chr1	32192290	C	A	AF=1e-05
chr1	32192298	A	G	AF=2e-06
chr1	32192300	C	G	AF=5e-06
chr1	32192306	T	C	AF=2e-06
chr1	32192313	T	C	AF=7e-06
chr1	32192316	C	G,T	AF=0.000237,5e-06
chr1	32192331	A	C	AF=2e-06
chr1	32192352	G	A	AF=2e-06
chr1	32192366	A	G	AF=2e-06
chr1	32192368	C	T	AF=1.5e-05
chr1	32192382	G	A	AF=2e-06
chr1	32192407	C	T	AF=2e-06
chr1	32192408	G	A	AF=9.2e-05
chr1	32192417	G	A	AF=1.2e-05
chr1	32192421	G	T	AF=7e-06
chr1	32192422	A	G	AF=2e-06
chr1	32192425	G	T	

# Normalize vcf to be able to count number of genotypes

In [7]:
bcftools norm -m-any ~/test/bcftools/ukb23156_c1_b23_v1.vcf.gz |\
bcftools norm --check-ref w -f ~/test/bcftools/GRCh38_full_analysis_set_plus_decoy_hla.fa -Oz > ~/test/bcftools/ukb23156_c1_b23_v1.norm.vcf.gz

Lines   total/split/realigned/skipped:	16836/1789/0/0
Lines   total/split/realigned/skipped:	19096/0/903/0


In [9]:
bcftools +fill-tags ~/test/bcftools/ukb23156_c1_b23_v1.norm.vcf.gz -Oz -o \
          ~/test/bcftools/ukb23156_c1_b23_v1.norm.tagged.vcf.gz -- -t F_MISSING,AC,AF,AN,MAF

In [12]:
bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\tAF=%AF\t[\t%SAMPLE=%GT]\n' ~/test/bcftools/ukb23156_c1_b23_v1.norm.vcf.gz > ~/test/bcftools/ukb23156_c1_b23_v1.norm.genotypes.txt

In [26]:
bcftools view -v snps -Ou -f PASS,. ~/test/bcftools/ukb23156_c1_b23_v1.norm.vcf.gz | bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\tAF=%AF\n' > ~/test/bcftools/ukb23156_c1_b23_v1.norm.AF.txt

In [13]:
## Proportion of alternative alleles with frequency > 0.5
cat ~/test/bcftools/ukb23156_c1_b23_v1.norm.AF.txt | awk -F'=' '{if($2>0.5) print $2}'

# As an example
# chr1	32651470	T	G	AF=0.997528
# T is the genome reference allele but G is the major allele (based on frequency)

0.997528
0.994224
0.77216
0.952747
0.959667
0.768405
0.959169
0.959244
0.939173
0.572103
0.998669
0.692843
0.88035
0.998642
0.98211
0.938164
0.761901
0.965917
0.998251
0.75831


In [8]:
bcftools index --tbi ~/test/bcftools/ukb23156_c1_b23_v1.norm.vcf.gz

In [9]:
bcftools view -v snps -Ou -f PASS,. ~/test/bcftools/ukb23156_c1_b23_v1.norm.vcf.gz |  bcftools view -i 'GT="het"' | bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\tAF=%AF\t[%GT\t]\n' | gzip -c > ~/test/bcftools/ukbb-snp-gqs1.txt.gz 




In [4]:
grep -o "[0-9][0-9][0-9][0-9][0-9][0-9][0-9]=1/0" ~/test/bcftools/ukb23156_c1_b23_v1.norm.genotypes.txt | head

2632443=1/0
2055237=1/0
5272852=1/0
1601225=1/0
3491682=1/0
2482697=1/0
4062215=1/0
6011202=1/0
1213249=1/0
4478081=1/0



In [5]:
grep -o "[0-9][0-9][0-9][0-9][0-9][0-9][0-9]=0/1" ~/test/bcftools/ukb23156_c1_b23_v1.norm.genotypes.txt | head

2631560=0/1
1733193=0/1
1679765=0/1
3871923=0/1
5624029=0/1
3836230=0/1
4562872=0/1
5260056=0/1
3746218=0/1
2774104=0/1
grep: write error: Broken pipe


In [11]:
bcftools view -v indels -Ou -f PASS,. -s 2632443,2055237,5272852,1601225,2631560,1733193,1679765,3871923 ~/test/bcftools/ukb23156_c1_b23_v1.norm.tagged.vcf.gz | bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\tAF=%AF\t[%SAMPLE=%GT\t]\n'

chr1	32166005	AAAAC	A	AF=0.000450305	2632443=0/0	2055237=0/0	5272852=0/0	1601225=0/0	2631560=0/1	1733193=0/1	1679765=0/1	3871923=0/1	
chr1	32166006	AAAC	A	AF=0.00553578	2632443=1/0	2055237=1/0	5272852=1/0	1601225=1/0	2631560=0/0	1733193=0/0	1679765=0/0	3871923=0/0	
chr1	32166007	AAC	A	AF=0.00247802	2632443=0/1	2055237=0/1	5272852=0/1	1601225=0/1	2631560=1/0	1733193=0/0	1679765=0/0	3871923=0/0	
chr1	32166008	AC	A	AF=0.0258507	2632443=0/0	2055237=0/0	5272852=0/0	1601225=0/0	2631560=0/0	1733193=0/0	1679765=0/0	3871923=0/0	
chr1	32166008	A	AT	AF=5.39287e-06	2632443=0/0	2055237=0/0	5272852=0/0	1601225=0/0	2631560=0/0	1733193=0/0	1679765=0/0	3871923=0/0	
chr1	32166011	AAAACAAC	A	AF=1.88297e-05	2632443=0/0	2055237=0/0	5272852=0/0	1601225=0/0	2631560=0/0	1733193=0/0	1679765=0/0	3871923=0/0	
chr1	32166012	AAAC	A	AF=0.0124437	2632443=0/0	2055237=0/0	5272852=0/0	1601225=0/0	2631560=0/0	1733193=0/0	1679765=0/0	3871923=0/0	
chr1	32166012	A	AAAC	AF=0.00382511	2632443=0/0	2055237=0/0	5272852=0/0	1601