In [1]:
from hail import *
hc = HailContext()

In [3]:
vcf = '1000Genomes.ALL.coreExome10K-v1.vcf.bgz'
sample_annotations = '1000Genomes.ALL.coreExome10K-v1.sample_annotations'
pruned_variants = 'purcell5k.interval_list'

In [6]:
vds = hc.import_vcf(vcf)
vds = vds.split_multi()
vds = vds.annotate_samples_table(sample_annotations, root='sa.pheno',sample_expr='Sample', config=TextTableConfig(impute=True))
out_path = '1kg.vds'
vds.write(out_path)

In [8]:
vds = hc.read(out_path)
vds.count(genotypes=True)

{u'nGenotypes': 27786135L, u'nVariants': 10961L, u'nSamples': 2535, u'nCalled': 27417806L, u'callRate': 98.6744144156789}

In [9]:
filter_condition = '''let ab = g.ad[1] / g.ad.sum in
                          ((g.isHomRef && ab <= 0.1) || 
                           (g.isHet && ab >= 0.25 && ab <= 0.75) || 
                           (g.isHomVar && ab >= 0.9))'''
filtered_vds = vds.filter_genotypes(filter_condition)
filtered_vds.count(genotypes=True)

In [12]:
filtered_vds_2 = (filtered_vds
     .filter_variants_expr('gs.fraction(g => g.isCalled) > 0.95')
     .sample_qc())
filtered_vds_2.print_schema(sa=True)
filtered_vds_2.export_samples('sampleqc.txt', 'Sample = s.id, sa.qc.*')

<hail.dataset.VariantDataset at 0x7f60e01d0550>

In [13]:
%%sh
head sampleqc.txt | cut -f 1,2,3,4,5,6,7,8,9,10

Sample	callRate	nCalled	nNotCalled	nHomRef	nHet	nHomVar	nSNP	nInsertion	nDeletion
HG00096	9.65210e-01	5410	195	4072	682	656	1994	0	0
HG00097	9.81980e-01	5504	101	4053	835	616	2067	0	0
HG00099	9.78591e-01	5485	120	4088	770	627	2024	0	0
HG00100	9.88582e-01	5541	64	4076	902	563	2028	0	0
HG00101	9.69313e-01	5433	172	4063	744	626	1996	0	0
HG00102	9.75022e-01	5465	140	4086	753	626	2005	0	0
HG00103	9.60036e-01	5381	224	4094	615	672	1959	0	0
HG00105	9.74844e-01	5464	141	4098	765	601	1967	0	0
HG00106	9.75022e-01	5465	140	4078	778	609	1996	0	0
