Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions bio2zarr/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@

from .vcf import explode as explode_vcf
65 changes: 51 additions & 14 deletions bio2zarr/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1009,7 +1009,11 @@ def fixed_field_spec(
shape.append(n)
chunks.append(chunk_width),
dimensions.append("samples")
if field.summary.max_number > 1:
if field.category == "FORMAT" and field.vcf_type == "String":
# FIXME not handling format string values very well right now
# as the max_number value is just the number of samples
pass
elif field.summary.max_number > 1:
shape.append(field.summary.max_number)
dimensions.append(field.name)
variable_name = prefix + field.name
Expand Down Expand Up @@ -1414,6 +1418,31 @@ def flush_chunk(start, stop):
return futures


def generate_spec(columnarised, out):
pcvcf = PickleChunkedVcf.load(columnarised)
spec = ZarrConversionSpec.generate(pcvcf)
json.dump(spec.asdict(), out, indent=4)


def to_zarr(
columnarised, zarr_path, conversion_spec, worker_processes=1, show_progress=False
):
pcvcf = PickleChunkedVcf.load(columnarised)
if conversion_spec is None:
spec = ZarrConversionSpec.generate(pcvcf)
else:
with open(conversion_spec, "r") as f:
d = json.load(f)
spec = ZarrConversionSpec.fromdict(d)
SgvcfZarr.convert(
pcvcf,
zarr_path,
conversion_spec=spec,
worker_processes=worker_processes,
show_progress=True,
)


def convert_vcf(
vcfs,
out_path,
Expand Down Expand Up @@ -1488,15 +1517,17 @@ def encode_bed_partition_genotypes(bed_path, zarr_path, start_variant, end_varia
flush_futures(futures)


def validate(vcf_path, zarr_path, show_progress):
def validate(vcf_path, zarr_path, show_progress=False):
store = zarr.DirectoryStore(zarr_path)

root = zarr.group(store=store)
pos = root["variant_position"][:]
allele = root["variant_allele"][:]
chrom = root["contig_id"][:][root["variant_contig"][:]]
vid = root["variant_id"][:]
call_genotype = iter(root["call_genotype"])
call_genotype = None
if "call_genotype" in root:
call_genotype = iter(root["call_genotype"])

vcf = cyvcf2.VCF(vcf_path)
format_headers = {}
Expand Down Expand Up @@ -1526,7 +1557,10 @@ def validate(vcf_path, zarr_path, show_progress):
start_index = np.searchsorted(pos, first_pos)
assert pos[start_index] == first_pos
vcf = cyvcf2.VCF(vcf_path)
iterator = tqdm.tqdm(vcf, total=vcf.num_records)
if show_progress:
iterator = tqdm.tqdm(vcf, total=vcf.num_records)
else:
iterator = vcf
for j, row in enumerate(iterator, start_index):
assert chrom[j] == row.CHROM
assert pos[j] == row.POS
Expand All @@ -1537,16 +1571,19 @@ def validate(vcf_path, zarr_path, show_progress):
assert np.all(allele[j, k + 1 :] == "")
# TODO FILTERS

gt = row.genotype.array()
gt_zarr = next(call_genotype)
gt_vcf = gt[:, :-1]
# NOTE weirdly cyvcf2 seems to remap genotypes automatically
# into the same missing/pad encoding that sgkit uses.
# if np.any(gt_zarr < 0):
# print("MISSING")
# print(gt_zarr)
# print(gt_vcf)
nt.assert_array_equal(gt_zarr, gt_vcf)
if call_genotype is None:
assert row.genotype is None
else:
gt = row.genotype.array()
gt_zarr = next(call_genotype)
gt_vcf = gt[:, :-1]
# NOTE weirdly cyvcf2 seems to remap genotypes automatically
# into the same missing/pad encoding that sgkit uses.
# if np.any(gt_zarr < 0):
# print("MISSING")
# print(gt_zarr)
# print(gt_vcf)
nt.assert_array_equal(gt_zarr, gt_vcf)

# TODO this is basically right, but the details about float padding
# need to be worked out in particular. Need to find examples of
Expand Down
Binary file added tests/data/vcf/sample.vcf.gz
Binary file not shown.
Binary file added tests/data/vcf/sample.vcf.gz.tbi
Binary file not shown.
Binary file added tests/data/vcf/sample_no_genotypes.vcf.gz
Binary file not shown.
Binary file added tests/data/vcf/sample_no_genotypes.vcf.gz.csi
Binary file not shown.
26 changes: 26 additions & 0 deletions tests/test_vcf.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import numpy as np
import numpy.testing as nt
import xarray.testing as xt
import pytest
import sgkit as sg

Expand Down Expand Up @@ -218,3 +219,28 @@ def test_call_HQ(self, ds):
[[-1, -1], [-1, -1], [-1, -1]],
]
nt.assert_array_equal(ds["call_HQ"], call_HQ)

def test_no_genotypes(self, ds, tmp_path):
path = "tests/data/vcf/sample_no_genotypes.vcf.gz"
out = tmp_path / "example.vcf.zarr"
vcf.convert_vcf([path], out)
ds2 = sg.load_dataset(out)
assert len(ds2["sample_id"]) == 0
for col in ds:
if col != "sample_id" and not col.startswith("call_"):
xt.assert_equal(ds[col], ds2[col])


@pytest.mark.parametrize(
"name",
[
"sample.vcf.gz",
"sample_no_genotypes.vcf.gz",
# "info_field_type_combos.vcf.gz",
],
)
def test_by_validating(name, tmp_path):
path = f"tests/data/vcf/{name}"
out = tmp_path / "test.zarr"
vcf.convert_vcf([path], out)
vcf.validate(path, out)
57 changes: 57 additions & 0 deletions validation-data/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
all: 1kg_2020_chr20.bcf.csi \
1kg_2020_chr20.vcf.gz.tbi \
1kg_2020_chr20_annotations.vcf.gz.tbi \
1kg_2020_chr20_annotations.bcf.csi\
1kg_2020_others.bcf.csi \
1kg_2020_others.vcf.gz.tbi \
1kg_p3_all_chr1.bcf.csi \
1kg_p3_all_chr1.vcf.gz.tbi\
1kg_p1_all_chr6.vcf.bcf.csi\
1kg_p1_all_chr6.vcf.gz.tbi

.PRECIOUS: %.bcf
.PRECIOUS: %.vcf.gz

# Modern 1000 genomes data
1KG_2020_BASE=http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_raw_GT_with_annot/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_
1KG_2020_GENOTYPES_URL="${1KG_2020_BASE}chr20.recalibrated_variants.vcf.gz"
1KG_2020_ANNOTATIONS_URL="${1KG_2020_BASE}chr20.recalibrated_variants.annotated.vcf.gz"
1KG_2020_OTHERS_URL="${1KG_2020_BASE}others.recalibrated_variants.vcf.gz"

# 1000 genomes phase 3
1KG_P3_ALL_URL=http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr1.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz
1KG_P3_SNP_URL=http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20181203_biallelic_SNV/ALL.chr1.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz

# 1000 genomes phase 1
1KG_P1_ALL_URL=http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase1/analysis_results/integrated_call_sets/ALL.chr6.integrated_phase1_v3.20101123.snps_indels_svs.genotypes.vcf.gz


%.vcf.gz.tbi: %.vcf.gz
tabix $<

%.bcf.csi: %.bcf
bcftools index $<

%.bcf: %.vcf.gz
bcftools view -O b $< > $@

1kg_2020_chr20.vcf.gz:
bcftools view ${1KG_2020_GENOTYPES_URL} | head -n 10000 | bcftools view -O z -o $@

1kg_2020_others.vcf.gz:
bcftools view ${1KG_2020_OTHERS_URL} | head -n 10000 | bcftools view -O z -o $@

1kg_2020_chr20_annotations.vcf.gz:
bcftools view ${1KG_2020_ANNOTATIONS_URL} | head -n 100000 | bcftools view -O z -o $@

1kg_p3_all_chr1.vcf.gz:
bcftools view ${1KG_P3_ALL_URL} | head -n 500000 | bcftools view -O z -o $@

1kg_p3_snp_chr1.vcf.gz:
bcftools view ${1KG_P3_SNP_URL} | head -n 500000 | bcftools view -O z -o $@

1kg_p1_all_chr6.vcf.gz:
bcftools view ${1KG_P1_ALL_URL} | head -n 10000 | bcftools view -O z -o $@

clean:
rm -f *.vcf.gz *.tbi *.bcf *.csi
55 changes: 55 additions & 0 deletions validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Development script to automate running the validation tests.
# These are large-scale tests that are not possible to run
# under unit-test conditions.
import pathlib
import click

import bio2zarr as bz

# tmp, come up with better names and flatten hierarchy
from bio2zarr import vcf


@click.command
@click.argument("vcfs", nargs=-1)
@click.option("-p", "--worker-processes", type=int, default=1)
@click.option("-f", "--force", is_flag=True, default=False)
def cli(vcfs, worker_processes, force):

data_path = pathlib.Path("validation-data")
if len(vcfs) == 0:
vcfs = list(data_path.glob("*.vcf.gz")) + list(data_path.glob("*.bcf"))
else:
vcfs = [pathlib.Path(f) for f in vcfs]
tmp_path = pathlib.Path("validation-tmp")
tmp_path.mkdir(exist_ok=True)
for f in vcfs:
print(f)
exploded = tmp_path / (f.name + ".exploded")
if force or not exploded.exists():
bz.explode_vcf(
[f],
exploded,
worker_processes=worker_processes,
show_progress=True,
)
spec = tmp_path / (f.name + ".spec")
if force or not spec.exists():
with open(spec, "w") as specfile:
vcf.generate_spec(exploded, specfile)

zarr = tmp_path / (f.name + ".zarr")
if force or not zarr.exists():
vcf.to_zarr(
exploded,
zarr,
conversion_spec=spec,
worker_processes=worker_processes,
show_progress=True,
)

vcf.validate(f, zarr, show_progress=True)


if __name__ == "__main__":
cli()
22 changes: 5 additions & 17 deletions vcf2zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,8 @@ def summarise(columnarised):
@click.argument("columnarised", type=click.Path())
# @click.argument("specfile", type=click.Path())
def genspec(columnarised):
pcvcf = cnv.PickleChunkedVcf.load(columnarised)
spec = cnv.ZarrConversionSpec.generate(pcvcf)
# with open(specfile, "w") as f:
stream = click.get_text_stream("stdout")
json.dump(spec.asdict(), stream, indent=4)
cnv.generate_spec(columnarised, stream)


@click.command
Expand All @@ -47,21 +44,12 @@ def genspec(columnarised):
@click.option("-s", "--conversion-spec", default=None)
@click.option("-p", "--worker-processes", type=int, default=1)
def to_zarr(columnarised, zarr_path, conversion_spec, worker_processes):
pcvcf = cnv.PickleChunkedVcf.load(columnarised)
if conversion_spec is None:
spec = cnv.ZarrConversionSpec.generate(pcvcf)
else:
with open(conversion_spec, "r") as f:
d = json.load(f)
spec = cnv.ZarrConversionSpec.fromdict(d)

cnv.SgvcfZarr.convert(
pcvcf,
cnv.to_zarr(
columnarised,
zarr_path,
conversion_spec=spec,
conversion_spec,
worker_processes=worker_processes,
show_progress=True,
)
show_progress=True)


@click.command
Expand Down