diff --git a/bio2zarr/__init__.py b/bio2zarr/__init__.py index e69de29b..dc8a8e5b 100644 --- a/bio2zarr/__init__.py +++ b/bio2zarr/__init__.py @@ -0,0 +1,2 @@ + +from .vcf import explode as explode_vcf diff --git a/bio2zarr/vcf.py b/bio2zarr/vcf.py index 2bff6ab2..8dae8d5a 100644 --- a/bio2zarr/vcf.py +++ b/bio2zarr/vcf.py @@ -1009,7 +1009,11 @@ def fixed_field_spec( shape.append(n) chunks.append(chunk_width), dimensions.append("samples") - if field.summary.max_number > 1: + if field.category == "FORMAT" and field.vcf_type == "String": + # FIXME not handling format string values very well right now + # as the max_number value is just the number of samples + pass + elif field.summary.max_number > 1: shape.append(field.summary.max_number) dimensions.append(field.name) variable_name = prefix + field.name @@ -1414,6 +1418,31 @@ def flush_chunk(start, stop): return futures +def generate_spec(columnarised, out): + pcvcf = PickleChunkedVcf.load(columnarised) + spec = ZarrConversionSpec.generate(pcvcf) + json.dump(spec.asdict(), out, indent=4) + + +def to_zarr( + columnarised, zarr_path, conversion_spec, worker_processes=1, show_progress=False +): + pcvcf = PickleChunkedVcf.load(columnarised) + if conversion_spec is None: + spec = ZarrConversionSpec.generate(pcvcf) + else: + with open(conversion_spec, "r") as f: + d = json.load(f) + spec = ZarrConversionSpec.fromdict(d) + SgvcfZarr.convert( + pcvcf, + zarr_path, + conversion_spec=spec, + worker_processes=worker_processes, + show_progress=True, + ) + + def convert_vcf( vcfs, out_path, @@ -1488,7 +1517,7 @@ def encode_bed_partition_genotypes(bed_path, zarr_path, start_variant, end_varia flush_futures(futures) -def validate(vcf_path, zarr_path, show_progress): +def validate(vcf_path, zarr_path, show_progress=False): store = zarr.DirectoryStore(zarr_path) root = zarr.group(store=store) @@ -1496,7 +1525,9 @@ def validate(vcf_path, zarr_path, show_progress): allele = root["variant_allele"][:] chrom = root["contig_id"][:][root["variant_contig"][:]] vid = root["variant_id"][:] - call_genotype = iter(root["call_genotype"]) + call_genotype = None + if "call_genotype" in root: + call_genotype = iter(root["call_genotype"]) vcf = cyvcf2.VCF(vcf_path) format_headers = {} @@ -1526,7 +1557,10 @@ def validate(vcf_path, zarr_path, show_progress): start_index = np.searchsorted(pos, first_pos) assert pos[start_index] == first_pos vcf = cyvcf2.VCF(vcf_path) - iterator = tqdm.tqdm(vcf, total=vcf.num_records) + if show_progress: + iterator = tqdm.tqdm(vcf, total=vcf.num_records) + else: + iterator = vcf for j, row in enumerate(iterator, start_index): assert chrom[j] == row.CHROM assert pos[j] == row.POS @@ -1537,16 +1571,19 @@ def validate(vcf_path, zarr_path, show_progress): assert np.all(allele[j, k + 1 :] == "") # TODO FILTERS - gt = row.genotype.array() - gt_zarr = next(call_genotype) - gt_vcf = gt[:, :-1] - # NOTE weirdly cyvcf2 seems to remap genotypes automatically - # into the same missing/pad encoding that sgkit uses. - # if np.any(gt_zarr < 0): - # print("MISSING") - # print(gt_zarr) - # print(gt_vcf) - nt.assert_array_equal(gt_zarr, gt_vcf) + if call_genotype is None: + assert row.genotype is None + else: + gt = row.genotype.array() + gt_zarr = next(call_genotype) + gt_vcf = gt[:, :-1] + # NOTE weirdly cyvcf2 seems to remap genotypes automatically + # into the same missing/pad encoding that sgkit uses. + # if np.any(gt_zarr < 0): + # print("MISSING") + # print(gt_zarr) + # print(gt_vcf) + nt.assert_array_equal(gt_zarr, gt_vcf) # TODO this is basically right, but the details about float padding # need to be worked out in particular. Need to find examples of diff --git a/tests/data/vcf/sample.vcf.gz b/tests/data/vcf/sample.vcf.gz new file mode 100644 index 00000000..00f8a725 Binary files /dev/null and b/tests/data/vcf/sample.vcf.gz differ diff --git a/tests/data/vcf/sample.vcf.gz.tbi b/tests/data/vcf/sample.vcf.gz.tbi new file mode 100644 index 00000000..1a63a1ad Binary files /dev/null and b/tests/data/vcf/sample.vcf.gz.tbi differ diff --git a/tests/data/vcf/sample_no_genotypes.vcf.gz b/tests/data/vcf/sample_no_genotypes.vcf.gz new file mode 100644 index 00000000..ae7abe10 Binary files /dev/null and b/tests/data/vcf/sample_no_genotypes.vcf.gz differ diff --git a/tests/data/vcf/sample_no_genotypes.vcf.gz.csi b/tests/data/vcf/sample_no_genotypes.vcf.gz.csi new file mode 100644 index 00000000..44ad65ce Binary files /dev/null and b/tests/data/vcf/sample_no_genotypes.vcf.gz.csi differ diff --git a/tests/test_vcf.py b/tests/test_vcf.py index d2964c86..240b270d 100644 --- a/tests/test_vcf.py +++ b/tests/test_vcf.py @@ -1,5 +1,6 @@ import numpy as np import numpy.testing as nt +import xarray.testing as xt import pytest import sgkit as sg @@ -218,3 +219,28 @@ def test_call_HQ(self, ds): [[-1, -1], [-1, -1], [-1, -1]], ] nt.assert_array_equal(ds["call_HQ"], call_HQ) + + def test_no_genotypes(self, ds, tmp_path): + path = "tests/data/vcf/sample_no_genotypes.vcf.gz" + out = tmp_path / "example.vcf.zarr" + vcf.convert_vcf([path], out) + ds2 = sg.load_dataset(out) + assert len(ds2["sample_id"]) == 0 + for col in ds: + if col != "sample_id" and not col.startswith("call_"): + xt.assert_equal(ds[col], ds2[col]) + + +@pytest.mark.parametrize( + "name", + [ + "sample.vcf.gz", + "sample_no_genotypes.vcf.gz", + # "info_field_type_combos.vcf.gz", + ], +) +def test_by_validating(name, tmp_path): + path = f"tests/data/vcf/{name}" + out = tmp_path / "test.zarr" + vcf.convert_vcf([path], out) + vcf.validate(path, out) diff --git a/validation-data/Makefile b/validation-data/Makefile new file mode 100644 index 00000000..7d540d78 --- /dev/null +++ b/validation-data/Makefile @@ -0,0 +1,57 @@ +all: 1kg_2020_chr20.bcf.csi \ + 1kg_2020_chr20.vcf.gz.tbi \ + 1kg_2020_chr20_annotations.vcf.gz.tbi \ + 1kg_2020_chr20_annotations.bcf.csi\ + 1kg_2020_others.bcf.csi \ + 1kg_2020_others.vcf.gz.tbi \ + 1kg_p3_all_chr1.bcf.csi \ + 1kg_p3_all_chr1.vcf.gz.tbi\ + 1kg_p1_all_chr6.vcf.bcf.csi\ + 1kg_p1_all_chr6.vcf.gz.tbi + +.PRECIOUS: %.bcf +.PRECIOUS: %.vcf.gz + +# Modern 1000 genomes data +1KG_2020_BASE=http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_raw_GT_with_annot/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_ +1KG_2020_GENOTYPES_URL="${1KG_2020_BASE}chr20.recalibrated_variants.vcf.gz" +1KG_2020_ANNOTATIONS_URL="${1KG_2020_BASE}chr20.recalibrated_variants.annotated.vcf.gz" +1KG_2020_OTHERS_URL="${1KG_2020_BASE}others.recalibrated_variants.vcf.gz" + +# 1000 genomes phase 3 +1KG_P3_ALL_URL=http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr1.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz +1KG_P3_SNP_URL=http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20181203_biallelic_SNV/ALL.chr1.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz + +# 1000 genomes phase 1 +1KG_P1_ALL_URL=http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase1/analysis_results/integrated_call_sets/ALL.chr6.integrated_phase1_v3.20101123.snps_indels_svs.genotypes.vcf.gz + + +%.vcf.gz.tbi: %.vcf.gz + tabix $< + +%.bcf.csi: %.bcf + bcftools index $< + +%.bcf: %.vcf.gz + bcftools view -O b $< > $@ + +1kg_2020_chr20.vcf.gz: + bcftools view ${1KG_2020_GENOTYPES_URL} | head -n 10000 | bcftools view -O z -o $@ + +1kg_2020_others.vcf.gz: + bcftools view ${1KG_2020_OTHERS_URL} | head -n 10000 | bcftools view -O z -o $@ + +1kg_2020_chr20_annotations.vcf.gz: + bcftools view ${1KG_2020_ANNOTATIONS_URL} | head -n 100000 | bcftools view -O z -o $@ + +1kg_p3_all_chr1.vcf.gz: + bcftools view ${1KG_P3_ALL_URL} | head -n 500000 | bcftools view -O z -o $@ + +1kg_p3_snp_chr1.vcf.gz: + bcftools view ${1KG_P3_SNP_URL} | head -n 500000 | bcftools view -O z -o $@ + +1kg_p1_all_chr6.vcf.gz: + bcftools view ${1KG_P1_ALL_URL} | head -n 10000 | bcftools view -O z -o $@ + +clean: + rm -f *.vcf.gz *.tbi *.bcf *.csi diff --git a/validation.py b/validation.py new file mode 100644 index 00000000..0d8e071b --- /dev/null +++ b/validation.py @@ -0,0 +1,55 @@ +# Development script to automate running the validation tests. +# These are large-scale tests that are not possible to run +# under unit-test conditions. +import pathlib +import click + +import bio2zarr as bz + +# tmp, come up with better names and flatten hierarchy +from bio2zarr import vcf + + +@click.command +@click.argument("vcfs", nargs=-1) +@click.option("-p", "--worker-processes", type=int, default=1) +@click.option("-f", "--force", is_flag=True, default=False) +def cli(vcfs, worker_processes, force): + + data_path = pathlib.Path("validation-data") + if len(vcfs) == 0: + vcfs = list(data_path.glob("*.vcf.gz")) + list(data_path.glob("*.bcf")) + else: + vcfs = [pathlib.Path(f) for f in vcfs] + tmp_path = pathlib.Path("validation-tmp") + tmp_path.mkdir(exist_ok=True) + for f in vcfs: + print(f) + exploded = tmp_path / (f.name + ".exploded") + if force or not exploded.exists(): + bz.explode_vcf( + [f], + exploded, + worker_processes=worker_processes, + show_progress=True, + ) + spec = tmp_path / (f.name + ".spec") + if force or not spec.exists(): + with open(spec, "w") as specfile: + vcf.generate_spec(exploded, specfile) + + zarr = tmp_path / (f.name + ".zarr") + if force or not zarr.exists(): + vcf.to_zarr( + exploded, + zarr, + conversion_spec=spec, + worker_processes=worker_processes, + show_progress=True, + ) + + vcf.validate(f, zarr, show_progress=True) + + +if __name__ == "__main__": + cli() diff --git a/vcf2zarr.py b/vcf2zarr.py index 145badf1..3ed3287d 100644 --- a/vcf2zarr.py +++ b/vcf2zarr.py @@ -34,11 +34,8 @@ def summarise(columnarised): @click.argument("columnarised", type=click.Path()) # @click.argument("specfile", type=click.Path()) def genspec(columnarised): - pcvcf = cnv.PickleChunkedVcf.load(columnarised) - spec = cnv.ZarrConversionSpec.generate(pcvcf) - # with open(specfile, "w") as f: stream = click.get_text_stream("stdout") - json.dump(spec.asdict(), stream, indent=4) + cnv.generate_spec(columnarised, stream) @click.command @@ -47,21 +44,12 @@ def genspec(columnarised): @click.option("-s", "--conversion-spec", default=None) @click.option("-p", "--worker-processes", type=int, default=1) def to_zarr(columnarised, zarr_path, conversion_spec, worker_processes): - pcvcf = cnv.PickleChunkedVcf.load(columnarised) - if conversion_spec is None: - spec = cnv.ZarrConversionSpec.generate(pcvcf) - else: - with open(conversion_spec, "r") as f: - d = json.load(f) - spec = cnv.ZarrConversionSpec.fromdict(d) - - cnv.SgvcfZarr.convert( - pcvcf, + cnv.to_zarr( + columnarised, zarr_path, - conversion_spec=spec, + conversion_spec, worker_processes=worker_processes, - show_progress=True, - ) + show_progress=True) @click.command