sgkit-dev · jeromekelleher · Feb 16, 2024 · Feb 15, 2024 · Feb 15, 2024 · Feb 15, 2024
diff --git a/bio2zarr/__init__.py b/bio2zarr/__init__.py
@@ -0,0 +1,2 @@
+
+from .vcf import explode as explode_vcf
diff --git a/bio2zarr/vcf.py b/bio2zarr/vcf.py
@@ -1009,7 +1009,11 @@ def fixed_field_spec(
                 shape.append(n)
                 chunks.append(chunk_width),
                 dimensions.append("samples")
-            if field.summary.max_number > 1:
+            if field.category == "FORMAT" and field.vcf_type == "String":
+                # FIXME not handling format string values very well right now
+                # as the max_number value is just the number of samples
+                pass
+            elif field.summary.max_number > 1:
                 shape.append(field.summary.max_number)
                 dimensions.append(field.name)
             variable_name = prefix + field.name
@@ -1414,6 +1418,31 @@ def flush_chunk(start, stop):
     return futures
 
 
+def generate_spec(columnarised, out):
+    pcvcf = PickleChunkedVcf.load(columnarised)
+    spec = ZarrConversionSpec.generate(pcvcf)
+    json.dump(spec.asdict(), out, indent=4)
+
+
+def to_zarr(
+    columnarised, zarr_path, conversion_spec, worker_processes=1, show_progress=False
+):
+    pcvcf = PickleChunkedVcf.load(columnarised)
+    if conversion_spec is None:
+        spec = ZarrConversionSpec.generate(pcvcf)
+    else:
+        with open(conversion_spec, "r") as f:
+            d = json.load(f)
+            spec = ZarrConversionSpec.fromdict(d)
+    SgvcfZarr.convert(
+        pcvcf,
+        zarr_path,
+        conversion_spec=spec,
+        worker_processes=worker_processes,
+        show_progress=True,
+    )
+
+
 def convert_vcf(
     vcfs,
     out_path,
@@ -1488,15 +1517,17 @@ def encode_bed_partition_genotypes(bed_path, zarr_path, start_variant, end_varia
         flush_futures(futures)
 
 
-def validate(vcf_path, zarr_path, show_progress):
+def validate(vcf_path, zarr_path, show_progress=False):
     store = zarr.DirectoryStore(zarr_path)
 
     root = zarr.group(store=store)
     pos = root["variant_position"][:]
     allele = root["variant_allele"][:]
     chrom = root["contig_id"][:][root["variant_contig"][:]]
     vid = root["variant_id"][:]
-    call_genotype = iter(root["call_genotype"])
+    call_genotype = None
+    if "call_genotype" in root:
+        call_genotype = iter(root["call_genotype"])
 
     vcf = cyvcf2.VCF(vcf_path)
     format_headers = {}
@@ -1526,7 +1557,10 @@ def validate(vcf_path, zarr_path, show_progress):
     start_index = np.searchsorted(pos, first_pos)
     assert pos[start_index] == first_pos
     vcf = cyvcf2.VCF(vcf_path)
-    iterator = tqdm.tqdm(vcf, total=vcf.num_records)
+    if show_progress:
+        iterator = tqdm.tqdm(vcf, total=vcf.num_records)
+    else:
+        iterator = vcf
     for j, row in enumerate(iterator, start_index):
         assert chrom[j] == row.CHROM
         assert pos[j] == row.POS
@@ -1537,16 +1571,19 @@ def validate(vcf_path, zarr_path, show_progress):
         assert np.all(allele[j, k + 1 :] == "")
         # TODO FILTERS
 
-        gt = row.genotype.array()
-        gt_zarr = next(call_genotype)
-        gt_vcf = gt[:, :-1]
-        # NOTE weirdly cyvcf2 seems to remap genotypes automatically
-        # into the same missing/pad encoding that sgkit uses.
-        # if np.any(gt_zarr < 0):
-        #     print("MISSING")
-        #     print(gt_zarr)
-        #     print(gt_vcf)
-        nt.assert_array_equal(gt_zarr, gt_vcf)
+        if call_genotype is None:
+            assert row.genotype is None
+        else:
+            gt = row.genotype.array()
+            gt_zarr = next(call_genotype)
+            gt_vcf = gt[:, :-1]
+            # NOTE weirdly cyvcf2 seems to remap genotypes automatically
+            # into the same missing/pad encoding that sgkit uses.
+            # if np.any(gt_zarr < 0):
+            #     print("MISSING")
+            #     print(gt_zarr)
+            #     print(gt_vcf)
+            nt.assert_array_equal(gt_zarr, gt_vcf)
 
         # TODO this is basically right, but the details about float padding
         # need to be worked out in particular. Need to find examples of

diff --git a/tests/data/vcf/sample.vcf.gz b/tests/data/vcf/sample.vcf.gz
diff --git a/tests/data/vcf/sample.vcf.gz.tbi b/tests/data/vcf/sample.vcf.gz.tbi
diff --git a/tests/data/vcf/sample_no_genotypes.vcf.gz b/tests/data/vcf/sample_no_genotypes.vcf.gz
diff --git a/tests/data/vcf/sample_no_genotypes.vcf.gz.csi b/tests/data/vcf/sample_no_genotypes.vcf.gz.csi
diff --git a/tests/test_vcf.py b/tests/test_vcf.py
@@ -1,5 +1,6 @@
 import numpy as np
 import numpy.testing as nt
+import xarray.testing as xt
 import pytest
 import sgkit as sg
 
@@ -218,3 +219,28 @@ def test_call_HQ(self, ds):
             [[-1, -1], [-1, -1], [-1, -1]],
         ]
         nt.assert_array_equal(ds["call_HQ"], call_HQ)
+
+    def test_no_genotypes(self, ds, tmp_path):
+        path = "tests/data/vcf/sample_no_genotypes.vcf.gz"
+        out = tmp_path / "example.vcf.zarr"
+        vcf.convert_vcf([path], out)
+        ds2 = sg.load_dataset(out)
+        assert len(ds2["sample_id"]) == 0
+        for col in ds:
+            if col != "sample_id" and not col.startswith("call_"):
+                xt.assert_equal(ds[col], ds2[col])
+
+
+@pytest.mark.parametrize(
+    "name",
+    [
+        "sample.vcf.gz",
+        "sample_no_genotypes.vcf.gz",
+        # "info_field_type_combos.vcf.gz",
+    ],
+)
+def test_by_validating(name, tmp_path):
+    path = f"tests/data/vcf/{name}"
+    out = tmp_path / "test.zarr"
+    vcf.convert_vcf([path], out)
+    vcf.validate(path, out)
diff --git a/validation-data/Makefile b/validation-data/Makefile
@@ -0,0 +1,57 @@
+all: 1kg_2020_chr20.bcf.csi \
+	1kg_2020_chr20.vcf.gz.tbi \
+	1kg_2020_chr20_annotations.vcf.gz.tbi \
+	1kg_2020_chr20_annotations.bcf.csi\
+	1kg_2020_others.bcf.csi \
+	1kg_2020_others.vcf.gz.tbi \
+	1kg_p3_all_chr1.bcf.csi \
+	1kg_p3_all_chr1.vcf.gz.tbi\
+	1kg_p1_all_chr6.vcf.bcf.csi\
+	1kg_p1_all_chr6.vcf.gz.tbi
+
+.PRECIOUS: %.bcf
+.PRECIOUS: %.vcf.gz
+
+# Modern 1000 genomes data
+1KG_2020_BASE=http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_raw_GT_with_annot/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_
+1KG_2020_GENOTYPES_URL="${1KG_2020_BASE}chr20.recalibrated_variants.vcf.gz"
+1KG_2020_ANNOTATIONS_URL="${1KG_2020_BASE}chr20.recalibrated_variants.annotated.vcf.gz"
+1KG_2020_OTHERS_URL="${1KG_2020_BASE}others.recalibrated_variants.vcf.gz"
+
+# 1000 genomes phase 3
+1KG_P3_ALL_URL=http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr1.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz
+1KG_P3_SNP_URL=http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20181203_biallelic_SNV/ALL.chr1.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz
+
+# 1000 genomes phase 1
+1KG_P1_ALL_URL=http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase1/analysis_results/integrated_call_sets/ALL.chr6.integrated_phase1_v3.20101123.snps_indels_svs.genotypes.vcf.gz
+
+
+%.vcf.gz.tbi: %.vcf.gz
+	tabix $<
+
+%.bcf.csi: %.bcf
+	bcftools index $<
+
+%.bcf: %.vcf.gz
+	bcftools view -O b $< > $@
+
+1kg_2020_chr20.vcf.gz:
+	bcftools view ${1KG_2020_GENOTYPES_URL} | head -n 10000 | bcftools view -O z -o $@
+
+1kg_2020_others.vcf.gz:
+	bcftools view ${1KG_2020_OTHERS_URL} | head -n 10000 | bcftools view -O z -o $@
+
+1kg_2020_chr20_annotations.vcf.gz:
+	bcftools view ${1KG_2020_ANNOTATIONS_URL} | head -n 100000 | bcftools view -O z -o $@
+
+1kg_p3_all_chr1.vcf.gz:
+	bcftools view ${1KG_P3_ALL_URL} | head -n 500000 | bcftools view -O z -o $@
+
+1kg_p3_snp_chr1.vcf.gz:
+	bcftools view ${1KG_P3_SNP_URL} | head -n 500000 | bcftools view -O z -o $@
+
+1kg_p1_all_chr6.vcf.gz:
+	bcftools view ${1KG_P1_ALL_URL} | head -n 10000 | bcftools view -O z -o $@
+
+clean:
+	rm -f *.vcf.gz *.tbi *.bcf *.csi
diff --git a/validation.py b/validation.py
@@ -0,0 +1,55 @@
+# Development script to automate running the validation tests.
+# These are large-scale tests that are not possible to run
+# under unit-test conditions.
+import pathlib
+import click
+
+import bio2zarr as bz
+
+# tmp, come up with better names and flatten hierarchy
+from bio2zarr import vcf
+
+
+@click.command
+@click.argument("vcfs", nargs=-1)
+@click.option("-p", "--worker-processes", type=int, default=1)
+@click.option("-f", "--force", is_flag=True, default=False)
+def cli(vcfs, worker_processes, force):
+
+    data_path = pathlib.Path("validation-data")
+    if len(vcfs) == 0:
+        vcfs = list(data_path.glob("*.vcf.gz")) + list(data_path.glob("*.bcf"))
+    else:
+        vcfs = [pathlib.Path(f) for f in vcfs]
+    tmp_path = pathlib.Path("validation-tmp")
+    tmp_path.mkdir(exist_ok=True)
+    for f in vcfs:
+        print(f)
+        exploded = tmp_path / (f.name + ".exploded")
+        if force or not exploded.exists():
+            bz.explode_vcf(
+                [f],
+                exploded,
+                worker_processes=worker_processes,
+                show_progress=True,
+            )
+        spec = tmp_path / (f.name + ".spec")
+        if force or not spec.exists():
+            with open(spec, "w") as specfile:
+                vcf.generate_spec(exploded, specfile)
+
+        zarr = tmp_path / (f.name + ".zarr")
+        if force or not zarr.exists():
+            vcf.to_zarr(
+                exploded,
+                zarr,
+                conversion_spec=spec,
+                worker_processes=worker_processes,
+                show_progress=True,
+            )
+
+        vcf.validate(f, zarr, show_progress=True)
+
+
+if __name__ == "__main__":
+    cli()
diff --git a/vcf2zarr.py b/vcf2zarr.py
@@ -34,11 +34,8 @@ def summarise(columnarised):
 @click.argument("columnarised", type=click.Path())
 # @click.argument("specfile", type=click.Path())
 def genspec(columnarised):
-    pcvcf = cnv.PickleChunkedVcf.load(columnarised)
-    spec = cnv.ZarrConversionSpec.generate(pcvcf)
-    # with open(specfile, "w") as f:
     stream = click.get_text_stream("stdout")
-    json.dump(spec.asdict(), stream, indent=4)
+    cnv.generate_spec(columnarised, stream)
 
 
 @click.command
@@ -47,21 +44,12 @@ def genspec(columnarised):
 @click.option("-s", "--conversion-spec", default=None)
 @click.option("-p", "--worker-processes", type=int, default=1)
 def to_zarr(columnarised, zarr_path, conversion_spec, worker_processes):
-    pcvcf = cnv.PickleChunkedVcf.load(columnarised)
-    if conversion_spec is None:
-        spec = cnv.ZarrConversionSpec.generate(pcvcf)
-    else:
-        with open(conversion_spec, "r") as f:
-            d = json.load(f)
-            spec = cnv.ZarrConversionSpec.fromdict(d)
-
-    cnv.SgvcfZarr.convert(
-        pcvcf,
+    cnv.to_zarr(
+        columnarised,
         zarr_path,
-        conversion_spec=spec,
+        conversion_spec,
         worker_processes=worker_processes,
-        show_progress=True,
-    )
+        show_progress=True)
 
 
 @click.command