diff --git a/bio2zarr/vcf.py b/bio2zarr/vcf.py index 67108852..032da3ae 100644 --- a/bio2zarr/vcf.py +++ b/bio2zarr/vcf.py @@ -1544,8 +1544,13 @@ def init_array(self, variable): object_codec=object_codec, dimension_separator=self.dimension_separator, ) - # Dimension names are part of the spec in Zarr v3 - a.attrs["_ARRAY_DIMENSIONS"] = variable.dimensions + a.attrs.update( + { + "description": variable.description, + # Dimension names are part of the spec in Zarr v3 + "_ARRAY_DIMENSIONS": variable.dimensions, + } + ) def get_array(self, name): return self.root["wip_" + name] diff --git a/tests/test_vcf.py b/tests/test_vcf.py index 5d4b50b0..0f7f0238 100644 --- a/tests/test_vcf.py +++ b/tests/test_vcf.py @@ -319,3 +319,39 @@ def test_check_overlap(regions): ] with pytest.raises(ValueError, match="Multiple VCFs have the region"): vcf.check_overlap(partitions) + + +class TestVcfDescriptions: + @pytest.mark.parametrize( + ("field", "description"), + [ + ("variant_NS", "Number of Samples With Data"), + ("variant_AN", "Total number of alleles in called genotypes"), + ( + "variant_AC", + "Allele count in genotypes, for each ALT allele, " + "in the same order as listed", + ), + ("variant_DP", "Total Depth"), + ("variant_AF", "Allele Frequency"), + ("variant_AA", "Ancestral Allele"), + ("variant_DB", "dbSNP membership, build 129"), + ("variant_H2", "HapMap2 membership"), + ("call_GQ", "Genotype Quality"), + ("call_DP", "Read Depth"), + ("call_HQ", "Haplotype Quality"), + ], + ) + def test_fields(self, schema, field, description): + assert schema["columns"][field]["description"] == description + + # This information is not in the schema yet, + # https://github.com/sgkit-dev/bio2zarr/issues/123 + # @pytest.mark.parametrize( + # ("filt", "description"), + # [ + # ("s50","Less than 50% of samples have data"), + # ("q10", "Quality below 10"), + # ]) + # def test_filters(self, schema, filt, description): + # assert schema["filters"][field]["description"] == description diff --git a/tests/test_vcf_examples.py b/tests/test_vcf_examples.py index 0a7b4f8d..3667287e 100644 --- a/tests/test_vcf_examples.py +++ b/tests/test_vcf_examples.py @@ -400,6 +400,29 @@ def test_vcf_dimensions(self, ds): assert ds.variant_H2.dims == ("variants",) assert ds.variant_position.dims == ("variants",) + @pytest.mark.parametrize( + ("field", "description"), + [ + ("variant_NS", "Number of Samples With Data"), + ("variant_AN", "Total number of alleles in called genotypes"), + ( + "variant_AC", + "Allele count in genotypes, for each ALT allele, " + "in the same order as listed", + ), + ("variant_DP", "Total Depth"), + ("variant_AF", "Allele Frequency"), + ("variant_AA", "Ancestral Allele"), + ("variant_DB", "dbSNP membership, build 129"), + ("variant_H2", "HapMap2 membership"), + ("call_GQ", "Genotype Quality"), + ("call_DP", "Read Depth"), + ("call_HQ", "Haplotype Quality"), + ], + ) + def test_vcf_field_description(self, ds, field, description): + assert ds[field].attrs["description"] == description + class Test1000G2020Example: data_path = "tests/data/vcf/1kg_2020_chrM.vcf.gz"