From 9d43297d3846e60c2c9c511fe3329d4db263d2e7 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Thu, 15 Feb 2024 22:34:56 +0000 Subject: [PATCH 1/6] Initial addition of VCF data and validation test --- bio2zarr/vcf.py | 7 +++++-- tests/data/vcf/sample.vcf.gz | Bin 0 -> 954 bytes tests/data/vcf/sample.vcf.gz.tbi | Bin 0 -> 185 bytes tests/test_vcf.py | 8 ++++++++ 4 files changed, 13 insertions(+), 2 deletions(-) create mode 100644 tests/data/vcf/sample.vcf.gz create mode 100644 tests/data/vcf/sample.vcf.gz.tbi diff --git a/bio2zarr/vcf.py b/bio2zarr/vcf.py index 2bff6ab2..aa67cbae 100644 --- a/bio2zarr/vcf.py +++ b/bio2zarr/vcf.py @@ -1488,7 +1488,7 @@ def encode_bed_partition_genotypes(bed_path, zarr_path, start_variant, end_varia flush_futures(futures) -def validate(vcf_path, zarr_path, show_progress): +def validate(vcf_path, zarr_path, show_progress=False): store = zarr.DirectoryStore(zarr_path) root = zarr.group(store=store) @@ -1526,7 +1526,10 @@ def validate(vcf_path, zarr_path, show_progress): start_index = np.searchsorted(pos, first_pos) assert pos[start_index] == first_pos vcf = cyvcf2.VCF(vcf_path) - iterator = tqdm.tqdm(vcf, total=vcf.num_records) + if show_progress: + iterator = tqdm.tqdm(vcf, total=vcf.num_records) + else: + iterator = vcf for j, row in enumerate(iterator, start_index): assert chrom[j] == row.CHROM assert pos[j] == row.POS diff --git a/tests/data/vcf/sample.vcf.gz b/tests/data/vcf/sample.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..00f8a725f71596daf33d8ad1a948c47bce28eccf GIT binary patch literal 954 zcmV;r14aBFiwFb&00000{{{d;LjnMu1C3N$Z{j!ooE7OlUn?&;jFW6%$}&oKvpjvzW%9~R3I(}(Q@Pqf4JCvGYL_Oem_>0~ z{yptKj=2Lh*;W?O?%m5|UPdy8lks%;;{7uoc+;gZ-6t!Rd(;RInexWFQoXC(7^q^M zN0}D%zSJc+eZxzcWU(snx2W7=Fe|_Odrs$1Z=nm*QpR|9B9(EhG-3Wguhadm)SM?W1H;U*lItz@C*e`n&?V(o;X)<~r}+jv$pXhwQL4?Q z{9tx_cLSA`+m`!b%l$A;W%*0*!(4se!>t}#41UYNvdrOPTNDt6qRhcVP1fq+v7sQ1 z3E3jptd`RmPL!@Hwozu_)joEXjm z4BJHQ$_s3N`g#fKr`e(TraxpJ$ra={SMTsr1N)=J%QKq2EYWy?7EeP2L}>oipP>38 zkxmt=8$?qd{$+$7pX~VLceINrrN~4+>Z7jdz{*5HC`NNcei8vOp!R7+5P6`28%G$c zU;%>jYZUnfiV;LD?h=$24zVp}yMXo)>!O*zT%zgHIf=qYSDkTUaM$#tGwY)tQMbw>3Xr30(DM176sU;gDvG)^TpSGe_%v z@>@2vkwoh}Eo4dIo&w(p{62!AT!?kGR?zdOp40RD9yjPox4L&zSnyD_-iT6qz|&liETZ}?$g^z~t7p{~~|fM9q31C*=?O#}!403VA8 c1ONa4009360763o02=@U000000000007AFV^Z)<= literal 0 HcmV?d00001 diff --git a/tests/data/vcf/sample.vcf.gz.tbi b/tests/data/vcf/sample.vcf.gz.tbi new file mode 100644 index 0000000000000000000000000000000000000000..1a63a1ad1c043db7996911fb629b865210cddeff GIT binary patch literal 185 zcmb2|=3rp}f&Xj_PR>jWa~QY_Kc%FkB_t#;CAn$vGMx&T#oWQGE7`*%p>&{Vo|8c7 zv^4@hQv+^g3%Tg;vN&TS;c)GASBjj^UJmIiHY+T8;#q{-Z!rhg_sTkLUU>5|^Hu)K z-`TwU8@mqwF!*a1amT=`|B^z0iz2Ug_KBbD)80u)JZsz}Zxg$?+t_FUQ^;Nc28Lx} T^;t)l85rcz94E~H6odi*0~|b8 literal 0 HcmV?d00001 diff --git a/tests/test_vcf.py b/tests/test_vcf.py index d2964c86..465ad697 100644 --- a/tests/test_vcf.py +++ b/tests/test_vcf.py @@ -218,3 +218,11 @@ def test_call_HQ(self, ds): [[-1, -1], [-1, -1], [-1, -1]], ] nt.assert_array_equal(ds["call_HQ"], call_HQ) + +class TestByValidating: + + def test_sample(self, tmp_path): + path = "tests/data/vcf/sample.vcf.gz" + out = tmp_path / "example.vcf.zarr" + vcf.convert_vcf([path], out) + vcf.validate(path, out) From 478407fe678d8795e3431ef7cb6feb38e8fbf43c Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Thu, 15 Feb 2024 22:36:11 +0000 Subject: [PATCH 2/6] Initial addition of validation data makefile --- validation-data/Makefile | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 validation-data/Makefile diff --git a/validation-data/Makefile b/validation-data/Makefile new file mode 100644 index 00000000..b9df917f --- /dev/null +++ b/validation-data/Makefile @@ -0,0 +1,19 @@ +all: 1kg_2020_chr20.bcf.csi 1kg_2020_chr20.vcf.gz.tbi + +%.vcf.gz.tbi: %.vcf.gz + tabix $< + +%.bcf.csi: %.bcf + bcftools index $< + +.PRECIOUS: %.bcf +%.bcf: %.vcf.gz + bcftools view -O b $< > $@ + +.PRECIOUS: %.vcf.gz +1kg_2020_chr20.vcf.gz: + # TODO replace with URL + bcftools view /home/jk/work/github/sgkit/big-data/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr20.recalibrated_variants.vcf.gz | head -n 10000 | bcftools view -O z > $@ + +clean: + rm -f *.vcf.gz *.tbi *.bcf *.csi From 98fb67cda4e5c1d6aa2e0e12e1347853dd05756a Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Thu, 15 Feb 2024 23:19:59 +0000 Subject: [PATCH 3/6] Initial validation infrastructure --- bio2zarr/__init__.py | 2 ++ bio2zarr/vcf.py | 25 ++++++++++++++++++++++ validation.py | 50 ++++++++++++++++++++++++++++++++++++++++++++ vcf2zarr.py | 22 +++++-------------- 4 files changed, 82 insertions(+), 17 deletions(-) create mode 100644 validation.py diff --git a/bio2zarr/__init__.py b/bio2zarr/__init__.py index e69de29b..dc8a8e5b 100644 --- a/bio2zarr/__init__.py +++ b/bio2zarr/__init__.py @@ -0,0 +1,2 @@ + +from .vcf import explode as explode_vcf diff --git a/bio2zarr/vcf.py b/bio2zarr/vcf.py index aa67cbae..b97d267e 100644 --- a/bio2zarr/vcf.py +++ b/bio2zarr/vcf.py @@ -1414,6 +1414,31 @@ def flush_chunk(start, stop): return futures +def generate_spec(columnarised, out): + pcvcf = PickleChunkedVcf.load(columnarised) + spec = ZarrConversionSpec.generate(pcvcf) + json.dump(spec.asdict(), out, indent=4) + + +def to_zarr( + columnarised, zarr_path, conversion_spec, worker_processes=1, show_progress=False +): + pcvcf = PickleChunkedVcf.load(columnarised) + if conversion_spec is None: + spec = ZarrConversionSpec.generate(pcvcf) + else: + with open(conversion_spec, "r") as f: + d = json.load(f) + spec = ZarrConversionSpec.fromdict(d) + SgvcfZarr.convert( + pcvcf, + zarr_path, + conversion_spec=spec, + worker_processes=worker_processes, + show_progress=True, + ) + + def convert_vcf( vcfs, out_path, diff --git a/validation.py b/validation.py new file mode 100644 index 00000000..2198657f --- /dev/null +++ b/validation.py @@ -0,0 +1,50 @@ +# Development script to automate running the validation tests. +# These are large-scale tests that are not possible to run +# under unit-test conditions. +import pathlib +import click + +import bio2zarr as bz + +# tmp, come up with better names and flatten hierarchy +from bio2zarr import vcf + + +@click.command +@click.option("-p", "--worker-processes", type=int, default=1) +@click.option("-f", "--force", is_flag=True, default=False) +def cli(worker_processes, force): + data_path = pathlib.Path("validation-data") + files = list(data_path.glob("*.vcf.gz")) + list(data_path.glob("*.bcf")) + tmp_path = pathlib.Path("validation-tmp") + tmp_path.mkdir(exist_ok=True) + for f in files: + print(f) + exploded = tmp_path / (f.name + ".exploded") + if force or not exploded.exists(): + bz.explode_vcf( + [f], + exploded, + worker_processes=worker_processes, + show_progress=True, + ) + spec = tmp_path / (f.name + ".spec") + if force or not spec.exists(): + with open(spec, "w") as specfile: + vcf.generate_spec(exploded, specfile) + + zarr = tmp_path / (f.name + ".zarr") + if force or not zarr.exists(): + vcf.to_zarr( + exploded, + zarr, + conversion_spec=spec, + worker_processes=worker_processes, + show_progress=True, + ) + + vcf.validate(f, zarr, show_progress=True) + + +if __name__ == "__main__": + cli() diff --git a/vcf2zarr.py b/vcf2zarr.py index 145badf1..3ed3287d 100644 --- a/vcf2zarr.py +++ b/vcf2zarr.py @@ -34,11 +34,8 @@ def summarise(columnarised): @click.argument("columnarised", type=click.Path()) # @click.argument("specfile", type=click.Path()) def genspec(columnarised): - pcvcf = cnv.PickleChunkedVcf.load(columnarised) - spec = cnv.ZarrConversionSpec.generate(pcvcf) - # with open(specfile, "w") as f: stream = click.get_text_stream("stdout") - json.dump(spec.asdict(), stream, indent=4) + cnv.generate_spec(columnarised, stream) @click.command @@ -47,21 +44,12 @@ def genspec(columnarised): @click.option("-s", "--conversion-spec", default=None) @click.option("-p", "--worker-processes", type=int, default=1) def to_zarr(columnarised, zarr_path, conversion_spec, worker_processes): - pcvcf = cnv.PickleChunkedVcf.load(columnarised) - if conversion_spec is None: - spec = cnv.ZarrConversionSpec.generate(pcvcf) - else: - with open(conversion_spec, "r") as f: - d = json.load(f) - spec = cnv.ZarrConversionSpec.fromdict(d) - - cnv.SgvcfZarr.convert( - pcvcf, + cnv.to_zarr( + columnarised, zarr_path, - conversion_spec=spec, + conversion_spec, worker_processes=worker_processes, - show_progress=True, - ) + show_progress=True) @click.command From 6fdd2960ccbc0efc49270fd410c78e9978bebc9c Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Fri, 16 Feb 2024 12:57:34 +0000 Subject: [PATCH 4/6] Update validation to include annotation VCF Add testing coverage of no-genotypes case --- bio2zarr/vcf.py | 27 +++++++++++------- tests/data/vcf/sample_no_genotypes.vcf.gz | Bin 0 -> 868 bytes tests/data/vcf/sample_no_genotypes.vcf.gz.csi | Bin 0 -> 187 bytes tests/test_vcf.py | 19 +++++++++++- validation-data/Makefile | 20 +++++++++---- validation.py | 11 +++++-- 6 files changed, 57 insertions(+), 20 deletions(-) create mode 100644 tests/data/vcf/sample_no_genotypes.vcf.gz create mode 100644 tests/data/vcf/sample_no_genotypes.vcf.gz.csi diff --git a/bio2zarr/vcf.py b/bio2zarr/vcf.py index b97d267e..a0ca3feb 100644 --- a/bio2zarr/vcf.py +++ b/bio2zarr/vcf.py @@ -1521,7 +1521,9 @@ def validate(vcf_path, zarr_path, show_progress=False): allele = root["variant_allele"][:] chrom = root["contig_id"][:][root["variant_contig"][:]] vid = root["variant_id"][:] - call_genotype = iter(root["call_genotype"]) + call_genotype = None + if "call_genotype" in root: + call_genotype = iter(root["call_genotype"]) vcf = cyvcf2.VCF(vcf_path) format_headers = {} @@ -1565,16 +1567,19 @@ def validate(vcf_path, zarr_path, show_progress=False): assert np.all(allele[j, k + 1 :] == "") # TODO FILTERS - gt = row.genotype.array() - gt_zarr = next(call_genotype) - gt_vcf = gt[:, :-1] - # NOTE weirdly cyvcf2 seems to remap genotypes automatically - # into the same missing/pad encoding that sgkit uses. - # if np.any(gt_zarr < 0): - # print("MISSING") - # print(gt_zarr) - # print(gt_vcf) - nt.assert_array_equal(gt_zarr, gt_vcf) + if call_genotype is None: + assert row.genotype is None + else: + gt = row.genotype.array() + gt_zarr = next(call_genotype) + gt_vcf = gt[:, :-1] + # NOTE weirdly cyvcf2 seems to remap genotypes automatically + # into the same missing/pad encoding that sgkit uses. + # if np.any(gt_zarr < 0): + # print("MISSING") + # print(gt_zarr) + # print(gt_vcf) + nt.assert_array_equal(gt_zarr, gt_vcf) # TODO this is basically right, but the details about float padding # need to be worked out in particular. Need to find examples of diff --git a/tests/data/vcf/sample_no_genotypes.vcf.gz b/tests/data/vcf/sample_no_genotypes.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..ae7abe10c8adc0e8bf60ba322de65a42f9ae13ab GIT binary patch literal 868 zcmV-q1DpIGiwFb&00000{{{d;LjnLt1C5kjkJ~5|hOe1lfjZi&bu92#Hk0*Aj6Kdo z%6QU=JKMVvaY$mxHZG7<>8~%4k4~(rt$YKA_kGTR1HQdoS2bI6(NGn?CfPo4(ar5` zHk}m@FY&+AG@g^i!bzDdMb)W_xA6y3*HF6@6B2e*O1Ap&pY|9~N)?MeghuEe6zZ7d zJ5jQ@IZT^wr>ODI1>XqTy!tNI0>RcyuvTj@LTJKT-Y_|@YOV%(azFJWt?9N@R_!M4 zs8AKHb!3`n&$hUH;pDq!$wZ8u;?S{p+A6kT;=16Y1oAap(59=Ig#TA+3w5#7R(`>e z{7-NNSG0!h1xo9h8EDnQ(u`MdqxYe-U-t0H8SnlAPb?0WeAg<&yuci&`ve&+w~)+= zQ(DJJ)RsX>8wR;p>6)m7btToQFPHS`{LgonvQBM#)n~oxvzk-&)7mq^zU*{Y4?P0u z6|hi3H@1h+mq99_YdFGsd$>O%(76cO1FV*dd=49CRB~H&4qWc4dIhmJx@J9kzhynr z?umBZJDNqjdmb`G*OnQRQa6OQFhqZu>mg5fZcFzJuFiMv9pVe_AwKSCU8w_HvYLN| z7+HgE`SAoEW_M2ycVzbI+LY?EG9!~xFG4+SSgWqQN&b3mN_clT-{d_N6!gZRU7NYqh7#6y})NPUzTgdc`e`e75nEh$UzzZxq3tR* zV>p<=!Sgpb;HR^DYx&#d6}bCy?QXuEmQJ%Vw11vVR4`*ph$G$Yy8zw!7<$Ol*^nC^2^hJN9=XS$XM+=fk&Sr>;J_au zAY_EXzzc_ZjR0>1%CWWI@)~ScaR%J!!aRop%Yq&=OfP_+I%huhQ0E9t33&W zP@|iw6kJk;!30c*Ga(5G(Y0%2Op`Im<7iBh*mv*=5dO$yznjnIxP&b9dj=Xr98yf2 u!~`wSz4;E>E_|JS1^@sbiwFb&00000{{{d;LjnLB00RI3000000001E`J!k5 literal 0 HcmV?d00001 diff --git a/tests/data/vcf/sample_no_genotypes.vcf.gz.csi b/tests/data/vcf/sample_no_genotypes.vcf.gz.csi new file mode 100644 index 0000000000000000000000000000000000000000..44ad65ced329442b5cb82d88fe0c033ded1d7853 GIT binary patch literal 187 zcmb2|=3rp}f&Xj_PR>jW^B9WxzNI`#PDn^#P7+l}V@h(<;AJ`$FpIf^S68x!M?y(r z#==kYCQC^Ch*12bU~90=a>}<^M-DMC`$RmMDG|V^eZ{9(#W?Ye&5oHIX3CR(l*p7F z`f_}eV}==9$OU80h0_I3&an=tI;FxgThLP`O-@H9p~g+cl8uc&ARuuEXY9O*jMfYc X-@b7~%wc3;kVkW#Gy^l(wIBijY8E(U literal 0 HcmV?d00001 diff --git a/tests/test_vcf.py b/tests/test_vcf.py index 465ad697..b3845b84 100644 --- a/tests/test_vcf.py +++ b/tests/test_vcf.py @@ -1,5 +1,6 @@ import numpy as np import numpy.testing as nt +import xarray.testing as xt import pytest import sgkit as sg @@ -219,10 +220,26 @@ def test_call_HQ(self, ds): ] nt.assert_array_equal(ds["call_HQ"], call_HQ) -class TestByValidating: + def test_no_genotypes(self, ds, tmp_path): + path = "tests/data/vcf/sample_no_genotypes.vcf.gz" + out = tmp_path / "example.vcf.zarr" + vcf.convert_vcf([path], out) + ds2 = sg.load_dataset(out) + assert len(ds2["sample_id"]) == 0 + for col in ds: + if col != "sample_id" and not col.startswith("call_"): + xt.assert_equal(ds[col], ds2[col]) + +class TestByValidating: def test_sample(self, tmp_path): path = "tests/data/vcf/sample.vcf.gz" out = tmp_path / "example.vcf.zarr" vcf.convert_vcf([path], out) vcf.validate(path, out) + + def test_sample_no_genotypes(self, tmp_path): + path = "tests/data/vcf/sample_no_genotypes.vcf.gz" + out = tmp_path / "example.vcf.zarr" + vcf.convert_vcf([path], out) + vcf.validate(path, out) diff --git a/validation-data/Makefile b/validation-data/Makefile index b9df917f..bb8bf3ad 100644 --- a/validation-data/Makefile +++ b/validation-data/Makefile @@ -1,4 +1,14 @@ -all: 1kg_2020_chr20.bcf.csi 1kg_2020_chr20.vcf.gz.tbi +all: 1kg_2020_chr20.bcf.csi \ + 1kg_2020_chr20.vcf.gz.tbi \ + 1kg_2020_chr20_annotations.vcf.gz.tbi \ + 1kg_2020_chr20_annotations.bcf.csi + +.PRECIOUS: %.bcf +.PRECIOUS: %.vcf.gz + +1KG_2020_GENOTYPES_URL="http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_raw_GT_with_annot/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr20.recalibrated_variants.vcf.gz" +1KG_2020_ANNOTATIONS_URL="http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_raw_GT_with_annot/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr20.recalibrated_variants.annotated.vcf.gz" + %.vcf.gz.tbi: %.vcf.gz tabix $< @@ -6,14 +16,14 @@ all: 1kg_2020_chr20.bcf.csi 1kg_2020_chr20.vcf.gz.tbi %.bcf.csi: %.bcf bcftools index $< -.PRECIOUS: %.bcf %.bcf: %.vcf.gz bcftools view -O b $< > $@ -.PRECIOUS: %.vcf.gz 1kg_2020_chr20.vcf.gz: - # TODO replace with URL - bcftools view /home/jk/work/github/sgkit/big-data/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr20.recalibrated_variants.vcf.gz | head -n 10000 | bcftools view -O z > $@ + bcftools view ${1KG_2020_GENOTYPES_URL} | head -n 10000 | bcftools view -O z -o $@ + +1kg_2020_chr20_annotations.vcf.gz: + bcftools view ${1KG_2020_ANNOTATIONS_URL} | head -n 100000 | bcftools view -O z -o $@ clean: rm -f *.vcf.gz *.tbi *.bcf *.csi diff --git a/validation.py b/validation.py index 2198657f..0d8e071b 100644 --- a/validation.py +++ b/validation.py @@ -11,14 +11,19 @@ @click.command +@click.argument("vcfs", nargs=-1) @click.option("-p", "--worker-processes", type=int, default=1) @click.option("-f", "--force", is_flag=True, default=False) -def cli(worker_processes, force): +def cli(vcfs, worker_processes, force): + data_path = pathlib.Path("validation-data") - files = list(data_path.glob("*.vcf.gz")) + list(data_path.glob("*.bcf")) + if len(vcfs) == 0: + vcfs = list(data_path.glob("*.vcf.gz")) + list(data_path.glob("*.bcf")) + else: + vcfs = [pathlib.Path(f) for f in vcfs] tmp_path = pathlib.Path("validation-tmp") tmp_path.mkdir(exist_ok=True) - for f in files: + for f in vcfs: print(f) exploded = tmp_path / (f.name + ".exploded") if force or not exploded.exists(): From ff6822be862cb77a1f32b19f13357e1f4c9db70a Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Fri, 16 Feb 2024 14:13:23 +0000 Subject: [PATCH 5/6] Fix bug in format string handling --- bio2zarr/vcf.py | 6 +++++- tests/test_vcf.py | 25 +++++++++++++------------ 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/bio2zarr/vcf.py b/bio2zarr/vcf.py index a0ca3feb..8dae8d5a 100644 --- a/bio2zarr/vcf.py +++ b/bio2zarr/vcf.py @@ -1009,7 +1009,11 @@ def fixed_field_spec( shape.append(n) chunks.append(chunk_width), dimensions.append("samples") - if field.summary.max_number > 1: + if field.category == "FORMAT" and field.vcf_type == "String": + # FIXME not handling format string values very well right now + # as the max_number value is just the number of samples + pass + elif field.summary.max_number > 1: shape.append(field.summary.max_number) dimensions.append(field.name) variable_name = prefix + field.name diff --git a/tests/test_vcf.py b/tests/test_vcf.py index b3845b84..240b270d 100644 --- a/tests/test_vcf.py +++ b/tests/test_vcf.py @@ -231,15 +231,16 @@ def test_no_genotypes(self, ds, tmp_path): xt.assert_equal(ds[col], ds2[col]) -class TestByValidating: - def test_sample(self, tmp_path): - path = "tests/data/vcf/sample.vcf.gz" - out = tmp_path / "example.vcf.zarr" - vcf.convert_vcf([path], out) - vcf.validate(path, out) - - def test_sample_no_genotypes(self, tmp_path): - path = "tests/data/vcf/sample_no_genotypes.vcf.gz" - out = tmp_path / "example.vcf.zarr" - vcf.convert_vcf([path], out) - vcf.validate(path, out) +@pytest.mark.parametrize( + "name", + [ + "sample.vcf.gz", + "sample_no_genotypes.vcf.gz", + # "info_field_type_combos.vcf.gz", + ], +) +def test_by_validating(name, tmp_path): + path = f"tests/data/vcf/{name}" + out = tmp_path / "test.zarr" + vcf.convert_vcf([path], out) + vcf.validate(path, out) From e473d2adc4e6a933041f2ba1ff180f56ddbd9681 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Fri, 16 Feb 2024 14:46:49 +0000 Subject: [PATCH 6/6] Add more 1000 genomes data --- validation-data/Makefile | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/validation-data/Makefile b/validation-data/Makefile index bb8bf3ad..7d540d78 100644 --- a/validation-data/Makefile +++ b/validation-data/Makefile @@ -1,13 +1,29 @@ all: 1kg_2020_chr20.bcf.csi \ 1kg_2020_chr20.vcf.gz.tbi \ 1kg_2020_chr20_annotations.vcf.gz.tbi \ - 1kg_2020_chr20_annotations.bcf.csi + 1kg_2020_chr20_annotations.bcf.csi\ + 1kg_2020_others.bcf.csi \ + 1kg_2020_others.vcf.gz.tbi \ + 1kg_p3_all_chr1.bcf.csi \ + 1kg_p3_all_chr1.vcf.gz.tbi\ + 1kg_p1_all_chr6.vcf.bcf.csi\ + 1kg_p1_all_chr6.vcf.gz.tbi .PRECIOUS: %.bcf .PRECIOUS: %.vcf.gz -1KG_2020_GENOTYPES_URL="http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_raw_GT_with_annot/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr20.recalibrated_variants.vcf.gz" -1KG_2020_ANNOTATIONS_URL="http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_raw_GT_with_annot/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chr20.recalibrated_variants.annotated.vcf.gz" +# Modern 1000 genomes data +1KG_2020_BASE=http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_raw_GT_with_annot/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_ +1KG_2020_GENOTYPES_URL="${1KG_2020_BASE}chr20.recalibrated_variants.vcf.gz" +1KG_2020_ANNOTATIONS_URL="${1KG_2020_BASE}chr20.recalibrated_variants.annotated.vcf.gz" +1KG_2020_OTHERS_URL="${1KG_2020_BASE}others.recalibrated_variants.vcf.gz" + +# 1000 genomes phase 3 +1KG_P3_ALL_URL=http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr1.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz +1KG_P3_SNP_URL=http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20181203_biallelic_SNV/ALL.chr1.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz + +# 1000 genomes phase 1 +1KG_P1_ALL_URL=http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase1/analysis_results/integrated_call_sets/ALL.chr6.integrated_phase1_v3.20101123.snps_indels_svs.genotypes.vcf.gz %.vcf.gz.tbi: %.vcf.gz @@ -22,8 +38,20 @@ all: 1kg_2020_chr20.bcf.csi \ 1kg_2020_chr20.vcf.gz: bcftools view ${1KG_2020_GENOTYPES_URL} | head -n 10000 | bcftools view -O z -o $@ +1kg_2020_others.vcf.gz: + bcftools view ${1KG_2020_OTHERS_URL} | head -n 10000 | bcftools view -O z -o $@ + 1kg_2020_chr20_annotations.vcf.gz: bcftools view ${1KG_2020_ANNOTATIONS_URL} | head -n 100000 | bcftools view -O z -o $@ +1kg_p3_all_chr1.vcf.gz: + bcftools view ${1KG_P3_ALL_URL} | head -n 500000 | bcftools view -O z -o $@ + +1kg_p3_snp_chr1.vcf.gz: + bcftools view ${1KG_P3_SNP_URL} | head -n 500000 | bcftools view -O z -o $@ + +1kg_p1_all_chr6.vcf.gz: + bcftools view ${1KG_P1_ALL_URL} | head -n 10000 | bcftools view -O z -o $@ + clean: rm -f *.vcf.gz *.tbi *.bcf *.csi