In [1]:
import glob
import os
import humanfriendly
import pandas as pd
from pathlib import Path
from bio2zarr.vcf2zarr import vcz

#### Data sources

In [10]:
WORKDIR=Path(os.environ["WORKDIR"])
VCFDIR=Path(os.environ["VCFDIR"])
VCF_FILE_PATTERN=VCFDIR / "PA_chr*.vcf.gz"
ICF_DIR=WORKDIR / "results" / "icf" / "spruce.all.vcf.gz.icf"
ZARR_DIR=WORKDIR / "results" / "vcz" / "spruce.all.vcf.gz.vcz"
ZARR_W_SCHEMA_DIR=WORKDIR / "results" / "vcz" / "spruce.all.vcf.gz.json.vcz"

#### VCF size and partitions

In [5]:
files = glob.glob(str(VCF_FILE_PATTERN))
total_size = sum(os.path.getsize(file) for file in files)
print(f"Total compressed VCF: {humanfriendly.format_size(total_size, binary=True)} across {len(files)} files")

Total compressed VCF: 7.63 TiB across 165 files


#### Inspect Zarr and ICF

##### Zarr output without schema

In [12]:
%%time
zarrvcf_inspec = vcz.inspect(ZARR_DIR)

CPU times: user 2min 21s, sys: 31min 54s, total: 34min 15s
Wall time: 2h 26min 30s


In [14]:
zarrdf = pd.DataFrame(zarrvcf_inspec)

In [15]:
zarrdf.to_csv("spruce_zarr_inspect.csv", index=False)

##### Zarr output with schema

In [16]:
%%time
zarrvcf_inspec = vcz.inspect(ZARR_W_SCHEMA_DIR)

CPU times: user 2min 35s, sys: 36min 54s, total: 39min 29s
Wall time: 2h 44min 9s


In [17]:
zarrdf = pd.DataFrame(zarrvcf_inspec)

In [18]:
zarrdf.to_csv("spruce_zarr_json_inspect.csv", index=False)

##### ICF output

In [19]:
%%time
icf_inspec = vcz.inspect(ICF_DIR)

CPU times: user 4.16 ms, sys: 5.47 ms, total: 9.63 ms
Wall time: 55.9 ms


In [21]:
icfdf = pd.DataFrame(icf_inspec)
icfdf.to_csv("spruce_icf_inspect.csv", index=False)

#### Count Zarr files

In [26]:
! find {ZARR_DIR} | wc -l

14980847


In [27]:
! find {ZARR_W_SCHEMA_DIR} | wc -l

16104404
