### Zarr Compression Tests

See http://alimanfoo.github.io/2016/09/21/genotype-compression-benchmark.html for zarr benchmarks on genotyping data.

In [1]:
from dask.distributed import Client, progress
import dask.array as da
import matplotlib.pyplot as plt
import numpy as np
client = Client(processes=True, threads_per_worker=1, n_workers=4, memory_limit='8GB')
client

0,1
Client  Scheduler: tcp://127.0.0.1:35329  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 32.00 GB


In [2]:
%run codecs.py
client.register_worker_plugin(CodecPlugin())
None

In [3]:
da.random.seed(1)
data_shape = (100000, 5000)
chunk_shape = (10000, 5000)
x = da.random.random(data_shape, chunks=chunk_shape)
x = (x > .2).astype(np.uint8)
x

Unnamed: 0,Array,Chunk
Bytes,500.00 MB,50.00 MB
Shape,"(100000, 5000)","(10000, 5000)"
Count,30 Tasks,10 Chunks
Type,uint8,numpy.ndarray
"Array Chunk Bytes 500.00 MB 50.00 MB Shape (100000, 5000) (10000, 5000) Count 30 Tasks 10 Chunks Type uint8 numpy.ndarray",5000  100000,

Unnamed: 0,Array,Chunk
Bytes,500.00 MB,50.00 MB
Shape,"(100000, 5000)","(10000, 5000)"
Count,30 Tasks,10 Chunks
Type,uint8,numpy.ndarray


In [4]:
%%time
# Test different compressor settings to make sure they actually take effect
from numcodecs import Blosc, LZ4
da.to_zarr(x, url='/tmp/snp_zarr_small_1', overwrite=True, filters=[PackGeneticBits()])
da.to_zarr(x, url='/tmp/snp_zarr_small_2', overwrite=True, filters=[PackGeneticBits()], compressor=None)
da.to_zarr(x, url='/tmp/snp_zarr_small_3', overwrite=True, filters=[PackGeneticBits()], compressor=LZ4())
# NOTE: Default compressor is Blosc(cname='lz4', clevel=5, shuffle=0)
da.to_zarr(x, url='/tmp/snp_zarr_small_4', overwrite=True, filters=[PackGeneticBits()], compressor=Blosc(cname='lz4', clevel=5, shuffle=0))

CPU times: user 1.09 s, sys: 156 ms, total: 1.25 s
Wall time: 11.9 s


In [5]:
!du -c -b /tmp/snp_zarr_small_*

96015978	/tmp/snp_zarr_small_1
125004395	/tmp/snp_zarr_small_2
93100140	/tmp/snp_zarr_small_3
96015978	/tmp/snp_zarr_small_4
410136491	total
