In [6]:
import soundfile as sf
import zarr
import os

In [63]:
input_folder = r'D:\2022_10_Mälaren\RTsys1\2022-10-01'
output_file = r'D:\Compression test\compressed_data_test_zipcompress.zarr.zip'

In [75]:
database = zarr.group(store=zarr.storage.ZipStore(output_file, compression=zarr.storage.zipfile.ZIP_DEFLATED))

In [76]:
database['channelA_2022-10-01_00-10-56'].info

0,1
Name,/channelA_2022-10-01_00-10-56
Type,zarr.core.Array
Data type,float32
Shape,"(335609856,)"
Chunk shape,"(655488,)"
Order,C
Read-only,False
Compressor,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)"
Store type,zarr.storage.ZipStore
No. bytes,1342439424 (1.3G)


In [66]:
database.store.zf

<zipfile.ZipFile filename='D:\\Compression test\\compressed_data_test.zarr.zip' mode='a'>

In [68]:
raw_bytes = 0
for file in os.listdir(input_folder):
    base, ext = os.path.splitext(file)
    if ext != '.wav':
        print(f'Skipping file {file}')
        continue
    path = os.path.join(input_folder, file)
    raw_bytes += os.path.getsize(path)
    print(f'Reading file {file}')
    raw_data, samplerate = sf.read(path, dtype='float32')
    print(f'Storing file {file}')
    database.array(name=base, data=raw_data)
print('Done')

Reading file channelA_2022-10-01_00-10-56.wav
Storing file channelA_2022-10-01_00-10-56.wav
Reading file channelA_2022-10-01_00-54-38.wav
Storing file channelA_2022-10-01_00-54-38.wav
Reading file channelA_2022-10-01_01-38-20.wav
Storing file channelA_2022-10-01_01-38-20.wav
Reading file channelA_2022-10-01_02-22-02.wav
Storing file channelA_2022-10-01_02-22-02.wav
Reading file channelA_2022-10-01_03-05-44.wav
Storing file channelA_2022-10-01_03-05-44.wav
Reading file channelA_2022-10-01_03-49-26.wav
Storing file channelA_2022-10-01_03-49-26.wav
Reading file channelA_2022-10-01_04-33-07.wav
Storing file channelA_2022-10-01_04-33-07.wav
Reading file channelA_2022-10-01_05-16-49.wav
Storing file channelA_2022-10-01_05-16-49.wav
Reading file channelA_2022-10-01_06-00-31.wav
Storing file channelA_2022-10-01_06-00-31.wav
Reading file channelA_2022-10-01_06-44-12.wav
Storing file channelA_2022-10-01_06-44-12.wav
Reading file channelA_2022-10-01_07-27-54.wav
Storing file channelA_2022-10-01_0

In [17]:
arr

('channelA_2022-10-01_00-10-56',
 <zarr.core.Array '/channelA_2022-10-01_00-10-56' (335609856,) float32>)

In [69]:
read_bytes = 0
stored_bytes = 0
for _, arr in database.arrays():
    read_bytes += arr.nbytes
    stored_bytes += arr.nbytes_stored
print(f'Originally {raw_bytes/1e9:.2f} GB')
print(f'Read as {read_bytes/1e9:.2f} GB')
print(f'Stored using {stored_bytes/1e9:.2f} GB')
print(f'Internal compression ratio: {read_bytes / stored_bytes}')
print(f'End-to-end compression ratio: {raw_bytes / stored_bytes}')


Originally 33.23 GB
Read as 44.30 GB
Stored using 14.45 GB
Internal compression ratio: 3.06516450551826
End-to-end compression ratio: 2.298874548177833


Using `zarr.group(store=zarr.storage.ZipStore(output_file))`

- Originally 33.23 GB
- Read as 44.30 GB
- Stored using 22.27 GB
- Internal compression ratio: 1.989059101644946
- End-to-end compression ratio: 1.4917950848513737

Using `zarr.group(store=zarr.storage.ZipStore(output_file, compression=zarr.storage.zipfile.ZIP_DEFLATED))`

- Originally 33.23 GB
- Read as 44.30 GB
- Stored using 14.45 GB
- Internal compression ratio: 3.06516450551826
- End-to-end compression ratio: 2.298874548177833

In [73]:
database.store.close()

In [8]:
input_data, fs = sf.read(input_file, dtype='int16')

In [134]:
output_file = 'compression_test_2.zarr'
# storage = zarr.storage.DirectoryStore(output_file)
import numcodecs
compressor = numcodecs.Blosc(cname='zstd', clevel=5, shuffle=numcodecs.Blosc.BITSHUFFLE)
storage = zarr.storage.ZipStore(output_file)
arr = zarr.array(data=input_data, store=storage, compressor=compressor)

In [31]:
zarr.storage.ZipStore?

[1;31mInit signature:[0m
[0mzarr[0m[1;33m.[0m[0mstorage[0m[1;33m.[0m[0mZipStore[0m[1;33m([0m[1;33m
[0m    [0mpath[0m[1;33m,[0m[1;33m
[0m    [0mcompression[0m[1;33m=[0m[1;36m0[0m[1;33m,[0m[1;33m
[0m    [0mallowZip64[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mmode[0m[1;33m=[0m[1;34m'a'[0m[1;33m,[0m[1;33m
[0m    [0mdimension_separator[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m     
Storage class using a Zip file.

Parameters
----------
path : string
    Location of file.
compression : integer, optional
    Compression method to use when writing to the archive.
allowZip64 : bool, optional
    If True (the default) will create ZIP files that use the ZIP64
    extensions when the zipfile is larger than 2 GiB. If False
    will raise an exception when the ZIP file would require ZIP64
    extensions.
mode : string, optional
    One of 'r' to read an existing fil

In [48]:
database.store.close()

In [59]:
calibration_data = zarr.open(r'C:\Users\carl4189\OneDrive - IVL Svenska Miljöinstitutet AB\Silent at Sea\Genomförande\WP4 Buller i vatten\Kalibrering\nidaq_compressed\2022-07-12_17-28-36.zarr.zip')

In [60]:
calibration_data.info

0,1
Type,zarr.core.Array
Data type,float64
Shape,"(2, 2200000)"
Chunk shape,"(1, 200000)"
Order,C
Read-only,False
Compressor,
Store type,zarr.storage.ZipStore
No. bytes,35200000 (33.6M)
No. bytes stored,2519154 (2.4M)


In [62]:
calibration_data.store.compression

0

In [41]:
print(storage.zf.open('.zarray').read().decode())

{
    "chunks": [
        1012403
    ],
    "compressor": {
        "blocksize": 0,
        "clevel": 5,
        "cname": "lz4",
        "id": "blosc",
        "shuffle": 1
    },
    "dtype": "<i2",
    "fill_value": 0,
    "filters": null,
    "order": "C",
    "shape": [
        1036700496
    ],
    "zarr_format": 2
}


In [42]:
import numcodecs
import numpy as np

In [106]:
np.set_printoptions(precision=None)
bits = 24
x = np.arange(-2**(bits - 1), 2**(bits - 1)) / 2**(bits - 1)
codec = numcodecs.Quantize(digits=bits - 18, dtype='f8')
codec = numcodecs.BitRound(keepbits=bits - 16)
y = codec.decode(codec.encode(x))
np.max(np.abs(x - y))

0.0

In [131]:
bits = 24
scale = 2**(bits - 1)
lsb = 1 / scale
value = 1 - lsb
codec = numcodecs.BitRound(bits - 8)
codec.decode(codec.encode(np.array(value))) * scale

8388608.0

In [123]:
codec.keepbits

0

In [97]:
y[1] * 2**23

-8388608.0