In [42]:
import cupy as cp
import numpy as np
import pandas as pd
import time
import zarr

import kvikio.nvcomp


In [43]:
# conda install -c conda-forge zarr

In [44]:
HOST_LZ4_MAX = 2013929216 # 2113929216
sizes = list(map(lambda x: HOST_LZ4_MAX//(2**x), np.arange(20)))
print(sizes)

[2013929216, 1006964608, 503482304, 251741152, 125870576, 62935288, 31467644, 15733822, 7866911, 3933455, 1966727, 983363, 491681, 245840, 122920, 61460, 30730, 15365, 7682, 3841]


In [45]:
input_size = []
cascaded_size = []
cascaded_temp_size = []
cascaded_round_trip_time = []
lz4_gpu_size = []
lz4_gpu_temp_size = []
lz4_gpu_round_trip_time = []
bitcomp_gpu_size = []
bitcomp_gpu_temp_size = []
bitcomp_gpu_round_trip_time = []
lz4_size = []
lz4_round_trip_time = []

In [46]:
!wget http://textfiles.com/etext/NONFICTION/kjv10.txt

821.61s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


--2023-09-29 13:44:43--  http://textfiles.com/etext/NONFICTION/kjv10.txt
Resolving textfiles.com (textfiles.com)... 208.86.224.90
Connecting to textfiles.com (textfiles.com)|208.86.224.90|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4432803 (4.2M) [text/plain]
Saving to: ‘kjv10.txt.2’


2023-09-29 13:44:58 (304 KB/s) - ‘kjv10.txt.2’ saved [4432803/4432803]



In [47]:
text = open('kjv10.txt').read()
bib = np.frombuffer(bytes(text, 'utf-8'), dtype=np.int8)
data_buffer = np.tile(bib, 500)

In [48]:
# One of the three below keys, this will set the arrangement of test data for a full run of the notebook.
TARGET = "Ascending"
DTYPE = cp.int32

In [49]:
data = {
    "Ascending": np.arange(0, HOST_LZ4_MAX, dtype=np.int32),
    "Random": np.random.randint(0, 100, HOST_LZ4_MAX, dtype=np.int32),
    "Text": data_buffer
}

In [50]:
def get_host_data(offset, dtype):
    exemplar = np.array([1], dtype=dtype)
    print(offset)
    print(exemplar.itemsize)
    print(data[TARGET].itemsize)
    index = offset // data[TARGET].itemsize
    index = index - (index % exemplar.itemsize)
    print(index)
    return data[TARGET][0:index].view(dtype)

In [51]:
input_size = []
cascaded_size = []
cascaded_temp_size = []
cascaded_round_trip_time = []
lz4_gpu_size = []
lz4_gpu_temp_size = []
lz4_gpu_round_trip_time = []
lz4_size = []
lz4_round_trip_time = []
for size in sizes:
    data_host = get_host_data(size, DTYPE)
    data_gpu = cp.array(data_host)
    """Cascaded GPU"""
    t_gpu = time.time()
    compressor = kvikio.nvcomp.CascadedManager(dtype=data_gpu.dtype)
    compressed = compressor.compress(data_gpu)
    output_size = compressed.nbytes

    decompressed = compressor.decompress(compressed)
    decompressed_size = decompressed.size * decompressed.itemsize
    input_size.append(data_gpu.size * data_gpu.itemsize)
    cascaded_round_trip_time.append(time.time() - t_gpu)
    cascaded_size.append(output_size)
    print('-----')
    print('Input size: ', data_gpu.size * data_gpu.itemsize)
    print('Cascaded GPU compressor output size: ', output_size)
    print('Cascaded GPU decompressor output size: ', decompressed_size)
    print('Cascaded GPU compress/decompress round trip time: ',time.time() - t_gpu)
    
    del compressor
        
    """LZ4 Host"""
    lz4 = zarr.LZ4()
    t_host = time.time()
    host_compressed = lz4.encode(data_gpu.get())
    del data_gpu
    print(len(host_compressed))
    host_compressed = host_compressed[:2113929216]
    host_decompressed = lz4.decode(host_compressed)
    print('Lz4 zarr time: ', time.time() - t_host)
    print('Lz4 compressed size: ', len(host_compressed))
    lz4_size.append(len(host_compressed))
    lz4_round_trip_time.append(time.time() - t_host)

2013929216
4
4
503482304
-----
Input size:  2013929216
Cascaded GPU compressor output size:  33434464
Cascaded GPU decompressor output size:  2013929216
Cascaded GPU compress/decompress round trip time:  0.1076362133026123
2021826967
Lz4 zarr time:  4.681669235229492
Lz4 compressed size:  2021826967
1006964608
4
4
251741152
-----
Input size:  1006964608
Cascaded GPU compressor output size:  16717276
Cascaded GPU decompressor output size:  1006964608
Cascaded GPU compress/decompress round trip time:  0.11769247055053711
1010913478
Lz4 zarr time:  2.57978892326355
Lz4 compressed size:  1010913478
503482304
4
4
125870576
-----
Input size:  503482304
Cascaded GPU compressor output size:  8358716
Cascaded GPU decompressor output size:  503482304
Cascaded GPU compress/decompress round trip time:  0.05775332450866699
505456734
Lz4 zarr time:  1.2365527153015137
Lz4 compressed size:  505456734
251741152
4
4
62935288
-----
Input size:  251741152
Cascaded GPU compressor output size:  4179436
Cas

In [52]:
lz4_gpu_size = []
lz4_gpu_temp_size = []
lz4_gpu_round_trip_time = []
for size in sizes:
    data_host = get_host_data(size, DTYPE)
    data_gpu = cp.array(data_host)

    """LZ4 GPU"""
    data_gpu = cp.array(data_host)
    t_gpu = time.time()
    compressor = kvikio.nvcomp.LZ4Manager(dtype=data_gpu.dtype)
    compressed = compressor.compress(data_gpu)
    output_size = compressed.nbytes

    decompressed = compressor.decompress(compressed)
    decompressed_size = decompressed.size * decompressed.itemsize
    lz4_gpu_round_trip_time.append(time.time() - t_gpu)
    lz4_gpu_size.append(output_size)
    print('lz4 GPU compressor output size: ', output_size)
    print('lz4 GPU decompressor output size: ', decompressed_size)
    print('lz4 GPU compress/decompress round trip time: ',time.time() - t_gpu)

2013929216
4
4
503482304
lz4 GPU compressor output size:  2022340697
lz4 GPU decompressor output size:  2013929216
lz4 GPU compress/decompress round trip time:  0.7271463871002197
1006964608
4
4
251741152
lz4 GPU compressor output size:  1011170371
lz4 GPU decompressor output size:  1006964608
lz4 GPU compress/decompress round trip time:  0.36713171005249023
503482304
4
4
125870576
lz4 GPU compressor output size:  505585200
lz4 GPU decompressor output size:  503482304
lz4 GPU compress/decompress round trip time:  0.1900792121887207
251741152
4
4
62935288
lz4 GPU compressor output size:  252792621
lz4 GPU decompressor output size:  251741152
lz4 GPU compress/decompress round trip time:  0.09049177169799805
125870576
4
4
31467644
lz4 GPU compressor output size:  126396327
lz4 GPU decompressor output size:  125870576
lz4 GPU compress/decompress round trip time:  0.04643416404724121
62935288
4
4
15733820
lz4 GPU compressor output size:  63198181
lz4 GPU decompressor output size:  62935280


In [53]:
# zarr lz4 max buffer size is 264241152 int64s
# zarr lz4 max buffer size is 2113929216 bytes
# cascaded max buffer size is 2147483640 bytes
# cascaded max buffer size is 268435456 int64s

In [54]:
print(input_size)
print(cascaded_size)
print(cascaded_temp_size)
print(cascaded_round_trip_time)
print(lz4_gpu_size)
print(lz4_gpu_temp_size)
print(lz4_gpu_round_trip_time)
print(lz4_size)
print(lz4_round_trip_time)
df = pd.DataFrame({
    'Input Size (Bytes)': input_size,
    'cascaded_size': cascaded_size,
    'cascaded_round_trip_time': cascaded_round_trip_time,
    'lz4_gpu_size': lz4_gpu_size,
    'lz4_gpu_round_trip_time': lz4_gpu_round_trip_time,
    'lz4_size': lz4_size,
    'lz4_round_trip_time': lz4_round_trip_time
})

[2013929216, 1006964608, 503482304, 251741152, 125870576, 62935280, 31467632, 15733808, 7866896, 3933440, 1966720, 983360, 491680, 245840, 122912, 61456, 30720, 15360, 7680, 3840]
[33434464, 16717276, 8358716, 4179436, 2089796, 1044976, 522532, 261344, 130716, 65436, 32796, 16476, 8316, 4236, 2184, 1148, 632, 360, 224, 156]
[]
[0.10751104354858398, 0.11756682395935059, 0.05767321586608887, 0.028416156768798828, 0.014620304107666016, 0.007331132888793945, 0.004427194595336914, 0.0025060176849365234, 0.0017902851104736328, 0.0016641616821289062, 0.001974821090698242, 0.0013790130615234375, 0.0011060237884521484, 0.0014438629150390625, 0.0010533332824707031, 0.0008640289306640625, 0.001127481460571289, 0.0014081001281738281, 0.0011692047119140625, 0.0012063980102539062]
[2022340697, 1011170371, 505585200, 252792621, 126396327, 63198181, 31599109, 15799573, 7899801, 3949915, 1974981, 987514, 493774, 246904, 123459, 61745, 30907, 15498, 7787, 3940]
[]
[0.7270452976226807, 0.3670234680175781

In [55]:
### You'll need the following to display the upcoming plots. ###

# !conda install -c conda-forge plotly
# !npm install require

In [56]:
df['Cascaded Compression Ratio'] = df['Input Size (Bytes)'] / df['cascaded_size']
df['Lz4 Gpu Compression Ratio'] = df['Input Size (Bytes)'] / df['lz4_gpu_size']
df['Lz4 Host Compression Ratio'] = df['Input Size (Bytes)'] / df['lz4_size']
df['Cascaded Speedup'] = df['lz4_round_trip_time'] / df['cascaded_round_trip_time']
df['Lz4 Gpu Speedup'] = df['lz4_round_trip_time'] / df['lz4_gpu_round_trip_time']
print(df.columns)

Index(['Input Size (Bytes)', 'cascaded_size', 'cascaded_round_trip_time',
       'lz4_gpu_size', 'lz4_gpu_round_trip_time', 'lz4_size',
       'lz4_round_trip_time', 'Cascaded Compression Ratio',
       'Lz4 Gpu Compression Ratio', 'Lz4 Host Compression Ratio',
       'Cascaded Speedup', 'Lz4 Gpu Speedup'],
      dtype='object')


In [57]:
import plotly.express as px
title = 'Gpu Acceleration over Zarr Lz4 - ' + TARGET + " " + str(DTYPE)
subtitle = 'Includes host->gpu copy time'
fig = px.line(df, x='Input Size (Bytes)',
              y=['Cascaded Speedup', 'Lz4 Gpu Speedup'],
              labels={'value': 'Multiple Faster'},
              title=title)
fig.update_xaxes(type='category')
fig.show()

In [58]:
import plotly.express as px
title = 'Compression - ' + TARGET + " " + str(DTYPE)
fig = px.line(df, x='Input Size (Bytes)',
              y=[
                  'Lz4 Gpu Compression Ratio',
                  'Cascaded Compression Ratio',
                  'Lz4 Host Compression Ratio'
              ],
              labels={'value': 'Compression Factor'},
              title=title)
fig.update_xaxes(type='category')
fig.show()