In [10]:
from dask.distributed import Client, progress, get_worker
import os
import binascii
client = Client("tcp://131.180.106.138:8786")

In [11]:
FILE_TO_BE_COMPRESSED = "sample.txt"
FINAL_COMPRESSED_FILE = "compressed.gz"

In [12]:
def run_on_worker(file_bytes, index):
    import requests
    from xfZlibWrapper import xfZlibWrapper
    
    # Write received data to a file, since xfZlib::compress_file expects a file path
    TEMP_FILE_ON_WORKER = f"_worker/temp-{index}.txt"
    TEMP_COMPRESSED_FILE_ON_WORKER = f"_worker/temp-{index}.gz"    
    os.makedirs(os.path.dirname("_worker/"), exist_ok=True)    
    open(TEMP_FILE_ON_WORKER, 'wb').write(file_bytes)
    
    xfZlib = xfZlibWrapper(xclbin_path = b"./build/xclbin_xilinx_u50_gen3x16_xdma_201920_3_sw_emu/compress_decompress.xclbin", libzso_path = "build/libz.so")
    size = xfZlib.compress_file(TEMP_FILE_ON_WORKER, TEMP_COMPRESSED_FILE_ON_WORKER)
    print(f'Compressed from {os.path.getsize(TEMP_FILE_ON_WORKER)} to {size} bytes')
    


    # Add CRC32 since the library doesn't add one (https://xilinx.github.io/Vitis_Libraries/data_compression/2020.1/source/L2/gzip.html?highlight=crc)
    f = open(TEMP_COMPRESSED_FILE_ON_WORKER, "rb")
    data = f.read()
    crc_bytes = binascii.crc32(file_bytes).to_bytes(4, 'little')
    data = data[:-13] + crc_bytes + data[-9:-5]
    
    return {
        'index': index,
        'data': data,
        'size': size
    }

In [13]:
import time
t0 = time.time()

num_of_workers = len(client.scheduler_info()["workers"])
data_split = []

# Split up the file into equal sized chunks based on number of available dask workers
print("Splitting input file into", num_of_workers, "chunk(s)")
with open(FILE_TO_BE_COMPRESSED, "rb") as ifile:    
    total = ifile.read()
    start = 0
    chunk_size = int(len(total)/num_of_workers)
    for i in range(num_of_workers - 1):
        data_split.append(total[start: start+chunk_size])
        start += chunk_size
    data_split.append(total[start:]) #Last partition

# Scatter the data to the workers before calling run_on_worker on the workers
distributed_data = client.scatter(data_split)
futures = client.map(run_on_worker, distributed_data, range(num_of_workers))
results = client.gather(futures)
print("Received data from workers")

# Reorder the response based on original input order
results.sort(key = lambda result: result['index'])  


print("Writing combined (compressed) data to " + FINAL_COMPRESSED_FILE)
with open(FINAL_COMPRESSED_FILE, "wb") as f:
    for result in results:        
        f.write(result['data'])
        fw = open('_part'+  str(result['index']) +'.gz', 'wb')
        fw.write(result['data'])
        fw.close()


t1 = time.time()
print("TOTAL EXECUTION TIME (in s): ", t1 - t0)

Splitting input file into 3 chunk(s)
Received data from workers
Writing combined (compressed) data to compressed.gz
TOTAL EXECUTION TIME (in s):  2.4881649017333984


In [14]:

FILE_COPY = FILE_TO_BE_COMPRESSED + ".copy"
COMMAND_TO_RUN = "gzip -dc " + FINAL_COMPRESSED_FILE + " > " + FILE_COPY
print("Extracting", FINAL_COMPRESSED_FILE, "using command: ")
print(COMMAND_TO_RUN)
os.system(COMMAND_TO_RUN)
print("Comparing", FILE_COPY, "to", FILE_TO_BE_COMPRESSED)
with open(FILE_TO_BE_COMPRESSED, 'rb') as f1:
    with open(FILE_COPY, 'rb') as f2:
        if f1.read() == f2.read():
            print("Validation succeeded !!")
        else:
            print("Validation failed !!")

Extracting compressed.gz using command: 
gzip -dc compressed.gz > sample.txt.copy
Comparing sample.txt.copy to sample.txt
Validation succeeded !!
