In [None]:
import pyarrow.parquet as pq, pandas as pd, numpy as np, gc, time
from pathlib import Path
import os
import tables
tables.parameters.MAX_BLOSC_THREADS = os.cpu_count()  # enable Blosc threads to use all cpu cores for decompression
#source / destination files
IN_PARQUET = Path("data.parquet")
OUT_H5 = Path("datacomplevel1.h5")

#open parquet file and getting how many row groups we need to stream
pq_file = pq.ParquetFile(IN_PARQUET, memory_map=True) # zero copy i/o data
num_row_groups = pq_file.num_row_groups

#iterating through the row groups and writing to hdf5
t0 = time.time()
# "w" -> create / overwrite HDF5 file
# "blosc:zstd" -> compression algorithm, its a faster multithreaded codec
# "6" -> compression level (0-9), 1 being fastest but minimal compression, 9 being slowest and highest compression
buf = []
with pd.HDFStore(OUT_H5, "w", complib="blosc:zstd", complevel=1) as s:
    for i in range(num_row_groups):
        #loading one row group into pandas df
        df = pq_file.read_row_group(i).to_pandas()
        #storing column as fixed width bytes 
        df["line_ID"] = df["line_ID"].apply(bytes.hex)
        # normalizing / forcing the label columns to use signed Int8 and use –1 for “missing”
        for col in ["target", "test"]:
            df[col] = df[col].fillna(-1).astype(np.int8)
        #staging the chunk
        buf.append(df)
        #when the buffer holds 4 row groups OR its the last row group, write to hdf5
        if len(buf) == 4 or i == num_row_groups - 1:
            #train being HDF5 table node
            s.append("train", df, data_columns=True, index=False) 
            df = pd.concat(buf, ignore_index=True)
            buf.clear() #freeing batch memory
        # del df
        # gc.collect()
        elapsed = time.time() - t0
        print(f"DONE row-group {i+1}/{num_row_groups}  |  elapsed {elapsed/60:.1f} min")

#printing the total time taken to convert
total = time.time() - t0
print(f"\nParquet → HDF5 completed in {total/60:.1f} minutes")

DONE row-group 1/169  |  elapsed 0.0 min
DONE row-group 2/169  |  elapsed 0.0 min
DONE row-group 3/169  |  elapsed 0.0 min
DONE row-group 4/169  |  elapsed 0.0 min
DONE row-group 5/169  |  elapsed 0.0 min
DONE row-group 6/169  |  elapsed 0.0 min
DONE row-group 7/169  |  elapsed 0.0 min
DONE row-group 8/169  |  elapsed 2.0 min
DONE row-group 9/169  |  elapsed 2.0 min
DONE row-group 10/169  |  elapsed 2.0 min
DONE row-group 11/169  |  elapsed 2.0 min
DONE row-group 12/169  |  elapsed 4.0 min
DONE row-group 13/169  |  elapsed 4.0 min
DONE row-group 14/169  |  elapsed 4.0 min
DONE row-group 15/169  |  elapsed 4.0 min
DONE row-group 16/169  |  elapsed 5.9 min
DONE row-group 17/169  |  elapsed 5.9 min
DONE row-group 18/169  |  elapsed 5.9 min
DONE row-group 19/169  |  elapsed 5.9 min
DONE row-group 20/169  |  elapsed 7.8 min
DONE row-group 21/169  |  elapsed 7.8 min
DONE row-group 22/169  |  elapsed 7.8 min
DONE row-group 23/169  |  elapsed 7.8 min
DONE row-group 24/169  |  elapsed 9.8 min
D