In [None]:
import tables, os
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
import pyarrow.dataset as ds
from pathlib import Path
import time
IN_PARQUET = Path("data.parquet")
OUT_H5 = Path("data2.h5")

#open parquet file and getting how many row groups we need to stream
pq_file = pq.ParquetFile(IN_PARQUET, memory_map=True) # zero copy i/o data
num_row_groups = pq_file.num_row_groups
tables.parameters.MAX_BLOSC_THREADS = os.cpu_count()  # enable Blosc threads

t0 = time.time()
with pd.HDFStore(OUT_H5, "w",
                 complib="blosc:zstd",
                 complevel=4) as store:                # keep same codec
    buf = []                                          # batch buffer
    for i in range(num_row_groups):
        df = pq_file.read_row_group(i).to_pandas()
        # (optional) fixed-width bytes for line_ID
        df["line_ID"] = df["line_ID"].astype("S16")

        # label cast
        for col in ["target", "test"]:
            df[col] = df[col].fillna(-1).astype(np.int8)

        buf.append(df)
        if len(buf) == 4 or i == num_row_groups - 1:            # batch of 4 RGs
            store.append("train", pd.concat(buf), index=False)
            buf.clear()
        elapsed = time.time() - t0
        print(f"DONE row-group {i+1}/{num_row_groups}  |  elapsed {elapsed/60:.1f} min")

DONE row-group 1/169  |  elapsed 0.0 min
DONE row-group 2/169  |  elapsed 0.0 min
DONE row-group 3/169  |  elapsed 0.0 min
DONE row-group 4/169  |  elapsed 0.0 min
DONE row-group 5/169  |  elapsed 0.0 min
DONE row-group 6/169  |  elapsed 0.0 min
DONE row-group 7/169  |  elapsed 0.0 min
DONE row-group 8/169  |  elapsed 0.0 min
DONE row-group 9/169  |  elapsed 0.0 min
DONE row-group 10/169  |  elapsed 0.0 min
DONE row-group 11/169  |  elapsed 0.0 min
DONE row-group 12/169  |  elapsed 0.1 min
DONE row-group 13/169  |  elapsed 0.1 min
DONE row-group 14/169  |  elapsed 0.1 min
DONE row-group 15/169  |  elapsed 0.1 min
DONE row-group 16/169  |  elapsed 0.1 min
DONE row-group 17/169  |  elapsed 0.1 min
DONE row-group 18/169  |  elapsed 0.1 min
DONE row-group 19/169  |  elapsed 0.1 min
DONE row-group 20/169  |  elapsed 0.1 min
DONE row-group 21/169  |  elapsed 0.1 min
DONE row-group 22/169  |  elapsed 0.1 min
DONE row-group 23/169  |  elapsed 0.1 min
DONE row-group 24/169  |  elapsed 0.1 min
D