#### Rohan Bhatt, Shubhang Srikoti 
##### MSML605 -  Investigating the Impact of Storage Formats

Problem statement: How does the choice of storage format (CSV, Parquet, HDF5) impact the overall performance of a machine learning pipeline and its processes (data ingestion, memory overhead, time-to-train, and more).

In [None]:
#all possible imports:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pyarrow.parquet as pq
import pyarrow as pa
import tables # for hdf5
import time, datetime, os, psutil
import xgboost as xgb
from pathlib import Path, PureWindowsPath
import gc

#importing data
import kagglehub

# Download latest version
if not os.path.exists("data.parquet"):
    path = kagglehub.dataset_download("jtbontinck/amex-parquet-file")
# print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Checking schema and data types:

In [None]:
pq_file = pq.ParquetFile("data.parquet")
print("Rows in file:", pq_file.metadata.num_rows)
print("Columns in file:", pq_file.metadata.num_columns)
print("Schema:", pq_file.schema)

Rows in file: 16895213
Columns in file: 193
Schema: <pyarrow._parquet.ParquetSchema object at 0x00000217C25E7980>
required group field_id=-1 duckdb_schema {
  optional fixed_len_byte_array(16) field_id=-1 line_ID (UUID);
  optional binary field_id=-1 customer_ID (String);
  optional int64 field_id=-1 date (Timestamp(isAdjustedToUTC=false, timeUnit=microseconds, is_from_converted_type=false, force_set_converted_type=false));
  optional float field_id=-1 P_2;
  optional float field_id=-1 D_39;
  optional float field_id=-1 B_1;
  optional float field_id=-1 B_2;
  optional float field_id=-1 R_1;
  optional float field_id=-1 S_3;
  optional float field_id=-1 D_41;
  optional float field_id=-1 B_3;
  optional float field_id=-1 D_42;
  optional float field_id=-1 D_43;
  optional float field_id=-1 D_44;
  optional float field_id=-1 B_4;
  optional float field_id=-1 D_45;
  optional float field_id=-1 B_5;
  optional float field_id=-1 R_2;
  optional float field_id=-1 D_46;
  optional float fiel

Converting Parquet -> CSV (row based)

In [None]:
# in/out file paths
IN_FILE  = Path(r"data.parquet")
OUT_CSV  = Path(r"E:\ML\data.csv") # final single file

#opening the parquet file
pq_file = pq.ParquetFile(IN_FILE, memory_map=True)
n_rg = pq_file.num_row_groups
print(f"Row groups in file: {n_rg}")

# write loop
first_chunk = True
for rg in range(n_rg):
    # load one row group into Arrow Table (stays off heap)
    table = pq_file.read_row_group(rg)
    # convert to pandas
    df = table.to_pandas(types_mapper=pd.ArrowDtype)
    # write / append
    if first_chunk:
        df.to_csv(OUT_CSV, index=False, mode="w", header=True)
        first_chunk = False
    else:
        df.to_csv(OUT_CSV, index=False, mode="a", header=False)
    
    # free memory
    del df, table
    gc.collect()
    print(f"row-group {rg+1}/{n_rg} appended")

print("All done →", OUT_CSV) #10 min - 23 rows 

Row groups in file: 169
✓ row-group 1/169 appended
✓ row-group 2/169 appended
✓ row-group 3/169 appended
✓ row-group 4/169 appended
✓ row-group 5/169 appended
✓ row-group 6/169 appended
✓ row-group 7/169 appended
✓ row-group 8/169 appended
✓ row-group 9/169 appended
✓ row-group 10/169 appended
✓ row-group 11/169 appended
✓ row-group 12/169 appended
✓ row-group 13/169 appended
✓ row-group 14/169 appended
✓ row-group 15/169 appended
✓ row-group 16/169 appended
✓ row-group 17/169 appended
✓ row-group 18/169 appended
✓ row-group 19/169 appended
✓ row-group 20/169 appended
✓ row-group 21/169 appended
✓ row-group 22/169 appended
✓ row-group 23/169 appended
✓ row-group 24/169 appended
✓ row-group 25/169 appended
✓ row-group 26/169 appended
✓ row-group 27/169 appended
✓ row-group 28/169 appended
✓ row-group 29/169 appended
✓ row-group 30/169 appended
✓ row-group 31/169 appended
✓ row-group 32/169 appended
✓ row-group 33/169 appended
✓ row-group 34/169 appended
✓ row-group 35/169 appended
✓ row

In [None]:
#sanity check that data.csv exists

df_head = pd.read_csv("E:/ML/data.csv", nrows=10)
print(df_head)

                                             line_ID  \
0  b'\xb6a\x82\x86f#F\x1d\x8c\x94\x7f\x8d\x944\xd...   
1        b'L\xa8+-\xa8\x8dM\xa9\x96g\xed0I\x95\x1e$'   
2        b']s_\x87\xaf B\xec\xbeEg\xb5\x1e\xb2\xaed'   
3     b'\xfb^\xd4{Q\xb5HO\xa8\xb6\xf6\xca\xb1]@\x99'   
4  b'`\xa5\x96\xf6\x1b\rG\x8d\xab\\\x16\x8d\xe1\x...   
5       b'uU\xc8Y+\xabKx\xa7\x1bO\x1d\xd4\x9f\t\xbc'   
6    b'\\\x8d$\xbc\xf8\xcaLG\xad\xf3\x8d\xb7I\xe1m.'   
7  b'\xee\xa7L\x9b\x16\x1cK\x90\xb2\xc69L\x8e\x16...   
8     b'\xaa\x96\x93=\xdejI\xe2\xa9Lm.\xe20\xed\xc1'   
9  b'\r\xc1\x99]\xbb\x0cA\xe0\x9cR\xac\n#\xe0\xbf...   

                                         customer_ID                 date  \
0  d00b98b2401d26197fa1d6102cdc1c9bbed7c066b8aaa9...  2018-03-06 00:00:00   
1  d00bc5e66e3aac9eae7c9e94621b36d196566d61ef7a32...  2018-03-25 00:00:00   
2  d00bd125cf6fa463a6c57b9959b8a4197f6f79fb154fee...  2018-03-28 00:00:00   
3  d00bfbdee3081206258a4b4fb2ef2eb311697f37056bfb...  2018-03-01 00:00:00  

In [None]:
import pyarrow.parquet as pq
import pandas as pd, gc, time
from pathlib import Path
from pandas.api.types import is_string_dtype

# ── paths ──────────────────────────────────────────────────────────
IN_PARQUET = Path("data.parquet")          # your AMEX file
OUT_H5     = Path(r"E:\ML\data.h5")
OUT_H5.parent.mkdir(exist_ok=True)

# ── open parquet ───────────────────────────────────────────────────
pq_file = pq.ParquetFile(IN_PARQUET, memory_map=True)
n_rg    = pq_file.num_row_groups
print(f"{n_rg} row groups found")

# ── open HDF5 store ────────────────────────────────────────────────
store = pd.HDFStore(OUT_H5, mode="w", complib="zlib", complevel=6)
KEY   = "train"

start = time.time()
for rg in range(n_rg):
    # 1) read row-group ➜ pandas
    df = pq_file.read_row_group(rg).to_pandas()

    # 2) bytes → hex strings
    for col in df.select_dtypes(["object"]).columns:
        sample = df[col].iloc[0]
        if isinstance(sample, (bytes, bytearray)):
            df[col] = df[col].apply(lambda b: b.hex() if isinstance(b, (bytes, bytearray)) else b)

    # 3) convert pandas StringDtype → object strings
    for col in df.columns:
        if is_string_dtype(df[col]):
            df[col] = df[col].astype(object)

    # 4) append
    store.append(
        KEY,
        df,
        data_columns=True,
        index=False,
        min_itemsize={"customer_ID": 36, "line_ID": 32},  # 32 hex chars for 16-byte ids
    )

    del df
    gc.collect()
    print(f"✓ row-group {rg+1}/{n_rg}")

store.close()
print(f"Done  → {OUT_H5}  | elapsed {time.time() - start:.1f}s")


169 row groups found
✓ row-group 1/169
✓ row-group 2/169
✓ row-group 3/169
✓ row-group 4/169
✓ row-group 5/169
✓ row-group 6/169
✓ row-group 7/169
✓ row-group 8/169
✓ row-group 9/169
✓ row-group 10/169
✓ row-group 11/169
✓ row-group 12/169
✓ row-group 13/169
✓ row-group 14/169
✓ row-group 15/169
✓ row-group 16/169
✓ row-group 17/169
✓ row-group 18/169
✓ row-group 19/169


In [24]:
store.close()