#### Rohan Bhatt, Shubhang Srikoti 
##### MSML605 -  Investigating the Impact of Storage Formats

Problem statement: How does the choice of storage format (CSV, Parquet, HDF5) impact the overall performance of a machine learning pipeline and its processes (data ingestion, memory overhead, time-to-train, and more).

In [2]:
#all possible imports:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pyarrow.parquet as pq
import pyarrow as pa
import tables # for hdf5
import time, datetime, os, psutil
import xgboost as xgb
from pathlib import Path, PureWindowsPath
import gc

#importing data
import kagglehub

# Download latest version
if not os.path.exists("data.parquet"):
    path = kagglehub.dataset_download("jtbontinck/amex-parquet-file")
# print("Path to dataset files:", path)

Checking schema and data types:

In [3]:
pq_file = pq.ParquetFile("data.parquet")
print("Rows in file:", pq_file.metadata.num_rows)
print("Columns in file:", pq_file.metadata.num_columns)
print("Schema:", pq_file.schema)

Rows in file: 16895213
Columns in file: 193
Schema: <pyarrow._parquet.ParquetSchema object at 0x00000219CA392980>
required group field_id=-1 duckdb_schema {
  optional fixed_len_byte_array(16) field_id=-1 line_ID (UUID);
  optional binary field_id=-1 customer_ID (String);
  optional int64 field_id=-1 date (Timestamp(isAdjustedToUTC=false, timeUnit=microseconds, is_from_converted_type=false, force_set_converted_type=false));
  optional float field_id=-1 P_2;
  optional float field_id=-1 D_39;
  optional float field_id=-1 B_1;
  optional float field_id=-1 B_2;
  optional float field_id=-1 R_1;
  optional float field_id=-1 S_3;
  optional float field_id=-1 D_41;
  optional float field_id=-1 B_3;
  optional float field_id=-1 D_42;
  optional float field_id=-1 D_43;
  optional float field_id=-1 D_44;
  optional float field_id=-1 B_4;
  optional float field_id=-1 D_45;
  optional float field_id=-1 B_5;
  optional float field_id=-1 R_2;
  optional float field_id=-1 D_46;
  optional float fiel

Converting Parquet -> CSV (row based)

In [None]:
# in/out file paths
IN_FILE  = Path(r"data.parquet")
OUT_CSV  = Path(r"E:\ML\data.csv") # final single file

#opening the parquet file
pq_file = pq.ParquetFile(IN_FILE, memory_map=True)
n_rg = pq_file.num_row_groups
print(f"Row groups in file: {n_rg}")

# write loop
first_chunk = True
for rg in range(n_rg):
    # load one row group into Arrow Table (stays off heap)
    table = pq_file.read_row_group(rg)
    # convert to pandas
    df = table.to_pandas(types_mapper=pd.ArrowDtype)
    # write / append
    if first_chunk:
        df.to_csv(OUT_CSV, index=False, mode="w", header=True)
        first_chunk = False
    else:
        df.to_csv(OUT_CSV, index=False, mode="a", header=False)
    
    # free memory
    del df, table
    gc.collect()
    print(f"row-group {rg+1}/{n_rg} appended")

print("All done →", OUT_CSV) #10 min - 23 rows 

Row groups in file: 169
✓ row-group 1/169 appended
✓ row-group 2/169 appended
✓ row-group 3/169 appended
✓ row-group 4/169 appended
✓ row-group 5/169 appended
✓ row-group 6/169 appended
✓ row-group 7/169 appended
✓ row-group 8/169 appended
✓ row-group 9/169 appended
✓ row-group 10/169 appended
✓ row-group 11/169 appended
✓ row-group 12/169 appended
✓ row-group 13/169 appended
✓ row-group 14/169 appended
✓ row-group 15/169 appended
✓ row-group 16/169 appended
✓ row-group 17/169 appended
✓ row-group 18/169 appended
✓ row-group 19/169 appended
✓ row-group 20/169 appended
✓ row-group 21/169 appended
✓ row-group 22/169 appended
✓ row-group 23/169 appended
✓ row-group 24/169 appended
✓ row-group 25/169 appended
✓ row-group 26/169 appended
✓ row-group 27/169 appended
✓ row-group 28/169 appended
✓ row-group 29/169 appended
✓ row-group 30/169 appended
✓ row-group 31/169 appended
✓ row-group 32/169 appended
✓ row-group 33/169 appended
✓ row-group 34/169 appended
✓ row-group 35/169 appended
✓ row

In [None]:
df = pd.read

Wrote data to E:\ML\results.csv
