In [3]:
from pathlib import Path
import pyarrow.parquet as pq
import pandas as pd
import csv, itertools, time

PARQUET_FILE = Path("data.parquet")            # original
CSV_FILE     = Path("E:/ML/data.csv")  # merged CSV
# H5_FILE      = Path(r"E:\ML\data.h5")          # (optional) you can verify later

# Schema & metadata
pq_file = pq.ParquetFile(PARQUET_FILE)
print("Parquet rows :", pq_file.metadata.num_rows)
print("Parquet cols :", pq_file.metadata.num_columns)
print("\nSchema →")
print(pq_file.schema)

# Peek at first 5 rows
sample_df = pq_file.read_row_group(0, columns=None).to_pandas().head()
print("\nSample rows →\n", sample_df)

# Read just the header line
with open(CSV_FILE, "r", newline="") as f:
    reader = csv.reader(f)
    columns_csv = next(reader)
print("CSV columns ({}):".format(len(columns_csv)))
print(columns_csv[:20], "...")

# Peek at first 5 rows with pandas (nrows keeps it tiny)
df_csv_head = pd.read_csv(CSV_FILE, nrows=5)
print("\nCSV sample rows →\n", df_csv_head.head())

cols_parquet = pq_file.schema.names
assert list(cols_parquet) == columns_csv, "Column mismatch!"
print("\n✅ Columns match between Parquet and CSV.")



Parquet rows : 16895213
Parquet cols : 193

Schema →
<pyarrow._parquet.ParquetSchema object at 0x000001F32FD39B00>
required group field_id=-1 duckdb_schema {
  optional fixed_len_byte_array(16) field_id=-1 line_ID (UUID);
  optional binary field_id=-1 customer_ID (String);
  optional int64 field_id=-1 date (Timestamp(isAdjustedToUTC=false, timeUnit=microseconds, is_from_converted_type=false, force_set_converted_type=false));
  optional float field_id=-1 P_2;
  optional float field_id=-1 D_39;
  optional float field_id=-1 B_1;
  optional float field_id=-1 B_2;
  optional float field_id=-1 R_1;
  optional float field_id=-1 S_3;
  optional float field_id=-1 D_41;
  optional float field_id=-1 B_3;
  optional float field_id=-1 D_42;
  optional float field_id=-1 D_43;
  optional float field_id=-1 D_44;
  optional float field_id=-1 B_4;
  optional float field_id=-1 D_45;
  optional float field_id=-1 B_5;
  optional float field_id=-1 R_2;
  optional float field_id=-1 D_46;
  optional float fie

In [None]:
import random
pq_file = pq.ParquetFile(PARQUET_FILE)

# Pick a random absolute row index
N = random.randint(0, pq_file.metadata.num_rows - 5)
print(f"Spot-checking rows {N} – {N+4}")

# ---- Parquet slice ----
# Identify which row-group N falls into
rows_per_rg = [pq_file.metadata.row_group(i).num_rows for i in range(pq_file.num_row_groups)]
rg_idx      = next(i for i, cum in enumerate(pd.Series(rows_per_rg).cumsum()) if cum > N)
rg_row0     = sum(rows_per_rg[:rg_idx])           # first global row of that row-group
table_rg    = pq_file.read_row_group(rg_idx)
slice_df_pq = table_rg.to_pandas().iloc[N - rg_row0 : N - rg_row0 + 5]

# ---- CSV slice ----
slice_df_csv = pd.read_csv(CSV_FILE, skiprows=N+1, nrows=5)  # +1 to skip header

print("\nParquet slice\n", slice_df_pq.head())
print("\nCSV slice\n",    slice_df_csv.head())

Spot-checking rows 14422229 – 14422233
