In [None]:
datafolder = "/data/20181029_D4_0900_0930/20181029_D4_0900_0930"

In [None]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from pathlib import Path
from PIL import Image

def image_to_bytes(image_path):
    with Image.open(image_path) as img:
        return img.tobytes()


# Define working directories
workdir = Path(datafolder)  # Update this to your data directory
annotations_dir = workdir / "Annotations"
frames_dir = workdir / "Frames"
out_path = workdir / "my_data.parquet"

# List all annotation files
annotation_files = sorted(annotations_dir.glob("*.csv"))

data = {
    "images": [],
    "metadata": []
}

# Loop through all annotation files and load the corresponding images
for annotation_file in annotation_files:
    # Extract the base filename without extension (e.g., '00001' from '00001.csv')
    base_name = annotation_file.stem
    
    # Define the corresponding image file path
    image_path = frames_dir / f"{base_name}.jpg"
    
    # Read the CSV metadata
    metadata = pd.read_csv(annotation_file)
    
    # Convert the image to bytes
    image_bytes = image_to_bytes(image_path)
    
    # Append data to the list
    data["images"].append(image_bytes)
    data["metadata"].append(metadata.to_dict(orient="records"))  # Convert metadata to a list of dictionaries

# Convert the collected data into a DataFrame
df = pd.DataFrame(data)

# Convert the DataFrame to Arrow Table
table = pa.Table.from_pandas(df)

# Write the table to a Parquet file
pq.write_table(table, out_path)

print(f"Parquet file saved to {out_path}")