In [1]:
import os
import h5py
from PIL import Image
import numpy as np
from tqdm import tqdm
import time
import concurrent.futures

def process_image(args):
    jpg_file, input_folder = args
    img_path = os.path.join(input_folder, jpg_file)
    with Image.open(img_path) as img:
        img_array = np.array(img)
    return jpg_file, img_array

def convert_jpg_folder_to_hdf5(input_folder, output_file):
    print(f"Starting conversion process...")
    print(f"Input folder: {input_folder}")
    print(f"Output file: {output_file}")
    
    start_time = time.time()
    
    jpg_files = [f for f in os.listdir(input_folder) if f.lower().endswith('.jpg')]
    total_images = len(jpg_files)
    print(f"Found {total_images} JPG files to process")

    with h5py.File(output_file, 'w') as hf:
        print("Creating HDF5 file structure...")
        dt = h5py.special_dtype(vlen=str)
        image_names = hf.create_dataset('image_names', (total_images,), dtype=dt)

        print("Analyzing sample image for dataset configuration...")
        sample_img = Image.open(os.path.join(input_folder, jpg_files[0]))
        img_shape = np.array(sample_img).shape
        img_dtype = np.array(sample_img).dtype
        print(f"Image shape: {img_shape}, dtype: {img_dtype}")

        print("Creating main image dataset...")
        images = hf.create_dataset('images', (total_images, *img_shape), dtype=img_dtype, chunks=True)

        print("Starting image processing with concurrent.futures...")
        with concurrent.futures.ProcessPoolExecutor() as executor:
            futures = [executor.submit(process_image, (jpg_file, input_folder)) for jpg_file in jpg_files]
            for i, future in enumerate(tqdm(concurrent.futures.as_completed(futures), total=total_images, desc="Processing images")):
                jpg_file, img_array = future.result()
                image_names[i] = jpg_file
                images[i] = img_array
                
                if (i + 1) % 1000 == 0 or (i + 1) == total_images:
                    print(f"Processed {i + 1} out of {total_images} images")

    end_time = time.time()
    total_time = end_time - start_time
    print(f"\nConversion complete. HDF5 file saved as {output_file}")
    print(f"Total processing time: {total_time:.2f} seconds")
    print(f"Average time per image: {total_time/total_images:.4f} seconds")

def get_filenames_from_hdf5(file_path, dataset_name='image_names'):
    """
    Retrieves all filenames from a specific dataset in an HDF5 file.
    
    :param file_path: Path to the HDF5 file
    :param dataset_name: Name of the dataset containing the filenames
    :return: List of filenames
    """
    with h5py.File(file_path, 'r') as hf:
        if dataset_name in hf:
            filenames = hf[dataset_name][:]
            return filenames.tolist()
        else:
            raise KeyError(f"Dataset '{dataset_name}' not found in the HDF5 file.")

# Example usage
input_folder = '/Users/jimmyhe/Desktop/KaggleCompetitions/ISISCANCER/preprocessed_image'
output_file = 'preprocessed_images.h5'

# Convert JPG images to HDF5
convert_jpg_folder_to_hdf5(input_folder, output_file)

# Retrieve filenames from HDF5
filenames = get_filenames_from_hdf5(output_file)
print("Sample of filenames:", filenames[:10])  # Print first 10 filenames

Starting conversion process...
Input folder: /Users/jimmyhe/Desktop/KaggleCompetitions/ISISCANCER/preprocessed_image
Output file: preprocessed_images.h5
Found 401059 JPG files to process
Creating HDF5 file structure...
Analyzing sample image for dataset configuration...
Image shape: (224, 224, 3), dtype: uint8
Creating main image dataset...
Starting image processing with concurrent.futures...


Process SpawnProcess-1:
Process SpawnProcess-2:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/miniconda3/envs/MLEnv/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/miniconda3/envs/MLEnv/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/miniconda3/envs/MLEnv/lib/python3.9/concurrent/futures/process.py", line 240, in _process_worker
    call_item = call_queue.get(block=True)
  File "/opt/miniconda3/envs/MLEnv/lib/python3.9/multiprocessing/queues.py", line 122, in get
    return _ForkingPickler.loads(res)
  File "/opt/miniconda3/envs/MLEnv/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/miniconda3/envs/MLEnv/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/miniconda3/envs/MLEnv/lib/python3.9/concurrent/futures/process.py", 

BrokenProcessPool: A child process terminated abruptly, the process pool is not usable anymore

In [3]:
def get_filenames_from_hdf5(file_path):
    """
    Retrieves all filenames (keys) from an HDF5 file.
    
    :param file_path: Path to the HDF5 file
    :return: List of filenames (keys)
    """
    with h5py.File(file_path, 'r') as hf:
        return list(hf.keys())

In [8]:
filenames = get_filenames_from_hdf5(output_file)