In [4]:
from datasets import load_dataset
import random
from PIL import Image
from pathlib import Path

class DatasetViewer:
    """
    A class for viewing random samples from an image dataset with associated OCR text. 
    """

    def __init__(self, base_path, metadata_dir="json", image_dir="image", split="train"):
        """
        Initialize the DatasetViewer.

        Args:
            base_path (str): The base directory where metadata and image directories are located.
            metadata_dir (str): The directory containing JSON metadata files.
            image_dir (str): The directory containing image files.
            split (str): The dataset split to load (e.g., "train", "test").
        """
        self.base_path = Path(base_path)
        self.metadata_path = self.base_path.joinpath(metadata_dir)
        self.image_path = self.base_path.joinpath(image_dir)
        self.split = split
        self.dataset = load_dataset("imagefolder", data_dir=self.image_path, split=self.split)

    def view_random_sample(self):
        """
        View a random sample from the dataset, display its OCR text and corresponding image.

        Returns:

                str: OCR text of the random sample
                int: The index of the random sample in the dataset
        """
        random_sample_index = random.randint(0, len(self.dataset) - 1)
        random_sample = self.dataset[random_sample_index]
        ocr_text = random_sample['text']
        
        print(f"Random sample index: {random_sample_index}\nOCR text:\n{ocr_text}")
        random_sample['image'].show()

# Example usage:
if __name__ == "__main__":
    base_directory = "/Users/shairawadhawan/Desktop/GiBots /To use"
    
    # Create an instance of DatasetViewer
    viewer = DatasetViewer(base_directory)
    
    # View a random sample from the dataset
    viewer.view_random_sample()


Random sample index: 6
OCR text:
{"gt_parse": {"Invoice Number": "2121359", "Invoice Total Amount": "15245.02", "Invoice Date": "21 February 2020", "Vendor Name": "onata"}}
