In [2]:
from pathlib import Path
import json
from datasets import load_dataset

class DonutMetadataGenerator:
    """
    A class to generate Donut metadata and save it to a JSONLines file.
    """

    def __init__(self, base_path, metadata_dir="json", image_dir="image"):
        """
        Initialize the DonutMetadataGenerator.

        Args:
            base_path (str): The base directory where metadata and image directories are located.
            metadata_dir (str): The directory containing JSON metadata files.
            image_dir (str): The directory containing image files.
        """
        self.base_path = Path(base_path)
        self.metadata_path = self.base_path.joinpath(metadata_dir)
        self.image_path = self.base_path.joinpath(image_dir)

    def generate_metadata(self, output_dir="output"):
        """
        Generate Donut-style metadata and save it to a JSONLines file.

        Args:
            output_dir (str): The directory where the JSONLines file will be saved.
        """
        metadata_list = []

        # Parse metadata
        for file_name in self.metadata_path.glob("*.json"):
            with open(file_name, "r") as json_file:
                # Load JSON file
                data = json.load(json_file)
                line = {'gt_parse': data}
                # Create "text" column with JSON string
                text = json.dumps(line)
                # Add to metadata list if the corresponding image exists
                if self.image_path.joinpath(f"{file_name.stem}.png").is_file():
                    metadata_list.append({"text": text, "file_name": f"{file_name.stem}.png"})

        # Create the output directory if it doesn't exist
        output_dir_path = self.base_path.joinpath(output_dir)
        output_dir_path.mkdir(parents=True, exist_ok=True)

        # Write JSONLines file
        with open(output_dir_path.joinpath('metadata.jsonl'), 'w') as outfile:
            for entry in metadata_list:
                json.dump(entry, outfile)
                outfile.write('\n')


#sample use case 
if __name__ == "__main__":
    # base directory where metadata and image directories are located
    base_directory = "/Users/shairawadhawan/Desktop/GiBots /To use"

    # Create an instance of DonutMetadataGenerator
    generator = DonutMetadataGenerator(base_directory)

    # Generate metadata and save it to the "output" directory
    generator.generate_metadata(output_dir="jsonl")
