In [1]:
# Standard library imports
import os
import sys
import json
import uuid

# Third-party imports
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import skimage.draw

# Custom module import
sys.path.append("../")  # Add parent directory to Python path
import LLMP as L

# ------------------- Configuration -------------------

# Main output directory
main_output_dir = "finetuning-EXP3-5000-10epochs-lora"
# Number of images to generate for each task
num_images_per_task = 5000

# Subdirectories for images and JSON files
image_output_dir = os.path.join(main_output_dir, "images")
json_output_dir = os.path.join(main_output_dir, "json")

# Create directories if they don't exist
os.makedirs(image_output_dir, exist_ok=True)
os.makedirs(json_output_dir, exist_ok=True)

# Define tasks with associated questions
tasks = {

    'type1': "In the grouped bar chart, compare the heights of the two marked bars. Estimate the ratio of the height of the shorter marked bar to the height of the taller marked bar. Use a scale from 0 to 1, where 1 indicates that both marked bars are of equal height. No explaination.",
    'type2': "In the divided stacked bar chart, compare the heights of the two marked segments in the left and right bars. Estimate the ratio of the height of the shorter marked segment to the taller marked segment. Use a scale from 0 to 1, where 1 indicates that both segments are of equal height. No explaination.",
    'type3': "In the mixed grouped bar chart, compare the heights of the two marked bars. Estimate the ratio of the shorter marked bar’s height to the taller marked bar’s height. Use a scale from 0 to 1, where 1 indicates equal height. No explaination.",
    'type4': "In the divided stacked bars, compare the lengths of the two marked segments in the left and right bars. Estimate the ratio of the shorter marked segment’s length to the length of the taller marked segment. Use a scale from 0 to 1, where 1 indicates equal length. No explanation.",
    'type5': "In the complex divided stacked bar chart, compare the lengths of the two marked segments in the left bar. Estimate the ratio of the length of the shorter marked segment to the length of the taller marked segment. Use a scale from 0 to 1, where 1 indicates that both segments are of equal length."

}


# List to store all data from all tasks
combined_dataset = []

# Loop through each task
for task, question in tasks.items():
    print(f"Generating images and dataset for task: {task}")
    
    # Set up a loop to generate images and collect their labels
    for i in range(num_images_per_task):
        # Generate the image and label for the task using GPImage
        image_array, label = L.GPImage.figure4(task)  # Ensure GPImage is defined or imported

        # Convert label to a Python-native float, wrapped in a list if it's a single value
        if isinstance(label, (float, np.float64)):
            label = [round(float(label), 2)]
        else:
            label = [round(float(val), 2) for val in label]
        
        # Convert the array to uint8 format (values from 0 to 255) for saving as an image
        image_array_uint8 = (image_array * 255).astype(np.uint8)

        # Convert the NumPy array to a PIL image
        pil_image = Image.fromarray(image_array_uint8)

        # Generate a unique ID for the image
        unique_id = str(uuid.uuid4())

        # Save the image with the unique ID
        image_filename = os.path.join(image_output_dir, f"{unique_id}.jpg")
        pil_image.save(image_filename)

        # Create a JSON entry for the dataset
        json_entry = {
            'id': unique_id,
            'image': f"{unique_id}.jpg",
            'question': question,
            'value': label  # Normalized label values as native Python floats
        }

        # Append the JSON entry to the combined dataset list
        combined_dataset.append(json_entry)

# Save the combined dataset as a single JSON file in the JSON folder
combined_json_filename = "combined_dataset.json"
combined_json_filepath = os.path.join(json_output_dir, combined_json_filename)

with open(combined_json_filepath, 'w') as json_file:
    json.dump(combined_dataset, json_file, indent=4)

print(f"Images saved in '{image_output_dir}' and combined dataset saved as '{combined_json_filename}' in '{json_output_dir}'")


Generating images and dataset for task: type1
Generating images and dataset for task: type2
Generating images and dataset for task: type3
Generating images and dataset for task: type4
Generating images and dataset for task: type5
Images saved in 'finetuning-EXP3-5000-10epochs-lora/images' and combined dataset saved as 'combined_dataset.json' in 'finetuning-EXP3-5000-10epochs-lora/json'
