In [1]:
# Standard library imports
import os
import sys
import json
import uuid

# Third-party imports
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import skimage.draw

# Custom module import
sys.path.append("../")  # Add parent directory to Python path
import LLMP as L

# ------------------- Configuration -------------------

# Main output directory
main_output_dir = "finetuning-EXP2-5000-5epochs-prompt"

# Subdirectories for images and JSON files
image_output_dir = os.path.join(main_output_dir, "images")
json_output_dir = os.path.join(main_output_dir, "json")

# Create directories if they don't exist
os.makedirs(image_output_dir, exist_ok=True)
os.makedirs(json_output_dir, exist_ok=True)

tasks = {
    "pie": (
        "The pie chart you are looking at is created as follows:"
        "First, create a list of five values where each value is between 3 and 39, and all values add up to 100. "
        "Next, divide each value in the list by the largest value, so that the largest value becomes 1.0. "
        "Then, shift the items in the list so that the largest value (1.0) is the first item. "
        "Now, look at the pie chart again. "
        "Identify the largest segment, which is marked with a dot. "
        "Go counterclockwise around the pie starting from the largest segment, estimating the ratio of the other four values to the maximum. "
        "Format your answer as [1.0, x.x, x.x, x.x, x.x] (the first number must be 1.0, representing the largest segment). No explanation."
    ),
    
    "bar": (
        "The bar chart you are looking at is created as follows:"
        "First, create a list of five values where each value is between 3 and 39, and all values add up to 100. "
        "Next, divide each value in the list by the largest value, so that the largest value becomes 1.0. "
        "Then, shift the items in the list so that the largest value (1.0) is the first item. "
        "Now, look at the bar chart again. "
        "Identify the largest bar, which is marked with a dot. "
        "Move left to right along the bar chart starting from the largest bar, estimating the ratio of the other four values to the maximum. "
        "Format your answer as [1.0, x.x, x.x, x.x, x.x] (the first number must be 1.0, representing the largest bar). No explanation."
    )
}


# Number of images to generate for each task
num_images_per_task = 5000

# List to store all data from all tasks
combined_dataset = []

# Loop through each task
for task, question in tasks.items():
    print(f"Generating images and dataset for task: {task}")
    
    # Set up a loop to generate images and collect their labels
    for i in range(num_images_per_task):
        # Generate the image and label for the task using GPImage
        image_array, label = L.GPImage.figure3(task)  # Ensure GPImage is defined or imported

        # Convert labels to Python-native floats for JSON compatibility
        label = [round(float(val), 2) for val in label]
        
        # Convert the array to uint8 format (values from 0 to 255) for saving as an image
        image_array_uint8 = (image_array * 255).astype(np.uint8)

        # Convert the NumPy array to a PIL image
        pil_image = Image.fromarray(image_array_uint8)

        # Generate a unique ID for the image
        unique_id = str(uuid.uuid4())

        # Save the image with the unique ID
        image_filename = os.path.join(image_output_dir, f"{unique_id}.jpg")
        pil_image.save(image_filename)

        # Create a JSON entry for the dataset
        json_entry = {
            'id': unique_id,
            'image': f"{unique_id}.jpg",
            'question': question,
            'value': label  # Normalized label values as native Python floats
        }

        # Append the JSON entry to the combined dataset list
        combined_dataset.append(json_entry)

# Save the combined dataset as a single JSON file in the JSON folder
combined_json_filename = "combined_dataset.json"
combined_json_filepath = os.path.join(json_output_dir, combined_json_filename)

with open(combined_json_filepath, 'w') as json_file:
    json.dump(combined_dataset, json_file, indent=4)

print(f"Images saved in '{image_output_dir}' and combined dataset saved as '{combined_json_filename}' in '{json_output_dir}'")


Generating images and dataset for task: pie
Generating images and dataset for task: bar
Images saved in 'finetuning-EXP2-5000-5epochs-prompt/images' and combined dataset saved as 'combined_dataset.json' in 'finetuning-EXP2-5000-5epochs-prompt/json'
