### Generate Dataset

In [12]:
# Standard library imports
import os
import sys
import json
import uuid

# Third-party imports
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import skimage.draw

# Custom module import
sys.path.append("../")  # Add parent directory to Python path
import LLMP as L

# ------------------- Configuration -------------------

# Main output directory
main_output_dir = "finetuning-EXP4-5000-10epochs-test"  # Directory to store results
# Number of images to generate for each task
num_images_per_task = 5000  # Total images per task

# Subdirectories for images and JSON files
image_output_dir = os.path.join(main_output_dir, "images")  # Directory for saving images
json_output_dir = os.path.join(main_output_dir, "json")  # Directory for saving JSON files

# Create directories if they don't exist
os.makedirs(image_output_dir, exist_ok=True)  # Ensure image directory exists
os.makedirs(json_output_dir, exist_ok=True)  # Ensure JSON directory exists

# Define tasks with associated questions
tasks = {
    "framed": "Estimate the lengths of the two bars with framing. Both lengths should fall between 49 and 60 pixels. No explanation. Format of the answer [xx, xx]",
    "unframed": "Estimate the lengths of the two bars without framing. Both lengths should fall between 49 and 60 pixels. No explanation. Format of the answer [xx, xx]"
}

# List to store all data from all tasks
combined_dataset = []  # Placeholder for the entire dataset

######################################

# Initialize counters for each dataset type
train_counter = 0  # Counter for training set images
val_counter = 0  # Counter for validation set images
test_counter = 0  # Counter for test set images

# Define the target number of images for each dataset
train_target = 5000  # Number of images for training
val_target = 1000  # Number of images for validation
test_target = 500  # Number of images for testing

# Combined dataset split lists
combined_dataset_training = []  # List to store training dataset entries
combined_dataset_validation = []  # List to store validation dataset entries
combined_dataset_testing = []  # List to store test dataset entries

# Separate lists to track unique labels in each dataset
train_labels = []  # Labels in the training set
val_labels = []  # Labels in the validation set
test_labels = []  # Labels in the test set
######################################

# Loop through each task
for task, question in tasks.items():
    print(f"Generating images and dataset for task: {task}")  # Inform user about the current task
    
    while train_counter < train_target or val_counter < val_target or test_counter < test_target:

        image_array, label = L.GPImage.figure12(task)  # Generate image and label using the custom module
        pot = np.random.choice(3)  # Randomly assign potential dataset

        # Ensure global label separation between datasets
        if label in train_labels or label == min(train_labels, default=None) or label == max(train_labels, default=None):
            pot = 0  # Training set priority
        elif label in val_labels or label == min(val_labels, default=None) or label == max(val_labels, default=None):
            pot = 1  # Validation set priority
        elif label in test_labels or label == min(test_labels, default=None) or label == max(test_labels, default=None):
            pot = 2  # Test set priority

        # Training dataset conditional checks
        if pot == 0 and train_counter < train_target:
            if label not in train_labels:  # Avoid duplicate labels
                train_labels.append(label)
            
            image_array = image_array.astype(np.float32)  # Convert to float32 to add noise

            # Add noise only to the background (value == 0)
            noise_mask = (image_array == 0)  # Identify background pixels
            noise = np.random.uniform(0, 0.05, image_array.shape)  # Generate random noise
            image_array[noise_mask] += noise[noise_mask]  # Apply noise to background only

            label = [round(float(val), 2) for val in label]  # Round label values to 2 decimals

            image_array_uint8 = (image_array * 255).astype(np.uint8)  # Scale image values to 0-255
            pil_image = Image.fromarray(image_array_uint8)  # Convert to PIL image
            unique_id = str(uuid.uuid4())  # Generate a unique identifier for the image
            image_filename = os.path.join(image_output_dir, f"{unique_id}.jpg")  # Set the image filename
            pil_image.save(image_filename)  # Save the image

            json_entry = {
                'id': unique_id,
                'image': f"{unique_id}.jpg",
                'question': question,
                'value': label
            }

            combined_dataset_training.append(json_entry)  # Add entry to the training dataset
            train_counter += 1  # Increment training counter

        # Validation dataset conditional checks
        elif pot == 1 and val_counter < val_target:
            if label not in val_labels:  # Avoid duplicate labels
                val_labels.append(label)

            image_array = image_array.astype(np.float32)  # Convert to float32 to add noise

            # Add noise only to the background (value == 0)
            noise_mask = (image_array == 0)  # Identify background pixels
            noise = np.random.uniform(0, 0.05, image_array.shape)  # Generate random noise
            image_array[noise_mask] += noise[noise_mask]  # Apply noise to background only

            label = [round(float(val), 2) for val in label]  # Round label values to 2 decimals

            image_array_uint8 = (image_array * 255).astype(np.uint8)  # Scale image values to 0-255
            pil_image = Image.fromarray(image_array_uint8)  # Convert to PIL image
            unique_id = str(uuid.uuid4())  # Generate a unique identifier for the image
            image_filename = os.path.join(image_output_dir, f"{unique_id}.jpg")  # Set the image filename
            pil_image.save(image_filename)  # Save the image

            json_entry = {
                'id': unique_id,
                'image': f"{unique_id}.jpg",
                'question': question,
                'value': label
            }

            combined_dataset_validation.append(json_entry)  # Add entry to the validation dataset
            val_counter += 1  # Increment validation counter

        # Test dataset conditional checks
        elif pot == 2 and test_counter < test_target:
            if label not in test_labels:  # Avoid duplicate labels
                test_labels.append(label)

            image_array = image_array.astype(np.float32)  # Convert to float32 to add noise

            # Add noise only to the background (value == 0)
            noise_mask = (image_array == 0)  # Identify background pixels
            noise = np.random.uniform(0, 0.05, image_array.shape)  # Generate random noise
            image_array[noise_mask] += noise[noise_mask]  # Apply noise to background only

            label = [round(float(val), 2) for val in label]  # Round label values to 2 decimals

            image_array_uint8 = (image_array * 255).astype(np.uint8)  # Scale image values to 0-255
            pil_image = Image.fromarray(image_array_uint8)  # Convert to PIL image
            unique_id = str(uuid.uuid4())  # Generate a unique identifier for the image
            image_filename = os.path.join(image_output_dir, f"{unique_id}.jpg")  # Set the image filename
            pil_image.save(image_filename)  # Save the image

            json_entry = {
                'id': unique_id,
                'image': f"{unique_id}.jpg",
                'question': question,
                'value': label
            }

            combined_dataset_testing.append(json_entry)  # Add entry to the test dataset
            test_counter += 1  # Increment test counter

# Save the combined dataset as separate JSON files
combined_json_training_filename = "train_dataset.json"
combined_json_training_filepath = os.path.join(json_output_dir, combined_json_training_filename)
with open(combined_json_training_filepath, 'w') as json_file:
    json.dump(combined_dataset_training, json_file, indent=4)  # Save training dataset to JSON
print(f"Training dataset saved as '{combined_json_training_filename}' in '{json_output_dir}'")


combined_json_validation_filename = "val_dataset.json"
combined_json_validation_filepath = os.path.join(json_output_dir, combined_json_validation_filename)
with open(combined_json_validation_filepath, 'w') as json_file:
    json.dump(combined_dataset_validation, json_file, indent=4)  # Save validation dataset to JSON
print(f"Validation dataset saved as '{combined_json_validation_filename}' in '{json_output_dir}'")

combined_json_testing_filename = "test_dataset.json"
combined_json_testing_filepath = os.path.join(json_output_dir, combined_json_testing_filename)
with open(combined_json_testing_filepath, 'w') as json_file:
    json.dump(combined_dataset_testing, json_file, indent=4)  # Save test dataset to JSON
print(f"Test dataset saved as '{combined_json_testing_filename}' in '{json_output_dir}'")


i am curious why testing has more unique labels from dataset

Generating images and dataset for task: framed
Generating images and dataset for task: unframed
Training dataset saved as 'train_dataset.json' in 'finetuning-EXP4-5000-10epochs-test/json'
Validation dataset saved as 'val_dataset.json' in 'finetuning-EXP4-5000-10epochs-test/json'
Test dataset saved as 'test_dataset.json' in 'finetuning-EXP4-5000-10epochs-test/json'


### Check overlap labels

In [20]:
# Script to count unique labels across datasets and check overlap
import os
import json

# Load datasets
json_output_dir = "finetuning-EXP4-5000-10epochs-test/json"

train_file = os.path.join(json_output_dir, "train_dataset.json")
val_file = os.path.join(json_output_dir, "val_dataset.json")
test_file = os.path.join(json_output_dir, "test_dataset.json")

with open(train_file, 'r') as f:
    train_dataset = json.load(f)

with open(val_file, 'r') as f:
    val_dataset = json.load(f)

with open(test_file, 'r') as f:
    test_dataset = json.load(f)

# Function to count unique labels in a dataset
def count_unique_labels(dataset):
    labels = [tuple(entry['value']) for entry in dataset]
    unique_labels = set(labels)
    return unique_labels

# Count unique labels for each dataset
train_unique_labels = count_unique_labels(train_dataset)
val_unique_labels = count_unique_labels(val_dataset)
test_unique_labels = count_unique_labels(test_dataset)

# Check for overlaps across datasets
train_val_overlap = train_unique_labels & val_unique_labels
train_test_overlap = train_unique_labels & test_unique_labels
val_test_overlap = val_unique_labels & test_unique_labels

# Print unique labels for each dataset
print("| Dataset      | Total Unique Labels | Unique Labels")
print("|--------------|---------------------|---------------")
print(f"| Training     | {len(train_unique_labels):<19} | {sorted(train_unique_labels)}")
print(f"| Validation   | {len(val_unique_labels):<19} | {sorted(val_unique_labels)}")
print(f"| Testing      | {len(test_unique_labels):<19} | {sorted(test_unique_labels)}")

# Print overlap information
print("\nOverlap Information:")
if train_val_overlap:
    print(f"- Overlap between Training and Validation: {sorted(train_val_overlap)}")
else:
    print("- No overlap between Training and Validation.")

if train_test_overlap:
    print(f"- Overlap between Training and Testing: {sorted(train_test_overlap)}")
else:
    print("- No overlap between Training and Testing.")

if val_test_overlap:
    print(f"- Overlap between Validation and Testing: {sorted(val_test_overlap)}")
else:
    print("- No overlap between Validation and Testing.")


| Dataset      | Total Unique Labels | Unique Labels
|--------------|---------------------|---------------
| Training     | 34                  | [(49.0, 52.0), (49.0, 53.0), (49.0, 57.0), (49.0, 59.0), (49.0, 60.0), (50.0, 55.0), (50.0, 56.0), (51.0, 52.0), (52.0, 51.0), (52.0, 55.0), (53.0, 49.0), (53.0, 50.0), (53.0, 51.0), (53.0, 56.0), (53.0, 58.0), (54.0, 49.0), (54.0, 50.0), (54.0, 57.0), (55.0, 51.0), (55.0, 54.0), (56.0, 51.0), (56.0, 54.0), (56.0, 55.0), (57.0, 49.0), (57.0, 51.0), (57.0, 54.0), (57.0, 55.0), (57.0, 58.0), (58.0, 59.0), (59.0, 55.0), (59.0, 57.0), (60.0, 52.0), (60.0, 56.0), (60.0, 59.0)]
| Validation   | 44                  | [(49.0, 51.0), (49.0, 55.0), (50.0, 51.0), (50.0, 53.0), (50.0, 54.0), (50.0, 59.0), (50.0, 60.0), (51.0, 49.0), (51.0, 50.0), (51.0, 53.0), (51.0, 54.0), (51.0, 55.0), (51.0, 56.0), (52.0, 50.0), (52.0, 53.0), (52.0, 54.0), (52.0, 56.0), (52.0, 59.0), (52.0, 60.0), (53.0, 52.0), (53.0, 59.0), (53.0, 60.0), (54.0, 51.0), (54.0, 53.0), (