### Generate Datasets

In [1]:
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import skimage.draw
import json

# Add LLMP module to system path
import sys

sys.path.append("../")  # Adds the current directory to the Python path

import LLMP as L

import uuid

# Define the main output directory
main_output_dir = "finetuning-EXP1-5000-10epochs-backup"

# Subdirectories for images and JSON files
image_output_dir = os.path.join(main_output_dir, "images")
json_output_dir = os.path.join(main_output_dir, "json")

# Create directories if they don't exist
os.makedirs(image_output_dir, exist_ok=True)
os.makedirs(json_output_dir, exist_ok=True)

# List of tasks and their respective questions
tasks = {
    "position_common_scale": "Please estimate the vertical position of the block relative to the line on the left (Top is 0, Bottom is 60). So the range is 0 - 60. No explanation.",
    "position_non_aligned_scale": "Please estimate the vertical position of the block relative to the line on the left (Top is 0, Bottom is 60). So the range is 0 - 60. No explanation.",
    "length": "Please estimate the length of the vertical line (from top to bottom). The height of the whole image is 100. No explanation.",
    "direction": "Please estimate the direction of the line relative to the starting dot in the range 0 - 359 degrees. No explanation.",
    "angle": "Please estimate the angle (0-90 degrees). No explanation.",
    "area": "Please estimate the area covered by the circle. The whole image is 100x100 with an area of 10000. No explanation.",
    "volume": "Please estimate the volume of the cube. The cube size is relative to the image size of 100x100. No explanation.",
    "curvature": "Please estimate the curvature of the line. (0 is no curvature - 1 is the maximum curvature) The more bend the line is, the higher the curvature. No explanation.",
    "shading": "Please estimate the shading density or texture density (range 0 to 100). No explanation."
}

# Number of images to generate for each task
num_images_per_task = 5000

# List to store all data from all tasks
combined_dataset = []

# Loop through each task
for task, question in tasks.items():
    print(f"Generating images and dataset for task: {task}")
    
    # Set up a loop to generate images and collect their labels
    for i in range(num_images_per_task):
        # Generate the image and label for the task using GPImage
    
        image_array, label = L.GPImage.figure1(task)  # Ensure GPImage is defined or imported
        
        # Convert the array to uint8 format (values from 0 to 255)
        image_array_uint8 = (image_array * 255).astype(np.uint8)

        # Convert the NumPy array to a PIL image
        pil_image = Image.fromarray(image_array_uint8)

        # Generate a unique ID for the image
        unique_id = str(uuid.uuid4())

        # Save the image with the unique ID
        image_filename = os.path.join(image_output_dir, f"{unique_id}.jpg")
        pil_image.save(image_filename)

        # Create a JSON entry for the dataset
        json_entry = {
            'id': unique_id,
            'image': f"{unique_id}.jpg",
            'question': question,
            'value': label
        }

        # Append the JSON entry to the combined dataset list
        combined_dataset.append(json_entry)

# Save the combined dataset as a single JSON file in the JSON folder
combined_json_filename = "combined_dataset.json"
combined_json_filepath = os.path.join(json_output_dir, combined_json_filename)

with open(combined_json_filepath, 'w') as json_file:
    json.dump(combined_dataset, json_file, indent=4)

print(f"Images saved in '{image_output_dir}' and combined dataset saved as '{combined_json_filename}' in '{json_output_dir}'")


Generating images and dataset for task: position_common_scale
Generating images and dataset for task: position_non_aligned_scale
Generating images and dataset for task: length
Generating images and dataset for task: direction
Generating images and dataset for task: angle
Generating images and dataset for task: area
Generating images and dataset for task: volume
Generating images and dataset for task: curvature
Generating images and dataset for task: shading
Images saved in 'finetuning-EXP1-5000-10epochs-backup/images' and combined dataset saved as 'combined_dataset.json' in 'finetuning-EXP1-5000-10epochs-backup/json'


### Distribution of each task

In [2]:
pwd



'/home/huuthanhvy.nguyen001/tmp/LLMP/EXP'

In [6]:
import pandas as pd

# Path to the JSON file
json_file_path = './EXPs-5000-3pochs/finetune/outputEXP1-5000-3poch/json/combined_dataset.json'

# Load JSON file into a DataFrame
df = pd.read_json(json_file_path)

import pandas as pd

# Dictionary to map simplified task names to full questions
task_descriptions = {
    "position_common_scale": "Please estimate the vertical position of the block relative to the line on the left (Top is 0, Bottom is 60). So the range is 0 - 60. No explanation.",
    "position_non_aligned_scale": "Please estimate the vertical position of the block relative to a misaligned line on the left. (Top is 0, Bottom is 60). No explanation.",
    "length": "Please estimate the length of the vertical line (from top to bottom). The height of the whole image is 100. No explanation.",
    "direction": "Please estimate the direction of the line relative to the starting dot in the range 0 - 359 degrees. No explanation.",
    "angle": "Please estimate the angle (0-90 degrees). No explanation.",
    "area": "Please estimate the area covered by the circle. The whole image is 100x100 with an area of 10000. No explanation.",
    "volume": "Please estimate the volume of the cube. The cube size is relative to the image size of 100x100. No explanation.",
    "curvature": "Please estimate the curvature of the line. (0 is no curvature - 1 is the maximum curvature) The more bend the line is, the higher the curvature. No explanation.",
    "shading": "Please estimate the shading density or texture density (range 0 to 100). No explanation."
}

# Function to get the distribution for a specified task, allowing either simplified or full task description
def get_task_distribution(df, task_key):
    # Check if the task_key is in the dictionary; if so, use the full description
    if task_key in task_descriptions:
        task_description = task_descriptions[task_key]
    elif task_key in task_descriptions.values():
        task_description = task_key
    else:
        print("Task not found. Please provide a valid task name or description.")
        return

    # Set display options to show all rows and columns
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_colwidth', None)  # Set to None to display full cell content

    # Filter the DataFrame to include only rows where 'task' matches the given task description
    task_df = df[df['question'] == task_description]

    # Group by 'value' within the filtered DataFrame to get the distribution and count
    distribution = task_df.groupby(['value']).size().reset_index(name='count')

    # Sort by 'value' in ascending order
    distribution = distribution.sort_values(by='value', ascending=True)

    # Display the sorted distribution DataFrame
    print(distribution)



FileNotFoundError: File ./EXPs-5000-3pochs/finetune/EXPs-5000-3pochs/json/combined_dataset.json does not exist

In [4]:
# Example usage
get_task_distribution(df, "volume")  # Using simplified task name

     value  count
0      1.0    252
1      8.0    249
2     27.0    248
3     64.0    227
4    125.0    243
5    216.0    260
6    343.0    247
7    512.0    261
8    729.0    264
9   1000.0    268
10  1331.0    237
11  1728.0    254
12  2197.0    252
13  2744.0    243
14  3375.0    261
15  4096.0    225
16  4913.0    247
17  5832.0    248
18  6859.0    258
19  8000.0    256


In [9]:
get_task_distribution(df, "position_common_scale")  # Using simplified task name

    value  count
0     0.0     87
1     1.0     72
2     2.0     89
3     3.0     86
4     4.0     91
5     5.0     89
6     6.0     76
7     7.0     81
8     8.0     84
9     9.0     88
10   10.0     82
11   11.0     85
12   12.0     92
13   13.0     77
14   14.0     80
15   15.0     77
16   16.0     99
17   17.0     82
18   18.0     70
19   19.0     95
20   20.0     94
21   21.0    341
22   22.0    332
23   23.0    336
24   24.0    309
25   25.0    345
26   26.0    336
27   27.0    352
28   28.0    315
29   29.0    348
30   30.0    328
31   31.0    322
32   32.0    330
33   33.0    310
34   34.0    351
35   35.0    329
36   36.0    346
37   37.0    342
38   38.0    349
39   39.0    335
40   40.0    333
41   41.0     81
42   42.0     78
43   43.0     82
44   44.0     73
45   45.0     71
46   46.0     83
47   47.0     78
48   48.0     86
49   49.0     59
50   50.0     88
51   51.0     74
52   52.0     84
53   53.0     89
54   54.0     90
55   55.0     88
56   56.0     77
57   57.0     

In [5]:
get_task_distribution(df, "area")

          value  count
0      3.141593    142
1     12.566371    116
2     28.274334    139
3     50.265482    139
4     78.539816    136
5    113.097336    118
6    153.938040    127
7    201.061930    125
8    254.469005    126
9    314.159265    149
10   380.132711    109
11   452.389342    146
12   530.929158    139
13   615.752160    128
14   706.858347    119
15   804.247719    110
16   907.920277    114
17  1017.876020    113
18  1134.114948    144
19  1256.637061    125
20  1385.442360    114
21  1520.530844    121
22  1661.902514     86
23  1809.557368    103
24  1963.495408    129
25  2123.716634    100
26  2290.221044    139
27  2463.008640    134
28  2642.079422    141
29  2827.433388    105
30  3019.070540    126
31  3216.990877    138
32  3421.194400    131
33  3631.681108    130
34  3848.451001    112
35  4071.504079    127
36  4300.840343    126
37  4536.459792    119
38  4778.362426    134
39  5026.548246    121


In [7]:
get_task_distribution(df, "curvature")

    value  count
0   0.000     53
1   0.001     59
2   0.002     49
3   0.003     52
4   0.004     48
5   0.006     69
6   0.007     78
7   0.008     50
8   0.009     69
9   0.010     56
10  0.011     58
11  0.012     53
12  0.013     76
13  0.014     58
14  0.016     58
15  0.017     62
16  0.018     55
17  0.019     84
18  0.020     71
19  0.021     54
20  0.022     59
21  0.023     62
22  0.024     62
23  0.026     63
24  0.027     86
25  0.028     65
26  0.029     57
27  0.030     68
28  0.031     74
29  0.032     67
30  0.033     70
31  0.034     51
32  0.036     65
33  0.037     58
34  0.038     54
35  0.039     62
36  0.040     49
37  0.041     59
38  0.042     64
39  0.043     75
40  0.044     59
41  0.046     58
42  0.047     57
43  0.048     69
44  0.049     63
45  0.050     63
46  0.051     53
47  0.052     60
48  0.053     70
49  0.054     75
50  0.056     61
51  0.057     61
52  0.058     73
53  0.059     68
54  0.060     75
55  0.061     67
56  0.062     65
57  0.063     

In [8]:
get_task_distribution(df, "shading")

    value  count
0     1.0     49
1     2.0     57
2     3.0     46
3     4.0     43
4     5.0     44
5     6.0     51
6     7.0     54
7     8.0     58
8     9.0     46
9    10.0     53
10   11.0     40
11   12.0     55
12   13.0     63
13   14.0     46
14   15.0     53
15   16.0     50
16   17.0     41
17   18.0     42
18   19.0     60
19   20.0     54
20   21.0     45
21   22.0     51
22   23.0     52
23   24.0     59
24   25.0     42
25   26.0     57
26   27.0     51
27   28.0     42
28   29.0     41
29   30.0     38
30   31.0     56
31   32.0     59
32   33.0     50
33   34.0     45
34   35.0     55
35   36.0     60
36   37.0     55
37   38.0     66
38   39.0     50
39   40.0     49
40   41.0     41
41   42.0     48
42   43.0     57
43   44.0     54
44   45.0     52
45   46.0     54
46   47.0     59
47   48.0     32
48   49.0     40
49   50.0     43
50   51.0     42
51   52.0     48
52   53.0     50
53   54.0     54
54   55.0     56
55   56.0     55
56   57.0     60
57   58.0     