# Generating Synthetic Data with LOGO-Programs, Descriptions and the respective Graphics

In [1]:
# load
from _1_logo_pseudo_code_generator import generateLOGOPseudoCode
from _2_sampler import LOGOProgramSampler
from _3_executable_logo_primitives import ReGALLOGOPrimitives
from _4_logo_graphic_generator_v1 import PseudoProgramInterpreter as PseudoProgramInterpreter_v1
from _4_logo_graphic_generator_v2 import PseudoProgramInterpreter as PseudoProgramInterpreter_v2
from _5_ascii_processor import ASCIIProcessor

generator=generateLOGOPseudoCode()

import json
import pandas as pd
import os

load the train and test data from the ReGAL-Paper. Join both and remove duplicate programs.

In [2]:
#LOGO
train_logo_data = "logo_data/python/train_200_dataset.jsonl"
test_logo_data = "logo_data/python/test_dataset.jsonl"

# Load train and test dataset
with open(f"../external/dependencies/{train_logo_data}", 'r') as f:
    train_data = [json.loads(line) for line in f]

with open(f"../external/dependencies/{test_logo_data}", 'r') as f:
    test_data = [json.loads(line) for line in f]

In [3]:
# Extract descriptions and programs from train_data and transform into pandas DataFrame
def extract_descriptions_and_programs(data):
    extracted_data = []
    for item in data:
        description = None
        program = None
        for message in item.get('messages', []):
            if message['from'] == 'human':
                description = message['value']
            elif message['from'] == 'gpt':
                program = message['value']
        if description and program:
            extracted_data.append([description, program])
    extracted_data = pd.DataFrame(extracted_data, columns=['Description', 'Program'])
    return extracted_data

# Extract descriptions and programs from train_data
df_train = extract_descriptions_and_programs(train_data)
df_test = extract_descriptions_and_programs(test_data)

#display(df_train)
#display(df_test)

# Append the test data to the train data
df_all = pd.concat([df_train, df_test], ignore_index=True)
#display(df_all)

#####################
# Drop all duplicate rows
df_all = df_all.drop_duplicates(subset=['Program']) # use the program as the unique identifier
#####################
display(df_all)

Unnamed: 0,Description,Program
0,4 concentric square s,"for i in range(5):\n embed(""""""for j in rang..."
1,6 sided snowflake with a medium line and a med...,"for j in range(6):\n embed(""""""forward(8)\nl..."
2,5 sided snowflake with a medium line and a sma...,"for j in range(5):\n embed(""""""forward(8)\nl..."
3,6 short line s in a row,"for j in range(6):\n embed(""""""forward(2)\nl..."
4,a small triangle connected by a big line to a ...,for i in range(3):\n forward(2)\n left(1...
...,...,...
306,8 sided snowflake with a medium circle and a s...,"for j in range(8):\n embed(""""""penup()\nforw..."
307,5 sided snowflake with 2 small circle s as arms,"for j in range(5):\n embed(""""""penup()\nforw..."
308,3 sided snowflake with a small square and a sm...,"for j in range(3):\n embed(""""""penup()\nforw..."
309,5 sided snowflake with a small 5 gon and a sma...,"for j in range(5):\n embed(""""""penup()\nforw..."


## Generate Graphics for the ReGAL dataset and the Synthetic Data

- initialize the sampler, if the train and test dataframes are porvided then the newly generated programs are not the same as the once in these datasets
- the data can be stored in json-line format
- next step is to initalize the interpreter, with this one can execute the programs and generate graphs
    - the interpreter generates .png-files
    - the size is for all graphics in one df the same (the aim is to keep size information which is relative to the other graphics thats why there can be a lot of whitespace this might be debatable)

In [2]:
#sampler=LOGOProgramSampler(generator, train_df=df_train, test_df=df_test) # this way only new programs are generated that are not in the train or test data

# Synthetic data
#synthetic_data = sampler.sample(50) 
#synthetic_data = pd.DataFrame(synthetic_data, columns=['Description', 'Program'])

#save data as json-line file with current timestamp
#timestamp = pd.Timestamp.now().strftime("%Y%m%d%H%M%S")
#synthetic_data.to_json(f"data/synthetic_data_{timestamp}.jsonl", orient="records", lines=True)

# load synthetic data
synthetic_data = pd.read_json("data/synthetic_data_20250115181421.jsonl", orient="records", lines=True)

In [None]:
# Generate graphics for the synthetic data 
interpreter = PseudoProgramInterpreter_v2()
interpreter.process_and_save_graphics(synthetic_data, output_dir="logo_graphic/synthetic_v2")

In [3]:
interpreter = PseudoProgramInterpreter_v1()
interpreter.process_and_save_graphics(synthetic_data, output_dir="logo_graphic/synthetic_v1") # here graphics are thigthly cropped around the image

In [8]:
print(synthetic_data['Description'].iloc[4])
print(synthetic_data['Program'].iloc[4])

print(synthetic_data['Description'].iloc[9])
print(synthetic_data['Program'].iloc[9])

7 medium line in a row
for j in range(7):
    embed("""forward(4)""", locals())
    penup()
    forward(2)
    left(0.0)

    pendown()
connected sequence of shapes: a small 6-gon, a short line, a small 9-gon
for i in range(6):
    forward(2)
    left(60.0)
forward(2)
for i in range(9):
    forward(2)
    left(40.0)


In [4]:
# TEST 
# create a test dataset form the all_data
test_indices = [98, 44, 100, 99, 200, 212, 214, 201, 53, 54, 282] # examples representing different shapes and combinations for testing purposes
df_test_subset = df_all.loc[test_indices].reset_index(drop=True)
#display(df_test_subset)

interpreter = PseudoProgramInterpreter_v2()
interpreter.process_and_save_graphics(df_test_subset, output_dir="logo_graphic/11testshapes")

In [5]:
# ReGAL DATA GRAPHICS
interpreter = PseudoProgramInterpreter_v2()
interpreter.process_and_save_graphics(df_all, output_dir="logo_graphic/train200_test")

## ASCII-Transfromer

based on the approach by Li and Ellis (2024). Similar to the authors  ensure a square image size (here 525x525) which I divied in 35x35 blocks with a pixel size of 15x15. In comparison the authors cropped a 512x512 section from the image around the center and divide it into 32x32 blocks with a pixel size of 16x16 each. Like the authors i then calculate the densty of black pixels and qunatizise this into 10 levels, which are represented by the ASCII numbers of 0-9. Each number represents a block resulting in a string with 35 lines, where each line has 35 numbers. A low density equals 0 and the higher the density of black pixels becomes the closer to 9 it will be. 

In [4]:
# Synthetic data ASCII
processor = ASCIIProcessor(n_blocks=35, m_blocks=35, levels=10)

dir_images = "logo_graphic/synthetic_v1/"
synthetic_data_ascii = processor.store_ascii_input(synthetic_data, dir_images)
display(synthetic_data_ascii)

Unnamed: 0,Description,Program,ascii_input
0,separated sequence of shapes: a medium triangl...,for i in range(3):\n forward(4)\n left(1...,00000000000000000000000000000000000\n000000000...
1,a 4 sided snowflake with an arm of a short line,"for j in range(4):\n embed(""""""forward(2)\nl...",00000000000000000000000000000000000\n000000000...
2,4 small 7-gon in a row,"for j in range(4):\n embed(""""""for i in rang...",00000000000000000000000000000000000\n000000000...
3,"connected sequence of shapes: a medium 9-gon, ...",for i in range(9):\n forward(4)\n left(4...,00000000000000000000000000000000000\n000000000...
4,7 medium line in a row,"for j in range(7):\n embed(""""""forward(4)""""""...",00000000000000000000000000000000000\n000000000...
5,a star with 6 points,for i in range(3):\n forward(10)\n left(...,00000000000000000000000000000000000\n000000000...
6,7 medium 7-gon in a row,"for j in range(7):\n embed(""""""for i in rang...",00000000000000000000000000000000000\n000000000...
7,a 8 sided snowflake with an arm of a medium 9-gon,"for j in range(8):\n embed(""""""for i in rang...",00000000000000000000000000000000000\n000000000...
8,9 medium square in a row,"for j in range(9):\n embed(""""""for i in rang...",00000000000000000000000000000000000\n000000000...
9,"connected sequence of shapes: a small 6-gon, a...",for i in range(6):\n forward(2)\n left(6...,00000000000000000000000000000000000\n000000000...
