# Generating Synthetic Data with LOGO-Programs, Descriptions and the respective Graphics

In [1]:
# load
from _1_logo_pseudo_code_generator import generateLOGOPseudoCode
from _2_sampler import LOGOProgramSampler
from _3_executable_logo_primitives import ReGALLOGOPrimitives
from _4_logo_graphic_generator_v1 import PseudoProgramInterpreter as PseudoProgramInterpreter_v1
from _4_logo_graphic_generator_v2 import PseudoProgramInterpreter as PseudoProgramInterpreter_v2
from _5_ascii_processor import ASCIIProcessor

generator=generateLOGOPseudoCode()

import json
import pandas as pd
import os

load the train and test data from the ReGAL-Paper. Join both and remove duplicate programs.

In [2]:
#LOGO
train_logo_data = "logo_data/python/train_200_dataset.jsonl"
dev_logo_data = "logo_data/python/dev_100.jsonl"
test_logo_data = "logo_data/python/test_dataset.jsonl"

# Load train, dev and test dataset
def load_data(data_path):
    with open(data_path, 'r') as f:
        data = [json.loads(line) for line in f]
    return data

# Extract descriptions and programs from train_data and transform into pandas DataFrame
def extract_descriptions_and_programs(data):
    extracted_data = []
    for item in data:
        description = None
        program = None
        if "messages" in item:  # suits the format for train_data and test_data
            for message in item.get('messages', []):
                if message['from'] == 'human':
                    description = message['value']
                elif message['from'] == 'gpt':
                    program = message['value']
        elif "program" in item and "language" in item: # suites the format for dev_data
            program = item['program']
            description = " ".join(item['language'])

        if description and program:
            extracted_data.append([description, program])
    extracted_data = pd.DataFrame(extracted_data, columns=['Description', 'Program'])
    return extracted_data

df_train = extract_descriptions_and_programs(load_data(f"../external/dependencies/{train_logo_data}"))
df_dev = extract_descriptions_and_programs(load_data(f"../external/dependencies/{dev_logo_data}"))
df_test = extract_descriptions_and_programs(load_data(f"../external/dependencies/{test_logo_data}"))

display(df_train.tail(2))
display(df_dev.tail(2))
display(df_test.tail(2))

Unnamed: 0,Description,Program
198,8 sided snowflake with a short space and a sho...,"for j in range(8):\n embed(""""""penup()\nforw..."
199,8 sided snowflake with a medium triangle as arms,"for j in range(8):\n embed(""""""for i in rang..."


Unnamed: 0,Description,Program
98,5 sided snowflake with a short line and a smal...,"for j in range(5):\n embed(""""""forward(4)\nl..."
99,6 sided snowflake with a short space and a sho...,"for j in range(6):\n embed(""""""penup()\nforw..."


Unnamed: 0,Description,Program
109,5 sided snowflake with a small 5 gon and a sma...,"for j in range(5):\n embed(""""""penup()\nforw..."
110,6 sided snowflake with a small 5 gon and a sma...,"for j in range(6):\n embed(""""""penup()\nforw..."


In [3]:
# Append the test data to the train data
df_train_test = pd.concat([df_train, df_test], ignore_index=True)
df_all = pd.concat([df_train, df_dev, df_test], ignore_index=True)
print("Dimensions of the combined train and test data: ", df_train_test.shape)
print("Dimensions of the combined train, dev and test data: ", df_all.shape)

print("\n")
#####################
# Drop all duplicate rows
df_train_test = df_train_test.drop_duplicates(subset=['Program']) # use the program as the unique identifier
df_all = df_all.drop_duplicates(subset=['Program']) # use the program as the unique identifier
#####################
print("Dimensions of the combined train and test data without duplicate Programs: ", df_train_test.shape)
print("Dimensions of the combined train, dev and test data without duplicate Programs: ", df_all.shape)

Dimensions of the combined train and test data:  (311, 2)
Dimensions of the combined train, dev and test data:  (411, 2)


Dimensions of the combined train and test data without duplicate Programs:  (263, 2)
Dimensions of the combined train, dev and test data without duplicate Programs:  (357, 2)


In [4]:
print(311-263)
print(411-357)
print(10000-357)

48
54
9643


## Generate Graphics for the ReGAL dataset and the Synthetic Data

- initialize the sampler, if the train and test dataframes are porvided then the newly generated programs are not the same as the once in these datasets
- the data can be stored in json-line format
- next step is to initalize the interpreter, with this one can execute the programs and generate graphs
    - the interpreter generates .png-files
    - the size is for all graphics in one df the same (the aim is to keep size information which is relative to the other graphics thats why there can be a lot of whitespace this might be debatable)

In [4]:
sampler=LOGOProgramSampler(generator, df_all) # this way only new programs are generated that are not in the train or test data

# Synthetic data
#synthetic_data = sampler.sample(9643) # sample 9643 new programs + the 357 programs from the train, dev and test data = 10000 programs
#synthetic_data = pd.DataFrame(synthetic_data, columns=['Description', 'Program'])

#save data as json-line file with current timestamp
#timestamp = pd.Timestamp.now().strftime("%Y%m%d%H%M%S")
#synthetic_data.to_json(f"data/synthetic_data_{timestamp}.jsonl", orient="records", lines=True)

# load synthetic data
synthetic_data = pd.read_json("data/synthetic_data_20250120143151.jsonl", orient="records", lines=True)

In [10]:
# Generate graphics for the synthetic data 
interpreter = PseudoProgramInterpreter_v2()
interpreter.process_and_save_graphics(synthetic_data, output_dir="logo_graphic/synthetic_v2")

In [6]:
interpreter = PseudoProgramInterpreter_v1()
interpreter.process_and_save_graphics(synthetic_data, output_dir="logo_graphic/synthetic_v1") # here graphics are thigthly cropped around the image
# ca. 10.000 = 30 min

In [8]:
print(synthetic_data['Description'].iloc[4])
print(synthetic_data['Program'].iloc[4])

print(synthetic_data['Description'].iloc[9])
print(synthetic_data['Program'].iloc[9])

7 medium line in a row
for j in range(7):
    embed("""forward(4)""", locals())
    penup()
    forward(2)
    left(0.0)

    pendown()
connected sequence of shapes: a small 6-gon, a short line, a small 9-gon
for i in range(6):
    forward(2)
    left(60.0)
forward(2)
for i in range(9):
    forward(2)
    left(40.0)


In [4]:
# TEST 
# create a test dataset form the all_data
test_indices = [98, 44, 100, 99, 200, 212, 214, 201, 53, 54, 282] # examples representing different shapes and combinations for testing purposes
df_test_subset = df_all.loc[test_indices].reset_index(drop=True)
#display(df_test_subset)

interpreter = PseudoProgramInterpreter_v2()
interpreter.process_and_save_graphics(df_test_subset, output_dir="logo_graphic/11testshapes")

In [12]:
# ReGAL DATA GRAPHICS
interpreter = PseudoProgramInterpreter_v1()
interpreter.process_and_save_graphics(df_all, output_dir="logo_graphic/all_ReGAL")

## ASCII-Transfromer

based on the approach by Li and Ellis (2024). Similar to the authors  ensure a square image size (here 525x525) which I divied in 35x35 blocks with a pixel size of 15x15. In comparison the authors cropped a 512x512 section from the image around the center and divide it into 32x32 blocks with a pixel size of 16x16 each. Like the authors i then calculate the densty of black pixels and qunatizise this into 10 levels, which are represented by the ASCII numbers of 0-9. Each number represents a block resulting in a string with 35 lines, where each line has 35 numbers. A low density equals 0 and the higher the density of black pixels becomes the closer to 9 it will be. 

In [8]:
# Synthetic data ASCII
processor = ASCIIProcessor(n_blocks=35, m_blocks=35, levels=10)

dir_images = "logo_graphic/synthetic_v1/"
synthetic_data_ascii = processor.store_ascii_input(synthetic_data, dir_images)
display(synthetic_data_ascii.head(2))

Unnamed: 0,Description,Program,ascii_input
0,"separated sequence of shapes: a small 8-gon, a...",for i in range(8):\n forward(2)\n left(4...,00000000000000000000000000000000000\n000000000...
1,a zigzag with 4 medium steps,for i in range(4):\n forward(4)\n left(9...,00000000000000000000000000000000000\n000000000...


In [9]:
dir_images = "logo_graphic/all_ReGAL/"
df_all_ascii = processor.store_ascii_input(df_all, dir_images)
display(df_all_ascii.head(2))

Unnamed: 0,Description,Program,ascii_input
0,4 concentric square s,"for i in range(5):\n embed(""""""for j in rang...",00000000000000000000000000000000000\n012222222...
1,6 sided snowflake with a medium line and a med...,"for j in range(6):\n embed(""""""forward(8)\nl...",00000000000000000000000000000000000\n000000000...


## Design Splits in Train and Test given generalization aspect

- length generalization:
    - criterion: semantic length
- mix and match concepts
    - compose different concepts
    - switch concept order
- apply general principles
    - compose new operations
    - add operation functunality

In [13]:
df_all_syn = pd.concat([df_all_ascii, synthetic_data_ascii], ignore_index=True)
len(df_all_syn)

10000

In [14]:
from _6_semantic_length import SemanticLength

sem_length = SemanticLength()

df_all_syn['Semantic Length'] = df_all_syn['Program'].apply(sem_length.calc_semantic_length)
df_all_syn=df_all_syn.sort_values(by="Semantic Length").reset_index(drop=True)

# Determine the split index
test_start_id = int(len(df_all_syn)*0.9)        # 10% for the test set
print(test_start_id)
# check and adjust the split index if both train and test contain programs with the same semantic length
while (
    test_start_id < len(df_all_syn) and 
    df_all_syn.loc[test_start_id, 'Semantic Length'] == df_all_syn.loc[test_start_id - 1, 'Semantic Length']
):
    test_start_id += 1
print(test_start_id)

# Create train and test splits
train_data = df_all_syn.iloc[:test_start_id]
test_data = df_all_syn.iloc[test_start_id:]

print("Train:")
display(train_data.head(2))
display(train_data.tail(2))
print("\nTest:")
display(test_data.head(2))
display(test_data.tail(2))

# Create a validation dataset from the training data about the same size as the test data
rs = 42
length = int(len(test_data))
validation_data = train_data.sample(n=length, random_state=rs)  
train_data = train_data.drop(validation_data.index)

print("\nTrain set size:", len(train_data))
print("\nValidation set size:", len(validation_data))
print("\nTest set size:", len(test_data))

9000
9003
Train:


Unnamed: 0,Description,Program,ascii_input,Semantic Length
0,a 5 pointed star,for i in range(5):\n forward(16)\n left(...,00000000000000000000000000000000000\n000000000...,5
1,a greek spiral with 5 turns,for i in range(6):\n forward(1 * i)\n le...,00000000000000000000000000000000000\n000000000...,6


Unnamed: 0,Description,Program,ascii_input,Semantic Length
9001,"separated sequence of shapes: a small 7-gon, a...",for i in range(7):\n forward(2)\n left(5...,00000000000000000000000000000000000\n000000000...,1363362
9002,"separated sequence of shapes: a small 7-gon, a...",for i in range(7):\n forward(2)\n left(5...,00000000000000000000000000000000000\n000000000...,1363362



Test:


Unnamed: 0,Description,Program,ascii_input,Semantic Length
9003,a 3 sided snowflake with arms of connected seq...,"for j in range(3):\n embed(""""""for i in rang...",00000000000000000000000000000000000\n000000000...,1368450
9004,"connected sequence of shapes: a small 7-gon, a...",for i in range(7):\n forward(2)\n left(5...,00000000000000000000000000000000000\n000000000...,1368451


Unnamed: 0,Description,Program,ascii_input,Semantic Length
9998,"separated sequence of shapes: a small circle, ...",for i in range(HALF_INF):\n forward(EPS_DIS...,00000000000000000000000000000000000\n000000000...,68407622226360
9999,"separated sequence of shapes: a small circle, ...",for i in range(HALF_INF):\n forward(EPS_DIS...,00000000000000000000000000000000000\n000000000...,68408672018760



Train set size: 8006

Validation set size: 997

Test set size: 997


In [17]:
#save data as json-line file with current timestamp
#timestamp = pd.Timestamp.now().strftime("%Y%m%d%H%M%S")
#train_data.to_json(f"data/length_train_data_ascii_{timestamp}.jsonl", orient="records", lines=True)
#validation_data.to_json(f"data/length_val_data_ascii_{timestamp}.jsonl", orient="records", lines=True)
#test_data.to_json(f"data/length_test_data_ascii_{timestamp}.jsonl", orient="records", lines=True)

In [15]:
# generate hf-hub dataset and push to hf-hub
from datasets import Dataset, DatasetDict

In [44]:
def preprocess_dataset(data, include_description=False):
    # Reset the index
    data = data.reset_index(drop=True)
    # Ensure columns are strings
    data['ascii_input'] = data['ascii_input'].astype(str)
    data['Program'] = data['Program'].astype(str)

    if include_description:
        data['Description'] = data['Description'].astype(str)
    
    # Step 1: Create the new input column
    if include_description:
        data['Input'] = (
            "description: " + data['Description'] +
            "ascii-art: " + data['ascii_input'] 
        )
    else:
        data['Input'] = "ascii-art: " + data['ascii_input']
        # optionally one could add start "<s> " and end tokens " </s>"

    # Step 2: Drop unnecessary columns
    data = data[['Input', 'Program']]
    return data

In [45]:
# Convert DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(preprocess_dataset(train_data, include_description=False))
validation_dataset = Dataset.from_pandas(preprocess_dataset(validation_data, include_description=False))
test_dataset = Dataset.from_pandas(preprocess_dataset(test_data, include_description=False))

# Combine into a DatasetDict
dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset,
    "test": test_dataset
})

dataset_dict.push_to_hub("ruthchy/semantic-length-generalization-logo-data-ascii", private=True)

# Convert DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(preprocess_dataset(train_data, include_description=True))
validation_dataset = Dataset.from_pandas(preprocess_dataset(validation_data, include_description=True))
test_dataset = Dataset.from_pandas(preprocess_dataset(test_data, include_description=True))

# Combine into a DatasetDict
dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset,
    "test": test_dataset
})

dataset_dict.push_to_hub("ruthchy/semantic-length-generalization-logo-data-ascii-desc", private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/527 [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/ruthchy/semantic-length-generalization-logo-data-ascii-desc/commit/30ee416a97395950587adfd7fc7c6f1cafe987bf', commit_message='Upload dataset', commit_description='', oid='30ee416a97395950587adfd7fc7c6f1cafe987bf', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/ruthchy/semantic-length-generalization-logo-data-ascii-desc', endpoint='https://huggingface.co', repo_type='dataset', repo_id='ruthchy/semantic-length-generalization-logo-data-ascii-desc'), pr_revision=None, pr_num=None)