### Install Dependencies

In [9]:
import os
import json

### Load Data

In [10]:
# Base directory of the cloned repo
dir_base = 'asymptote-exemples'

# Output JSONL path
dir_data = 'data'
os.makedirs(dir_data, exist_ok=True)
jsonl_path = os.path.join(dir_data, 'asymptote_dataset_phase1.jsonl')

In [11]:
# Tags to discard (any sample containing these will be skipped)
DISCARD_TAGS = [
    'Animation', 'Fractals', 'L-System', 'Tiling', 'tube',
    'Opacity', 'Morphing', 'Recursion', 'Physics',
    'Random', 'Spherical harmonics'
]

### Remove Samples with Non-Relevant Tags

In [12]:
# Counter for assigning unique IDs across all folders
global_id_counter = 1
samples = []

# Traverse each category folder in base dir
for folder_name in os.listdir(dir_base):
    folder_path = os.path.join(dir_base, folder_name)
    if not os.path.isdir(folder_path):
        continue

    # Try to get category name from category.txt
    category_path = os.path.join(folder_path, 'category.txt')
    if os.path.exists(category_path):
        with open(category_path, 'r', encoding='utf-8') as f:
            category_line = f.read().strip()
            category = category_line.split('|')[-1].strip()
    else:
        category = folder_name  # fallback

    # Loop through .asy files in this category
    for file in os.listdir(folder_path):
        
        if not file.endswith('.asy'):
            continue

        base_name = file.replace('.asy', '')
        asy_path = os.path.join(folder_path, base_name + '.asy')
        tag_path = os.path.join(folder_path, base_name + '.tag')

        # Read .asy code
        try:
            with open(asy_path, 'r', encoding='utf-8') as f:
                asy_code = f.read().strip()
        except Exception as e:
            print('Skipping %s: %s'%(asy_path, e))
            continue

        # Read tags (flattened as string)
        tags = ''
        if os.path.exists(tag_path):
            with open(tag_path, 'r', encoding='utf-8') as f:
                tags = f.read().strip().replace('\n', ' ')

        # Filter out discarded tags
        if any(discard_tag in tags for discard_tag in DISCARD_TAGS):
            continue

        # Assign new unique ID
        assigned_id = str(global_id_counter).zfill(4)
        global_id_counter += 1

        # Compose sample
        samples.append({
            'id': assigned_id,
            'filename': base_name,
            'category': category,
            'tags': tags,
            'asy_code': asy_code
        })

samples = samples[1:]

# Save all samples to .jsonl
with open(jsonl_path, 'w', encoding='utf-8') as f:
    for row in samples:
        json.dump(row, f)
        f.write('\n')

print('Saved %i samples to %s'%(len(samples), jsonl_path))


Saved 424 samples to data\asymptote_dataset_phase1.jsonl
