Install Dependencies

In [None]:
!apt-get install -y asymptote

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  asymptote-doc dvisvgm fonts-droid-fallback fonts-lato fonts-lmodern
  fonts-noto-mono fonts-urw-base35 freeglut3 ghostscript gsfonts imagemagick
  imagemagick-6-common imagemagick-6.q16 info install-info
  libalgorithm-c3-perl libauthen-sasl-perl libb-hooks-endofscope-perl
  libb-hooks-op-check-perl libclass-c3-perl libclass-c3-xs-perl
  libclass-data-inheritable-perl libclass-method-modifiers-perl
  libclass-xsaccessor-perl libcommon-sense-perl libdata-optlist-perl
  libdevel-callchecker-perl libdevel-caller-perl
  libdevel-globaldestruction-perl libdevel-lexalias-perl
  libdevel-stacktrace-perl libdist-checkconflicts-perl libdjvulibre-text
  libdjvulibre21 libdynaloader-functions-perl libemail-date-format-perl
  libeval-closure-perl libexception-class-perl libfftw3-double3
  libfile-homedir-perl libfile-which-perl libgs9 libgs9-common

In [None]:
import os
import json
import subprocess

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Directories

In [None]:
dir_data = '/content/drive/MyDrive/asymptote_model/data'
jsonl_path = os.path.join(dir_data, 'asymptote_dataset_phase1_augmented.jsonl')
dir_images = os.path.join(dir_data, "images")
dir_asy = os.path.join(dir_data, "asy_files")

Render Asymptote Diagrams

In [None]:
# Load dataset
with open(jsonl_path, 'r', encoding='utf-8') as f:
    samples = [json.loads(line.strip()) for line in f]

# Log file
log_path = 'render_log.txt'
log_file = open(log_path, 'w', encoding='utf-8')

# Write .asy files and render to .png
for i, sample in enumerate(samples, 1):
    asy_id = sample['id']
    asy_path = os.path.join(dir_asy,'%s.asy' % asy_id)
    png_path = os.path.join(dir_images,'%s.png' % asy_id)

    print('[%d/%d] Rendering %s...' % (i, len(samples), asy_path))

    try:
        result = subprocess.run(
            ['asy', '-tex', 'pdflatex', '-f', 'png', '-o', png_path, asy_path],
            check=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE
        )

        if os.path.exists(png_path):
            sample['image_path'] = png_path
            log_file.write('SUCCESS: %s\n' % asy_path)
        else:
            sample['image_path'] = 'FAILED'
            log_file.write('FAILED (no png): %s\n' % asy_path)

    except subprocess.CalledProcessError as e:
        sample['image_path'] = 'FAILED'
        log_file.write('FAILED (error): %s\n' % asy_path)
        log_file.write(e.stderr.decode('utf-8') + '\n')

# Close log
log_file.close()

# Save updated dataset
updated_jsonl_path = os.path.join(dir_data, 'asymptote_dataset_phase2_rendered.jsonl')
with open(updated_jsonl_path, 'w', encoding='utf-8') as f:
    for sample in samples:
        json.dump(sample, f)
        f.write('\n')

print('Rendering complete. Updated JSONL saved to: %s' % updated_jsonl_path)
print('Log written to: %s' % log_path)

[1/2120] Rendering /content/drive/MyDrive/asymptote_model/data/asy_files/0002.asy...
[2/2120] Rendering /content/drive/MyDrive/asymptote_model/data/asy_files/0003.asy...
[3/2120] Rendering /content/drive/MyDrive/asymptote_model/data/asy_files/0004.asy...
[4/2120] Rendering /content/drive/MyDrive/asymptote_model/data/asy_files/0005.asy...
[5/2120] Rendering /content/drive/MyDrive/asymptote_model/data/asy_files/0006.asy...
[6/2120] Rendering /content/drive/MyDrive/asymptote_model/data/asy_files/0007.asy...
[7/2120] Rendering /content/drive/MyDrive/asymptote_model/data/asy_files/0008.asy...
[8/2120] Rendering /content/drive/MyDrive/asymptote_model/data/asy_files/0009.asy...
[9/2120] Rendering /content/drive/MyDrive/asymptote_model/data/asy_files/0010.asy...
[10/2120] Rendering /content/drive/MyDrive/asymptote_model/data/asy_files/0011.asy...
[11/2120] Rendering /content/drive/MyDrive/asymptote_model/data/asy_files/0012.asy...
[12/2120] Rendering /content/drive/MyDrive/asymptote_model/data

KeyboardInterrupt: 

In [None]:
# Directories
dir_data = '/content/drive/MyDrive/asymptote_model/data'
jsonl_path = os.path.join(dir_data, 'asymptote_dataset_phase1_augmented.jsonl')
dir_images = os.path.join(dir_data, "images")
dir_asy = os.path.join(dir_data, "asy_files")

# Load dataset
with open(jsonl_path, 'r', encoding='utf-8') as f:
    samples = [json.loads(line.strip()) for line in f]

# Log file
log_path = os.path.join(dir_data, 'render_log.txt')
log_file = open(log_path, 'w', encoding='utf-8')

already_rendered = set(os.listdir(dir_images))

# Write .asy files and render to .png
for i, sample in enumerate(samples, 1):
    asy_id = sample['id']
    asy_path = os.path.join(dir_asy,'%s.asy' % asy_id)
    png_path = os.path.join(dir_images,'%s.png' % asy_id)

    if ('%s.png' % asy_id) in already_rendered:
        continue

    print('[%d/%d] Rendering %s...' % (i, len(samples), asy_path))

    try:
        result = subprocess.run(
            ['asy', '-tex', 'pdflatex', '-f', 'png', '-o', png_path, asy_path],
            check=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            timeout=30
        )

        if os.path.exists(png_path):
            sample['image_path'] = png_path
            log_file.write('SUCCESS: %s\n' % asy_path)
        else:
            sample['image_path'] = 'FAILED'
            log_file.write('FAILED (no png): %s\n' % asy_path)

    except subprocess.CalledProcessError as e:
        sample['image_path'] = 'FAILED'
        log_file.write('FAILED (error): %s\n' % asy_path)
        log_file.write(e.stderr.decode('utf-8') + '\n')

    except subprocess.TimeoutExpired:
        sample['image_path'] = 'FAILED'
        log_file.write(f'TIMEOUT: %s\n' % asy_path)

# Close log
log_file.close()

# Save updated dataset
updated_jsonl_path = os.path.join(dir_data, 'asymptote_dataset_phase2_rendered.jsonl')
with open(updated_jsonl_path, 'w', encoding='utf-8') as f:
    for sample in samples:
        json.dump(sample, f)
        f.write('\n')

print('Rendering complete. Updated JSONL saved to: %s' % updated_jsonl_path)
print('Log written to: %s' % log_path)

[74/2120] Rendering /content/drive/MyDrive/asymptote_model/data/asy_files/0075.asy...
[110/2120] Rendering /content/drive/MyDrive/asymptote_model/data/asy_files/0111.asy...
[161/2120] Rendering /content/drive/MyDrive/asymptote_model/data/asy_files/0162.asy...
[162/2120] Rendering /content/drive/MyDrive/asymptote_model/data/asy_files/0163.asy...
[212/2120] Rendering /content/drive/MyDrive/asymptote_model/data/asy_files/0213.asy...
[213/2120] Rendering /content/drive/MyDrive/asymptote_model/data/asy_files/0214.asy...
[323/2120] Rendering /content/drive/MyDrive/asymptote_model/data/asy_files/0324.asy...
[324/2120] Rendering /content/drive/MyDrive/asymptote_model/data/asy_files/0325.asy...
[325/2120] Rendering /content/drive/MyDrive/asymptote_model/data/asy_files/0326.asy...
[336/2120] Rendering /content/drive/MyDrive/asymptote_model/data/asy_files/0337.asy...
[348/2120] Rendering /content/drive/MyDrive/asymptote_model/data/asy_files/0349.asy...
[350/2120] Rendering /content/drive/MyDrive/

Add Image Paths to dataset

In [None]:
import os
import json

# Directories
dir_data = '/content/drive/MyDrive/asymptote_model/data'
jsonl_path = os.path.join(dir_data, 'asymptote_dataset_phase1_augmented.jsonl')
dir_images = os.path.join(dir_data, 'images')
output_jsonl = os.path.join(dir_data, 'asymptote_dataset_phase2_rendered.jsonl')

# Load samples
with open(jsonl_path, 'r', encoding='utf-8') as f:
    samples = [json.loads(line.strip()) for line in f]

# Update each sample's image path
for sample in samples:
    asy_id = sample['id']
    png_filename = '%s.png' % asy_id
    full_png_path = os.path.join(dir_images, png_filename)

    if os.path.exists(full_png_path):
        sample['image_path'] = 'images/%s' % png_filename
    else:
        sample['image_path'] = 'FAILED'

# Save updated JSONL
with open(output_jsonl, 'w', encoding='utf-8') as f:
    for sample in samples:
        json.dump(sample, f)
        f.write('\n')

print('Updated %d samples with image paths.' % len(samples))
print('Output saved to: %s' % output_jsonl)


Updated 2120 samples with image paths.
Output saved to: /content/drive/MyDrive/asymptote_model/data/asymptote_dataset_phase2_rendered.jsonl


Remove Samples with no Rendered PNG

In [None]:
final_jsonl_path = os.path.join(dir_data, 'asymptote_dataset_phase3.jsonl')
updated_jsonl_path = os.path.join(dir_data, 'asymptote_dataset_phase2_rendered.jsonl')

# Load all samples
with open(updated_jsonl_path, 'r', encoding='utf-8') as f:
    samples = [json.loads(line) for line in f]

# Filter samples with a valid .png in the images folder
valid_samples = []
for sample in samples:
    asy_id = sample['id']
    img_path = sample.get('image_path', '')
    if img_path.endswith('.png'):
        full_img_path = os.path.join(dir_images, '%s.png' % asy_id)
        if os.path.exists(full_img_path):
            valid_samples.append(sample)

# Save filtered samples
with open(final_jsonl_path, 'w', encoding='utf-8') as f:
    for sample in valid_samples:
        json.dump(sample, f)
        f.write('\n')

print('Filtered dataset saved: %i valid samples written to %s' %(len(valid_samples), final_jsonl_path))

Filtered dataset saved: 1727 valid samples written to /content/drive/MyDrive/asymptote_model/data/asymptote_dataset_phase3.jsonl
