In [50]:
import os
import json

# Directories
dir_data = 'data'
input_jsonl = os.path.join(dir_data, 'asymptote_dataset_phase1.jsonl')
output_jsonl = os.path.join(dir_data, 'asymptote_dataset_mini.jsonl')

# Load full dataset
with open(input_jsonl, 'r', encoding='utf-8') as f:
    samples = [json.loads(line.strip()) for line in f]

# Take first 20 samples
samples_mini = samples[1:21]

# Add image path field
for sample in samples_mini:
    sample['image_path'] = os.path.join(dir_data, 'images', '%s.png' % sample['id'])
    
# Generate matching descriptions
descriptions = [
    "Draw a non-crossed polygon and its reverse, with directional arrows showing both clockwise and counterclockwise orientation.",
    "Draw a unit circle path in both clockwise and counterclockwise orientation using arrows.",
    "Plot several closed curved paths centered at different points, showing clockwise and counterclockwise directions with arrows.",
    "Draw a randomly generated closed path with clockwise and counterclockwise orientations, visualized with arrows.",
    "Draw two horizontal line segments: one from (0,0) to (2,0), and one from (4,0) to (4,2), using 2cm units.",
    "Draw two horizontal line segments: one from (0,0) to (2,0), and one from (4,0) to (4,2), with figure size 5cm.",
    "Draw the same horizontal line segments as before, but increase figure width to 10cm.",
    "Draw two short horizontal line segments with figure size 5cm by 3cm.",
    "Repeat the line segment drawing with size 10cm by 2.5cm.",
    "Draw the same horizontal segments with non-fixed scaling of size 10cm by 2.5cm.",
    "Change scaling to 3cm width and 10cm height while drawing the segments.",
    "Use physical units (cm) to define points and draw segments using coordinates like (3cm,0) and (6cm,4cm).",
    "Draw segments using unit size 1cm and integer coordinates.",
    "Use asymmetric scaling (2cm for x, 1cm for y) to draw segments with integer coordinates.",
    "Draw a large circle of radius 4 with two dots at the center and circumference, using fixed scaling and light grey fill.",
    "Draw a horizontal segment using a picture frame object, and shift the drawing 40 units down.",
    "Repeat the same using a picture object instead of frame, and draw the same scaled segment twice vertically stacked.",
    "Draw an open triangle using the points (0,0), (1,0), and (0,1).",
    "Draw a closed triangle using the same three points: (0,0), (1,0), and (0,1).",
    "Draw a square using an array of points, then draw both diagonals to complete the cross."
]

# Add descriptions to samples
for i, sample in enumerate(samples_mini):
    sample['description'] = descriptions[i]

# Save to new file
with open(output_jsonl, 'w', encoding='utf-8') as f:
    for row in samples_mini:
        json.dump(row, f)
        f.write('\n')

print('Saved mini dataset to:', output_jsonl)

Saved mini dataset to: data\asymptote_dataset_mini.jsonl


In [51]:
output_jsonl_hf = os.path.join(dir_data, 'hf_finetune_mini.jsonl')

with open(output_jsonl_hf, 'w', encoding='utf-8') as f:
    for sample in samples_mini:
        prompt = sample['description'].strip()
        code = sample['asy_code'].strip()
        json.dump({'prompt': prompt, 'completion': code}, f)
        f.write('\n')
        
print('Saved hugging face formatted mini dataset to:', output_jsonl)

Saved hugging face formatted mini dataset to: data\asymptote_dataset_mini.jsonl
