Install Dependencies

In [None]:
!pip install -q openai tqdm backoff

import os, json, openai
from tqdm import tqdm
import backoff
import time

Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Directories

In [None]:
from openai import OpenAI
client = OpenAI(api_key="INPUT_KEY")
model = "gpt-4o"

In [None]:
input_jsonl = "/content/drive/MyDrive/asymptote_model/data/asymptote_dataset_phase3.jsonl"
output_jsonl = "/content/drive/MyDrive/asymptote_model/data/asymptote_dataset_phase3_descriptions.jsonl"
start_index = 0
sleep_time = 1.2

Prompt Template

In [None]:
def build_prompt(asy_code):
    return(
        "You are given a piece of Asymptote code used for generating mathematical diagrams.\n"
        "Your task is to write a detailed and clear math-style instruction or problem prompt "
        "that accurately matches what this code would visually render.\n"
        "Be specific: mention exact coordinates, object types, angles, or visual styles.\n\n"
        f"Asymptote code:\n{asy_code}\n\nDescription:"
    )

Generate Description

In [None]:
@backoff.on_exception(backoff.expo, openai.RateLimitError, max_tries=5)
@backoff.on_exception(backoff.expo, openai.OpenAIError, max_tries=3)

def generate_description(asy_code):
    prompt = build_prompt(asy_code)

    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a helpful assistant that writes detailed math-style descriptions from Asymptote code."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.5,
        max_tokens=200
    )
    return response.choices[0].message.content.strip()

Process samples

In [None]:
with open(input_jsonl, "r", encoding="utf-8") as f:
    samples = [json.loads(line.strip()) for line in f]

In [None]:
end_index = min(start_index, len(samples))
samples_to_process = samples # Attempting to run all 1727 samples
print(f"Processing samples {start_index} to {end_index - 1}...")

In [None]:
output_lines = []
for i, sample in enumerate(tqdm(samples_to_process, desc="Generating Descriptions")):
    if "description" in sample:
        output_lines.append(sample)
        continue

    try:
        desc = generate_description(sample["asy_code"])
    except Exception as e:
        print(f"Error on ID {sample['id']}: {e}")
        desc = "DESCRIPTION_GENERATION_FAILED"

    sample["description"] = desc
    output_lines.append(sample)
    time.sleep(sleep_time)

# Append results to output jsonl
with open(output_jsonl, "a", encoding="utf-8") as f:
    for line in output_lines:
        json.dump(line, f)
        f.write("\n")

print(f"Finished batch: {start_index}–{end_index - 1}. Output saved to {output_jsonl}")


Generating Descriptions: 100%|██████████| 1727/1727 [2:43:32<00:00,  5.68s/it]

Finished batch: 0–9. Output saved to /content/drive/MyDrive/asymptote_model/data/asymptote_dataset_phase3_descriptions.jsonl





Save Final Dataset


*   Parametrized
*   Rendered Image Paths
*   OpenAI Descriptions



In [None]:
with open(output_jsonl, "r", encoding="utf-8") as f:
    samples = [json.loads(line.strip()) for line in f]

samples = samples[11:]

output_jsonl_2 = "/content/drive/MyDrive/asymptote_model/data/asymptote_dataset_final.jsonl"

with open(output_jsonl_2, "w", encoding="utf-8") as f:
    for line in samples:
        json.dump(line, f)
        f.write("\n")