In [6]:
import os
import json
import re
import random

# Directories and paths
dir_data = 'data'
input_jsonl = os.path.join(dir_data, 'asymptote_dataset_phase1.jsonl')
output_jsonl = os.path.join(dir_data, 'asymptote_dataset_phase1_augmented.jsonl')

with open(input_jsonl, "r", encoding="utf-8") as f:
    original_samples = [json.loads(line.strip()) for line in f]

# Existing ID tracking
existing_ids = [int(sample["id"]) for sample in original_samples]
id_counter = max(existing_ids) + 1
augmented_samples = original_samples.copy()

# Updated parameter ranges for more variety
def random_radius(): return random.randint(1, 20)
def random_angle(): return random.choice([15, 30, 45, 60, 75, 90, 120, 135])
def random_tick(): return random.choice([0.25, 0.5, 1, 2, 5])
def random_unitsize(): return random.choice(["%scm" % random.randint(1, 4) for _ in range(5)])
def random_pen(): return round(random.uniform(0.2, 3.0), 2)

def build_variant(base_sample, new_code):
    global id_counter
    new_sample = base_sample.copy()
    new_sample["id"] = str(id_counter).zfill(4)
    new_sample["asy_code"] = new_code
    id_counter += 1
    return new_sample

def generate_variants(sample, n_variants=3):
    code = sample["asy_code"]
    variants = []

    for _ in range(n_variants):
        new_code = code

        if re.search(r'\bradius\b|\br\s*=', new_code):
            r = random_radius()
            new_code = re.sub(r'(\bradius\s*=\s*)[\d.]+', lambda m: m.group(1) + str(r), new_code)
            new_code = re.sub(r'(real\s+r\s*=\s*)[\d.]+', lambda m: m.group(1) + str(r), new_code)
            new_code = re.sub(r'(\br\s*=\s*)[\d.]+', lambda m: m.group(1) + str(r), new_code)

        if 'rotate(' in new_code or 'arc' in new_code:
            angle = random_angle()
            new_code = re.sub(r'rotate\s*\(\s*\d+(\.\d+)?\s*\)', f'rotate({angle})', new_code)
            new_code = re.sub(r'arc\(([^,]+),\s*([^,]+),\s*\d+,\s*\d+\)',
                              fr'arc(\1, \2, 0, {angle})', new_code)

        if 'xaxis' in new_code:
            step = random_tick()
            new_code = re.sub(r'xaxis\(([^)]*?)Ticks\(Step\s*=\s*\d+(\.\d+)?',
                              fr'xaxis(\1Ticks(Step={step}', new_code)

        if 'yaxis' in new_code:
            step = random_tick()
            new_code = re.sub(r'yaxis\(([^)]*?)Ticks\(Step\s*=\s*\d+(\.\d+)?',
                              fr'yaxis(\1Ticks(Step={step}', new_code)

        if 'unitsize' in new_code:
            size = random.choice(["1cm", "2cm", "3cm", "4cm"])
            new_code = re.sub(r'unitsize\s*\([^)]*\)', f'unitsize({size})', new_code)

        if 'draw(' in new_code and 'linewidth' in new_code:
            width = random_pen()
            new_code = re.sub(r'linewidth\s*\(\s*\d+(\.\d+)?\s*\)', f'linewidth({width})', new_code)

        variants.append(build_variant(sample, new_code))

    return variants

# Apply augmentation
for sample in original_samples:
    augmented_samples.extend(generate_variants(sample, n_variants=4))  # increase per-sample variants

# Save
with open(output_jsonl, "w", encoding="utf-8") as f:
    for sample in augmented_samples:
        json.dump(sample, f)
        f.write("\n")

print("Saved %d samples to %s" % (len(augmented_samples), output_jsonl))

Saved 2120 samples to data\asymptote_dataset_phase1_augmented.jsonl
