In [None]:
import os
import json
import random
from tqdm import tqdm
from google.colab import drive

# Mount Drive
drive.mount('/content/drive')

# Paths & Config
input_path = "/content/drive/MyDrive/asymptote_model/data/asymptote_dataset_final.jsonl"
output_dir = "/content/drive/MyDrive/asymptote_model/data/data_splits"
split_ratio = (0.8, 0.05, 0.15)
random.seed(42)

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Load Original Data
with open(input_path, "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

# -Randomize and Split
random.shuffle(data)
n = len(data)
n_train = int(n * split_ratio[0])
n_val = int(n * split_ratio[1])
n_test = n - n_train - n_val

train_data = data[:n_train]
val_data = data[n_train:n_train + n_val]
test_data = data[n_train + n_val:]

# Write Split Files (no formatting)
for name, split in zip(["train", "val", "test"], [train_data, val_data, test_data]):
    out_path = os.path.join(output_dir, f"{name}.jsonl")
    with open(out_path, "w", encoding="utf-8") as f:
        for sample in tqdm(split, desc=f"Writing {name} set"):
            json.dump(sample, f)
            f.write("\n")

print(f"\nDataset split complete. Unformatted files saved in: {output_dir}")
print(f" - Train: {len(train_data)}")
print(f" - Val:   {len(val_data)}")
print(f" - Test:  {len(test_data)}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Writing train set: 100%|██████████| 1381/1381 [00:00<00:00, 26168.33it/s]
Writing val set: 100%|██████████| 86/86 [00:00<00:00, 37441.37it/s]
Writing test set: 100%|██████████| 260/260 [00:00<00:00, 45332.52it/s]


Dataset split complete. Unformatted files saved in: /content/drive/MyDrive/asymptote_model/data/data_splits
 - Train: 1381
 - Val:   86
 - Test:  260



