Install Dependencies

In [None]:
!pip install -q transformers==4.47.0 datasets peft accelerate bitsandbytes fsspec==2025.3.2 scikit-image
!pip install -U datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m110.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m107.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m84.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m35.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
import os
import torch
from datasets import (
    load_dataset,
    Dataset,
    DatasetDict
)
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    DataCollatorForLanguageModeling
)
from peft import prepare_model_for_kbit_training, get_peft_model, LoraConfig, TaskType

In [None]:
dir_data = "/content/drive/MyDrive/asymptote_model/data/data_splits"
path_train = os.path.join(dir_data, "train.jsonl")
path_val = os.path.join(dir_data, "val.jsonl")
dir_output = "/content/drive/MyDrive/asymptote_model/phi3_lora_trained"
dir_logs = "/content/drive/MyDrive/asymptote_model/logs"
os.makedirs(dir_output, exist_ok=True)
os.makedirs(dir_logs, exist_ok=True)

Load model and tokenizer

In [None]:
model_id = "microsoft/phi-3-mini-4k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    attn_implementation="eager"
)
base_model = prepare_model_for_kbit_training(base_model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(base_model, lora_config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Preprocess/Tokenize Datasets

In [None]:
# Helper: Load .jsonl into list of dicts
def load_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        return [json.loads(line) for line in f]

# Load unformatted datasets
raw_datasets = DatasetDict({
    "train": Dataset.from_list(load_jsonl(path_train)),
    "validation": Dataset.from_list(load_jsonl(path_val))
})

# Format into Phi-3 prompt style
def format_example(example):
    prompt = (
        "<|system|>\nYou are a helpful assistant that writes Asymptote code.\n<|end|>\n"
        f"<|user|>\n{example['description']}\n<|end|>\n"
        f"<|assistant|>\n{example['asy_code']}<|end|>"
    )
    return {"text": prompt}

formatted_datasets = raw_datasets.map(format_example)

# Tokenize and prepare labels
def tokenize(example):
    tokenized = tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=1024
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

tokenized_datasets = formatted_datasets.map(
    tokenize,
    remove_columns=formatted_datasets["train"].column_names
)

train_data = tokenized_datasets["train"]
val_data = tokenized_datasets["validation"]

print("Datasets ready for training:")
print(f" - Train: {len(train_data)} examples")
print(f" - Val:   {len(val_data)} examples")

Map:   0%|          | 0/1381 [00:00<?, ? examples/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Map:   0%|          | 0/1381 [00:00<?, ? examples/s]

Map:   0%|          | 0/86 [00:00<?, ? examples/s]

Datasets ready for training:
 - Train: 1381 examples
 - Val:   86 examples


Training Arguments

In [None]:
training_args = TrainingArguments(
    output_dir=dir_output,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=16,  # effective batch size = 32
    num_train_epochs=5,
    logging_dir=dir_logs,
    eval_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=2,
    learning_rate=2e-5,
    fp16=False,
    bf16=True, # Use bf16 on A100 GPU
    report_to="none",
    remove_unused_columns=False,
    dataloader_num_workers=2
)

Training

In [None]:
def compute_metrics(eval_preds):
    return {}

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

trainer.train()

  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,No log,1.360006
2,1.449900,1.194454
3,1.208700,1.081748
4,0.997800,1.001288


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=215, training_loss=1.1691691553869912, metrics={'train_runtime': 2799.7507, 'train_samples_per_second': 2.466, 'train_steps_per_second': 0.077, 'total_flos': 1.5525680787750912e+17, 'train_loss': 1.1691691553869912, 'epoch': 4.903039073806078})

Save Model and Tokenizer

In [None]:
model.save_pretrained(dir_output)
tokenizer.save_pretrained(dir_output)

('/content/drive/MyDrive/asymptote_model/phi3_lora_trained/tokenizer_config.json',
 '/content/drive/MyDrive/asymptote_model/phi3_lora_trained/special_tokens_map.json',
 '/content/drive/MyDrive/asymptote_model/phi3_lora_trained/tokenizer.model',
 '/content/drive/MyDrive/asymptote_model/phi3_lora_trained/added_tokens.json',
 '/content/drive/MyDrive/asymptote_model/phi3_lora_trained/tokenizer.json')