<a href="https://colab.research.google.com/github/reinterpretcat/vrp-experiments/blob/master/Qwen3-4B-reVRP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installation

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

# Setup model

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 10240 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # Can use 4bit quantization to reduce memory usage.


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-4B-Base",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.7.8: Fast Qwen3 patching. Transformers: 4.53.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.08G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/166 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

In [None]:
lora_rank = 32 # Larger rank = smarter, but slower

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.7.8 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


# Upload data

In [None]:
# --- 1. Upload data set (data.jsonl) ---
from google.colab import files
uploaded = files.upload()
file_path = "data.jsonl"

Saving data.jsonl to data.jsonl


In [None]:
# --- 2. Load data set ---
from datasets import load_dataset

# Load JSONL file
# 'json' builder automatically handles .jsonl files line-by-line
dataset = load_dataset('json', data_files=file_path, split='train')

dataset

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['prompt', 'completion'],
    num_rows: 620
})

In [None]:
dataset[0]

{'prompt': 'Unassigned Job Reason: NO_REASON_FOUND\n\nExplain what is Unassigned Job Reason: NO_REASON_FOUND',
 'completion': '```markdown\nThe `NO_REASON_FOUND` code indicates that the solver could not identify a specific constraint violation as the primary cause for a job being unassigned. This can occur in complex, highly constrained problems where multiple factors subtly contribute to infeasibility, or when a job is simply "too expensive" to serve given the overall optimization objectives without a clear single culprit.\n\n**Possible action:** unknown\n\n**How to improve problem formulation:**\nWhen `NO_REASON_FOUND` appears, it often suggests a fundamental mismatch between the problem\'s scale/complexity and the available fleet\'s capabilities, or very aggressive objectives.\n1.  **Relax Objectives:** If you have strict objectives like `minimize-tours` or `minimize-arrival-time`, consider adding `minimize-unassigned` with a higher priority or a penalty to ensure all jobs are attem

## Prompt template

In [None]:
prompt_template = """You are an expert in Vehicle Routing Problems (VRP) and the `reinterpretcat` VRP solver.
Your mission is to provide concise, accurate, and direct answers to user queries by understanding the complete context of the problem.
Strive for clarity, correctness, and adherence to the requested format in all your responses.
Below is a task related to Vehicle Routing Problems. Write a response that appropriately completes the request.

### Instruction:
{instruction_content}

### Response:
{response_content}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["prompt"]
    outputs      = examples["completion"]
    texts = []
    for instruction, output in zip(instructions, outputs):
        text = prompt_template.format(instruction_content=instruction, response_content=output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

dataset = dataset.map(formatting_prompts_func, batched = True,)


Map:   0%|          | 0/620 [00:00<?, ? examples/s]

# Config

In [None]:
from trl import SFTConfig, SFTTrainer
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    packing = False, # Can make training 5x faster for short sequences.
    args = SFTConfig(
        # compute loss on both the prompt and the completion while still using a prompt-completion dataset,
        # this is equivalent to converting the dataset to a language modeling format.
        completion_only_loss=False,
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/620 [00:00<?, ? examples/s]

## Train

In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 620 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 66,060,288 of 4,088,528,384 (1.62% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,1.1924
2,1.6147
3,1.3673
4,1.518
5,1.8452
6,1.4135
7,1.1245
8,1.1189
9,1.0585
10,1.3836


# Inference

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    prompt_template.format(
        instruction_content = "I'm getting unassigned jobs with CAPACITY_CONSTRAINT code. How I can fix that?", # instruction
        response_content = "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")


# Synchronous:
#outputs = model.generate(**inputs, max_new_tokens = 500, use_cache = True)
#tokenizer.batch_decode(outputs)

# Streaming:
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 500)

You are an expert in Vehicle Routing Problems (VRP) and the `reinterpretcat` VRP solver.
Your mission is to provide concise, accurate, and direct answers to user queries by understanding the complete context of the problem.
Strive for clarity, correctness, and adherence to the requested format in all your responses.
Below is a task related to Vehicle Routing Problems. Write a response that appropriately completes the request.

### Instruction:
I'm getting unassigned jobs with CAPACITY_CONSTRAINT code. How I can fix that?

### Response:
```markdown
The `CAPACITY_CONSTRAINT` code indicates that a job could not be assigned because the vehicle's capacity was exceeded at some point during its route. This typically happens when the vehicle's load exceeds its maximum capacity at any point, even if it's not the final job. To fix this:
1.  **Increase Vehicle Capacity**: If possible, increase the `capacity` of the vehicle to handle more demand.
2.  **Reduce Job Demand**: Decrease the `demand` of

# Saving

Some cleanup

In [None]:
import psutil

def mem_stat():
  # GPU memory statistics (existing)
  gpu_stats = torch.cuda.get_device_properties(0)
  start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
  max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
  print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
  print(f"{start_gpu_memory} GB of GPU memory reserved.")

  # RAM memory statistics (new)
  ram_stats = psutil.virtual_memory()
  total_ram = round(ram_stats.total / (1024 ** 3), 3)
  available_ram = round(ram_stats.available / (1024 ** 3), 3)
  used_ram = round(ram_stats.used / (1024 ** 3), 3)
  print(f"RAM: Total = {total_ram} GB, Used = {used_ram} GB, Available = {available_ram} GB")

In [None]:
print("Before cleanup:")
mem_stat()

Before cleanup:
GPU = Tesla T4. Max memory = 14.741 GB.
10.428 GB of GPU memory reserved.
RAM: Total = 12.674 GB, Used = 6.945 GB, Available = 3.568 GB


In [None]:
import torch
import gc

# Delete variables that are no longer needed
del trainer
del dataset

# Run garbage collection
gc.collect()

# Clear GPU cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()


NameError: name 'dataset' is not defined

In [None]:
print("\n\nAfter cleanup:")
mem_stat()



After cleanup:
GPU = Tesla T4. Max memory = 14.741 GB.
10.428 GB of GPU memory reserved.
RAM: Total = 12.674 GB, Used = 7.026 GB, Available = 3.485 GB


## Save locally

In [None]:
save_locally = True
save_full = True

if save_locally:

  # model.save_pretrained("Qwen3-4B-reVRP-lora")
  # tokenizer.save_pretrained("Qwen3-4B-reVRP-lora")

  if save_full:
    # Merge to 16bit
    model.save_pretrained_merged("Qwen3-4B-reVRP",
                                tokenizer,
                                save_method = "merged_16bit",
                                maximum_memory_usage = 0.2,   # avoid OOM
                                safe_serialization=None)      # save as safe tensors, not bin
else:
  model.push_to_hub_merged("reinterpretcat/Qwen3-4B-reVRP",
                           tokenizer,
                           save_method = "merged_16bit",
                           maximum_memory_usage = 0.2,
                           safe_serialization=None,
                           token = "")


Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...
Successfully copied all 2 files from cache to Qwen3-4B-reVRP.
Downloading safetensors index for unsloth/Qwen3-4B-Base...


Unsloth: Merging weights into 16bit:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
from google.colab import files
import os
import shutil

if save_locally:
  # Define the directory where the model was saved
  output_dir = "Qwen3-4B-reVRP-lora"

  # Download each file
  for file_name in os.listdir(output_dir):
    file_path = os.path.join(output_dir, file_name)
    files.download(file_path)

  # zip_file_name = "Qwen3-4B-reVRP.zip"
  # shutil.make_archive(zip_file_name.replace(".zip", ""), 'zip', output_dir)
  # files.download(zip_file_name)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>