# XLAM 2.0 Function Calling Fine-tuning

Fine-tune Qwen 2.5 Instruct on XLAM 2.0 (APIGen-MT) format dataset for improved function calling.

**Model**: Qwen/Qwen2.5-7B-Instruct  
**Method**: LoRA with 4-bit quantization  
**Framework**: Unsloth for 2x faster training

## 1. Setup and Installation

In [None]:
# Install dependencies
!pip install -q unsloth
!pip install -q "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install -q xformers trl peft accelerate bitsandbytes

In [None]:
import json
import torch
from datasets import load_dataset, Dataset
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments
from pathlib import Path

## 2. Configuration

In [None]:
# Model configuration
MODEL_NAME = "unsloth/Qwen2.5-7B-Instruct"
MAX_SEQ_LENGTH = 2048
LOAD_IN_4BIT = True

# LoRA configuration (from APIGen-MT paper)
LORA_R = 16
LORA_ALPHA = 16
LORA_DROPOUT = 0.0
TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]

# Training configuration
BATCH_SIZE = 2
GRADIENT_ACCUMULATION_STEPS = 4
LEARNING_RATE = 2e-4
NUM_EPOCHS = 3
WARMUP_STEPS = 10
LOGGING_STEPS = 10
SAVE_STEPS = 100

# Dataset configuration
HF_DATASET_REPO = None  # Set to your HuggingFace repo, e.g., "username/xlam-dataset"
LOCAL_DATASET_PATH = "xlam_v2_formatted.jsonl"  # XLAM 2.0 formatted dataset

# Output configuration
OUTPUT_DIR = "./xlam_checkpoints"
FINAL_MODEL_NAME = "xlam-qwen2.5-7b-lora"

## 3. Load Model with Unsloth

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=None,  # Auto-detect
    load_in_4bit=LOAD_IN_4BIT,
)

print(f"Model loaded: {MODEL_NAME}")
print(f"Max sequence length: {MAX_SEQ_LENGTH}")
print(f"4-bit quantization: {LOAD_IN_4BIT}")

## 4. Configure LoRA

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    target_modules=TARGET_MODULES,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=42,
)

print(f"LoRA configured with rank={LORA_R}, alpha={LORA_ALPHA}")

## 5. Load and Prepare Dataset

In [None]:
# Load dataset from HuggingFace or local file
if HF_DATASET_REPO:
    print(f"Loading dataset from HuggingFace: {HF_DATASET_REPO}")
    dataset = load_dataset(HF_DATASET_REPO, split="train")
else:
    print(f"Loading dataset from local file: {LOCAL_DATASET_PATH}")
    # Read JSONL file
    data = []
    with open(LOCAL_DATASET_PATH) as f:
        for line in f:
            data.append(json.loads(line))
    dataset = Dataset.from_list(data)

print(f"Dataset loaded: {len(dataset)} samples")
print(f"Example sample: {dataset[0]}")

In [None]:
# Convert XLAM 2.0 format to chat messages for training
def xlam_to_chat_messages(sample):
    """
    Convert XLAM 2.0 format to chat messages.
    
    XLAM format has:
    - conversations: list of turns with from/value
    - tools: JSON string of tool definitions
    - system: domain policy
    
    We convert to messages format for training.
    """
    messages = []
    
    # Add system message (domain policy)
    if sample.get("system"):
        messages.append({
            "role": "system",
            "content": sample["system"]
        })
    
    # Convert conversation turns to messages
    role_mapping = {
        "human": "user",
        "gpt": "assistant",
        "function_call": "assistant",
        "observation": "user"
    }
    
    for turn in sample["conversations"]:
        role = role_mapping.get(turn["from"], "user")
        messages.append({
            "role": role,
            "content": turn["value"]
        })
    
    return {"messages": messages}

# Format messages into Qwen chat template
def format_chat_template(sample):
    """
    Convert messages to Qwen 2.5 chat template format.
    
    Qwen format:
    <|im_start|>system
    {system_message}<|im_end|>
    <|im_start|>user
    {user_message}<|im_end|>
    <|im_start|>assistant
    {assistant_message}<|im_end|>
    """
    messages = sample["messages"]
    
    # Use tokenizer's chat template if available
    if hasattr(tokenizer, "apply_chat_template"):
        formatted = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False
        )
    else:
        # Manual formatting
        formatted = ""
        for msg in messages:
            role = msg["role"]
            content = msg["content"]
            formatted += f"<|im_start|>{role}\n{content}<|im_end|>\n"
    
    return {"text": formatted}

# Convert XLAM to messages, then to chat template
dataset = dataset.map(xlam_to_chat_messages)
dataset = dataset.map(format_chat_template, remove_columns=dataset.column_names)
print(f"Dataset formatted for training")
print(f"Example formatted text:\n{dataset[0]['text'][:500]}...")

## 6. Training Configuration

In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    warmup_steps=WARMUP_STEPS,
    num_train_epochs=NUM_EPOCHS,
    learning_rate=LEARNING_RATE,
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    logging_steps=LOGGING_STEPS,
    save_steps=SAVE_STEPS,
    save_total_limit=3,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=42,
    report_to="none",  # Change to "wandb" if you want logging
)

print(f"Training configuration:")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Gradient accumulation: {GRADIENT_ACCUMULATION_STEPS}")
print(f"  Effective batch size: {BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS}")
print(f"  Learning rate: {LEARNING_RATE}")
print(f"  Epochs: {NUM_EPOCHS}")

## 7. Initialize Trainer

In [None]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LENGTH,
    args=training_args,
    packing=False,  # Disable packing for function calling (needs clear boundaries)
)

print("Trainer initialized")

## 8. Train Model

In [None]:
# Show GPU memory before training
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU: {gpu_stats.name}")
print(f"Memory: {start_gpu_memory} GB / {max_memory} GB reserved")
print("\nStarting training...\n")

# Train
trainer_stats = trainer.train()

# Show GPU memory after training
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_training = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
print(f"\nTraining complete!")
print(f"Peak memory reserved: {used_memory} GB ({used_percentage}%)")
print(f"Memory used for training: {used_memory_for_training} GB")

## 9. Save Model

In [None]:
# Save LoRA adapters locally
model.save_pretrained(FINAL_MODEL_NAME)
tokenizer.save_pretrained(FINAL_MODEL_NAME)
print(f"Model saved to {FINAL_MODEL_NAME}")

# Optionally push to HuggingFace Hub
# model.push_to_hub("your-username/xlam-qwen2.5-7b-lora", token="your_token")
# tokenizer.push_to_hub("your-username/xlam-qwen2.5-7b-lora", token="your_token")

## 10. Test Inference

In [None]:
# Enable inference mode
FastLanguageModel.for_inference(model)

# Test tools
test_tools = [
    {
        "name": "get_weather",
        "description": "Get current weather for a location",
        "parameters": {
            "type": "object",
            "properties": {
                "location": {"type": "string", "description": "City name"},
                "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
            },
            "required": ["location"]
        }
    }
]

# Test message
test_messages = [
    {"role": "system", "content": "You are a helpful assistant with access to tools."},
    {"role": "user", "content": f"Available tools:\n{json.dumps(test_tools, indent=2)}\n\nUser: What's the weather in San Francisco?"}
]

# Format with chat template
test_input = tokenizer.apply_chat_template(
    test_messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt"
).to("cuda")

# Generate
outputs = model.generate(
    test_input,
    max_new_tokens=256,
    temperature=0.7,
    top_p=0.9,
    do_sample=True
)

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("\n=== Test Inference ===")
print(response)

## 11. Evaluation (Optional)

Compare fine-tuned model vs base model on function calling accuracy.

In [None]:
# Evaluation test cases
eval_cases = [
    {
        "query": "Book a flight from NYC to LAX for tomorrow",
        "expected_tool": "book_flight",
        "tools": [
            {"name": "book_flight", "description": "Book airline tickets"},
            {"name": "get_weather", "description": "Get weather info"},
        ]
    },
    {
        "query": "What's the temperature in Paris?",
        "expected_tool": "get_weather",
        "tools": [
            {"name": "get_weather", "description": "Get weather info"},
            {"name": "book_hotel", "description": "Book hotel rooms"},
        ]
    },
]

def evaluate_tool_calling(model, tokenizer, test_cases):
    """Evaluate model's tool calling accuracy."""
    correct = 0
    total = len(test_cases)
    
    for case in test_cases:
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": f"Available tools:\n{json.dumps(case['tools'])}\n\nUser: {case['query']}"}
        ]
        
        inputs = tokenizer.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to("cuda")
        
        outputs = model.generate(inputs, max_new_tokens=128, temperature=0.1)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Check if expected tool is called
        if case['expected_tool'] in response:
            correct += 1
            print(f"✓ {case['query']} -> {case['expected_tool']}")
        else:
            print(f"✗ {case['query']} -> Expected {case['expected_tool']}, got: {response[:100]}")
    
    accuracy = (correct / total) * 100
    print(f"\nAccuracy: {accuracy:.1f}% ({correct}/{total})")
    return accuracy

# Run evaluation
print("=== Evaluating Fine-tuned Model ===")
finetuned_accuracy = evaluate_tool_calling(model, tokenizer, eval_cases)

# Compare with base model
print("\n=== Loading Base Model for Comparison ===")
base_model, base_tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=None,
    load_in_4bit=LOAD_IN_4BIT,
)
FastLanguageModel.for_inference(base_model)

print("\n=== Evaluating Base Model ===")
base_accuracy = evaluate_tool_calling(base_model, base_tokenizer, eval_cases)

print(f"\n=== Results ===")
print(f"Base Model Accuracy: {base_accuracy:.1f}%")
print(f"Fine-tuned Model Accuracy: {finetuned_accuracy:.1f}%")
print(f"Improvement: {finetuned_accuracy - base_accuracy:.1f}%")

## 12. Export for Production

Merge LoRA weights with base model for deployment.

In [None]:
# Merge and save full model (larger but faster inference)
model.save_pretrained_merged(
    f"{FINAL_MODEL_NAME}-merged",
    tokenizer,
    save_method="merged_16bit",  # or "merged_4bit" for smaller size
)
print(f"Merged model saved to {FINAL_MODEL_NAME}-merged")

# Save in GGUF format for llama.cpp (optional)
# model.save_pretrained_gguf(
#     f"{FINAL_MODEL_NAME}-gguf",
#     tokenizer,
#     quantization_method="q4_k_m"
# )