In [None]:
%pip install peft wandb

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset
import torch
from peft import LoraConfig, get_peft_model
import numpy as np
import os
from kaggle_secrets import UserSecretsClient
import wandb
from huggingface_hub import login

In [4]:
user_secrets = UserSecretsClient()
os.environ['hf_token'] = user_secrets.get_secret("HF_TOKEN")
os.environ['WANDB_API_KEY'] = user_secrets.get_secret("WANDB_TOKEN")
assert(os.getenv('hf_token'))
assert(os.getenv('WANDB_API_KEY'))

In [5]:
from datetime import datetime
import pytz

now_utc = datetime.now(pytz.utc)
now_colombo = now_utc.astimezone(pytz.timezone('Asia/Colombo'))
time_str = now_colombo.strftime('%Y-%b-%d--%H-%M-%S')
run_name = f'lora-{time_str}'
print(run_name)

lora-2025-May-16--13-53-57


In [6]:
wandb.login(key=os.getenv('WANDB_API_KEY'))
wandb.init(project="choreo-doc-ast-ft-lora", name=run_name)

login(token=os.getenv('hf_token'))

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mrtweera[0m ([33mrtw-rtweera[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


In [7]:
# Load tokenizer and model
model_id = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,  # Use mixed precision for efficiency
    device_map="auto"            # Automatically choose best device setup (NOTE: remove if causes problems)
)

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

In [None]:
# Configure model for LoRA fine-tuning (Parameter-Efficient Fine-Tuning)
lora_config = LoraConfig(
    r=8,                          # Rank
    lora_alpha=16,                # Alpha parameter for LoRA scaling
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # Attention layers to fine-tune
    lora_dropout=0.10,
    bias="none",
    task_type="CAUSAL_LM"
)

In [9]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()  # Shows percentage of parameters being trained

trainable params: 1,081,344 || all params: 495,114,112 || trainable%: 0.2184


In [11]:
# Load dataset
dataset = load_dataset("json", data_files="/kaggle/input/choreo-dataset/choreo_dataset.jsonl")

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
# Proper formatting for Qwen models' chat template based on Ollama template
# def format_chat(instruction, input_text, output):
#     # Using Qwen's chat template structure from Ollama
#     if instruction:
#         # Use instruction as system message
#         formatted = f"<|im_start|>system\n{instruction}<|im_end|>\n"
#     else:
#         formatted = ""
    
#     # Add user input (if any)
#     if input_text:
#         formatted += f"<|im_start|>user\n{input_text}<|im_end|>\n"
    
#     # Add assistant response
#     formatted += f"<|im_start|>assistant\n{output}<|im_end|>"
    
#     return formatted

In [12]:
# Split the dataset into training and validation sets
dataset = dataset["train"].train_test_split(test_size=0.1)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

In [13]:
# Preprocess and tokenize dataset
def preprocess_function(examples):
    # We are providing same instruction for every message, so tokenize only once (for optimization)
    system_instruction = examples['instruction'][0] # select 1st instruction

    # format each example
    formatted_texts = []
    for inp, out in zip(examples['input'], examples['output']):
        # Using consistent system message with varied user inputs and assistant outputs
        formatted = f"<|im_start|>system\n{system_instruction}<|im_end|>\n<|im_start|>user\n{inp}<|im_end|>\n<|im_start|>assistant\n{out}<|im_end|>"
        formatted_texts.append(formatted)
    
    tokenized = tokenizer(
        formatted_texts,
        padding="max_length",
        truncation=True,
        max_length=1024,
        return_tensors="pt"
    )
    
    # Create labels (for causal LM, typically identical to input_ids)
    tokenized["labels"] = tokenized["input_ids"].clone()
    
    return tokenized

In [14]:
# Apply preprocessing to datasets
tokenized_train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=train_dataset.column_names
)

tokenized_eval_dataset = eval_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=eval_dataset.column_names
)

Map:   0%|          | 0/607 [00:00<?, ? examples/s]

Map:   0%|          | 0/68 [00:00<?, ? examples/s]

In [None]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir="./qwen_choreo_ft_lora",
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=4,
    eval_accumulation_steps=4,
    num_train_epochs=20,
    weight_decay=0.01,
    logging_steps=10,
    logging_first_step=True,
    eval_on_start=True,
    eval_strategy="steps",
    eval_steps=20,
    save_strategy="steps",
    fp16=True if torch.cuda.is_available() else False,
    logging_dir="./logs",
    load_best_model_at_end=True,
    save_total_limit=3,
    report_to="wandb",
    push_to_hub=True,
    run_name=run_name
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer,
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [17]:
# Start training
trainer.train()

Step,Training Loss,Validation Loss
0,No log,11.058711
20,9.615100,9.616743
40,7.998200,7.957623
60,6.320400,6.294807
80,4.811900,4.884067
100,4.144000,3.989768


TrainOutput(global_step=114, training_loss=6.505313045100162, metrics={'train_runtime': 485.708, 'train_samples_per_second': 3.749, 'train_steps_per_second': 0.235, 'total_flos': 4016342565126144.0, 'train_loss': 6.505313045100162, 'epoch': 3.0})

In [18]:
# Save the model
model.save_pretrained("./qwen_choreo_ft_final")
tokenizer.save_pretrained("./qwen_choreo_ft_final")

('./qwen_choreo_ft_final/tokenizer_config.json',
 './qwen_choreo_ft_final/special_tokens_map.json',
 './qwen_choreo_ft_final/vocab.json',
 './qwen_choreo_ft_final/merges.txt',
 './qwen_choreo_ft_final/added_tokens.json',
 './qwen_choreo_ft_final/tokenizer.json')

In [None]:
# Optionally, test the model with a sample
test_input = "How can I enable rate limiting for aan API in choreo?"
formatted_test = f"<|im_start|>user\n{test_input}<|im_end|>\n<|im_start|>assistant\n"
inputs = tokenizer(formatted_test, return_tensors="pt").to(model.device)

# Generate response
outputs = model.generate(
    input_ids=inputs["input_ids"],
    max_new_tokens=1000,
    temperature=0.5,
    top_p=0.9,
    do_sample=True
)

# Decode and print response
print(tokenizer.decode(outputs[0], skip_special_tokens=False))

<|im_start|>user
How can I enable rate limiting for aan API in choreo?<|im_end|>
<|im_start|>assistant
To enable rate limiting for an API using the Choreo framework, you typically need to configure your application to limit the number of requests that it makes per unit time (e.g., per second). This is usually done by setting a rate limit on your backend service.

Here are the general steps to achieve this:

### 1. Define Rate Limits

First, define how many requests you want to make within a given time frame. For example, if you want to allow up to 500 requests per minute, you would set the following rate limits:

```python
from choreo.api import RateLimitingAPI

# Create an instance of the Rate Limiting API
rate_limit = RateLimitingAPI()

# Set the maximum number of requests allowed per minute
rate_limit.set_max_requests_per_minute(500)

# Example usage:
response = rate_limit.request("GET", "/api/v1/data")
```

### 2. Implement Rate Limiting Logic

You also need to implement logic to h

[1;34mwandb[0m: 
[1;34mwandb[0m: 🚀 View run [33mlora-2025-May-16--13-53-57[0m at: [34mhttps://wandb.ai/rtw-rtweera/choreo-doc-ast-ft-lora/runs/4u0f0b63[0m
[1;34mwandb[0m: Find logs at: [1;35mwandb/run-20250516_082406-4u0f0b63/logs[0m
