In [1]:
# %%
%pip install transformers datasets wandb accelerate

# %%
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset
import torch
import numpy as np
import os
from kaggle_secrets import UserSecretsClient
import wandb
from huggingface_hub import login


Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=2.0.0->accelerate)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collectin

2025-05-18 09:02:46.611601: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747558967.068657      71 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747558967.221108      71 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:

# %%
# Set up environment variables for authentication
user_secrets = UserSecretsClient()
os.environ['HF_TOKEN'] = user_secrets.get_secret("HF_TOKEN")
os.environ['WANDB_API_KEY'] = user_secrets.get_secret("WANDB_TOKEN")
assert(os.getenv('HF_TOKEN'))
assert(os.getenv('WANDB_API_KEY'))


In [3]:

# %%
# Create a unique run name based on timestamp
from datetime import datetime
import pytz

now_utc = datetime.now(pytz.utc)
now_colombo = now_utc.astimezone(pytz.timezone('Asia/Colombo'))
time_str = now_colombo.strftime('%Y-%b-%d_%H-%M-%S')
run_name = f'full-ft-{time_str}'
print(run_name)


full-ft-2025-May-18_14-33-09


In [4]:

# %%
# Initialize wandb and login to HuggingFace
wandb.login(key=os.getenv('WANDB_API_KEY'))
wandb.init(project="choreo-doc-full-ft", name=run_name)

login(token=os.getenv('HF_TOKEN'))


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mrtweera[0m ([33mrtw-rtweera[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [5]:

# %%
# First set the default device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [6]:

# Load tokenizer and model
model_id = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)


tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [7]:

# Load model with explicit device placement to avoid mixed-device issues
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16
)

# Explicitly move model to the device
model = model.to(device)


config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

In [8]:

# Make sure all parameters are trainable (unlike LoRA which only trains a subset)
for param in model.parameters():
    param.requires_grad = True

# %%
# Print trainable parameters info
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"Trainable parameters: {trainable_params:,} ({100 * trainable_params / total_params:.2f}% of total)")


Trainable parameters: 494,032,768 (100.00% of total)


In [9]:

# Check that all parameters are on the correct device
device_check = all(p.device == device for p in model.parameters())
print(f"All parameters on {device}: {device_check}")


All parameters on cuda: False


In [10]:

# %%
# Load dataset
dataset = load_dataset("json", data_files="/kaggle/input/choreo-dataset/choreo_dataset.jsonl")


Generating train split: 0 examples [00:00, ? examples/s]

In [11]:

# %%
# Split the dataset into training and validation sets
dataset = dataset["train"].train_test_split(test_size=0.1, seed=42)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]
print(f"Training examples: {len(train_dataset)}, Evaluation examples: {len(eval_dataset)}")


Training examples: 607, Evaluation examples: 68


In [12]:

# %%
# Preprocess and tokenize dataset
def preprocess_function(examples):
    # Format conversations using the model's chat template
    conversations = []
    
    for instruction, inp, out in zip(examples['instruction'], examples['input'], examples['output']):
        # Create conversation with system instruction, user input, and assistant output
        messages = [
            {"role": "system", "content": instruction},
            {"role": "user", "content": inp},
            {"role": "assistant", "content": out}
        ]
        conversations.append(messages)
    
    # Apply the model's built-in chat template
    formatted_texts = [tokenizer.apply_chat_template(conv, tokenize=False, add_generation_prompt=False) 
                      for conv in conversations]
    
    # Tokenize the formatted texts
    tokenized = tokenizer(
        formatted_texts,
        padding="max_length",
        truncation=True,
        max_length=1024,
        return_tensors="pt"
    )
    
    # Create labels (for causal LM, typically identical to input_ids)
    tokenized["labels"] = tokenized["input_ids"].clone()
    
    return tokenized


In [13]:

# %%
# Apply preprocessing to datasets
print("Preprocessing training dataset...")
tokenized_train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=train_dataset.column_names
)

print("Preprocessing evaluation dataset...")
tokenized_eval_dataset = eval_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=eval_dataset.column_names
)


Preprocessing training dataset...


Map:   0%|          | 0/607 [00:00<?, ? examples/s]

Preprocessing evaluation dataset...


Map:   0%|          | 0/68 [00:00<?, ? examples/s]

In [14]:

# %%
# Determine whether to use fp16 or bf16 based on hardware support
bf16_available = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
print(f"BF16 support available: {bf16_available}")


BF16 support available: True


In [15]:

# Set up training arguments - simplified without DeepSpeed
training_args = TrainingArguments(
    output_dir="./qwen_choreo_full_ft",
    learning_rate=1e-5,  # Slightly higher learning rate since we're not using DeepSpeed
    per_device_train_batch_size=1,  # Very small batch size to avoid OOM errors
    gradient_accumulation_steps=16,  # Increased accumulation to compensate for smaller batch
    per_device_eval_batch_size=1,
    num_train_epochs=3,  # Reduced epochs for faster training
    weight_decay=0.01,
    logging_steps=1,
    logging_first_step=True,
    eval_strategy="steps",
    eval_steps=5,
    save_strategy="steps",
    save_steps=20,
    # Precision options
    bf16=bf16_available,  # Use BF16 if supported
    fp16=not bf16_available and torch.cuda.is_available(),  # Otherwise use FP16 if on CUDA
    logging_dir="./logs",
    load_best_model_at_end=True,
    save_total_limit=4,
    report_to="wandb",
    push_to_hub=True,
    hub_model_id=f"qwen-choreo-full-ft-{time_str}",
    run_name=run_name,
    # Memory optimization
    gradient_checkpointing=True,
    optim="adamw_torch",
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    # Avoid device issues
    no_cuda=not torch.cuda.is_available(),  # Only disable CUDA if not available
    dataloader_num_workers=1,  # Reduced for stability
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer,
)


  trainer = Trainer(


In [16]:
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
5,3.3714,5.053649
10,0.3546,0.740638
15,0.2975,0.659138
20,0.2908,0.610927
25,0.2768,0.589317
30,0.2957,0.577857
35,0.266,0.570947
40,0.205,0.567414
45,0.2939,0.565714
50,0.2903,0.565357


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

TrainOutput(global_step=57, training_loss=0.7154842638655713, metrics={'train_runtime': 2484.0522, 'train_samples_per_second': 0.733, 'train_steps_per_second': 0.023, 'total_flos': 4004244246233088.0, 'train_loss': 0.7154842638655713, 'epoch': 3.0})

In [17]:
# Save the model locally
print("Saving model...")
model.save_pretrained("./qwen_choreo_full_ft_final")
tokenizer.save_pretrained("./qwen_choreo_full_ft_final")

print("Pushing model to Hub...")
model.push_to_hub(f"qwen-choreo-full-ft-{time_str}-final")
tokenizer.push_to_hub(f"qwen-choreo-full-ft-{time_str}-final")


Saving model...
Pushing model to Hub...


model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/rtweera/qwen-choreo-full-ft-2025-May-18_14-33-09-final/commit/72ff706d3ab8e529f89f02cfdaf310b8e8ecc39b', commit_message='Upload tokenizer', commit_description='', oid='72ff706d3ab8e529f89f02cfdaf310b8e8ecc39b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/rtweera/qwen-choreo-full-ft-2025-May-18_14-33-09-final', endpoint='https://huggingface.co', repo_type='model', repo_id='rtweera/qwen-choreo-full-ft-2025-May-18_14-33-09-final'), pr_revision=None, pr_num=None)

In [18]:
# Test the model with a sample
print("Testing the fine-tuned model...")
test_input = "Whart is choreo?"

# Create conversation with the test input
test_messages = [
    {"role": "user", "content": test_input}
]
formatted_test = tokenizer.apply_chat_template(test_messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(formatted_test, return_tensors="pt").to(device)

# Generate response
outputs = model.generate(
    input_ids=inputs["input_ids"],
    max_new_tokens=1000,
    temperature=0.5,
    top_p=0.9,
    do_sample=True
)

# Decode and print response
print("Model response:")
print(tokenizer.decode(outputs[0], skip_special_tokens=False))

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
DynamicCache + torch.export is tested on torch 2.6.0+ and may not work on earlier versions.


Testing the fine-tuned model...
Model response:
<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
Whart is choreo?<|im_end|>
<|im_start|>assistant
Choreo is an open-source, high-performance data processing framework developed and maintained by Alibaba Cloud. It provides a comprehensive set of tools for building scalable, efficient, and flexible data pipelines in various domains such as machine learning, natural language processing, IoT, and more.\n\nChoreo supports multiple programming languages including Python, Java, C++, and Go. With Choreo, you can easily build applications that process large volumes of data efficiently.\n\nKey features of Choreo include:\n- **High Performance**: Designed to handle massive datasets with low latency.\n- **Scalability**: Automatically scales your application based on the load and performance requirements.\n- **Flexibility**: Supports both batch and stream operations.\n- **Security**: Bu