In [None]:
%pip install transformers datasets wandb accelerate deepspeed

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset
import torch
import numpy as np
import os
from kaggle_secrets import UserSecretsClient
import wandb
from huggingface_hub import login

In [None]:
# Set up environment variables for authentication
user_secrets = UserSecretsClient()
os.environ['HF_TOKEN'] = user_secrets.get_secret("HF_TOKEN")
os.environ['WANDB_API_KEY'] = user_secrets.get_secret("WANDB_TOKEN")
assert(os.getenv('HF_TOKEN'))
assert(os.getenv('WANDB_API_KEY'))

In [None]:
# Create a unique run name based on timestamp
from datetime import datetime
import pytz

now_utc = datetime.now(pytz.utc)
now_colombo = now_utc.astimezone(pytz.timezone('Asia/Colombo'))
time_str = now_colombo.strftime('%Y-%b-%d--%H-%M-%S')
run_name = f'full-ft-{time_str}'
print(run_name)

In [None]:
# Initialize wandb and login to HuggingFace
wandb.login(key=os.getenv('WANDB_API_KEY'))
wandb.init(project="choreo-doc-full-ft", name=run_name)

login(token=os.getenv('HF_TOKEN'))

In [None]:
# Load tokenizer and model
model_id = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,  # Use mixed precision for efficiency
    device_map="auto"            # Automatically choose best device setup
)

# Make sure all parameters are trainable (unlike LoRA which only trains a subset)
for param in model.parameters():
    param.requires_grad = True

In [None]:
# Print trainable parameters info
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"Trainable parameters: {trainable_params:,} ({100 * trainable_params / total_params:.2f}% of total)")

In [None]:
# Load dataset
dataset = load_dataset("json", data_files="/kaggle/input/choreo-dataset/choreo_dataset.jsonl")

In [None]:
# Split the dataset into training and validation sets
dataset = dataset["train"].train_test_split(test_size=0.1)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

In [None]:
# Preprocess and tokenize dataset
def preprocess_function(examples):
    # Format conversations using the model's chat template
    conversations = []
    
    for instruction, inp, out in zip(examples['instruction'], examples['input'], examples['output']):
        # Create conversation with system instruction, user input, and assistant output
        messages = [
            {"role": "system", "content": instruction},
            {"role": "user", "content": inp},
            {"role": "assistant", "content": out}
        ]
        conversations.append(messages)
    
    # Apply the model's built-in chat template
    formatted_texts = [tokenizer.apply_chat_template(conv, tokenize=False, add_generation_prompt=False) 
                      for conv in conversations]
    
    # Tokenize the formatted texts
    tokenized = tokenizer(
        formatted_texts,
        padding="max_length",
        truncation=True,
        max_length=1024,
        return_tensors="pt"
    )
    
    # Create labels (for causal LM, typically identical to input_ids)
    tokenized["labels"] = tokenized["input_ids"].clone()
    
    return tokenized

In [None]:
# %%

In [None]:
# Apply preprocessing to datasets
tokenized_train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=train_dataset.column_names
)

tokenized_eval_dataset = eval_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=eval_dataset.column_names
)

In [None]:
# DeepSpeed configuration for efficient full fine-tuning
deepspeed_config = {
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },
    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "betas": "auto",
            "eps": "auto",
            "weight_decay": "auto"
        }
    },
    "scheduler": {
        "type": "WarmupLR",
        "params": {
            "warmup_min_lr": "auto",
            "warmup_max_lr": "auto",
            "warmup_num_steps": "auto"
        }
    },
    "zero_optimization": {
        "stage": 2,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": True
        },
        "allgather_partitions": True,
        "allgather_bucket_size": 2e8,
        "overlap_comm": True,
        "reduce_scatter": True,
        "reduce_bucket_size": 2e8,
        "contiguous_gradients": True
    },
    "gradient_accumulation_steps": "auto",
    "gradient_clipping": "auto",
    "steps_per_print": 10,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": False
}

In [None]:
# %%

In [None]:
# Set up training arguments - reduced learning rate and batch size for full fine-tuning
training_args = TrainingArguments(
    output_dir="./qwen_choreo_full_ft",
    # For full fine-tuning, we use a lower learning rate than in LoRA
    learning_rate=2e-6,
    per_device_train_batch_size=2,  # Smaller batch size due to memory constraints with full fine-tuning
    gradient_accumulation_steps=8,  # Increased to compensate for smaller batch size
    per_device_eval_batch_size=2,
    eval_accumulation_steps=8,
    num_train_epochs=4,  # Fewer epochs than LoRA, as full fine-tuning converges faster
    weight_decay=0.01,
    logging_steps=10,
    logging_first_step=True,
    eval_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=200,
    fp16=True if torch.cuda.is_available() else False,
    bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(),
    logging_dir="./logs",
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="wandb",
    push_to_hub=True,
    hub_model_id=f"qwen-choreo-full-ft-{time_str}",  # Custom model ID for HF Hub
    hub_strategy="end",  # Push at the end of training
    run_name=run_name,
    # Additional parameters for full fine-tuning
    deepspeed=deepspeed_config,  # Enable DeepSpeed
    tf32=True,  # Enable TensorFloat-32 if available
    ddp_find_unused_parameters=False,
    gradient_checkpointing=True,  # Enable gradient checkpointing for memory efficiency
    optim="adamw_torch",
    warmup_ratio=0.1,  # Warmup ratio for learning rate scheduler
    lr_scheduler_type="cosine",  # Cosine scheduler works well for full fine-tuning
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer,
)

# %%
# Start training
trainer.train()

# %%
# Save the model locally
model.save_pretrained("./qwen_choreo_full_ft_final")
tokenizer.save_pretrained("./qwen_choreo_full_ft_final")

# Push the final model to Hub if not already done by the trainer
if not training_args.push_to_hub:
    model.push_to_hub(f"qwen-choreo-full-ft-{time_str}")
    tokenizer.push_to_hub(f"qwen-choreo-full-ft-{time_str}")

# %%
# Test the model with a sample
test_input = "How do I configure my applications to scale automatically?"

# Create conversation with the test input
test_messages = [
    {"role": "user", "content": test_input}
]
formatted_test = tokenizer.apply_chat_template(test_messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(formatted_test, return_tensors="pt").to(model.device)

# Generate response
outputs = model.generate(
    input_ids=inputs["input_ids"],
    max_new_tokens=1000,
    temperature=0.5,
    top_p=0.9,
    do_sample=True
)

# Decode and print response
print(tokenizer.decode(outputs[0], skip_special_tokens=False))