<a href="https://www.kaggle.com/code/pragnyanramtha/ai-math?scriptVersionId=282855868" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
! pip install uv
! uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
! uv pip install -U peft datasets deepspeed scikit-learn accelerate numpy==1.26.4 scikit-learn transformers trl
! uv pip install https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.5.4/flash_attn-2.6.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl

[2mUsing Python 3.11.13 environment at: /usr[0m
[2mAudited [1m3 packages[0m [2min 73ms[0m[0m
[2mUsing Python 3.11.13 environment at: /usr[0m
[2K[2mResolved [1m80 packages[0m [2min 1.79s[0m[0m                                        [0m
[2mAudited [1m80 packages[0m [2min 1ms[0m[0m
[2mUsing Python 3.11.13 environment at: /usr[0m
[2mAudited [1m1 package[0m [2min 77ms[0m[0m


In [2]:
"""
SFT Training Script for Phi-4-Reasoning-Plus
Full fine-tuning with DeepSpeed ZeRO-3 - MAX GPU UTILIZATION
"""

import os
import time
import json
import torch
from pathlib import Path
from datetime import datetime, timedelta

from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, TrainerCallback
from datasets import load_dataset
from trl import SFTTrainer

# Disable W&B
os.environ["WANDB_DISABLED"] = "true"


# === DeepSpeed ZeRO-3 Config - MAX GPU UTILIZATION ===
DEEPSPEED_CONFIG = {
    "bf16": {
        "enabled": True
    },
    "zero_optimization": {
        "stage": 3,
        
        # Keep optimizer on GPU if possible, overflow to CPU
        "offload_optimizer": {
            "device": "cpu",  # Only optimizer to CPU (saves ~56GB)
            "pin_memory": True,
            "buffer_count": 8,
            "fast_init": True
        },
        
        # Keep parameters on GPU (fast)
        "offload_param": {
            "device": "none"  # NO param offload - max speed
        },
        
        # Maximize GPU communication efficiency
        "overlap_comm": True,
        "contiguous_gradients": True,
        "reduce_scatter": True,
        "reduce_bucket_size": 1e9,  # Larger buckets = faster on H100
        "stage3_prefetch_bucket_size": 1e9,
        "stage3_param_persistence_threshold": 1e6,
        
        # Maximize memory usage
        "sub_group_size": 1e12,
        "stage3_max_live_parameters": 3e9,
        "stage3_max_reuse_distance": 3e9,
        "stage3_gather_16bit_weights_on_model_save": True
    },
    
    # Gradient settings
    "gradient_accumulation_steps": "auto",
    "gradient_clipping": "auto",
    
    # Batch settings
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    
    # Faster kernels
    "prescale_gradients": False,
    "wall_clock_breakdown": False,
    
    # Communication optimization for single GPU
    "communication_data_type": "bf16",
}


# === Real-time Logger with GPU Stats ===
class RealTimeLogger(TrainerCallback):
    def __init__(self):
        self.start_time = None
        self.step_times = []
        
    def on_train_begin(self, args, state, control, **kwargs):
        self.start_time = time.time()
        self.last_step_time = time.time()
        print("\n" + "="*80)
        print("üöÄ TRAINING STARTED | DeepSpeed ZeRO-3 | MAX GPU UTILIZATION")
        print("="*80 + "\n")
        
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs and "loss" in logs:
            now = time.time()
            step = state.global_step
            total = state.max_steps
            pct = (step / total) * 100 if total > 0 else 0
            loss = logs.get("loss", 0)
            lr = logs.get("learning_rate", 0)
            epoch = logs.get("epoch", 0)
            
            # Time calculations
            elapsed = now - self.start_time
            step_time = now - self.last_step_time
            self.last_step_time = now
            steps_per_sec = step / elapsed if elapsed > 0 else 0
            remaining = (total - step) / steps_per_sec if steps_per_sec > 0 else 0
            
            # GPU stats
            if torch.cuda.is_available():
                mem_alloc = torch.cuda.memory_allocated() / 1e9
                mem_reserved = torch.cuda.memory_reserved() / 1e9
                mem_total = torch.cuda.get_device_properties(0).total_memory / 1e9
                gpu_util = (mem_reserved / mem_total) * 100
                
                # Get GPU compute utilization if available
                try:
                    import subprocess
                    result = subprocess.run(
                        ['nvidia-smi', '--query-gpu=utilization.gpu', '--format=csv,noheader,nounits'],
                        capture_output=True, text=True, timeout=1
                    )
                    compute_util = int(result.stdout.strip())
                except:
                    compute_util = -1
            else:
                gpu_util = 0
                compute_util = -1
            
            # Format output
            gpu_str = f"Mem: {mem_reserved:.0f}/{mem_total:.0f}GB ({gpu_util:.0f}%)"
            if compute_util >= 0:
                gpu_str += f" | Compute: {compute_util}%"
            
            print(f"[{pct:5.1f}%] Step {step:>5}/{total} | "
                  f"Loss: {loss:.4f} | LR: {lr:.2e} | "
                  f"Epoch: {epoch:.2f} | {gpu_str} | "
                  f"ETA: {timedelta(seconds=int(remaining))}")
    
    def on_save(self, args, state, control, **kwargs):
        print(f"\nüíæ Checkpoint saved at step {state.global_step}\n")
    
    def on_train_end(self, args, state, control, **kwargs):
        elapsed = time.time() - self.start_time
        print("\n" + "="*80)
        print(f"‚úÖ TRAINING COMPLETE | Total time: {timedelta(seconds=int(elapsed))}")
        print("="*80 + "\n")


# === Configuration ===
CONFIG = {
    # Model
    "model_name": "microsoft/Phi-4-reasoning-plus",
    "max_seq_length": 4096,
    
    # Data
    "dataset_path": "/kaggle/input/aimath-train/data/sft_dataset.jsonl",
    "text_field": "text",
    
    # Training
    "output_dir": "/kaggle/working/outputs/sft",
    "num_train_epochs": 2,
    "per_device_train_batch_size": 2,  # Increased for better GPU util
    "gradient_accumulation_steps": 16,  # Adjusted to keep effective batch = 32
    
    # Optimizer
    "learning_rate": 2e-5,
    "lr_scheduler_type": "cosine",
    "warmup_ratio": 0.05,
    "weight_decay": 0.01,
    "max_grad_norm": 1.0,
    
    # Precision
    "bf16": True,
    "tf32": True,
    
    # Memory
    "gradient_checkpointing": True,
    
    # Saving & Logging
    "save_steps": 200,
    "save_total_limit": 3,
    "logging_steps": 5,
    
    # Misc
    "seed": 42,
}


def save_deepspeed_config():
    """Save DeepSpeed config to file."""
    config_path = Path("/kaggle/working/ds_config.json")
    with open(config_path, "w") as f:
        json.dump(DEEPSPEED_CONFIG, f, indent=2)
    print(f"   ‚úÖ DeepSpeed config saved to {config_path}")
    return str(config_path)


def load_model_and_tokenizer():
    """Load Phi-4 for DeepSpeed ZeRO-3 training."""
    print("\n" + "="*80)
    print("üì• Loading Model (DeepSpeed ZeRO-3 - Max GPU)")
    print("="*80)
    
    # Load tokenizer
    print("\n   Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(
        CONFIG["model_name"],
        trust_remote_code=True,
    )
    
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    print(f"   ‚úÖ Tokenizer loaded | Vocab size: {len(tokenizer)}")
    
    # Load model
    print("\n   Loading model...")
    model = AutoModelForCausalLM.from_pretrained(
        CONFIG["model_name"],
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,
        attn_implementation="flash_attention_2",
    )
    
    # Enable gradient checkpointing
    if CONFIG["gradient_checkpointing"]:
        model.gradient_checkpointing_enable()
        model.config.use_cache = False
        print("   ‚úÖ Gradient checkpointing enabled")
    
    # Model stats
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    
    print(f"\n   üìä Model Stats:")
    print(f"      Total parameters:     {total_params / 1e9:.2f}B")
    print(f"      Trainable parameters: {trainable_params / 1e9:.2f}B")
    
    return model, tokenizer


def load_sft_dataset(path: str):
    """Load dataset from JSONL file."""
    print(f"\nüìÇ Loading dataset from {path}")
    dataset = load_dataset("json", data_files=path, split="train")
    print(f"   ‚úÖ Loaded {len(dataset):,} examples")
    return dataset


def create_training_arguments(deepspeed_config_path: str):
    """Create training arguments with DeepSpeed."""
    
    output_dir = Path(CONFIG["output_dir"])
    output_dir.mkdir(parents=True, exist_ok=True)
    
    return TrainingArguments(
        # Output
        output_dir=str(output_dir),
        
        # Training duration
        num_train_epochs=CONFIG["num_train_epochs"],
        
        # Batch size - LARGER for better GPU utilization
        per_device_train_batch_size=CONFIG["per_device_train_batch_size"],
        gradient_accumulation_steps=CONFIG["gradient_accumulation_steps"],
        
        # Learning rate
        learning_rate=CONFIG["learning_rate"],
        lr_scheduler_type=CONFIG["lr_scheduler_type"],
        warmup_ratio=CONFIG["warmup_ratio"],
        weight_decay=CONFIG["weight_decay"],
        max_grad_norm=CONFIG["max_grad_norm"],
        
        # Precision
        bf16=CONFIG["bf16"],
        tf32=CONFIG["tf32"],
        
        # Gradient checkpointing
        gradient_checkpointing=CONFIG["gradient_checkpointing"],
        gradient_checkpointing_kwargs={"use_reentrant": False},
        
        # DeepSpeed
        deepspeed=deepspeed_config_path,
        
        # Saving
        save_strategy="steps",
        save_steps=CONFIG["save_steps"],
        save_total_limit=CONFIG["save_total_limit"],
        
        # Logging
        logging_steps=CONFIG["logging_steps"],
        logging_first_step=True,
        report_to="none",
        disable_tqdm=False,
        
        # Speed optimizations
        dataloader_num_workers=4,
        dataloader_pin_memory=True,
        dataloader_prefetch_factor=2,
        
        # Misc
        seed=CONFIG["seed"],
        local_rank=-1,
    )


def print_training_summary(dataset, training_args):
    """Print training configuration summary."""
    
    effective_batch = (
        training_args.per_device_train_batch_size *
        training_args.gradient_accumulation_steps
    )
    steps_per_epoch = len(dataset) // effective_batch
    total_steps = steps_per_epoch * training_args.num_train_epochs
    
    # Faster with batch_size=2 (~5s per step)
    estimated_time = total_steps * 5
    
    print("\n" + "="*80)
    print("üìã Training Configuration - MAX GPU UTILIZATION")
    print("="*80)
    print(f"   Model:              {CONFIG['model_name']}")
    print(f"   Mode:               Full Fine-Tuning + DeepSpeed ZeRO-3")
    print(f"   Dataset size:       {len(dataset):,}")
    print(f"   Max seq length:     {CONFIG['max_seq_length']}")
    print(f"   Epochs:             {training_args.num_train_epochs}")
    print(f"   Micro batch size:   {training_args.per_device_train_batch_size}")
    print(f"   Gradient accum:     {training_args.gradient_accumulation_steps}")
    print(f"   Effective batch:    {effective_batch}")
    print(f"   Steps per epoch:    {steps_per_epoch:,}")
    print(f"   Total steps:        {total_steps:,}")
    print(f"   Learning rate:      {training_args.learning_rate}")
    print(f"   Precision:          bf16 + tf32")
    print(f"   Param offload:      None (GPU)")
    print(f"   Optimizer offload:  CPU")
    print(f"   Grad checkpointing: {CONFIG['gradient_checkpointing']}")
    print(f"\n   ‚è±Ô∏è  Estimated time:  {timedelta(seconds=estimated_time)}")
    print("="*80)


def main():
    """Main training function."""
    start_time = time.time()
    
    print("\n" + "="*80)
    print("üéØ Phi-4 Math SFT Training - MAX GPU UTILIZATION")
    print("="*80)
    print(f"   Started at:  {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"   Platform:    Kaggle H100 80GB")
    print(f"   Mode:        Full Fine-Tuning (14B params)")
    print(f"   Strategy:    DeepSpeed ZeRO-3 | Params on GPU | Optimizer on CPU")
    
    # GPU info
    if torch.cuda.is_available():
        gpu_name = torch.cuda.get_device_name(0)
        gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
        print(f"   GPU:         {gpu_name} ({gpu_mem:.0f} GB)")
    
    # Save DeepSpeed config
    print("\n‚öôÔ∏è  Setting up DeepSpeed ZeRO-3...")
    ds_config_path = save_deepspeed_config()
    
    # Load model
    model, tokenizer = load_model_and_tokenizer()
    
    # Load dataset
    dataset = load_sft_dataset(CONFIG["dataset_path"])
    
    # Create training arguments
    training_args = create_training_arguments(ds_config_path)
    
    # Print summary
    print_training_summary(dataset, training_args)
    
    # Create trainer
    print("\nüèãÔ∏è Creating SFT Trainer...")
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=dataset,
        args=training_args,
        dataset_text_field=CONFIG["text_field"],
        max_seq_length=CONFIG["max_seq_length"],
        packing=True,
        callbacks=[RealTimeLogger()],
    )
    
    # Pre-training GPU check
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
        mem_alloc = torch.cuda.memory_allocated() / 1e9
        mem_total = torch.cuda.get_device_properties(0).total_memory / 1e9
        print(f"\nüìä GPU Memory before training: {mem_alloc:.1f} / {mem_total:.0f} GB")
    
    # Train
    trainer.train()
    
    # Save final model
    print("\nüíæ Saving final model...")
    final_path = Path(CONFIG["output_dir"]) / "final"
    trainer.save_model(str(final_path))
    tokenizer.save_pretrained(str(final_path))
    
    # Summary
    elapsed = time.time() - start_time
    print(f"\n‚úÖ Model saved to: {final_path}")
    print(f"   Total training time: {timedelta(seconds=int(elapsed))}")
    
    return str(final_path)


if __name__ == "__main__":
    final_model_path = main()
    print(f"\nüéâ SFT Complete! Model at: {final_model_path}")
    print(f"   Next step: Run GRPO training")

2025-11-30 12:34:21.493193: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764506061.509285    2529 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764506061.514742    2529 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'




üéØ Phi-4 Math SFT Training - MAX GPU UTILIZATION
   Started at:  2025-11-30 12:34:25
   Platform:    Kaggle H100 80GB
   Mode:        Full Fine-Tuning (14B params)
   Strategy:    DeepSpeed ZeRO-3 | Params on GPU | Optimizer on CPU
   GPU:         NVIDIA H100 80GB HBM3 (85 GB)

‚öôÔ∏è  Setting up DeepSpeed ZeRO-3...
   ‚úÖ DeepSpeed config saved to /kaggle/working/ds_config.json

üì• Loading Model (DeepSpeed ZeRO-3 - Max GPU)

   Loading tokenizer...


`torch_dtype` is deprecated! Use `dtype` instead!


   ‚úÖ Tokenizer loaded | Vocab size: 100352

   Loading model...


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

   ‚úÖ Gradient checkpointing enabled

   üìä Model Stats:
      Total parameters:     14.66B
      Trainable parameters: 14.66B

üìÇ Loading dataset from /kaggle/working/data/sft_dataset.jsonl


FileNotFoundError: Unable to find '/kaggle/working/data/sft_dataset.jsonl'