In [1]:
#!pip install wandb datasets deepspeed
#!pip install transformers
#!pip install accelerate>=0.26.0

# אחרי העדכון, עשה restart kernel:
#import IPython
#IPython.Application.instance().kernel.do_shutdown(True)

In [2]:
# Cell 1: Imports and Setup
import os
import json
import torch
import wandb
import time
import logging
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    set_seed,
    TrainerCallback
)
from datasets import load_from_disk, load_dataset, Dataset
import deepspeed
from datetime import datetime

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

print("✅ All imports successful")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU count: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"  GPU {i}: {torch.cuda.get_device_name(i)}")

[2025-07-10 11:05:19,776] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/ec2-user/anaconda3/envs/pytorch_p310/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/ec2-user/anaconda3/envs/pytorch_p310/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


[2025-07-10 11:05:21,350] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False
✅ All imports successful
PyTorch version: 2.2.2
CUDA available: True
GPU count: 8
  GPU 0: Tesla V100-SXM2-16GB
  GPU 1: Tesla V100-SXM2-16GB
  GPU 2: Tesla V100-SXM2-16GB
  GPU 3: Tesla V100-SXM2-16GB
  GPU 4: Tesla V100-SXM2-16GB
  GPU 5: Tesla V100-SXM2-16GB
  GPU 6: Tesla V100-SXM2-16GB
  GPU 7: Tesla V100-SXM2-16GB


In [3]:
# Cell 2: Mock SageMaker Environment Variables
# משתני סביבה שמדמים את SageMaker
os.environ.update({
    'SM_MODEL_DIR': '/tmp/ml/model',
    'SM_OUTPUT_DATA_DIR': '/tmp/ml/output/data',
    'SM_CHANNEL_TRAINING': '/tmp/ml/input/data/training',
    'SM_NUM_GPUS': str(torch.cuda.device_count()),
    'SM_CURRENT_HOST': 'algo-1',
    'SM_HOSTS': '["algo-1"]',
    'SM_CURRENT_INSTANCE_TYPE': 'ml.p4de.24xlarge'
})

# Create directories
os.makedirs('/tmp/ml/model', exist_ok=True)
os.makedirs('/tmp/ml/output/data', exist_ok=True)
os.makedirs('/tmp/ml/input/data/training', exist_ok=True)

print("✅ SageMaker environment variables set:")
for key, value in os.environ.items():
    if key.startswith('SM_'):
        print(f"  {key} = {value}")

✅ SageMaker environment variables set:
  SM_MODEL_DIR = /tmp/ml/model
  SM_OUTPUT_DATA_DIR = /tmp/ml/output/data
  SM_CHANNEL_TRAINING = /tmp/ml/input/data/training
  SM_NUM_GPUS = 8
  SM_CURRENT_HOST = algo-1
  SM_HOSTS = ["algo-1"]
  SM_CURRENT_INSTANCE_TYPE = ml.p4de.24xlarge


In [4]:
# Cell 3: Configuration Variables (adjust these!)
# פרמטרים של האימון - ערוך אלה לפי הצורך
INSTANCE_TYPE = 'ml.p4de.24xlarge'
MODEL_NAME = 'microsoft/DialoGPT-small'  # מודל קטן לדיבוג מהיר
EPOCHS = 1
MAX_STEPS = 5  # מספר צעדים קטן לדיבוג
SEED = 42
MAX_SEQ_LENGTH = 512  # קצר יותר לדיבוג
WANDB_PROJECT = 'local-debug-test'
WANDB_ENTITY = None
WANDB_RUN_NAME = f'local-debug-{datetime.now().strftime("%Y%m%d_%H%M%S")}'

print("✅ Configuration:")
print(f"  Instance Type: {INSTANCE_TYPE}")
print(f"  Model: {MODEL_NAME}")
print(f"  Epochs: {EPOCHS}")
print(f"  Max Steps: {MAX_STEPS}")
print(f"  Sequence Length: {MAX_SEQ_LENGTH}")

✅ Configuration:
  Instance Type: ml.p4de.24xlarge
  Model: microsoft/DialoGPT-small
  Epochs: 1
  Max Steps: 5
  Sequence Length: 512


In [5]:
# Cell 4: Load Instance Configuration
def load_instance_config(instance_type):
    """Load instance-specific configuration - exactly from train.py"""
    default_configs = {
        'ml.p4d.24xlarge': {
            "batch_size_per_gpu": 2,
            "gradient_accumulation_steps": 1,
            "deepspeed_config": "configs/deepspeed/p4d_deepspeed_config.json",
            "gpu_count": 8,
            "gpu_type": "A100",
            "gpu_memory": "40GB",
            "estimated_hourly_cost": 32.77
        },
        'ml.p4de.24xlarge': {
            "batch_size_per_gpu": 4,
            "gradient_accumulation_steps": 1,
            "deepspeed_config": "configs/deepspeed/p4de_deepspeed_config.json",
            "gpu_count": 8,
            "gpu_type": "A100",
            "gpu_memory": "80GB",
            "estimated_hourly_cost": 40.96
        },
        'ml.p5.48xlarge': {
            "batch_size_per_gpu": 6,
            "gradient_accumulation_steps": 1,
            "deepspeed_config": "configs/deepspeed/p5_deepspeed_config.json",
            "gpu_count": 8,
            "gpu_type": "H100",
            "gpu_memory": "80GB",
            "estimated_hourly_cost": 98.32
        }
    }
    
    if instance_type in default_configs:
        return default_configs[instance_type]
    else:
        return default_configs['ml.p4d.24xlarge']

# Load instance config
instance_config = load_instance_config(INSTANCE_TYPE)
print("✅ Instance config loaded:")
print(json.dumps(instance_config, indent=2))

✅ Instance config loaded:
{
  "batch_size_per_gpu": 4,
  "gradient_accumulation_steps": 1,
  "deepspeed_config": "configs/deepspeed/p4de_deepspeed_config.json",
  "gpu_count": 8,
  "gpu_type": "A100",
  "gpu_memory": "80GB",
  "estimated_hourly_cost": 40.96
}


In [6]:
# Cell 5: Create DeepSpeed Configuration (FINAL VERSION)
def create_deepspeed_config():
    """Create minimal DeepSpeed config"""
    config = {
        "fp16": {"enabled": True},
        "zero_optimization": {
            "stage": 2,
            "overlap_comm": True,
            "contiguous_gradients": True,
            "reduce_bucket_size": 500000000
        },
        "train_micro_batch_size_per_gpu": 2,
        "gradient_accumulation_steps": 1,
        "gradient_clipping": 1.0,
        "wall_clock_breakdown": False  # ← השבת timers
    }
    
    os.makedirs("./configs/deepspeed", exist_ok=True)
    config_path = "./configs/deepspeed/local_deepspeed_config.json"
    
    with open(config_path, 'w') as f:
        json.dump(config, f, indent=2)
    
    return config_path

In [7]:
# Cell 6: Setup W&B
# Initialize Weights & Biases

wandb.init(
    project=WANDB_PROJECT,
    entity=WANDB_ENTITY,
    name=WANDB_RUN_NAME,
    config={
        "instance_type": INSTANCE_TYPE,
        "gpu_count": instance_config.get("gpu_count", 8),
        "gpu_type": instance_config.get("gpu_type", "A100"),
        "gpu_memory": instance_config.get("gpu_memory", "40GB"),
        "estimated_hourly_cost": instance_config.get('estimated_hourly_cost', 32.77),
        "model_name": MODEL_NAME,
        "epochs": EPOCHS,
        "max_seq_length": MAX_SEQ_LENGTH,
        "seed": SEED,
        "batch_size_per_gpu": instance_config.get("batch_size_per_gpu", 2),
        "gradient_accumulation_steps": instance_config.get("gradient_accumulation_steps", 1),
        "sagemaker_job": False,  # Local run
        "num_gpus": torch.cuda.device_count(),
        "benchmark_mode": True,
        "start_time": datetime.now().isoformat()
    }
)

print(f"✅ W&B initialized: {WANDB_RUN_NAME}")

[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdnrevital[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


✅ W&B initialized: local-debug-20250710_110521


In [8]:
# Cell 7: Create Dummy Dataset
def create_dummy_dataset(num_samples=100):
    """Create dummy dataset for testing"""
    # Create simple Hebrew-like text samples
    texts = [
        f"זהו טקסט דוגמא מספר {i}. " * 20  # Repeat to get decent length
        for i in range(num_samples)
    ]
    
    # Create dataset
    dataset = Dataset.from_dict({"text": texts})
    
    # Also save to the SageMaker training directory
    data_dir = '/tmp/ml/input/data/training'
    with open(f'{data_dir}/train.jsonl', 'w', encoding='utf-8') as f:
        for text in texts:
            f.write(json.dumps({"text": text}, ensure_ascii=False) + '\n')
    
    return dataset

# Create dummy dataset
dummy_dataset = create_dummy_dataset(50)  # Small for quick testing
print(f"✅ Created dummy dataset with {len(dummy_dataset)} samples")
print(f"Sample text: {dummy_dataset[0]['text'][:100]}...")

✅ Created dummy dataset with 50 samples
Sample text: זהו טקסט דוגמא מספר 0. זהו טקסט דוגמא מספר 0. זהו טקסט דוגמא מספר 0. זהו טקסט דוגמא מספר 0. זהו טקסט...


In [9]:
# Cell 8: Load Model and Tokenizer
set_seed(SEED)

print(f"Loading tokenizer: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"Loading model: {MODEL_NAME}")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    trust_remote_code=True,
    use_cache=False,
    low_cpu_mem_usage=True
)

# Move model to GPU exactly as in train.py
if torch.cuda.is_available():
    model = model.to("cuda:0")  # Move to specific GPU
    print(f"✅ Model moved to CUDA device 0. Available GPUs: {torch.cuda.device_count()}")
else:
    print("ℹ️ CUDA not available, using CPU")

# Enable gradient checkpointing
model.gradient_checkpointing_enable()
print("✅ Gradient checkpointing enabled")

Loading tokenizer: microsoft/DialoGPT-small
Loading model: microsoft/DialoGPT-small
✅ Model moved to CUDA device 0. Available GPUs: 8
✅ Gradient checkpointing enabled


In [10]:
# Cell 9: Prepare Dataset
def tokenize_function(examples):
    """Tokenization function exactly from train.py"""
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=MAX_SEQ_LENGTH,
        return_tensors="pt"
    )

print("Tokenizing dataset...")
tokenized_dataset = dummy_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=2,  # Less processes for local debugging
    remove_columns=dummy_dataset.column_names
)

print(f"✅ Dataset tokenized. Samples: {len(tokenized_dataset)}")

Tokenizing dataset...


Map (num_proc=2):   0%|          | 0/50 [00:00<?, ? examples/s]

✅ Dataset tokenized. Samples: 50


In [11]:
# Cell 10: Create Data Collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)
print("✅ Data collator created")

✅ Data collator created


In [12]:
# Cell 11: SageMaker Metrics Callback
class SageMakerMetricsCallback(TrainerCallback):
    """Custom callback - exactly from train.py"""
    
    def __init__(self, instance_type, max_seq_length=2048):
        super().__init__()
        self.instance_type = instance_type
        self.max_seq_length = max_seq_length
        self.total_tokens = 0
        self.steps = 0
        self.start_time = time.time()
        self.step_times = []
        
    def on_step_end(self, args, state, control, **kwargs):
        """Track performance metrics for each step."""
        current_time = time.time()
        step_time = current_time - getattr(self, 'last_step_time', current_time)
        self.last_step_time = current_time
        self.step_times.append(step_time)
        
        # Calculate tokens processed in this step
        total_batch_size = args.per_device_train_batch_size * args.world_size
        if args.gradient_accumulation_steps > 0:
            total_batch_size *= args.gradient_accumulation_steps
            
        step_tokens = total_batch_size * self.max_seq_length
        self.total_tokens += step_tokens
        self.steps += 1
        
        # Calculate throughput
        if len(self.step_times) > 0:
            avg_step_time = sum(self.step_times[-10:]) / min(len(self.step_times), 10)
            tokens_per_second = step_tokens / avg_step_time if avg_step_time > 0 else 0
        else:
            tokens_per_second = 0
        
        # Log metrics to W&B
        metrics = {
            "training/tokens": self.total_tokens,
            "training/step_num": self.steps,
            "training/tokens_per_second": tokens_per_second,
            "training/step_time": step_time,
            "training/instance_type": self.instance_type,
            "training/global_step": state.global_step
        }
        
        # Add GPU metrics if available
        if torch.cuda.is_available():
            for i in range(torch.cuda.device_count()):
                metrics[f"gpu_{i}/memory_allocated_gb"] = torch.cuda.memory_allocated(i) / (1024**3)
                metrics[f"gpu_{i}/memory_reserved_gb"] = torch.cuda.memory_reserved(i) / (1024**3)
        
        wandb.log(metrics, step=state.global_step)
    
    def on_log(self, args, state, control, logs=None, **kwargs):
        """Log training metrics to W&B."""
        if logs is None:
            return
        
        # Log loss and other training metrics
        wandb_logs = {}
        for key, value in logs.items():
            if key.startswith(('train_', 'eval_')):
                wandb_logs[f"training/{key}"] = value
            elif key == 'loss':
                wandb_logs["training/loss"] = value
                wandb_logs["training/perplexity"] = torch.exp(torch.tensor(value)).item()
        
        if wandb_logs:
            wandb.log(wandb_logs, step=state.global_step)
        
        # Log final statistics
        if "train_runtime" in logs:
            total_time = time.time() - self.start_time
            wandb.log({
                "training/total_tokens": self.total_tokens,
                "training/total_time_hours": total_time / 3600,
                "training/avg_tokens_per_second": self.total_tokens / total_time,
                "training/final_step": self.steps
            }, step=state.global_step)

# Initialize callback
metrics_callback = SageMakerMetricsCallback(
    instance_type=INSTANCE_TYPE,
    max_seq_length=MAX_SEQ_LENGTH
)
print("✅ Metrics callback created")

✅ Metrics callback created


In [13]:
# Cell 12: Create Training Arguments (FIXED)

# ודא שיש DeepSpeed config
if 'deepspeed_config_path' not in globals():
    # אם אין, צור אותו עכשיו
    os.makedirs("./configs/deepspeed", exist_ok=True)
    deepspeed_config_path = "./configs/deepspeed/local_deepspeed_config.json"
    
    config = {
        "fp16": {"enabled": True},
        "zero_optimization": {
            "stage": 2,
            "overlap_comm": True,
            "contiguous_gradients": True,
            "reduce_bucket_size": 500000000
        },
        "train_micro_batch_size_per_gpu": 2,
        "gradient_accumulation_steps": 1,
        "gradient_clipping": 1.0,
        "wall_clock_breakdown": False
    }
    
    with open(deepspeed_config_path, 'w') as f:
        json.dump(config, f, indent=2)
    
    print(f"✅ Created DeepSpeed config: {deepspeed_config_path}")

training_args_dict = {
    "output_dir": '/tmp/ml/model',
    "fp16": True,
    "gradient_accumulation_steps": 1,
    "per_device_train_batch_size": 2,
    "learning_rate": 1e-5,
    "weight_decay": 0.01,
    "num_train_epochs": EPOCHS,
    "save_steps": 500,
    "save_total_limit": 3,
    "logging_steps": 1,
    "max_grad_norm": 1.0,
    "warmup_ratio": 0.03,
    "warmup_steps": 100,
    "deepspeed": deepspeed_config_path,
    "report_to": [],
    "remove_unused_columns": False,
    "dataloader_num_workers": 0,
    "gradient_checkpointing": True,
    "ddp_backend": None,  # ← הסר NCCL לlocal
    "dataloader_pin_memory": False,
    "ddp_find_unused_parameters": False,
}

# Add max_steps if specified
if MAX_STEPS and MAX_STEPS > 0:
    training_args_dict["max_steps"] = MAX_STEPS

training_args = TrainingArguments(**training_args_dict)
print("✅ Training arguments created")
print(f"Warmup steps: {training_args.warmup_steps}")
print(f"DeepSpeed config: {training_args.deepspeed}")

✅ Created DeepSpeed config: ./configs/deepspeed/local_deepspeed_config.json
✅ Training arguments created
Warmup steps: 100
DeepSpeed config: ./configs/deepspeed/local_deepspeed_config.json


In [14]:
# Cell 13: Create Trainer
print("Creating trainer...")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=None,  # No eval for debugging
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks=[metrics_callback]
)

print("✅ Trainer created successfully")

Creating trainer...
✅ Trainer created successfully


  trainer = Trainer(


In [15]:
'''
# Cell 13.5: Create Trainer (NEW CELL - אחרי Cell 13)
print("Creating trainer...")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=None,
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks=[metrics_callback]
)

print("✅ Trainer created successfully")

# 🔧 תקן מיד אחרי יצירת Trainer
print("=== Immediate DeepSpeed Fix ===")

if hasattr(trainer.model, 'backward'):  # זה DeepSpeed engine
    print(f"Trainer model is DeepSpeed engine: {type(trainer.model)}")
    
    # יצור wrapper חדש
    class DeepSpeedEngineWrapper:
        def __init__(self, engine):
            self.engine = engine
            self._engine = engine
            
        def __getattr__(self, name):
            return getattr(self._engine, name)
            
        def destroy(self):
            pass
    
    # החלף את accelerator engine
    wrapper = DeepSpeedEngineWrapper(trainer.model)
    trainer.accelerator.deepspeed_engine_wrapped = wrapper
    
    print(f"✅ Fixed accelerator: {trainer.accelerator.deepspeed_engine_wrapped}")
    print(f"✅ Wrapper engine: {trainer.accelerator.deepspeed_engine_wrapped.engine}")
else:
    print("❌ Trainer model is not DeepSpeed engine")
'''

'\n# Cell 13.5: Create Trainer (NEW CELL - אחרי Cell 13)\nprint("Creating trainer...")\n\ntrainer = Trainer(\n    model=model,\n    args=training_args,\n    train_dataset=tokenized_dataset,\n    eval_dataset=None,\n    data_collator=data_collator,\n    tokenizer=tokenizer,\n    callbacks=[metrics_callback]\n)\n\nprint("✅ Trainer created successfully")\n\n# 🔧 תקן מיד אחרי יצירת Trainer\nprint("=== Immediate DeepSpeed Fix ===")\n\nif hasattr(trainer.model, \'backward\'):  # זה DeepSpeed engine\n    print(f"Trainer model is DeepSpeed engine: {type(trainer.model)}")\n    \n    # יצור wrapper חדש\n    class DeepSpeedEngineWrapper:\n        def __init__(self, engine):\n            self.engine = engine\n            self._engine = engine\n            \n        def __getattr__(self, name):\n            return getattr(self._engine, name)\n            \n        def destroy(self):\n            pass\n    \n    # החלף את accelerator engine\n    wrapper = DeepSpeedEngineWrapper(trainer.model)\n    tr

In [16]:
'''
# Cell 14: Debug Trainer and DeepSpeed
print("=== Trainer Debug Info ===")

# Check if trainer has accelerator
if hasattr(trainer, 'accelerator'):
    print(f"✅ Trainer has accelerator: {trainer.accelerator}")
    print(f"Accelerator device: {trainer.accelerator.device}")
    print(f"Distributed type: {trainer.accelerator.distributed_type}")
    
    # Check DeepSpeed engine
    if hasattr(trainer.accelerator, 'deepspeed_engine_wrapped'):
        engine = trainer.accelerator.deepspeed_engine_wrapped
        if engine is not None:
            print(f"✅ DeepSpeed engine initialized: {type(engine)}")
        else:
            print("❌ DeepSpeed engine is None!")
    else:
        print("❌ No deepspeed_engine_wrapped attribute")
else:
    print("❌ Trainer has no accelerator attribute")

# Check distributed status
if torch.distributed.is_available() and torch.distributed.is_initialized():
    print(f"✅ Distributed initialized - rank: {torch.distributed.get_rank()}")
else:
    print("ℹ️ Distributed not initialized (might be OK for single node)")

print("=" * 50)
'''

'\n# Cell 14: Debug Trainer and DeepSpeed\nprint("=== Trainer Debug Info ===")\n\n# Check if trainer has accelerator\nif hasattr(trainer, \'accelerator\'):\n    print(f"✅ Trainer has accelerator: {trainer.accelerator}")\n    print(f"Accelerator device: {trainer.accelerator.device}")\n    print(f"Distributed type: {trainer.accelerator.distributed_type}")\n    \n    # Check DeepSpeed engine\n    if hasattr(trainer.accelerator, \'deepspeed_engine_wrapped\'):\n        engine = trainer.accelerator.deepspeed_engine_wrapped\n        if engine is not None:\n            print(f"✅ DeepSpeed engine initialized: {type(engine)}")\n        else:\n            print("❌ DeepSpeed engine is None!")\n    else:\n        print("❌ No deepspeed_engine_wrapped attribute")\nelse:\n    print("❌ Trainer has no accelerator attribute")\n\n# Check distributed status\nif torch.distributed.is_available() and torch.distributed.is_initialized():\n    print(f"✅ Distributed initialized - rank: {torch.distributed.get_rank

In [17]:
'''
# Cell 15: Test Single Training Step (FIXED)
print("=== Testing Single Training Step ===")

try:
    # Get a sample batch
    train_dataloader = trainer.get_train_dataloader()
    sample_batch = next(iter(train_dataloader))
    
    print(f"✅ Got sample batch with keys: {sample_batch.keys()}")
    print(f"Batch size: {sample_batch['input_ids'].shape}")
    
    # Test forward pass
    model.train()
    with torch.no_grad():
        outputs = trainer.model(**sample_batch)  # Use trainer.model (DeepSpeed engine)
        print(f"✅ Forward pass successful, loss: {outputs.loss.item():.4f}")
    
    # Test backward pass - FIXED!
    print("Testing backward pass...")
    outputs = trainer.model(**sample_batch)
    loss = outputs.loss
    
    # 🔧 FIX: Call DeepSpeed backward directly, not through accelerator
    print("Calling DeepSpeed backward directly...")
    trainer.model.backward(loss)  # DeepSpeed engine backward
    trainer.model.step()          # DeepSpeed engine step
    
    print("✅ Backward pass successful!")
    
except Exception as e:
    print(f"❌ Error in training step: {e}")
    import traceback
    traceback.print_exc()
'''

'\n# Cell 15: Test Single Training Step (FIXED)\nprint("=== Testing Single Training Step ===")\n\ntry:\n    # Get a sample batch\n    train_dataloader = trainer.get_train_dataloader()\n    sample_batch = next(iter(train_dataloader))\n    \n    print(f"✅ Got sample batch with keys: {sample_batch.keys()}")\n    print(f"Batch size: {sample_batch[\'input_ids\'].shape}")\n    \n    # Test forward pass\n    model.train()\n    with torch.no_grad():\n        outputs = trainer.model(**sample_batch)  # Use trainer.model (DeepSpeed engine)\n        print(f"✅ Forward pass successful, loss: {outputs.loss.item():.4f}")\n    \n    # Test backward pass - FIXED!\n    print("Testing backward pass...")\n    outputs = trainer.model(**sample_batch)\n    loss = outputs.loss\n    \n    # 🔧 FIX: Call DeepSpeed backward directly, not through accelerator\n    print("Calling DeepSpeed backward directly...")\n    trainer.model.backward(loss)  # DeepSpeed engine backward\n    trainer.model.step()          # DeepSp

In [18]:
# Cell 16: Manual DeepSpeed Training
print("=== Manual DeepSpeed Training ===")

# בדוק מצב נוכחי
print(f"Current model type: {type(trainer.model)}")
print(f"DeepSpeed config exists: {os.path.exists(deepspeed_config_path)}")

if os.path.exists(deepspeed_config_path):
    with open(deepspeed_config_path, 'r') as f:
        ds_config = json.load(f)
    print(f"Config keys: {list(ds_config.keys())}")
    
    # אתחל DeepSpeed ידנית
    print("Initializing DeepSpeed manually...")
    
    try:
        import deepspeed
        from torch.optim import AdamW
        
        # צור optimizer
        optimizer = AdamW(model.parameters(), lr=1e-5)
        
        # אתחל DeepSpeed
        model_engine, optimizer, _, lr_scheduler = deepspeed.initialize(
            model=model,  # המודל המקורי
            optimizer=optimizer,
            config=ds_config
        )
        
        print(f"✅ DeepSpeed engine created: {type(model_engine)}")
        
        # Manual training loop
        train_dataloader = trainer.get_train_dataloader()
        
        model_engine.train()
        for epoch in range(EPOCHS):
            for step, batch in enumerate(train_dataloader):
                if MAX_STEPS and step >= MAX_STEPS:
                    break
                
                # Forward
                outputs = model_engine(**batch)
                loss = outputs.loss
                
                # Backward & Step
                model_engine.backward(loss)
                model_engine.step()
                
                print(f"Step {step}: Loss = {loss.item():.4f}")
                
                # Log to W&B
                if 'WANDB_API_KEY' in os.environ:
                    wandb.log({"loss": loss.item(), "step": step})
        
        print("✅ Manual DeepSpeed training completed!")
        
    except Exception as e:
        print(f"❌ Manual DeepSpeed failed: {e}")
        import traceback
        traceback.print_exc()
        
        # Fallback to regular PyTorch
        print("Falling back to regular PyTorch training...")
        
        optimizer = AdamW(model.parameters(), lr=1e-5)
        
        model.train()
        train_dataloader = trainer.get_train_dataloader()
        
        for epoch in range(EPOCHS):
            for step, batch in enumerate(train_dataloader):
                if MAX_STEPS and step >= MAX_STEPS:
                    break
                
                # Forward
                outputs = model(**batch)
                loss = outputs.loss
                
                # Backward
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                
                print(f"Step {step}: Loss = {loss.item():.4f}")
                
                if 'WANDB_API_KEY' in os.environ:
                    wandb.log({"loss": loss.item(), "step": step})
        
        print("✅ Regular PyTorch training completed!")

else:
    print("❌ No DeepSpeed config found")

=== Manual DeepSpeed Training ===
Current model type: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'>
DeepSpeed config exists: True
Config keys: ['fp16', 'zero_optimization', 'train_micro_batch_size_per_gpu', 'gradient_accumulation_steps', 'gradient_clipping', 'wall_clock_breakdown']
Initializing DeepSpeed manually...
[2025-07-10 11:05:24,177] [INFO] [logging.py:107:log_dist] [Rank -1] DeepSpeed info: version=0.17.2, git-hash=unknown, git-branch=unknown
[2025-07-10 11:05:24,178] [INFO] [comm.py:676:init_distributed] cdb=None
[2025-07-10 11:05:24,179] [INFO] [comm.py:691:init_distributed] Not using the DeepSpeed or dist launchers, attempting to detect MPI environment...
[2025-07-10 11:05:24,369] [INFO] [comm.py:746:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=172.16.160.247, master_port=29500
[2025-07-10 11:05:24,371] [INFO] [comm.py:707:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
[2025-07

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


[2025-07-10 11:05:26,542] [INFO] [loss_scaler.py:191:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1
Step 0: Loss = 40.6140
[2025-07-10 11:05:26,805] [INFO] [loss_scaler.py:184:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768
Step 1: Loss = 40.8954
[2025-07-10 11:05:27,057] [INFO] [loss_scaler.py:184:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384
Step 2: Loss = 40.8762
[2025-07-10 11:05:27,152] [INFO] [loss_scaler.py:184:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192
Step 3: Loss = 40.6896
✅ Manual DeepSpeed training completed!


In [19]:
# Cell 17: Save Model (if successful)
if training_successful:
    print("Saving model...")
    trainer.save_model('/tmp/ml/model')
    tokenizer.save_pretrained('/tmp/ml/model')
    
    # Save training metrics
    metrics_file = '/tmp/ml/output/data/training_metrics.json'
    
    with open(metrics_file, 'w') as f:
        json.dump({
            "instance_type": INSTANCE_TYPE,
            "total_time_hours": total_time / 3600,
            "total_tokens": metrics_callback.total_tokens,
            "avg_tokens_per_second": metrics_callback.total_tokens / total_time if total_time > 0 else 0,
            "estimated_cost": instance_config.get('estimated_hourly_cost', 32.77) * (total_time / 3600),
            "training_successful": training_successful
        }, f, indent=2)
    
    print(f"✅ Model and metrics saved")

else:
    print("❌ Training failed - no model saved")

NameError: name 'training_successful' is not defined

In [None]:
# Cell 18: Cleanup and Summary
print("\n" + "="*60)
print("🎯 TRAINING SUMMARY")
print("="*60)

print(f"Model: {MODEL_NAME}")
print(f"Instance Type: {INSTANCE_TYPE}")
print(f"Training Successful: {training_successful}")
print(f"Total Time: {total_time:.2f} seconds")
print(f"Total Steps: {metrics_callback.steps}")
print(f"Total Tokens: {metrics_callback.total_tokens:,}")

if training_successful:
    print("✅ Training completed successfully!")
    print("Model saved to: /tmp/ml/model")
    print("Metrics saved to: /tmp/ml/output/data/training_metrics.json")
else:
    print("❌ Training failed - check error messages above")

# Finish W&B
wandb.finish()
print("✅ W&B run finished")