In [1]:
# %%
def run_training():
    import os
    import torch
    from datasets import load_from_disk
    from transformers import (
        ModernBertConfig,
        ModernBertForMaskedLM,
        DataCollatorForLanguageModeling,
        AutoTokenizer,
        Trainer,
        TrainingArguments,
    )
    from accelerate import Accelerator

    # Initialize Accelerator. The Trainer will automatically detect and use it.
    accelerator = Accelerator()

    # --- 1. Setup Paths and Environment ---
    WORK_DIR = os.getenv('WORK')
    DATA_FOLDER = os.path.join(WORK_DIR, "data")
    CACHED_DATA_FOLDER = os.path.join(WORK_DIR, "cached_data")
    os.environ['HF_HOME'] = CACHED_DATA_FOLDER
    os.environ['TRITON_HIP_LLD_PATH'] = '/opt/rocm-6.4.1/lib/llvm/bin/ld.lld'
    os.chdir(WORK_DIR)
    
    accelerator.print(f"Working directory: {os.getcwd()}")

    # --- 2. Define Parameters ---
    vocabulary_size = 32_768
    context_size = 512
    tokenizer_name = f"tokenizers/custom/{vocabulary_size:_}"
    model_name = f"Modern/{4.6}"

    # --- 3. Load Datasets and Tokenizer ---
    tokenized_datasets_name = os.path.join(DATA_FOLDER, f"tokenized-for-training/custom/vocab_size:{vocabulary_size:_}/context_size:{context_size}")
    tokenized_datasets = load_from_disk(tokenized_datasets_name)
    training_dataset = tokenized_datasets["train"]
    
    tokenizer = AutoTokenizer.from_pretrained(
        tokenizer_name,
        local_files_only=True,
        cache_dir=CACHED_DATA_FOLDER
    )

    # --- 4. Configure and Initialize Model ---
    config = ModernBertConfig.from_pretrained(
        "answerdotai/ModernBERT-base",
        reference_compile=False,
        attn_implementation="flash_attention_2",
    )
    config.vocab_size = vocabulary_size
    config.max_position_embeddings = 512
    config.local_attention = 128
    config.pad_token_id = 0
    config.bos_token_id = 2
    config.cls_token_id = 2
    config.eos_token_id = 3
    config.sep_token_id = 3

    model = ModernBertForMaskedLM(config=config)
    # NOTE: We do NOT call model.to("cuda") or model.half().
    # The Trainer, powered by Accelerate, will handle device placement and mixed precision.

    # --- 5. Setup Collator and Training Arguments ---
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=True,
        mlm_probability=0.3
    )

    training_args = TrainingArguments(
        output_dir=f'training/{model_name}',
        overwrite_output_dir=True,
        max_steps=500_000,
        per_device_train_batch_size=256,   # This is now PER GPU
        gradient_accumulation_steps=1,
        dataloader_num_workers=64,         # Correct way to set workers with Trainer
        logging_strategy="steps",
        logging_first_step=True,
        logging_steps=1_000,
        save_strategy="steps",
        save_steps=1_000,
        save_total_limit=5,
        fp16=True,                         # Enable mixed precision
    )

    # --- 6. Initialize and Run Trainer ---
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=training_dataset,
        data_collator=data_collator,
    )

    accelerator.print("Starting training on all available GPUs...")
    trainer.train()
    accelerator.print("Training complete!")

In [2]:
# %%
from accelerate import notebook_launcher

# This will launch the function you defined above on 4 GPUs.
# The notebook will wait here until the training is finished.
notebook_launcher(run_training, num_processes=4)

Launching training on 4 CUDAs.


  backends.update(_get_backends("networkx.backends"))
  backends.update(_get_backends("networkx.backends"))
  backends.update(_get_backends("networkx.backends"))
  backends.update(_get_backends("networkx.backends"))
W0926 20:30:24.722184 1573209 torch/multiprocessing/spawn.py:175] Terminating process 1573303 via signal SIGTERM
W0926 20:30:24.724163 1573209 torch/multiprocessing/spawn.py:175] Terminating process 1573305 via signal SIGTERM
E0926 20:30:24.819444 1573209 torch/distributed/elastic/multiprocessing/api.py:742] failed (exitcode: 1) local_rank: 0 (pid: 1573301) of fn: run_training (start_method: fork)
E0926 20:30:24.819444 1573209 torch/distributed/elastic/multiprocessing/api.py:742] Traceback (most recent call last):
E0926 20:30:24.819444 1573209 torch/distributed/elastic/multiprocessing/api.py:742]   File "/work1/lgarcia/renneruan/amd_200/lib64/python3.9/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 697, in _poll
E0926 20:30:24.819444 1573209 torch/dis

ChildFailedError: 
============================================================
run_training FAILED
------------------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2025-09-26_20:30:24
  host      : k005-009.hpcfund
  rank      : 0 (local_rank: 0)
  exitcode  : 1 (pid: 1573301)
  error_file: /tmp/torchelastic_eenny7rl/none_v2l0bsfr/attempt_0/0/error.json
  traceback : Traceback (most recent call last):
    File "/work1/lgarcia/renneruan/amd_200/lib64/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 357, in wrapper
      return f(*args, **kwargs)
    File "/tmp/ipykernel_1573209/2636644470.py", line 17, in run_training
      accelerator = Accelerator()
    File "/work1/lgarcia/renneruan/amd_200/lib64/python3.9/site-packages/accelerate/accelerator.py", line 462, in __init__
      self.state = AcceleratorState(
    File "/work1/lgarcia/renneruan/amd_200/lib64/python3.9/site-packages/accelerate/state.py", line 912, in __init__
      PartialState(cpu, **kwargs)
    File "/work1/lgarcia/renneruan/amd_200/lib64/python3.9/site-packages/accelerate/state.py", line 301, in __init__
      self.set_device()
    File "/work1/lgarcia/renneruan/amd_200/lib64/python3.9/site-packages/accelerate/state.py", line 838, in set_device
      device_module.set_device(self.device)
    File "/work1/lgarcia/renneruan/amd_200/lib64/python3.9/site-packages/torch/cuda/__init__.py", line 567, in set_device
      torch._C._cuda_setDevice(device)
    File "/work1/lgarcia/renneruan/amd_200/lib64/python3.9/site-packages/torch/cuda/__init__.py", line 398, in _lazy_init
      raise RuntimeError(
  RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method
  
============================================================