# Model Training


In [1]:
!hostname

k005-009.hpcfund


In [2]:
!rocm-smi



Device  Node  IDs              Temp    Power  Partitions          SCLK    MCLK     Fan  Perf  PwrCap  VRAM%  GPU%  
[3m              (DID,     GUID)  (Edge)  (Avg)  (Mem, Compute, ID)                                                   [0m
0       5     0x740f,   36740  36.0°C  42.0W  N/A, N/A, 0         800Mhz  1600Mhz  0%   auto  300.0W  0%     0%    
1       4     0x740f,   22429  41.0°C  43.0W  N/A, N/A, 0         800Mhz  1600Mhz  0%   auto  300.0W  0%     0%    
2       3     0x740f,   32693  39.0°C  43.0W  N/A, N/A, 0         800Mhz  1600Mhz  0%   auto  300.0W  0%     0%    
3       2     0x740f,   42924  42.0°C  40.0W  N/A, N/A, 0         800Mhz  1600Mhz  0%   auto  300.0W  0%     0%    


In [3]:
import os

WORK_DIR = os.getenv('WORK')

DATA_FOLDER = os.path.join(WORK_DIR, "data")

CACHED_DATA_FOLDER = os.path.join(WORK_DIR, "cached_data")

# Salvamos o path do Cache par ao HuggingFace
os.environ['HF_HOME'] = CACHED_DATA_FOLDER
os.environ['TRITON_HIP_LLD_PATH'] = '/opt/rocm-6.4.1/lib/llvm/bin/ld.lld'


CACHED_DATA_FOLDER

'/work1/lgarcia/renneruan/cached_data'

In [4]:
os.chdir(WORK_DIR)
print(os.getcwd())

/work1/lgarcia/renneruan


In [5]:
# !pip install triton

In [6]:
import torch
print(torch.__version__)

2.9.0.dev20250825+rocm6.4


In [7]:
!pip list

Package                   Version
------------------------- --------------------------
accelerate                1.10.1
aiohappyeyeballs          2.6.1
aiohttp                   3.12.15
aiosignal                 1.4.0
anyio                     4.11.0
argon2-cffi               25.1.0
argon2-cffi-bindings      25.1.0
arrow                     1.3.0
asttokens                 3.0.0
async-lru                 2.0.5
async-timeout             5.0.1
attrs                     25.3.0
babel                     2.17.0
beautifulsoup4            4.13.5
bleach                    6.2.0
certifi                   2025.8.3
cffi                      2.0.0
charset-normalizer        3.4.3
click                     8.1.8
comm                      0.2.3
contourpy                 1.3.0
cycler                    0.12.1
datasets                  4.1.1
debugpy                   1.8.17
decorator                 5.2.1
defusedxml                0.7.1
dill                      0.4.0
einops                    0.8.1
exc

In [8]:
# !pip install einops

In [9]:
from datasets import load_from_disk

from transformers import ModernBertConfig
from transformers import ModernBertForMaskedLM
from transformers import DataCollatorForLanguageModeling


from transformers import AutoTokenizer

from transformers import Trainer, TrainingArguments
from transformers import get_wsd_schedule
from torch.optim import AdamW


  backends.update(_get_backends("networkx.backends"))


In [10]:
import torch
print(torch.__version__)

2.9.0.dev20250825+rocm6.4


In [11]:
vocabulary_size = 32_768
context_size = 512
tokenizer_name = f"tokenizers/custom/{vocabulary_size:_}"

In [12]:
tokenized_datasets_name = os.path.join(DATA_FOLDER, f"tokenized-for-training/custom/vocab_size:{vocabulary_size:_}/context_size:{context_size}")

tokenized_datasets = load_from_disk(tokenized_datasets_name)

print(tokenized_datasets["train"].features, '\n')

tokenized_datasets

Loading dataset from disk:   0%|          | 0/409 [00:00<?, ?it/s]

Loading dataset from disk:   0%|          | 0/46 [00:00<?, ?it/s]

{'input_ids': List(Value('int32')), 'token_type_ids': List(Value('int8')), 'attention_mask': List(Value('int8')), 'special_tokens_mask': List(Value('int8'))} 



DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 56726693
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 6302966
    })
})

In [13]:
training_dataset = tokenized_datasets["train"] #.select([i for i in range( int(1 * len(tokenized_datasets["train"])) )])

training_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
    num_rows: 56726693
})

In [14]:
evaluation_dataset = tokenized_datasets["test"] #.select([i for i in range( int(1 * len(tokenized_datasets["test"])) )])

evaluation_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
    num_rows: 6302966
})

In [15]:
training_dataset.set_format(type="pt", columns=['input_ids', 'attention_mask', 'special_tokens_mask'])

evaluation_dataset.set_format(type="pt", columns=['input_ids', 'attention_mask', 'special_tokens_mask'])

training_dataset, evaluation_dataset

(Dataset({
     features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
     num_rows: 56726693
 }),
 Dataset({
     features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
     num_rows: 6302966
 }))

In [16]:
model_name = f"Modern/{4.6}"

In [17]:
config = ModernBertConfig.from_pretrained(
    "answerdotai/ModernBERT-base",
      reference_compile=False,
      attn_implementation="flash_attention_2",
    dtype=torch.float16)

config

ModernBertConfig {
  "architectures": [
    "ModernBertForMaskedLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 50281,
  "classifier_activation": "gelu",
  "classifier_bias": false,
  "classifier_dropout": 0.0,
  "classifier_pooling": "mean",
  "cls_token_id": 50281,
  "decoder_bias": true,
  "deterministic_flash_attn": false,
  "dtype": "float16",
  "embedding_dropout": 0.0,
  "eos_token_id": 50282,
  "global_attn_every_n_layers": 3,
  "global_rope_theta": 160000.0,
  "gradient_checkpointing": false,
  "hidden_activation": "gelu",
  "hidden_size": 768,
  "initializer_cutoff_factor": 2.0,
  "initializer_range": 0.02,
  "intermediate_size": 1152,
  "layer_norm_eps": 1e-05,
  "local_attention": 128,
  "local_rope_theta": 10000.0,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "mlp_dropout": 0.0,
  "model_type": "modernbert",
  "norm_bias": false,
  "norm_eps": 1e-05,
  "num_attention_heads": 12,
  "num_hidden_layers": 22,
  "pad_token_id": 502

In [18]:
# diminish the specs

# {bert-small} with max_position_embedding = 128

config.vocab_size = vocabulary_size
config.max_position_embeddings = 512
config.local_attention = 128
config.pad_token_id = 0
config.bos_token_id = 2
config.cls_token_id = 2
config.eos_token_id = 3
config.sep_token_id = 3

config

ModernBertConfig {
  "architectures": [
    "ModernBertForMaskedLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 2,
  "classifier_activation": "gelu",
  "classifier_bias": false,
  "classifier_dropout": 0.0,
  "classifier_pooling": "mean",
  "cls_token_id": 2,
  "decoder_bias": true,
  "deterministic_flash_attn": false,
  "dtype": "float16",
  "embedding_dropout": 0.0,
  "eos_token_id": 3,
  "global_attn_every_n_layers": 3,
  "global_rope_theta": 160000.0,
  "gradient_checkpointing": false,
  "hidden_activation": "gelu",
  "hidden_size": 768,
  "initializer_cutoff_factor": 2.0,
  "initializer_range": 0.02,
  "intermediate_size": 1152,
  "layer_norm_eps": 1e-05,
  "local_attention": 128,
  "local_rope_theta": 10000.0,
  "max_position_embeddings": 512,
  "mlp_bias": false,
  "mlp_dropout": 0.0,
  "model_type": "modernbert",
  "norm_bias": false,
  "norm_eps": 1e-05,
  "num_attention_heads": 12,
  "num_hidden_layers": 22,
  "pad_token_id": 0,
  "position_e

In [19]:
config.dtype = torch.float16

# Initialize model
model = ModernBertForMaskedLM(config=config)

# Convert all parameters to float16 (safety)
model = model.half()

# Move model to GPU
model = model.to("cuda")

In [20]:
model

ModernBertForMaskedLM(
  (model): ModernBertModel(
    (embeddings): ModernBertEmbeddings(
      (tok_embeddings): Embedding(32768, 768, padding_idx=0)
      (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (drop): Dropout(p=0.0, inplace=False)
    )
    (layers): ModuleList(
      (0): ModernBertEncoderLayer(
        (attn_norm): Identity()
        (attn): ModernBertAttention(
          (Wqkv): Linear(in_features=768, out_features=2304, bias=False)
          (rotary_emb): ModernBertUnpaddedRotaryEmbedding(dim=64, base=160000.0, scale_base=None)
          (Wo): Linear(in_features=768, out_features=768, bias=False)
          (out_drop): Identity()
        )
        (mlp_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): ModernBertMLP(
          (Wi): Linear(in_features=768, out_features=2304, bias=False)
          (act): GELUActivation()
          (drop): Dropout(p=0.0, inplace=False)
          (Wo): Linear(in_features=1152, out_features=768, 

In [21]:
next(model.parameters()).dtype


torch.float16

In [22]:
#save model architecture and configurations to facilitate loading afterwards

model.save_pretrained(f"models/{model_name}")

In [23]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, 
                                          local_files_only=True, 
                                          cache_dir = CACHED_DATA_FOLDER
                                          )

In [24]:
# mask 30% of the tokens
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm = True,
    mlm_probability=0.3
) 

In [None]:
total_steps = 500_000

training_args = TrainingArguments(
    output_dir=f'training/{model_name}',
    overwrite_output_dir=True,
    
    # num_train_epochs=1,                     # number of training epochs
    max_steps=total_steps,
    # max_steps=100,

    gradient_accumulation_steps = 1,
    num_workers=64,
    # eval_accumulation_steps = 1,

    per_device_train_batch_size=256,          # batch size for training
    # per_device_eval_batch_size=32,           # batch size for evaluation

    
    logging_strategy="steps",
    logging_first_step=True, # output the initial loss
    logging_steps=1_000,
    logging_dir=f"training-logs/{model_name}",
    # report_to=["tensorboard"],

    save_strategy="steps",
    save_steps=1_000,                      # Save checkpoints every 100 steps
    save_total_limit=5,                  # Limit the total number of saved checkpoints

    fp16=True,                            # Enable mixed precision for faster training

    # learning_rate=8e-4,
    # weight_decay=1e-2,
    # adam_beta1=0.9,
    # adam_beta2=0.999,
    # adam_epsilon=1e-06,
    # lr_scheduler_type=
)

# # Create default optimizer
# optimizer = AdamW(
#     model.parameters(),
#     lr = 8e-4,
#     weight_decay=1e-2,
#     betas = (0.9, 0.999),
# )

# scheduler = get_wsd_schedule(
#     AdamW,                  # Your optimizer
#     num_warmup_steps=total_steps * 0.1,   # Number of warmup steps
#     num_decay_steps=total_steps * 0.1,   # Number of decay steps
#     # num_training_steps=total_steps,  # Total number of training steps
#     num_stable_steps=total_steps * 0.8,   # Number of stable steps
#     #warmup_type="linear",   # Warmup type
#     #decay_type="1-sqrt",    # Decay type
#     min_lr_ratio=0.0,       # Minimum learning rate ratio
#     # num_cycles=0.5,         # Number of cosine cycles
# )

    # lr_scheduler_type="warmup_stable_decay",
    # lr_scheduler_kwargs = {
    #     #"optimizer": "AdamW",
    #     #"num_warmup_steps": 25,l
    #     "num_decay_steps": 25,
    #     #"num_training_steps": 1000,
    #     "num_stable_steps": 950,
    #     # "warmup_type": "linear",
    #     #"decay_type": "1-sqrt",
    #     #"min_lr_ratio": 0,
    # },
    # # warmup_steps=25,

trainer = Trainer(
    model=model,                        # Model to train
    args=training_args,                 # Training arguments
    train_dataset=training_dataset,     # Training dataset
    # eval_dataset=evaluation_dataset,    # Evaluation dataset
    data_collator=data_collator,
    # optimizers=(optimizer, scheduler),
)

In [26]:
import gc

gc.collect()

76

In [27]:
torch.cuda

<module 'torch.cuda' from '/work1/lgarcia/renneruan/amd_200/lib64/python3.9/site-packages/torch/cuda/__init__.py'>

In [28]:
torch.cuda.empty_cache()

In [None]:
# export TRITON_HIP_LLD_PATH="/opt/rocm-6.2.1/lib/llvm/bin/ld.lld"


In [29]:
!hipcc --version

HIP version: 6.4.43483-a187df25c
AMD clang version 19.0.0git (https://github.com/RadeonOpenCompute/llvm-project roc-6.4.1 25184 c87081df219c42dc27c5b6d86c0525bc7d01f727)
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /opt/rocm-6.4.1/lib/llvm/bin
Configuration file: /opt/rocm-6.4.1/lib/llvm/bin/clang++.cfg


In [30]:
# !pip install --upgrade fastai
# # ou, se for accelerate:
# !pip install --upgrade accelerate


In [29]:
next(model.parameters()).device

device(type='cuda', index=0)

In [30]:
import torch

# Is any GPU available?
print("CUDA available:", torch.cuda.is_available())

# Device of a new tensor
x = torch.randn(1)
print("Default tensor device:", x.device)

# Optional: check which GPU (if any) would be used
if torch.cuda.is_available():
    print("Current GPU:", torch.cuda.current_device())
    print("GPU name:", torch.cuda.get_device_name(torch.cuda.current_device()))



CUDA available: True
Default tensor device: cpu
Current GPU: 0
GPU name: AMD Instinct MI210


In [31]:
all(p.is_cuda for p in model.parameters())


True

In [32]:
next(model.parameters()).dtype


torch.float16

In [None]:
!rocminfo | grep 'Architecture:'

In [None]:
!rocminfo

[37mROCk module version 6.12.12 is loaded[0m
HSA System Attributes    
Runtime Version:         1.15
Runtime Ext Version:     1.7
System Timestamp Freq.:  1000.000000MHz
Sig. Max Wait Duration:  18446744073709551615 (0xFFFFFFFFFFFFFFFF) (timestamp count)
Machine Model:           LARGE                              
System Endianness:       LITTLE                             
Mwaitx:                  DISABLED
XNACK enabled:           NO
DMAbuf Support:          YES
VMM Support:             YES

HSA Agents               
*******                  
Agent 1                  
*******                  
  Name:                    AMD EPYC 7V13 64-Core Processor    
  Uuid:                    CPU-XX                             
  Marketing Name:          AMD EPYC 7V13 64-Core Processor    
  Vendor Name:             CPU                                
  Feature:                 None specified                     
  Profile:                 FULL_PROFILE                       
  Float Round Mode

In [33]:
trainer.train()

Step,Training Loss
1,2.7527


KeyboardInterrupt: 

In [None]:
!find /opt/rocm* -name ld.lld


/opt/rocm-6.1.2/lib/llvm/bin/ld.lld
/opt/rocm-6.2.1/lib/llvm/bin/ld.lld
/opt/rocm-6.3.1/lib/llvm/bin/ld.lld
/opt/rocm-6.4.1/lib/llvm/bin/ld.lld


In [None]:
!pip list

Package                   Version
------------------------- --------------
accelerate                1.2.1
aiohappyeyeballs          2.6.1
aiohttp                   3.11.14
aiosignal                 1.3.2
anyio                     4.9.0
argon2-cffi               23.1.0
argon2-cffi-bindings      21.2.0
arrow                     1.3.0
asttokens                 3.0.0
async-lru                 2.0.5
async-timeout             5.0.1
attrs                     25.3.0
babel                     2.17.0
beautifulsoup4            4.13.3
bleach                    6.2.0
certifi                   2025.1.31
cffi                      1.17.1
charset-normalizer        3.4.1
click                     8.1.8
comm                      0.2.2
datasets                  3.2.0
debugpy                   1.8.13
decorator                 5.2.1
defusedxml                0.7.1
dill                      0.3.8
einops                    0.8.1
evaluate                  0.4.3
exceptiongroup            1.2.2
executing       

In [None]:
# %%
# All necessary imports
import torch
from torch.utils.data import TensorDataset, DataLoader
from accelerate import Accelerator, notebook_launcher

# ====================================================================
# STEP 1: Define your training logic inside a single function
# ====================================================================
def training_function():
    # Initialize the Accelerator
    # It automatically handles device placement (e.g., 'cuda:0', 'cuda:1')
    accelerator = Accelerator()
    
    # --- Create your model, optimizer, and dataloaders ---
    # (It's important to create them here, inside the function)
    model = torch.nn.Linear(50, 10)
    optimizer = torch.optim.AdamW(model.parameters())

    # Create some dummy data
    X = torch.randn(2048, 50)
    y = torch.randint(0, 10, (2048,))
    dataset = TensorDataset(X, y)
    # The dataloader will be automatically sharded across GPUs by accelerator.prepare()
    dataloader = DataLoader(dataset, batch_size=64)

    # --- Use accelerator.prepare() ---
    # This is the magic step! It wraps your objects to work in a distributed environment.
    model, optimizer, dataloader = accelerator.prepare(model, optimizer, dataloader)
    
    # --- Your Training Loop ---
    model.train()
    for epoch in range(3):
        for batch in dataloader:
            inputs, targets = batch
            
            # Forward pass
            outputs = model(inputs)
            loss = torch.nn.functional.cross_entropy(outputs, targets)
            
            # Backward pass - use accelerator.backward() instead of loss.backward()
            accelerator.backward(loss)
            
            optimizer.step()
            optimizer.zero_grad()
            
        # Use accelerator.print to avoid printing from every process
        accelerator.print(f"Epoch {epoch} | Loss: {loss.item()}")

# ====================================================================
# STEP 2: Launch the training function with notebook_launcher
# ====================================================================
# %%
# This will launch `training_function` on 2 GPUs.
# The code in this cell will wait until the training is complete.
notebook_launcher(training_function, num_processes=2)

## Trying using accelerator

Launching training on 4 CUDAs.


  backends.update(_get_backends("networkx.backends"))
  backends.update(_get_backends("networkx.backends"))
  backends.update(_get_backends("networkx.backends"))
  backends.update(_get_backends("networkx.backends"))
W0926 20:25:40.393144 1572853 torch/multiprocessing/spawn.py:175] Terminating process 1572941 via signal SIGTERM
W0926 20:25:40.394976 1572853 torch/multiprocessing/spawn.py:175] Terminating process 1572942 via signal SIGTERM
E0926 20:25:40.482525 1572853 torch/distributed/elastic/multiprocessing/api.py:742] failed (exitcode: 1) local_rank: 0 (pid: 1572939) of fn: run_training (start_method: fork)
E0926 20:25:40.482525 1572853 torch/distributed/elastic/multiprocessing/api.py:742] Traceback (most recent call last):
E0926 20:25:40.482525 1572853 torch/distributed/elastic/multiprocessing/api.py:742]   File "/work1/lgarcia/renneruan/amd_200/lib64/python3.9/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 697, in _poll
E0926 20:25:40.482525 1572853 torch/dis

ChildFailedError: 
============================================================
run_training FAILED
------------------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2025-09-26_20:25:40
  host      : k005-009.hpcfund
  rank      : 0 (local_rank: 0)
  exitcode  : 1 (pid: 1572939)
  error_file: /tmp/torchelastic_p_45ufg5/none__3lvwe1u/attempt_0/0/error.json
  traceback : Traceback (most recent call last):
    File "/work1/lgarcia/renneruan/amd_200/lib64/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 357, in wrapper
      return f(*args, **kwargs)
    File "/tmp/ipykernel_1572853/2636644470.py", line 17, in run_training
      accelerator = Accelerator()
    File "/work1/lgarcia/renneruan/amd_200/lib64/python3.9/site-packages/accelerate/accelerator.py", line 462, in __init__
      self.state = AcceleratorState(
    File "/work1/lgarcia/renneruan/amd_200/lib64/python3.9/site-packages/accelerate/state.py", line 912, in __init__
      PartialState(cpu, **kwargs)
    File "/work1/lgarcia/renneruan/amd_200/lib64/python3.9/site-packages/accelerate/state.py", line 301, in __init__
      self.set_device()
    File "/work1/lgarcia/renneruan/amd_200/lib64/python3.9/site-packages/accelerate/state.py", line 838, in set_device
      device_module.set_device(self.device)
    File "/work1/lgarcia/renneruan/amd_200/lib64/python3.9/site-packages/torch/cuda/__init__.py", line 567, in set_device
      torch._C._cuda_setDevice(device)
    File "/work1/lgarcia/renneruan/amd_200/lib64/python3.9/site-packages/torch/cuda/__init__.py", line 398, in _lazy_init
      raise RuntimeError(
  RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method
  
============================================================