DistilBERT Training code and GPU configs

In [None]:
# Install PyTorch with CUDA 12.4 support (replace 'cu124' with the correct tag if different)
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting torch
  Using cached https://download.pytorch.org/whl/cu121/torch-2.2.2%2Bcu121-cp312-cp312-win_amd64.whl (2454.8 MB)
Installing collected packages: torch
  Attempting uninstall: torch
    Found existing installation: torch 2.2.2
    Uninstalling torch-2.2.2:
      Successfully uninstalled torch-2.2.2
Successfully installed torch-2.2.2+cu121


In [None]:
!pip install datasets
!pip install transformers

Defaulting to user installation because normal site-packages is not writeable
Collecting fsspec<=2024.2.0,>=2023.1.0 (from fsspec[http]<=2024.2.0,>=2023.1.0->datasets)
  Using cached fsspec-2024.2.0-py3-none-any.whl.metadata (6.8 kB)
Using cached fsspec-2024.2.0-py3-none-any.whl (170 kB)
Installing collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2024.3.1
    Uninstalling fsspec-2024.3.1:
      Successfully uninstalled fsspec-2024.3.1
Successfully installed fsspec-2024.2.0
Defaulting to user installation because normal site-packages is not writeable


In [None]:

from datasets import load_dataset, Dataset, concatenate_datasets
import torch
from transformers import DistilBertTokenizer, DistilBertForMaskedLM, Trainer, TrainingArguments, logging

In [None]:
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA version:", torch.version.cuda)
    print("Device name:", torch.cuda.get_device_name(0))

PyTorch version: 2.2.2+cu121
CUDA available: True
CUDA version: 12.1
Device name: NVIDIA GeForce RTX 3060


In [None]:
from datasets import load_dataset, Dataset, concatenate_datasets

def load_data(file_path):
    with open(file_path, encoding="utf-8") as file:
        texts = [line.strip() for line in file if line.strip()]
    return Dataset.from_dict({"text": texts})

# Define paths to data files
data_files = {
    "open_subtitles": r"train_10m\open_subtitles.train",
    "bnc_spoken": r"train_10m\bnc_spoken.train",
    "gutenberg": r"train_10m\gutenberg.train",
    "childes": r"train_10m\childes.train",
    "simple_wiki": r"train_10m\simple_wiki.train",
    "switchboard": r"train_10m\switchboard.train"
}

# Load datasets and concatenate them
datasets = {name: load_data(path) for name, path in data_files.items()}
combined_dataset = concatenate_datasets(list(datasets.values()))

# Optionally, shuffle the dataset
combined_dataset = combined_dataset.shuffle(seed=42)


In [None]:
from transformers import DataCollatorForLanguageModeling

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,        # Enable Masked Language Modeling
    mlm_probability=0.15
)

def tokenize_and_mask(examples):
    # Tokenize the texts
    tokenized_inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512, return_tensors="np")
    # Manually adjust the length of each example if needed, or ensure your dataset doesn't exceed the max_length
    return tokenized_inputs

tokenized_datasets = combined_dataset.map(tokenize_and_mask, batched=True, remove_columns=["text"])
# Apply data collator in trainer, not in the map function



loading file vocab.txt from cache at C:\Users\rebec\.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\6cdc0aad91f5ae2e6712e91bc7b65d1cf5c05411\vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at C:\Users\rebec\.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\6cdc0aad91f5ae2e6712e91bc7b65d1cf5c05411\tokenizer_config.json
loading file tokenizer.json from cache at C:\Users\rebec\.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\6cdc0aad91f5ae2e6712e91bc7b65d1cf5c05411\tokenizer.json
loading configuration file config.json from cache at C:\Users\rebec\.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\6cdc0aad91f5ae2e6712e91bc7b65d1cf5c05411\config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_d

Map:   0%|          | 0/1165314 [00:00<?, ? examples/s]

In [None]:
!pip install transformers[torch] accelerate -U --force-reinstall

Defaulting to user installation because normal site-packages is not writeable
Collecting accelerate
  Using cached accelerate-0.29.2-py3-none-any.whl.metadata (18 kB)
Collecting transformers[torch]
  Using cached transformers-4.39.3-py3-none-any.whl.metadata (134 kB)
Collecting filelock (from transformers[torch])
  Using cached filelock-3.13.4-py3-none-any.whl.metadata (2.8 kB)
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers[torch])
  Using cached huggingface_hub-0.22.2-py3-none-any.whl.metadata (12 kB)
Collecting numpy>=1.17 (from transformers[torch])
  Using cached numpy-1.26.4-cp312-cp312-win_amd64.whl.metadata (61 kB)
Collecting packaging>=20.0 (from transformers[torch])
  Using cached packaging-24.0-py3-none-any.whl.metadata (3.2 kB)
Collecting pyyaml>=5.1 (from transformers[torch])
  Using cached PyYAML-6.0.1-cp312-cp312-win_amd64.whl.metadata (2.1 kB)
Collecting regex!=2019.12.17 (from transformers[torch])
  Using cached regex-2024.4.16-cp312-cp312-win_amd64.whl.metad

  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datasets 2.18.0 requires fsspec[http]<=2024.2.0,>=2023.1.0, but you have fsspec 2024.3.1 which is incompatible.
torchaudio 2.2.2+cu121 requires torch==2.2.2+cu121, but you have torch 2.2.2 which is incompatible.
torchvision 0.17.2+cu121 requires torch==2.2.2+cu121, but you have torch 2.2.2 which is incompatible.


In [None]:
!pip show accelerate transformers


Name: accelerate
Version: 0.29.2
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: zach.mueller@huggingface.co
License: Apache
Location: C:\Users\rebec\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages
Requires: huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch
Required-by: 
---
Name: transformers
Version: 4.39.3
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: C:\Users\rebec\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages
Requires: filelock, hugg

In [None]:
print(torch.__version__)

2.2.2+cu121


In [None]:
import sys
print(sys.executable)


C:\Users\rebec\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe


In [None]:
from accelerate import Accelerator

In [None]:
model = DistilBertForMaskedLM.from_pretrained('distilbert-base-uncased')

# Set the logger to use the INFO level
logging.set_verbosity_info()

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    logging_dir='./logs',  # Directory for storing logs
    logging_steps=100,  # Log training info every 100 steps
    per_device_train_batch_size=8,  # Adjust based on your GPU memory
    save_steps=10_000,
    save_total_limit=2,
    report_to="all",  # Ensures logging to both TensorBoard and stdout
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator  # Pass the data collator here to handle dynamic masking
)


loading configuration file config.json from cache at C:\Users\rebec\.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\6cdc0aad91f5ae2e6712e91bc7b65d1cf5c05411\config.json
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.39.3",
  "vocab_size": 30522
}

loading weights file model.safetensors from cache at C:\Users\rebec\.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\6cdc0aad91f5ae2e6712e91bc7b65d1cf5c05411\model.safetensors
All model checkpoint weights were used when initializing DistilBertForMaskedLM.

All the weights of DistilBertFor

In [None]:
trainer.train()

***** Running training *****
  Num examples = 1,165,314
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 436,995
  Number of trainable parameters = 66,985,530


  0%|          | 0/436995 [00:00<?, ?it/s]

{'loss': 2.7525, 'grad_norm': 33.065460205078125, 'learning_rate': 4.998855822148995e-05, 'epoch': 0.0}
{'loss': 2.6615, 'grad_norm': 26.558879852294922, 'learning_rate': 4.99771164429799e-05, 'epoch': 0.0}
{'loss': 2.5155, 'grad_norm': 30.26500129699707, 'learning_rate': 4.9965674664469846e-05, 'epoch': 0.0}
{'loss': 2.5828, 'grad_norm': 23.92010498046875, 'learning_rate': 4.9954232885959794e-05, 'epoch': 0.0}
{'loss': 2.5406, 'grad_norm': 23.818222045898438, 'learning_rate': 4.994279110744974e-05, 'epoch': 0.0}
{'loss': 2.603, 'grad_norm': 25.47873306274414, 'learning_rate': 4.993134932893969e-05, 'epoch': 0.0}
{'loss': 2.2571, 'grad_norm': 24.167726516723633, 'learning_rate': 4.991990755042964e-05, 'epoch': 0.0}
{'loss': 2.3206, 'grad_norm': 17.598976135253906, 'learning_rate': 4.9908465771919586e-05, 'epoch': 0.01}
{'loss': 2.6281, 'grad_norm': 32.55040740966797, 'learning_rate': 4.9897023993409534e-05, 'epoch': 0.01}
{'loss': 2.5626, 'grad_norm': 21.74773597717285, 'learning_rate'

Saving model checkpoint to ./results\checkpoint-10000
Configuration saved in ./results\checkpoint-10000\config.json


{'loss': 2.3511, 'grad_norm': 18.42654037475586, 'learning_rate': 4.885582214899484e-05, 'epoch': 0.07}


Model weights saved in ./results\checkpoint-10000\model.safetensors


{'loss': 2.5217, 'grad_norm': 20.27106285095215, 'learning_rate': 4.8844380370484786e-05, 'epoch': 0.07}
{'loss': 2.1987, 'grad_norm': 24.488054275512695, 'learning_rate': 4.8832938591974734e-05, 'epoch': 0.07}
{'loss': 2.43, 'grad_norm': 42.24113464355469, 'learning_rate': 4.882149681346469e-05, 'epoch': 0.07}
{'loss': 2.3033, 'grad_norm': 21.58443832397461, 'learning_rate': 4.8810055034954636e-05, 'epoch': 0.07}
{'loss': 2.3751, 'grad_norm': 24.940134048461914, 'learning_rate': 4.8798613256444584e-05, 'epoch': 0.07}
{'loss': 2.2685, 'grad_norm': 15.479841232299805, 'learning_rate': 4.878717147793453e-05, 'epoch': 0.07}
{'loss': 2.427, 'grad_norm': 14.422588348388672, 'learning_rate': 4.877572969942448e-05, 'epoch': 0.07}
{'loss': 2.4561, 'grad_norm': 226.31683349609375, 'learning_rate': 4.876428792091443e-05, 'epoch': 0.07}
{'loss': 2.2733, 'grad_norm': 26.637144088745117, 'learning_rate': 4.8752846142404376e-05, 'epoch': 0.07}
{'loss': 2.5015, 'grad_norm': 32.898475646972656, 'learn

Saving model checkpoint to ./results\checkpoint-20000
Configuration saved in ./results\checkpoint-20000\config.json


{'loss': 2.4499, 'grad_norm': 12.825489044189453, 'learning_rate': 4.771164429798968e-05, 'epoch': 0.14}


Model weights saved in ./results\checkpoint-20000\model.safetensors


{'loss': 2.3763, 'grad_norm': 22.9476375579834, 'learning_rate': 4.770020251947963e-05, 'epoch': 0.14}
{'loss': 2.262, 'grad_norm': 17.723764419555664, 'learning_rate': 4.7688760740969576e-05, 'epoch': 0.14}
{'loss': 2.3604, 'grad_norm': 25.209335327148438, 'learning_rate': 4.767731896245953e-05, 'epoch': 0.14}
{'loss': 2.5617, 'grad_norm': 22.645828247070312, 'learning_rate': 4.766587718394948e-05, 'epoch': 0.14}
{'loss': 2.5523, 'grad_norm': 15.39207649230957, 'learning_rate': 4.7654435405439427e-05, 'epoch': 0.14}
{'loss': 2.5494, 'grad_norm': 36.612281799316406, 'learning_rate': 4.7642993626929374e-05, 'epoch': 0.14}
{'loss': 2.2229, 'grad_norm': 20.576290130615234, 'learning_rate': 4.763155184841932e-05, 'epoch': 0.14}
{'loss': 2.3852, 'grad_norm': 13.775464057922363, 'learning_rate': 4.762011006990927e-05, 'epoch': 0.14}
{'loss': 2.3027, 'grad_norm': 17.42525291442871, 'learning_rate': 4.760866829139922e-05, 'epoch': 0.14}
{'loss': 2.3201, 'grad_norm': 10.550214767456055, 'learni

Saving model checkpoint to ./results\checkpoint-30000
Configuration saved in ./results\checkpoint-30000\config.json


{'loss': 2.3447, 'grad_norm': 1.9742639064788818, 'learning_rate': 4.656746644698452e-05, 'epoch': 0.21}


Model weights saved in ./results\checkpoint-30000\model.safetensors
Deleting older checkpoint [results\checkpoint-10000] due to args.save_total_limit


{'loss': 2.2679, 'grad_norm': 35.62400436401367, 'learning_rate': 4.655602466847447e-05, 'epoch': 0.21}
{'loss': 2.5674, 'grad_norm': 34.27400588989258, 'learning_rate': 4.654458288996442e-05, 'epoch': 0.21}
{'loss': 2.2695, 'grad_norm': 27.223615646362305, 'learning_rate': 4.6533141111454366e-05, 'epoch': 0.21}
{'loss': 2.225, 'grad_norm': 23.860700607299805, 'learning_rate': 4.6521699332944314e-05, 'epoch': 0.21}
{'loss': 2.3187, 'grad_norm': 12.180253982543945, 'learning_rate': 4.651025755443426e-05, 'epoch': 0.21}
{'loss': 2.5209, 'grad_norm': 60.19098663330078, 'learning_rate': 4.649881577592421e-05, 'epoch': 0.21}
{'loss': 2.4301, 'grad_norm': 9.017581939697266, 'learning_rate': 4.648737399741416e-05, 'epoch': 0.21}
{'loss': 2.3435, 'grad_norm': 16.3916015625, 'learning_rate': 4.647593221890411e-05, 'epoch': 0.21}
{'loss': 2.3947, 'grad_norm': 18.78986930847168, 'learning_rate': 4.646449044039406e-05, 'epoch': 0.21}
{'loss': 2.2496, 'grad_norm': 12.570146560668945, 'learning_rate

Saving model checkpoint to ./results\checkpoint-40000
Configuration saved in ./results\checkpoint-40000\config.json


{'loss': 2.288, 'grad_norm': 21.88767433166504, 'learning_rate': 4.5423288595979364e-05, 'epoch': 0.27}


Model weights saved in ./results\checkpoint-40000\model.safetensors
Deleting older checkpoint [results\checkpoint-20000] due to args.save_total_limit


{'loss': 2.2604, 'grad_norm': 26.502038955688477, 'learning_rate': 4.541184681746931e-05, 'epoch': 0.28}
{'loss': 2.4197, 'grad_norm': 24.38302230834961, 'learning_rate': 4.540040503895926e-05, 'epoch': 0.28}
{'loss': 2.2132, 'grad_norm': 17.621612548828125, 'learning_rate': 4.538896326044921e-05, 'epoch': 0.28}
{'loss': 2.1784, 'grad_norm': 14.684526443481445, 'learning_rate': 4.5377521481939156e-05, 'epoch': 0.28}
{'loss': 2.2968, 'grad_norm': 15.43430233001709, 'learning_rate': 4.5366079703429104e-05, 'epoch': 0.28}
{'loss': 2.373, 'grad_norm': 20.629745483398438, 'learning_rate': 4.535463792491905e-05, 'epoch': 0.28}
{'loss': 2.251, 'grad_norm': 23.550716400146484, 'learning_rate': 4.5343196146409e-05, 'epoch': 0.28}
{'loss': 2.5783, 'grad_norm': 20.459680557250977, 'learning_rate': 4.533175436789895e-05, 'epoch': 0.28}
{'loss': 2.3767, 'grad_norm': 36.88876724243164, 'learning_rate': 4.5320312589388896e-05, 'epoch': 0.28}
{'loss': 2.2737, 'grad_norm': 62.68445587158203, 'learning_

Saving model checkpoint to ./results\checkpoint-50000
Configuration saved in ./results\checkpoint-50000\config.json


{'loss': 2.4078, 'grad_norm': 22.143726348876953, 'learning_rate': 4.42791107449742e-05, 'epoch': 0.34}


Model weights saved in ./results\checkpoint-50000\model.safetensors
Deleting older checkpoint [results\checkpoint-30000] due to args.save_total_limit


{'loss': 2.2172, 'grad_norm': 13.94355583190918, 'learning_rate': 4.426766896646415e-05, 'epoch': 0.34}
{'loss': 2.5069, 'grad_norm': 23.15069007873535, 'learning_rate': 4.4256227187954096e-05, 'epoch': 0.34}
{'loss': 2.3864, 'grad_norm': 11.392027854919434, 'learning_rate': 4.4244785409444044e-05, 'epoch': 0.35}
{'loss': 2.2973, 'grad_norm': 30.453794479370117, 'learning_rate': 4.423334363093399e-05, 'epoch': 0.35}
{'loss': 2.3194, 'grad_norm': 23.651243209838867, 'learning_rate': 4.422190185242394e-05, 'epoch': 0.35}
{'loss': 2.5151, 'grad_norm': 19.12358856201172, 'learning_rate': 4.421046007391389e-05, 'epoch': 0.35}
{'loss': 2.5285, 'grad_norm': 15.536113739013672, 'learning_rate': 4.4199018295403835e-05, 'epoch': 0.35}
{'loss': 2.6535, 'grad_norm': 9.626994132995605, 'learning_rate': 4.4187576516893783e-05, 'epoch': 0.35}
{'loss': 2.3781, 'grad_norm': 24.835887908935547, 'learning_rate': 4.417613473838373e-05, 'epoch': 0.35}
{'loss': 2.4121, 'grad_norm': 21.495031356811523, 'lear

Saving model checkpoint to ./results\checkpoint-60000
Configuration saved in ./results\checkpoint-60000\config.json


{'loss': 2.2345, 'grad_norm': 19.969345092773438, 'learning_rate': 4.313493289396904e-05, 'epoch': 0.41}


Model weights saved in ./results\checkpoint-60000\model.safetensors
Deleting older checkpoint [results\checkpoint-40000] due to args.save_total_limit


{'loss': 2.3129, 'grad_norm': 30.177066802978516, 'learning_rate': 4.312349111545899e-05, 'epoch': 0.41}
{'loss': 2.4015, 'grad_norm': 30.031641006469727, 'learning_rate': 4.311204933694894e-05, 'epoch': 0.41}
{'loss': 2.4228, 'grad_norm': 10.786165237426758, 'learning_rate': 4.3100607558438886e-05, 'epoch': 0.41}
{'loss': 2.3836, 'grad_norm': 27.11167335510254, 'learning_rate': 4.3089165779928834e-05, 'epoch': 0.41}
{'loss': 2.3369, 'grad_norm': 16.289880752563477, 'learning_rate': 4.307772400141878e-05, 'epoch': 0.42}
{'loss': 2.3414, 'grad_norm': 27.654699325561523, 'learning_rate': 4.306628222290873e-05, 'epoch': 0.42}
{'loss': 2.264, 'grad_norm': 24.686552047729492, 'learning_rate': 4.305484044439868e-05, 'epoch': 0.42}
{'loss': 2.371, 'grad_norm': 23.5666446685791, 'learning_rate': 4.304339866588863e-05, 'epoch': 0.42}
{'loss': 2.3929, 'grad_norm': 19.38857650756836, 'learning_rate': 4.303195688737858e-05, 'epoch': 0.42}
{'loss': 2.2866, 'grad_norm': 25.49618911743164, 'learning_

Saving model checkpoint to ./results\checkpoint-70000
Configuration saved in ./results\checkpoint-70000\config.json


{'loss': 2.2152, 'grad_norm': 19.282133102416992, 'learning_rate': 4.199075504296388e-05, 'epoch': 0.48}


Model weights saved in ./results\checkpoint-70000\model.safetensors
Deleting older checkpoint [results\checkpoint-50000] due to args.save_total_limit


{'loss': 2.2226, 'grad_norm': 28.67856216430664, 'learning_rate': 4.197931326445383e-05, 'epoch': 0.48}
{'loss': 2.2955, 'grad_norm': 21.713727951049805, 'learning_rate': 4.196787148594378e-05, 'epoch': 0.48}
{'loss': 2.5877, 'grad_norm': 15.277961730957031, 'learning_rate': 4.195642970743373e-05, 'epoch': 0.48}
{'loss': 2.6172, 'grad_norm': 20.88836669921875, 'learning_rate': 4.1944987928923676e-05, 'epoch': 0.48}
{'loss': 2.3038, 'grad_norm': 30.461252212524414, 'learning_rate': 4.1933546150413624e-05, 'epoch': 0.48}
{'loss': 2.5059, 'grad_norm': 14.753929138183594, 'learning_rate': 4.192210437190357e-05, 'epoch': 0.48}
{'loss': 2.2938, 'grad_norm': 17.142847061157227, 'learning_rate': 4.191066259339352e-05, 'epoch': 0.49}
{'loss': 2.4609, 'grad_norm': 28.606937408447266, 'learning_rate': 4.189922081488347e-05, 'epoch': 0.49}
{'loss': 2.3773, 'grad_norm': 30.099294662475586, 'learning_rate': 4.1887779036373416e-05, 'epoch': 0.49}
{'loss': 2.2911, 'grad_norm': 16.061447143554688, 'lea

Saving model checkpoint to ./results\checkpoint-80000
Configuration saved in ./results\checkpoint-80000\config.json


{'loss': 2.2192, 'grad_norm': 20.451555252075195, 'learning_rate': 4.084657719195872e-05, 'epoch': 0.55}


Model weights saved in ./results\checkpoint-80000\model.safetensors
Deleting older checkpoint [results\checkpoint-60000] due to args.save_total_limit


{'loss': 2.2982, 'grad_norm': 33.66252136230469, 'learning_rate': 4.083513541344867e-05, 'epoch': 0.55}
{'loss': 2.3881, 'grad_norm': 28.578916549682617, 'learning_rate': 4.0823693634938616e-05, 'epoch': 0.55}
{'loss': 2.2563, 'grad_norm': 19.886754989624023, 'learning_rate': 4.0812251856428564e-05, 'epoch': 0.55}
{'loss': 2.2952, 'grad_norm': 13.540472030639648, 'learning_rate': 4.080081007791851e-05, 'epoch': 0.55}
{'loss': 2.2928, 'grad_norm': 21.40454864501953, 'learning_rate': 4.0789368299408466e-05, 'epoch': 0.55}
{'loss': 2.1941, 'grad_norm': 19.65068244934082, 'learning_rate': 4.0777926520898414e-05, 'epoch': 0.55}
{'loss': 2.2828, 'grad_norm': 17.014646530151367, 'learning_rate': 4.076648474238836e-05, 'epoch': 0.55}
{'loss': 2.3032, 'grad_norm': 13.307753562927246, 'learning_rate': 4.075504296387831e-05, 'epoch': 0.55}
{'loss': 2.2701, 'grad_norm': 18.848337173461914, 'learning_rate': 4.074360118536826e-05, 'epoch': 0.56}
{'loss': 2.356, 'grad_norm': 20.838409423828125, 'lear

Saving model checkpoint to ./results\checkpoint-90000
Configuration saved in ./results\checkpoint-90000\config.json


{'loss': 2.4283, 'grad_norm': 23.322420120239258, 'learning_rate': 3.970239934095356e-05, 'epoch': 0.62}


Model weights saved in ./results\checkpoint-90000\model.safetensors
Deleting older checkpoint [results\checkpoint-70000] due to args.save_total_limit


{'loss': 2.0916, 'grad_norm': 18.865524291992188, 'learning_rate': 3.969095756244351e-05, 'epoch': 0.62}
{'loss': 2.2329, 'grad_norm': 14.925874710083008, 'learning_rate': 3.967951578393346e-05, 'epoch': 0.62}
{'loss': 2.2319, 'grad_norm': 16.601526260375977, 'learning_rate': 3.9668074005423406e-05, 'epoch': 0.62}
{'loss': 2.3012, 'grad_norm': 33.653785705566406, 'learning_rate': 3.9656632226913354e-05, 'epoch': 0.62}
{'loss': 2.3191, 'grad_norm': 18.877700805664062, 'learning_rate': 3.96451904484033e-05, 'epoch': 0.62}
{'loss': 2.3772, 'grad_norm': 27.408645629882812, 'learning_rate': 3.963374866989325e-05, 'epoch': 0.62}
{'loss': 2.3615, 'grad_norm': 22.194244384765625, 'learning_rate': 3.96223068913832e-05, 'epoch': 0.62}
{'loss': 2.333, 'grad_norm': 19.076171875, 'learning_rate': 3.9610865112873145e-05, 'epoch': 0.62}
{'loss': 2.1768, 'grad_norm': 18.08026123046875, 'learning_rate': 3.959942333436309e-05, 'epoch': 0.62}
{'loss': 2.5085, 'grad_norm': 21.617883682250977, 'learning_ra

Saving model checkpoint to ./results\checkpoint-100000
Configuration saved in ./results\checkpoint-100000\config.json


{'loss': 2.2686, 'grad_norm': 12.082313537597656, 'learning_rate': 3.85582214899484e-05, 'epoch': 0.69}


Model weights saved in ./results\checkpoint-100000\model.safetensors
Deleting older checkpoint [results\checkpoint-80000] due to args.save_total_limit


{'loss': 2.2716, 'grad_norm': 24.558748245239258, 'learning_rate': 3.8546779711438345e-05, 'epoch': 0.69}
{'loss': 2.2059, 'grad_norm': 24.816253662109375, 'learning_rate': 3.853533793292829e-05, 'epoch': 0.69}
{'loss': 2.2214, 'grad_norm': 32.84592819213867, 'learning_rate': 3.852389615441824e-05, 'epoch': 0.69}
{'loss': 2.4765, 'grad_norm': 55.68162155151367, 'learning_rate': 3.851245437590819e-05, 'epoch': 0.69}
{'loss': 2.5326, 'grad_norm': 16.502458572387695, 'learning_rate': 3.850101259739814e-05, 'epoch': 0.69}
{'loss': 2.2121, 'grad_norm': 22.986722946166992, 'learning_rate': 3.8489570818888085e-05, 'epoch': 0.69}
{'loss': 2.2016, 'grad_norm': 33.235843658447266, 'learning_rate': 3.847812904037803e-05, 'epoch': 0.69}
{'loss': 2.4837, 'grad_norm': 27.220115661621094, 'learning_rate': 3.846668726186798e-05, 'epoch': 0.69}
{'loss': 2.334, 'grad_norm': 15.577566146850586, 'learning_rate': 3.8455245483357936e-05, 'epoch': 0.69}
{'loss': 2.2671, 'grad_norm': 21.767030715942383, 'lear

Saving model checkpoint to ./results\checkpoint-110000
Configuration saved in ./results\checkpoint-110000\config.json


{'loss': 2.3309, 'grad_norm': 18.441768646240234, 'learning_rate': 3.741404363894324e-05, 'epoch': 0.76}


Model weights saved in ./results\checkpoint-110000\model.safetensors
Deleting older checkpoint [results\checkpoint-90000] due to args.save_total_limit


{'loss': 2.426, 'grad_norm': 17.611873626708984, 'learning_rate': 3.740260186043319e-05, 'epoch': 0.76}
{'loss': 2.3469, 'grad_norm': 16.897451400756836, 'learning_rate': 3.7391160081923135e-05, 'epoch': 0.76}
{'loss': 2.2318, 'grad_norm': 22.3885498046875, 'learning_rate': 3.737971830341308e-05, 'epoch': 0.76}
{'loss': 2.2664, 'grad_norm': 18.189071655273438, 'learning_rate': 3.736827652490303e-05, 'epoch': 0.76}
{'loss': 2.3488, 'grad_norm': 47.99972152709961, 'learning_rate': 3.735683474639298e-05, 'epoch': 0.76}
{'loss': 2.2017, 'grad_norm': 15.313207626342773, 'learning_rate': 3.7345392967882934e-05, 'epoch': 0.76}
{'loss': 2.2266, 'grad_norm': 25.382471084594727, 'learning_rate': 3.733395118937288e-05, 'epoch': 0.76}
{'loss': 2.3191, 'grad_norm': 28.779911041259766, 'learning_rate': 3.732250941086283e-05, 'epoch': 0.76}
{'loss': 2.288, 'grad_norm': 33.6932258605957, 'learning_rate': 3.731106763235278e-05, 'epoch': 0.76}
{'loss': 2.2757, 'grad_norm': 15.369185447692871, 'learning_

Saving model checkpoint to ./results\checkpoint-120000
Configuration saved in ./results\checkpoint-120000\config.json


{'loss': 2.3092, 'grad_norm': 13.579763412475586, 'learning_rate': 3.626986578793808e-05, 'epoch': 0.82}


Model weights saved in ./results\checkpoint-120000\model.safetensors
Deleting older checkpoint [results\checkpoint-100000] due to args.save_total_limit


{'loss': 2.3311, 'grad_norm': 35.064247131347656, 'learning_rate': 3.625842400942803e-05, 'epoch': 0.82}
{'loss': 2.5947, 'grad_norm': 18.79954719543457, 'learning_rate': 3.624698223091798e-05, 'epoch': 0.83}
{'loss': 2.304, 'grad_norm': 17.998693466186523, 'learning_rate': 3.6235540452407926e-05, 'epoch': 0.83}
{'loss': 2.2308, 'grad_norm': 8.217191696166992, 'learning_rate': 3.6224098673897873e-05, 'epoch': 0.83}
{'loss': 2.3096, 'grad_norm': 16.01431655883789, 'learning_rate': 3.621265689538782e-05, 'epoch': 0.83}
{'loss': 2.2957, 'grad_norm': 19.460071563720703, 'learning_rate': 3.620121511687777e-05, 'epoch': 0.83}
{'loss': 2.2721, 'grad_norm': 17.55293846130371, 'learning_rate': 3.618977333836772e-05, 'epoch': 0.83}
{'loss': 2.2505, 'grad_norm': 14.353286743164062, 'learning_rate': 3.6178331559857665e-05, 'epoch': 0.83}
{'loss': 2.2878, 'grad_norm': 16.414844512939453, 'learning_rate': 3.616688978134761e-05, 'epoch': 0.83}
{'loss': 2.4332, 'grad_norm': 24.762496948242188, 'learni

Saving model checkpoint to ./results\checkpoint-130000
Configuration saved in ./results\checkpoint-130000\config.json


{'loss': 2.1954, 'grad_norm': 19.051517486572266, 'learning_rate': 3.512568793693292e-05, 'epoch': 0.89}


Model weights saved in ./results\checkpoint-130000\model.safetensors
Deleting older checkpoint [results\checkpoint-110000] due to args.save_total_limit


{'loss': 2.3465, 'grad_norm': 25.76778793334961, 'learning_rate': 3.5114246158422865e-05, 'epoch': 0.89}
{'loss': 2.072, 'grad_norm': 25.93143081665039, 'learning_rate': 3.510280437991281e-05, 'epoch': 0.89}
{'loss': 2.3485, 'grad_norm': 24.203901290893555, 'learning_rate': 3.509136260140277e-05, 'epoch': 0.89}
{'loss': 2.4178, 'grad_norm': 10.340868949890137, 'learning_rate': 3.5079920822892716e-05, 'epoch': 0.9}
{'loss': 2.3458, 'grad_norm': 12.284414291381836, 'learning_rate': 3.5068479044382664e-05, 'epoch': 0.9}
{'loss': 2.4598, 'grad_norm': 25.634233474731445, 'learning_rate': 3.505703726587261e-05, 'epoch': 0.9}
{'loss': 2.1384, 'grad_norm': 20.546598434448242, 'learning_rate': 3.504559548736256e-05, 'epoch': 0.9}
{'loss': 2.3842, 'grad_norm': 25.78415298461914, 'learning_rate': 3.503415370885251e-05, 'epoch': 0.9}
{'loss': 2.209, 'grad_norm': 18.961816787719727, 'learning_rate': 3.5022711930342455e-05, 'epoch': 0.9}
{'loss': 2.1844, 'grad_norm': 19.97052574157715, 'learning_rat

Saving model checkpoint to ./results\checkpoint-140000
Configuration saved in ./results\checkpoint-140000\config.json


{'loss': 2.4051, 'grad_norm': 21.87112045288086, 'learning_rate': 3.398151008592776e-05, 'epoch': 0.96}


Model weights saved in ./results\checkpoint-140000\model.safetensors
Deleting older checkpoint [results\checkpoint-120000] due to args.save_total_limit


{'loss': 2.3548, 'grad_norm': 23.810209274291992, 'learning_rate': 3.397006830741771e-05, 'epoch': 0.96}
{'loss': 2.3107, 'grad_norm': 26.05113983154297, 'learning_rate': 3.3958626528907655e-05, 'epoch': 0.96}
{'loss': 2.3948, 'grad_norm': 15.769007682800293, 'learning_rate': 3.39471847503976e-05, 'epoch': 0.96}
{'loss': 2.3207, 'grad_norm': 26.784931182861328, 'learning_rate': 3.393574297188755e-05, 'epoch': 0.96}
{'loss': 2.2342, 'grad_norm': 9.820320129394531, 'learning_rate': 3.39243011933775e-05, 'epoch': 0.96}
{'loss': 2.1702, 'grad_norm': 18.160316467285156, 'learning_rate': 3.391285941486745e-05, 'epoch': 0.97}
{'loss': 2.2319, 'grad_norm': 16.23314666748047, 'learning_rate': 3.3901417636357395e-05, 'epoch': 0.97}
{'loss': 2.4866, 'grad_norm': 18.347402572631836, 'learning_rate': 3.388997585784734e-05, 'epoch': 0.97}
{'loss': 2.5763, 'grad_norm': 14.450797080993652, 'learning_rate': 3.387853407933729e-05, 'epoch': 0.97}
{'loss': 2.4365, 'grad_norm': 27.879634857177734, 'learnin

Saving model checkpoint to ./results\checkpoint-150000
Configuration saved in ./results\checkpoint-150000\config.json


{'loss': 2.1933, 'grad_norm': 20.20804786682129, 'learning_rate': 3.2837332234922595e-05, 'epoch': 1.03}


Model weights saved in ./results\checkpoint-150000\model.safetensors
Deleting older checkpoint [results\checkpoint-130000] due to args.save_total_limit


{'loss': 2.2121, 'grad_norm': 7.171145915985107, 'learning_rate': 3.282589045641254e-05, 'epoch': 1.03}
{'loss': 2.3493, 'grad_norm': 17.118303298950195, 'learning_rate': 3.281444867790249e-05, 'epoch': 1.03}
{'loss': 2.3117, 'grad_norm': 20.21784019470215, 'learning_rate': 3.280300689939244e-05, 'epoch': 1.03}
{'loss': 2.1837, 'grad_norm': 18.91415786743164, 'learning_rate': 3.2791565120882387e-05, 'epoch': 1.03}
{'loss': 2.2697, 'grad_norm': 16.6615047454834, 'learning_rate': 3.2780123342372334e-05, 'epoch': 1.03}
{'loss': 2.2056, 'grad_norm': 19.03372573852539, 'learning_rate': 3.276868156386228e-05, 'epoch': 1.03}
{'loss': 2.2742, 'grad_norm': 15.002570152282715, 'learning_rate': 3.275723978535224e-05, 'epoch': 1.03}
{'loss': 2.3774, 'grad_norm': 15.307174682617188, 'learning_rate': 3.2745798006842185e-05, 'epoch': 1.04}
{'loss': 2.2398, 'grad_norm': 28.82415199279785, 'learning_rate': 3.273435622833213e-05, 'epoch': 1.04}
{'loss': 2.2379, 'grad_norm': 18.586509704589844, 'learning

Saving model checkpoint to ./results\checkpoint-160000
Configuration saved in ./results\checkpoint-160000\config.json


{'loss': 2.2697, 'grad_norm': 17.643693923950195, 'learning_rate': 3.169315438391744e-05, 'epoch': 1.1}


Model weights saved in ./results\checkpoint-160000\model.safetensors
Deleting older checkpoint [results\checkpoint-140000] due to args.save_total_limit


{'loss': 2.2747, 'grad_norm': 14.608038902282715, 'learning_rate': 3.1681712605407385e-05, 'epoch': 1.1}
{'loss': 2.3446, 'grad_norm': 21.24964714050293, 'learning_rate': 3.167027082689733e-05, 'epoch': 1.1}
{'loss': 2.2706, 'grad_norm': 20.94800567626953, 'learning_rate': 3.165882904838728e-05, 'epoch': 1.1}
{'loss': 2.3798, 'grad_norm': 23.542587280273438, 'learning_rate': 3.1647387269877235e-05, 'epoch': 1.1}
{'loss': 2.2715, 'grad_norm': 8.877904891967773, 'learning_rate': 3.1635945491367183e-05, 'epoch': 1.1}
{'loss': 2.2344, 'grad_norm': 24.613683700561523, 'learning_rate': 3.162450371285713e-05, 'epoch': 1.1}
{'loss': 2.142, 'grad_norm': 19.858774185180664, 'learning_rate': 3.161306193434708e-05, 'epoch': 1.1}
{'loss': 2.3172, 'grad_norm': 24.107938766479492, 'learning_rate': 3.160162015583703e-05, 'epoch': 1.1}
{'loss': 2.1761, 'grad_norm': 19.063640594482422, 'learning_rate': 3.1590178377326975e-05, 'epoch': 1.1}
{'loss': 2.1948, 'grad_norm': 14.815779685974121, 'learning_rate

Saving model checkpoint to ./results\checkpoint-170000
Configuration saved in ./results\checkpoint-170000\config.json


{'loss': 2.2582, 'grad_norm': 17.827577590942383, 'learning_rate': 3.054897653291228e-05, 'epoch': 1.17}


Model weights saved in ./results\checkpoint-170000\model.safetensors
Deleting older checkpoint [results\checkpoint-150000] due to args.save_total_limit


{'loss': 2.3463, 'grad_norm': 15.653550148010254, 'learning_rate': 3.053753475440223e-05, 'epoch': 1.17}
{'loss': 2.2947, 'grad_norm': 18.381650924682617, 'learning_rate': 3.0526092975892175e-05, 'epoch': 1.17}
{'loss': 2.3399, 'grad_norm': 23.69552230834961, 'learning_rate': 3.0514651197382123e-05, 'epoch': 1.17}
{'loss': 2.3909, 'grad_norm': 18.472511291503906, 'learning_rate': 3.050320941887207e-05, 'epoch': 1.17}
{'loss': 2.3271, 'grad_norm': 15.841768264770508, 'learning_rate': 3.049176764036202e-05, 'epoch': 1.17}
{'loss': 2.2215, 'grad_norm': 13.286792755126953, 'learning_rate': 3.0480325861851967e-05, 'epoch': 1.17}
{'loss': 2.2508, 'grad_norm': 5.145831108093262, 'learning_rate': 3.0468884083341915e-05, 'epoch': 1.17}
{'loss': 2.3127, 'grad_norm': 20.724863052368164, 'learning_rate': 3.0457442304831863e-05, 'epoch': 1.17}
{'loss': 2.2621, 'grad_norm': 18.08901023864746, 'learning_rate': 3.044600052632181e-05, 'epoch': 1.17}
{'loss': 2.2866, 'grad_norm': 12.402469635009766, 'le

Saving model checkpoint to ./results\checkpoint-180000
Configuration saved in ./results\checkpoint-180000\config.json


{'loss': 2.3069, 'grad_norm': 27.239051818847656, 'learning_rate': 2.940479868190712e-05, 'epoch': 1.24}


Model weights saved in ./results\checkpoint-180000\model.safetensors
Deleting older checkpoint [results\checkpoint-160000] due to args.save_total_limit


{'loss': 2.3976, 'grad_norm': 21.548404693603516, 'learning_rate': 2.939335690339707e-05, 'epoch': 1.24}
{'loss': 2.3388, 'grad_norm': 31.186254501342773, 'learning_rate': 2.9381915124887017e-05, 'epoch': 1.24}
{'loss': 2.237, 'grad_norm': 14.089794158935547, 'learning_rate': 2.9370473346376965e-05, 'epoch': 1.24}
{'loss': 2.3865, 'grad_norm': 16.919790267944336, 'learning_rate': 2.9359031567866913e-05, 'epoch': 1.24}
{'loss': 2.2775, 'grad_norm': 19.453826904296875, 'learning_rate': 2.934758978935686e-05, 'epoch': 1.24}
{'loss': 2.3887, 'grad_norm': 19.807634353637695, 'learning_rate': 2.933614801084681e-05, 'epoch': 1.24}
{'loss': 2.4511, 'grad_norm': 8.677526473999023, 'learning_rate': 2.9324706232336757e-05, 'epoch': 1.24}
{'loss': 2.3892, 'grad_norm': 15.38620376586914, 'learning_rate': 2.9313264453826705e-05, 'epoch': 1.24}
{'loss': 2.2678, 'grad_norm': 17.485015869140625, 'learning_rate': 2.9301822675316653e-05, 'epoch': 1.24}
{'loss': 2.3347, 'grad_norm': 17.289783477783203, 'l

Saving model checkpoint to ./results\checkpoint-190000
Configuration saved in ./results\checkpoint-190000\config.json


{'loss': 2.1547, 'grad_norm': 27.233036041259766, 'learning_rate': 2.8260620830901957e-05, 'epoch': 1.3}


Model weights saved in ./results\checkpoint-190000\model.safetensors
Deleting older checkpoint [results\checkpoint-170000] due to args.save_total_limit


{'loss': 2.2559, 'grad_norm': 12.763753890991211, 'learning_rate': 2.8249179052391905e-05, 'epoch': 1.31}
{'loss': 2.2655, 'grad_norm': 29.5200252532959, 'learning_rate': 2.8237737273881853e-05, 'epoch': 1.31}
{'loss': 2.3103, 'grad_norm': 17.93013572692871, 'learning_rate': 2.82262954953718e-05, 'epoch': 1.31}
{'loss': 2.2641, 'grad_norm': 2.106800079345703, 'learning_rate': 2.821485371686175e-05, 'epoch': 1.31}
{'loss': 2.1609, 'grad_norm': 11.848029136657715, 'learning_rate': 2.8203411938351696e-05, 'epoch': 1.31}
{'loss': 2.2311, 'grad_norm': 20.682031631469727, 'learning_rate': 2.8191970159841648e-05, 'epoch': 1.31}
{'loss': 2.1233, 'grad_norm': 23.341171264648438, 'learning_rate': 2.8180528381331596e-05, 'epoch': 1.31}
{'loss': 2.4685, 'grad_norm': 19.579814910888672, 'learning_rate': 2.8169086602821544e-05, 'epoch': 1.31}
{'loss': 2.2308, 'grad_norm': 14.883835792541504, 'learning_rate': 2.815764482431149e-05, 'epoch': 1.31}
{'loss': 2.2964, 'grad_norm': 26.366409301757812, 'lea

Saving model checkpoint to ./results\checkpoint-200000
Configuration saved in ./results\checkpoint-200000\config.json


{'loss': 2.2528, 'grad_norm': 21.739784240722656, 'learning_rate': 2.7116442979896796e-05, 'epoch': 1.37}


Model weights saved in ./results\checkpoint-200000\model.safetensors
Deleting older checkpoint [results\checkpoint-180000] due to args.save_total_limit


{'loss': 2.2827, 'grad_norm': 16.249605178833008, 'learning_rate': 2.7105001201386744e-05, 'epoch': 1.37}
{'loss': 2.3694, 'grad_norm': 18.655359268188477, 'learning_rate': 2.709355942287669e-05, 'epoch': 1.37}
{'loss': 2.1478, 'grad_norm': 17.47219467163086, 'learning_rate': 2.708211764436664e-05, 'epoch': 1.38}
{'loss': 2.1509, 'grad_norm': 20.254663467407227, 'learning_rate': 2.7070675865856587e-05, 'epoch': 1.38}
{'loss': 2.3057, 'grad_norm': 32.57402420043945, 'learning_rate': 2.7059234087346542e-05, 'epoch': 1.38}
{'loss': 2.1582, 'grad_norm': 17.605045318603516, 'learning_rate': 2.704779230883649e-05, 'epoch': 1.38}
{'loss': 2.3205, 'grad_norm': 29.18224334716797, 'learning_rate': 2.7036350530326438e-05, 'epoch': 1.38}
{'loss': 2.423, 'grad_norm': 9.730202674865723, 'learning_rate': 2.7024908751816386e-05, 'epoch': 1.38}
{'loss': 2.3115, 'grad_norm': 10.223403930664062, 'learning_rate': 2.7013466973306334e-05, 'epoch': 1.38}
{'loss': 2.1789, 'grad_norm': 7.785402297973633, 'lear

Saving model checkpoint to ./results\checkpoint-210000
Configuration saved in ./results\checkpoint-210000\config.json


{'loss': 2.2431, 'grad_norm': 22.184059143066406, 'learning_rate': 2.5972265128891638e-05, 'epoch': 1.44}


Model weights saved in ./results\checkpoint-210000\model.safetensors
Deleting older checkpoint [results\checkpoint-190000] due to args.save_total_limit


{'loss': 2.0817, 'grad_norm': 15.351676940917969, 'learning_rate': 2.5960823350381586e-05, 'epoch': 1.44}
{'loss': 2.3938, 'grad_norm': 15.517776489257812, 'learning_rate': 2.5949381571871534e-05, 'epoch': 1.44}
{'loss': 2.1031, 'grad_norm': 20.1710262298584, 'learning_rate': 2.593793979336148e-05, 'epoch': 1.44}
{'loss': 2.3004, 'grad_norm': 13.038212776184082, 'learning_rate': 2.592649801485143e-05, 'epoch': 1.44}
{'loss': 2.4094, 'grad_norm': 29.541423797607422, 'learning_rate': 2.5915056236341377e-05, 'epoch': 1.45}
{'loss': 2.2615, 'grad_norm': 31.73858642578125, 'learning_rate': 2.5903614457831325e-05, 'epoch': 1.45}
{'loss': 2.1112, 'grad_norm': 16.852298736572266, 'learning_rate': 2.5892172679321273e-05, 'epoch': 1.45}
{'loss': 2.5019, 'grad_norm': 36.98277282714844, 'learning_rate': 2.5880730900811225e-05, 'epoch': 1.45}
{'loss': 2.2014, 'grad_norm': 20.009084701538086, 'learning_rate': 2.5869289122301173e-05, 'epoch': 1.45}
{'loss': 2.1973, 'grad_norm': 18.381633758544922, 'l

Saving model checkpoint to ./results\checkpoint-220000
Configuration saved in ./results\checkpoint-220000\config.json


{'loss': 2.155, 'grad_norm': 18.014034271240234, 'learning_rate': 2.4828087277886473e-05, 'epoch': 1.51}


Model weights saved in ./results\checkpoint-220000\model.safetensors
Deleting older checkpoint [results\checkpoint-200000] due to args.save_total_limit


{'loss': 2.4133, 'grad_norm': 19.73082160949707, 'learning_rate': 2.4816645499376425e-05, 'epoch': 1.51}
{'loss': 2.1514, 'grad_norm': 22.481510162353516, 'learning_rate': 2.4805203720866372e-05, 'epoch': 1.51}
{'loss': 2.1944, 'grad_norm': 21.44377326965332, 'learning_rate': 2.4793761942356324e-05, 'epoch': 1.51}
{'loss': 2.3999, 'grad_norm': 19.17879867553711, 'learning_rate': 2.4782320163846272e-05, 'epoch': 1.51}
{'loss': 2.224, 'grad_norm': 17.57513427734375, 'learning_rate': 2.477087838533622e-05, 'epoch': 1.51}
{'loss': 1.8244, 'grad_norm': 15.433242797851562, 'learning_rate': 2.4759436606826168e-05, 'epoch': 1.51}
{'loss': 2.1787, 'grad_norm': 21.447359085083008, 'learning_rate': 2.4747994828316116e-05, 'epoch': 1.52}
{'loss': 2.3802, 'grad_norm': 22.692604064941406, 'learning_rate': 2.4736553049806064e-05, 'epoch': 1.52}
{'loss': 2.2038, 'grad_norm': 17.96605110168457, 'learning_rate': 2.472511127129601e-05, 'epoch': 1.52}
{'loss': 2.2798, 'grad_norm': 21.5673828125, 'learning

Saving model checkpoint to ./results\checkpoint-230000
Configuration saved in ./results\checkpoint-230000\config.json


{'loss': 2.1648, 'grad_norm': 41.34564208984375, 'learning_rate': 2.3683909426881315e-05, 'epoch': 1.58}


Model weights saved in ./results\checkpoint-230000\model.safetensors
Deleting older checkpoint [results\checkpoint-210000] due to args.save_total_limit


{'loss': 2.2066, 'grad_norm': 19.441997528076172, 'learning_rate': 2.3672467648371263e-05, 'epoch': 1.58}
{'loss': 2.2961, 'grad_norm': 15.83720874786377, 'learning_rate': 2.366102586986121e-05, 'epoch': 1.58}
{'loss': 2.1883, 'grad_norm': 23.287343978881836, 'learning_rate': 2.364958409135116e-05, 'epoch': 1.58}
{'loss': 2.2566, 'grad_norm': 31.541776657104492, 'learning_rate': 2.3638142312841107e-05, 'epoch': 1.58}
{'loss': 2.0202, 'grad_norm': 12.64116096496582, 'learning_rate': 2.362670053433106e-05, 'epoch': 1.58}
{'loss': 2.147, 'grad_norm': 22.446489334106445, 'learning_rate': 2.3615258755821006e-05, 'epoch': 1.58}
{'loss': 2.2019, 'grad_norm': 20.79738998413086, 'learning_rate': 2.3603816977310954e-05, 'epoch': 1.58}
{'loss': 2.1733, 'grad_norm': 14.972790718078613, 'learning_rate': 2.3592375198800902e-05, 'epoch': 1.58}
{'loss': 2.3125, 'grad_norm': 17.14771270751953, 'learning_rate': 2.358093342029085e-05, 'epoch': 1.59}
{'loss': 2.2698, 'grad_norm': 19.97764778137207, 'learn

Saving model checkpoint to ./results\checkpoint-240000
Configuration saved in ./results\checkpoint-240000\config.json


{'loss': 2.2468, 'grad_norm': 28.29116439819336, 'learning_rate': 2.2539731575876154e-05, 'epoch': 1.65}


Model weights saved in ./results\checkpoint-240000\model.safetensors
Deleting older checkpoint [results\checkpoint-220000] due to args.save_total_limit


{'loss': 2.3228, 'grad_norm': 27.295787811279297, 'learning_rate': 2.2528289797366102e-05, 'epoch': 1.65}
{'loss': 2.2917, 'grad_norm': 0.3826369047164917, 'learning_rate': 2.251684801885605e-05, 'epoch': 1.65}
{'loss': 2.1553, 'grad_norm': 21.449371337890625, 'learning_rate': 2.2505406240346e-05, 'epoch': 1.65}
{'loss': 2.2176, 'grad_norm': 23.952022552490234, 'learning_rate': 2.249396446183595e-05, 'epoch': 1.65}
{'loss': 2.1592, 'grad_norm': 20.51802635192871, 'learning_rate': 2.2482522683325897e-05, 'epoch': 1.65}
{'loss': 2.2927, 'grad_norm': 18.562347412109375, 'learning_rate': 2.2471080904815845e-05, 'epoch': 1.65}
{'loss': 2.3011, 'grad_norm': 34.48555374145508, 'learning_rate': 2.2459639126305797e-05, 'epoch': 1.65}
{'loss': 2.1036, 'grad_norm': 10.020378112792969, 'learning_rate': 2.2448197347795745e-05, 'epoch': 1.65}
{'loss': 2.2044, 'grad_norm': 14.633383750915527, 'learning_rate': 2.2436755569285692e-05, 'epoch': 1.65}
{'loss': 2.1549, 'grad_norm': 17.81003761291504, 'lea

Saving model checkpoint to ./results\checkpoint-250000
Configuration saved in ./results\checkpoint-250000\config.json


{'loss': 2.3321, 'grad_norm': 23.588821411132812, 'learning_rate': 2.1395553724870996e-05, 'epoch': 1.72}


Model weights saved in ./results\checkpoint-250000\model.safetensors
Deleting older checkpoint [results\checkpoint-230000] due to args.save_total_limit


{'loss': 2.1146, 'grad_norm': 9.95433521270752, 'learning_rate': 2.1384111946360944e-05, 'epoch': 1.72}
{'loss': 2.2598, 'grad_norm': 19.34613037109375, 'learning_rate': 2.1372670167850892e-05, 'epoch': 1.72}
{'loss': 2.2823, 'grad_norm': 27.240711212158203, 'learning_rate': 2.136122838934084e-05, 'epoch': 1.72}
{'loss': 2.274, 'grad_norm': 17.166473388671875, 'learning_rate': 2.1349786610830788e-05, 'epoch': 1.72}
{'loss': 2.1447, 'grad_norm': 28.229679107666016, 'learning_rate': 2.1338344832320736e-05, 'epoch': 1.72}
{'loss': 2.0666, 'grad_norm': 25.877498626708984, 'learning_rate': 2.1326903053810684e-05, 'epoch': 1.72}
{'loss': 2.2084, 'grad_norm': 15.052375793457031, 'learning_rate': 2.1315461275300632e-05, 'epoch': 1.72}
{'loss': 2.2461, 'grad_norm': 15.99804401397705, 'learning_rate': 2.130401949679058e-05, 'epoch': 1.72}
{'loss': 2.3327, 'grad_norm': 24.13697624206543, 'learning_rate': 2.1292577718280528e-05, 'epoch': 1.72}
{'loss': 2.1516, 'grad_norm': 24.833051681518555, 'lea

Saving model checkpoint to ./results\checkpoint-260000
Configuration saved in ./results\checkpoint-260000\config.json


{'loss': 2.0706, 'grad_norm': 16.020021438598633, 'learning_rate': 2.0251375873865835e-05, 'epoch': 1.78}


Model weights saved in ./results\checkpoint-260000\model.safetensors
Deleting older checkpoint [results\checkpoint-240000] due to args.save_total_limit


{'loss': 2.3948, 'grad_norm': 16.992809295654297, 'learning_rate': 2.0239934095355783e-05, 'epoch': 1.79}
{'loss': 2.2451, 'grad_norm': 18.251070022583008, 'learning_rate': 2.022849231684573e-05, 'epoch': 1.79}
{'loss': 2.0317, 'grad_norm': 25.097911834716797, 'learning_rate': 2.021705053833568e-05, 'epoch': 1.79}
{'loss': 2.1571, 'grad_norm': 26.436595916748047, 'learning_rate': 2.0205608759825627e-05, 'epoch': 1.79}
{'loss': 2.2405, 'grad_norm': 22.243301391601562, 'learning_rate': 2.0194166981315575e-05, 'epoch': 1.79}
{'loss': 2.2102, 'grad_norm': 20.86471939086914, 'learning_rate': 2.0182725202805526e-05, 'epoch': 1.79}
{'loss': 2.1664, 'grad_norm': 19.093948364257812, 'learning_rate': 2.0171283424295474e-05, 'epoch': 1.79}
{'loss': 2.2139, 'grad_norm': 16.150436401367188, 'learning_rate': 2.0159841645785422e-05, 'epoch': 1.79}
{'loss': 2.2879, 'grad_norm': 32.7116813659668, 'learning_rate': 2.014839986727537e-05, 'epoch': 1.79}
{'loss': 2.2046, 'grad_norm': 17.3265380859375, 'lea

Saving model checkpoint to ./results\checkpoint-270000
Configuration saved in ./results\checkpoint-270000\config.json


{'loss': 2.2141, 'grad_norm': 15.870306968688965, 'learning_rate': 1.9107198022860674e-05, 'epoch': 1.85}


Model weights saved in ./results\checkpoint-270000\model.safetensors
Deleting older checkpoint [results\checkpoint-250000] due to args.save_total_limit


{'loss': 2.1878, 'grad_norm': 17.813364028930664, 'learning_rate': 1.9095756244350625e-05, 'epoch': 1.85}
{'loss': 2.2185, 'grad_norm': 31.615467071533203, 'learning_rate': 1.9084314465840573e-05, 'epoch': 1.85}
{'loss': 2.2176, 'grad_norm': 19.28611946105957, 'learning_rate': 1.907287268733052e-05, 'epoch': 1.86}
{'loss': 2.2479, 'grad_norm': 22.158031463623047, 'learning_rate': 1.906143090882047e-05, 'epoch': 1.86}
{'loss': 2.1864, 'grad_norm': 17.302051544189453, 'learning_rate': 1.9049989130310417e-05, 'epoch': 1.86}
{'loss': 2.1746, 'grad_norm': 21.177898406982422, 'learning_rate': 1.9038547351800365e-05, 'epoch': 1.86}
{'loss': 2.2924, 'grad_norm': 17.571247100830078, 'learning_rate': 1.9027105573290313e-05, 'epoch': 1.86}
{'loss': 2.1872, 'grad_norm': 27.287220001220703, 'learning_rate': 1.901566379478026e-05, 'epoch': 1.86}
{'loss': 2.2465, 'grad_norm': 21.369670867919922, 'learning_rate': 1.900422201627021e-05, 'epoch': 1.86}
{'loss': 2.2243, 'grad_norm': 22.18438148498535, 'l

Saving model checkpoint to ./results\checkpoint-280000
Configuration saved in ./results\checkpoint-280000\config.json


{'loss': 2.066, 'grad_norm': 16.819372177124023, 'learning_rate': 1.7963020171855513e-05, 'epoch': 1.92}


Model weights saved in ./results\checkpoint-280000\model.safetensors
Deleting older checkpoint [results\checkpoint-260000] due to args.save_total_limit


{'loss': 2.1958, 'grad_norm': 18.528175354003906, 'learning_rate': 1.795157839334546e-05, 'epoch': 1.92}
{'loss': 2.1934, 'grad_norm': 24.92327880859375, 'learning_rate': 1.7940136614835412e-05, 'epoch': 1.92}
{'loss': 2.2875, 'grad_norm': 23.71475601196289, 'learning_rate': 1.792869483632536e-05, 'epoch': 1.92}
{'loss': 2.2436, 'grad_norm': 21.95000457763672, 'learning_rate': 1.7917253057815308e-05, 'epoch': 1.92}
{'loss': 2.1494, 'grad_norm': 14.834589004516602, 'learning_rate': 1.7905811279305256e-05, 'epoch': 1.93}
{'loss': 1.9813, 'grad_norm': 20.623552322387695, 'learning_rate': 1.7894369500795204e-05, 'epoch': 1.93}
{'loss': 2.2357, 'grad_norm': 24.359678268432617, 'learning_rate': 1.7882927722285152e-05, 'epoch': 1.93}
{'loss': 2.2011, 'grad_norm': 14.52773380279541, 'learning_rate': 1.78714859437751e-05, 'epoch': 1.93}
{'loss': 2.3912, 'grad_norm': 11.946833610534668, 'learning_rate': 1.786004416526505e-05, 'epoch': 1.93}
{'loss': 2.3454, 'grad_norm': 24.160688400268555, 'lear

Saving model checkpoint to ./results\checkpoint-290000
Configuration saved in ./results\checkpoint-290000\config.json


{'loss': 2.2719, 'grad_norm': 26.636005401611328, 'learning_rate': 1.681884232085035e-05, 'epoch': 1.99}


Model weights saved in ./results\checkpoint-290000\model.safetensors
Deleting older checkpoint [results\checkpoint-270000] due to args.save_total_limit


{'loss': 2.0418, 'grad_norm': 13.830501556396484, 'learning_rate': 1.6807400542340303e-05, 'epoch': 1.99}
{'loss': 2.1563, 'grad_norm': 28.17438507080078, 'learning_rate': 1.679595876383025e-05, 'epoch': 1.99}
{'loss': 1.9682, 'grad_norm': 28.246871948242188, 'learning_rate': 1.67845169853202e-05, 'epoch': 1.99}
{'loss': 2.183, 'grad_norm': 20.1605224609375, 'learning_rate': 1.6773075206810147e-05, 'epoch': 1.99}
{'loss': 2.1891, 'grad_norm': 34.19792938232422, 'learning_rate': 1.6761633428300098e-05, 'epoch': 1.99}
{'loss': 2.2442, 'grad_norm': 17.59095001220703, 'learning_rate': 1.6750191649790046e-05, 'epoch': 1.99}
{'loss': 2.1556, 'grad_norm': 19.200855255126953, 'learning_rate': 1.6738749871279994e-05, 'epoch': 2.0}
{'loss': 2.2199, 'grad_norm': 22.0750675201416, 'learning_rate': 1.6727308092769942e-05, 'epoch': 2.0}
{'loss': 2.1905, 'grad_norm': 22.168277740478516, 'learning_rate': 1.671586631425989e-05, 'epoch': 2.0}
{'loss': 2.1626, 'grad_norm': 24.553722381591797, 'learning_r

Saving model checkpoint to ./results\checkpoint-300000
Configuration saved in ./results\checkpoint-300000\config.json


{'loss': 2.1142, 'grad_norm': 23.062583923339844, 'learning_rate': 1.5674664469845194e-05, 'epoch': 2.06}


Model weights saved in ./results\checkpoint-300000\model.safetensors
Deleting older checkpoint [results\checkpoint-280000] due to args.save_total_limit


{'loss': 2.165, 'grad_norm': 29.602869033813477, 'learning_rate': 1.5663222691335142e-05, 'epoch': 2.06}
{'loss': 2.2262, 'grad_norm': 11.295178413391113, 'learning_rate': 1.565178091282509e-05, 'epoch': 2.06}
{'loss': 2.1342, 'grad_norm': 13.038511276245117, 'learning_rate': 1.5640339134315038e-05, 'epoch': 2.06}
{'loss': 2.2387, 'grad_norm': 31.439672470092773, 'learning_rate': 1.5628897355804986e-05, 'epoch': 2.06}
{'loss': 2.0208, 'grad_norm': 24.37904930114746, 'learning_rate': 1.5617455577294934e-05, 'epoch': 2.06}
{'loss': 2.2834, 'grad_norm': 15.50900936126709, 'learning_rate': 1.560601379878488e-05, 'epoch': 2.06}
{'loss': 2.1812, 'grad_norm': 23.371826171875, 'learning_rate': 1.5594572020274833e-05, 'epoch': 2.06}
{'loss': 2.0473, 'grad_norm': 24.15560531616211, 'learning_rate': 1.558313024176478e-05, 'epoch': 2.07}
{'loss': 2.0719, 'grad_norm': 4.173125267028809, 'learning_rate': 1.557168846325473e-05, 'epoch': 2.07}
{'loss': 2.0295, 'grad_norm': 17.066932678222656, 'learnin

Saving model checkpoint to ./results\checkpoint-310000
Configuration saved in ./results\checkpoint-310000\config.json


{'loss': 2.0984, 'grad_norm': 1.6004258394241333, 'learning_rate': 1.4530486618840033e-05, 'epoch': 2.13}


Model weights saved in ./results\checkpoint-310000\model.safetensors
Deleting older checkpoint [results\checkpoint-290000] due to args.save_total_limit


{'loss': 2.0452, 'grad_norm': 19.630916595458984, 'learning_rate': 1.4519044840329982e-05, 'epoch': 2.13}
{'loss': 2.2597, 'grad_norm': 19.20631980895996, 'learning_rate': 1.450760306181993e-05, 'epoch': 2.13}
{'loss': 2.0731, 'grad_norm': 16.869508743286133, 'learning_rate': 1.4496161283309878e-05, 'epoch': 2.13}
{'loss': 2.1177, 'grad_norm': 40.95354461669922, 'learning_rate': 1.4484719504799826e-05, 'epoch': 2.13}
{'loss': 2.1122, 'grad_norm': 23.26268196105957, 'learning_rate': 1.4473277726289774e-05, 'epoch': 2.13}
{'loss': 2.1655, 'grad_norm': 28.054880142211914, 'learning_rate': 1.4461835947779722e-05, 'epoch': 2.13}
{'loss': 2.1643, 'grad_norm': 18.6295166015625, 'learning_rate': 1.4450394169269672e-05, 'epoch': 2.13}
{'loss': 2.0503, 'grad_norm': 5.719363212585449, 'learning_rate': 1.443895239075962e-05, 'epoch': 2.13}
{'loss': 2.2436, 'grad_norm': 22.957651138305664, 'learning_rate': 1.4427510612249571e-05, 'epoch': 2.13}
{'loss': 2.1737, 'grad_norm': 17.213607788085938, 'lea

Saving model checkpoint to ./results\checkpoint-320000
Configuration saved in ./results\checkpoint-320000\config.json


{'loss': 2.1342, 'grad_norm': 24.1651611328125, 'learning_rate': 1.3386308767834873e-05, 'epoch': 2.2}


Model weights saved in ./results\checkpoint-320000\model.safetensors
Deleting older checkpoint [results\checkpoint-300000] due to args.save_total_limit


{'loss': 2.3004, 'grad_norm': 21.536500930786133, 'learning_rate': 1.3374866989324821e-05, 'epoch': 2.2}
{'loss': 1.9838, 'grad_norm': 11.71746826171875, 'learning_rate': 1.336342521081477e-05, 'epoch': 2.2}
{'loss': 2.1762, 'grad_norm': 20.964494705200195, 'learning_rate': 1.3351983432304719e-05, 'epoch': 2.2}
{'loss': 2.2393, 'grad_norm': 20.570009231567383, 'learning_rate': 1.3340541653794667e-05, 'epoch': 2.2}
{'loss': 2.0767, 'grad_norm': 25.800928115844727, 'learning_rate': 1.3329099875284615e-05, 'epoch': 2.2}
{'loss': 2.2328, 'grad_norm': 26.51878547668457, 'learning_rate': 1.3317658096774563e-05, 'epoch': 2.2}
{'loss': 2.2871, 'grad_norm': 31.74241828918457, 'learning_rate': 1.330621631826451e-05, 'epoch': 2.2}
{'loss': 2.1039, 'grad_norm': 22.16261100769043, 'learning_rate': 1.329477453975446e-05, 'epoch': 2.2}
{'loss': 2.1167, 'grad_norm': 24.995805740356445, 'learning_rate': 1.3283332761244408e-05, 'epoch': 2.2}
{'loss': 2.0732, 'grad_norm': 19.610401153564453, 'learning_ra

Saving model checkpoint to ./results\checkpoint-330000
Configuration saved in ./results\checkpoint-330000\config.json


{'loss': 2.1842, 'grad_norm': 27.474178314208984, 'learning_rate': 1.2242130916829712e-05, 'epoch': 2.27}


Model weights saved in ./results\checkpoint-330000\model.safetensors
Deleting older checkpoint [results\checkpoint-310000] due to args.save_total_limit


{'loss': 2.0101, 'grad_norm': 14.189931869506836, 'learning_rate': 1.223068913831966e-05, 'epoch': 2.27}
{'loss': 1.9836, 'grad_norm': 16.486602783203125, 'learning_rate': 1.221924735980961e-05, 'epoch': 2.27}
{'loss': 2.1014, 'grad_norm': 17.22929573059082, 'learning_rate': 1.220780558129956e-05, 'epoch': 2.27}
{'loss': 2.1365, 'grad_norm': 13.840583801269531, 'learning_rate': 1.2196363802789507e-05, 'epoch': 2.27}
{'loss': 2.0227, 'grad_norm': 18.755983352661133, 'learning_rate': 1.2184922024279455e-05, 'epoch': 2.27}
{'loss': 1.9167, 'grad_norm': 20.624765396118164, 'learning_rate': 1.2173480245769403e-05, 'epoch': 2.27}
{'loss': 2.1865, 'grad_norm': 29.30479621887207, 'learning_rate': 1.2162038467259351e-05, 'epoch': 2.27}
{'loss': 1.9032, 'grad_norm': 24.05643653869629, 'learning_rate': 1.2150596688749299e-05, 'epoch': 2.27}
{'loss': 2.1038, 'grad_norm': 17.975505828857422, 'learning_rate': 1.2139154910239247e-05, 'epoch': 2.27}
{'loss': 2.0464, 'grad_norm': 19.220407485961914, 'l

Saving model checkpoint to ./results\checkpoint-340000
Configuration saved in ./results\checkpoint-340000\config.json


{'loss': 2.0968, 'grad_norm': 17.834943771362305, 'learning_rate': 1.1097953065824552e-05, 'epoch': 2.33}


Model weights saved in ./results\checkpoint-340000\model.safetensors
Deleting older checkpoint [results\checkpoint-320000] due to args.save_total_limit


{'loss': 1.9635, 'grad_norm': 37.21354293823242, 'learning_rate': 1.10865112873145e-05, 'epoch': 2.33}
{'loss': 1.9032, 'grad_norm': 11.13888168334961, 'learning_rate': 1.1075069508804448e-05, 'epoch': 2.34}
{'loss': 2.0317, 'grad_norm': 26.076839447021484, 'learning_rate': 1.1063627730294398e-05, 'epoch': 2.34}
{'loss': 2.1874, 'grad_norm': 22.882539749145508, 'learning_rate': 1.1052185951784346e-05, 'epoch': 2.34}
{'loss': 2.0473, 'grad_norm': 11.27231502532959, 'learning_rate': 1.1040744173274294e-05, 'epoch': 2.34}
{'loss': 2.0885, 'grad_norm': 31.956403732299805, 'learning_rate': 1.1029302394764244e-05, 'epoch': 2.34}
{'loss': 2.17, 'grad_norm': 17.824966430664062, 'learning_rate': 1.1017860616254191e-05, 'epoch': 2.34}
{'loss': 2.1755, 'grad_norm': 13.895376205444336, 'learning_rate': 1.100641883774414e-05, 'epoch': 2.34}
{'loss': 1.962, 'grad_norm': 55.76495361328125, 'learning_rate': 1.0994977059234087e-05, 'epoch': 2.34}
{'loss': 2.1744, 'grad_norm': 29.497596740722656, 'learn

Saving model checkpoint to ./results\checkpoint-350000
Configuration saved in ./results\checkpoint-350000\config.json


{'loss': 2.2182, 'grad_norm': 16.65206527709961, 'learning_rate': 9.953775214819391e-06, 'epoch': 2.4}


Model weights saved in ./results\checkpoint-350000\model.safetensors
Deleting older checkpoint [results\checkpoint-330000] due to args.save_total_limit


{'loss': 2.129, 'grad_norm': 14.990521430969238, 'learning_rate': 9.942333436309341e-06, 'epoch': 2.4}
{'loss': 2.0675, 'grad_norm': 24.474964141845703, 'learning_rate': 9.930891657799289e-06, 'epoch': 2.4}
{'loss': 2.0974, 'grad_norm': 18.944107055664062, 'learning_rate': 9.919449879289237e-06, 'epoch': 2.4}
{'loss': 2.0761, 'grad_norm': 20.22848892211914, 'learning_rate': 9.908008100779185e-06, 'epoch': 2.41}
{'loss': 2.0139, 'grad_norm': 17.82283592224121, 'learning_rate': 9.896566322269134e-06, 'epoch': 2.41}
{'loss': 1.9372, 'grad_norm': 24.701387405395508, 'learning_rate': 9.885124543759082e-06, 'epoch': 2.41}
{'loss': 2.1547, 'grad_norm': 26.941753387451172, 'learning_rate': 9.87368276524903e-06, 'epoch': 2.41}
{'loss': 2.105, 'grad_norm': 19.240766525268555, 'learning_rate': 9.86224098673898e-06, 'epoch': 2.41}
{'loss': 2.1669, 'grad_norm': 25.88138771057129, 'learning_rate': 9.850799208228928e-06, 'epoch': 2.41}
{'loss': 2.0995, 'grad_norm': 15.099312782287598, 'learning_rate'

Saving model checkpoint to ./results\checkpoint-360000
Configuration saved in ./results\checkpoint-360000\config.json


{'loss': 2.1668, 'grad_norm': 14.950942039489746, 'learning_rate': 8.809597363814232e-06, 'epoch': 2.47}


Model weights saved in ./results\checkpoint-360000\model.safetensors
Deleting older checkpoint [results\checkpoint-340000] due to args.save_total_limit


{'loss': 1.9714, 'grad_norm': 17.99595832824707, 'learning_rate': 8.79815558530418e-06, 'epoch': 2.47}
{'loss': 2.2852, 'grad_norm': 58.704803466796875, 'learning_rate': 8.786713806794128e-06, 'epoch': 2.47}
{'loss': 2.0753, 'grad_norm': 12.051910400390625, 'learning_rate': 8.775272028284077e-06, 'epoch': 2.47}
{'loss': 2.061, 'grad_norm': 27.3834228515625, 'learning_rate': 8.763830249774025e-06, 'epoch': 2.47}
{'loss': 2.1391, 'grad_norm': 21.722564697265625, 'learning_rate': 8.752388471263973e-06, 'epoch': 2.47}
{'loss': 1.9684, 'grad_norm': 19.13446044921875, 'learning_rate': 8.740946692753923e-06, 'epoch': 2.48}
{'loss': 2.1521, 'grad_norm': 18.342315673828125, 'learning_rate': 8.72950491424387e-06, 'epoch': 2.48}
{'loss': 2.1243, 'grad_norm': 20.58599281311035, 'learning_rate': 8.718063135733819e-06, 'epoch': 2.48}
{'loss': 2.247, 'grad_norm': 16.13445281982422, 'learning_rate': 8.706621357223767e-06, 'epoch': 2.48}
{'loss': 2.0499, 'grad_norm': 25.001644134521484, 'learning_rate'

Saving model checkpoint to ./results\checkpoint-370000
Configuration saved in ./results\checkpoint-370000\config.json


{'loss': 2.1081, 'grad_norm': 29.009735107421875, 'learning_rate': 7.665419512809072e-06, 'epoch': 2.54}


Model weights saved in ./results\checkpoint-370000\model.safetensors
Deleting older checkpoint [results\checkpoint-350000] due to args.save_total_limit


{'loss': 2.0615, 'grad_norm': 26.713163375854492, 'learning_rate': 7.65397773429902e-06, 'epoch': 2.54}
{'loss': 2.0489, 'grad_norm': 21.00713539123535, 'learning_rate': 7.642535955788968e-06, 'epoch': 2.54}
{'loss': 2.1683, 'grad_norm': 25.56529998779297, 'learning_rate': 7.631094177278916e-06, 'epoch': 2.54}
{'loss': 2.0859, 'grad_norm': 12.950047492980957, 'learning_rate': 7.619652398768865e-06, 'epoch': 2.54}
{'loss': 2.165, 'grad_norm': 18.621522903442383, 'learning_rate': 7.608210620258813e-06, 'epoch': 2.54}
{'loss': 2.0754, 'grad_norm': 18.375051498413086, 'learning_rate': 7.5967688417487625e-06, 'epoch': 2.54}
{'loss': 2.031, 'grad_norm': 28.461584091186523, 'learning_rate': 7.5853270632387105e-06, 'epoch': 2.54}
{'loss': 2.1645, 'grad_norm': 17.500341415405273, 'learning_rate': 7.573885284728658e-06, 'epoch': 2.55}
{'loss': 2.1081, 'grad_norm': 20.933547973632812, 'learning_rate': 7.562443506218607e-06, 'epoch': 2.55}
{'loss': 2.066, 'grad_norm': 40.26567077636719, 'learning_

Saving model checkpoint to ./results\checkpoint-380000
Configuration saved in ./results\checkpoint-380000\config.json


{'loss': 2.1648, 'grad_norm': 20.83769416809082, 'learning_rate': 6.52124166180391e-06, 'epoch': 2.61}


Model weights saved in ./results\checkpoint-380000\model.safetensors
Deleting older checkpoint [results\checkpoint-360000] due to args.save_total_limit


{'loss': 2.0784, 'grad_norm': 21.983633041381836, 'learning_rate': 6.50979988329386e-06, 'epoch': 2.61}
{'loss': 2.0601, 'grad_norm': 27.1513729095459, 'learning_rate': 6.498358104783808e-06, 'epoch': 2.61}
{'loss': 2.2364, 'grad_norm': 19.23900032043457, 'learning_rate': 6.486916326273757e-06, 'epoch': 2.61}
{'loss': 2.1231, 'grad_norm': 13.487345695495605, 'learning_rate': 6.475474547763705e-06, 'epoch': 2.61}
{'loss': 2.1229, 'grad_norm': 19.80852508544922, 'learning_rate': 6.4640327692536525e-06, 'epoch': 2.61}
{'loss': 2.0911, 'grad_norm': 26.257492065429688, 'learning_rate': 6.452590990743601e-06, 'epoch': 2.61}
{'loss': 2.3015, 'grad_norm': 18.213396072387695, 'learning_rate': 6.441149212233549e-06, 'epoch': 2.61}
{'loss': 2.0769, 'grad_norm': 15.041596412658691, 'learning_rate': 6.429707433723499e-06, 'epoch': 2.61}
{'loss': 2.1704, 'grad_norm': 18.132902145385742, 'learning_rate': 6.418265655213447e-06, 'epoch': 2.61}
{'loss': 2.0486, 'grad_norm': 24.96137237548828, 'learning_

Saving model checkpoint to ./results\checkpoint-390000
Configuration saved in ./results\checkpoint-390000\config.json


{'loss': 1.9546, 'grad_norm': 14.611321449279785, 'learning_rate': 5.377063810798751e-06, 'epoch': 2.68}


Model weights saved in ./results\checkpoint-390000\model.safetensors
Deleting older checkpoint [results\checkpoint-370000] due to args.save_total_limit


{'loss': 1.9894, 'grad_norm': 18.392377853393555, 'learning_rate': 5.3656220322886995e-06, 'epoch': 2.68}
{'loss': 2.0434, 'grad_norm': 28.463470458984375, 'learning_rate': 5.3541802537786475e-06, 'epoch': 2.68}
{'loss': 2.1024, 'grad_norm': 18.23686981201172, 'learning_rate': 5.342738475268596e-06, 'epoch': 2.68}
{'loss': 1.9735, 'grad_norm': 23.929216384887695, 'learning_rate': 5.331296696758544e-06, 'epoch': 2.68}
{'loss': 2.0748, 'grad_norm': 16.460153579711914, 'learning_rate': 5.319854918248493e-06, 'epoch': 2.68}
{'loss': 2.1293, 'grad_norm': 25.638952255249023, 'learning_rate': 5.308413139738441e-06, 'epoch': 2.68}
{'loss': 2.1744, 'grad_norm': 35.255123138427734, 'learning_rate': 5.29697136122839e-06, 'epoch': 2.68}
{'loss': 1.9215, 'grad_norm': 26.21482276916504, 'learning_rate': 5.285529582718338e-06, 'epoch': 2.68}
{'loss': 2.0737, 'grad_norm': 20.590198516845703, 'learning_rate': 5.2740878042082865e-06, 'epoch': 2.68}
{'loss': 2.2075, 'grad_norm': 36.60570526123047, 'learn

Saving model checkpoint to ./results\checkpoint-400000
Configuration saved in ./results\checkpoint-400000\config.json


{'loss': 2.1834, 'grad_norm': 20.124595642089844, 'learning_rate': 4.23288595979359e-06, 'epoch': 2.75}


Model weights saved in ./results\checkpoint-400000\model.safetensors
Deleting older checkpoint [results\checkpoint-380000] due to args.save_total_limit


{'loss': 2.1345, 'grad_norm': 32.49067306518555, 'learning_rate': 4.221444181283539e-06, 'epoch': 2.75}
{'loss': 2.1987, 'grad_norm': 16.752979278564453, 'learning_rate': 4.210002402773487e-06, 'epoch': 2.75}
{'loss': 2.1187, 'grad_norm': 21.879941940307617, 'learning_rate': 4.198560624263435e-06, 'epoch': 2.75}
{'loss': 2.0634, 'grad_norm': 25.541030883789062, 'learning_rate': 4.187118845753384e-06, 'epoch': 2.75}
{'loss': 2.0185, 'grad_norm': 17.56687355041504, 'learning_rate': 4.175677067243333e-06, 'epoch': 2.75}
{'loss': 2.1772, 'grad_norm': 12.832354545593262, 'learning_rate': 4.164235288733281e-06, 'epoch': 2.75}
{'loss': 2.1504, 'grad_norm': 17.800085067749023, 'learning_rate': 4.1527935102232294e-06, 'epoch': 2.75}
{'loss': 2.1578, 'grad_norm': 29.297401428222656, 'learning_rate': 4.141351731713178e-06, 'epoch': 2.75}
{'loss': 2.0672, 'grad_norm': 24.846702575683594, 'learning_rate': 4.129909953203126e-06, 'epoch': 2.75}
{'loss': 2.1938, 'grad_norm': 16.10456085205078, 'learni

Saving model checkpoint to ./results\checkpoint-410000
Configuration saved in ./results\checkpoint-410000\config.json


{'loss': 2.0171, 'grad_norm': 38.012550354003906, 'learning_rate': 3.08870810878843e-06, 'epoch': 2.81}


Model weights saved in ./results\checkpoint-410000\model.safetensors
Deleting older checkpoint [results\checkpoint-390000] due to args.save_total_limit


{'loss': 1.9753, 'grad_norm': 21.104280471801758, 'learning_rate': 3.077266330278379e-06, 'epoch': 2.82}
{'loss': 1.9815, 'grad_norm': 15.460317611694336, 'learning_rate': 3.065824551768327e-06, 'epoch': 2.82}
{'loss': 2.0282, 'grad_norm': 16.766674041748047, 'learning_rate': 3.0543827732582756e-06, 'epoch': 2.82}
{'loss': 2.1366, 'grad_norm': 26.046306610107422, 'learning_rate': 3.0429409947482236e-06, 'epoch': 2.82}
{'loss': 2.0759, 'grad_norm': 16.826993942260742, 'learning_rate': 3.0314992162381723e-06, 'epoch': 2.82}
{'loss': 2.1016, 'grad_norm': 13.473711013793945, 'learning_rate': 3.0200574377281207e-06, 'epoch': 2.82}
{'loss': 2.0658, 'grad_norm': 26.01637077331543, 'learning_rate': 3.008615659218069e-06, 'epoch': 2.82}
{'loss': 2.1116, 'grad_norm': 26.32892417907715, 'learning_rate': 2.9971738807080175e-06, 'epoch': 2.82}
{'loss': 2.0711, 'grad_norm': 26.364892959594727, 'learning_rate': 2.985732102197966e-06, 'epoch': 2.82}
{'loss': 2.0216, 'grad_norm': 27.031997680664062, 'l

Saving model checkpoint to ./results\checkpoint-420000
Configuration saved in ./results\checkpoint-420000\config.json


{'loss': 1.9681, 'grad_norm': 19.580829620361328, 'learning_rate': 1.94453025778327e-06, 'epoch': 2.88}


Model weights saved in ./results\checkpoint-420000\model.safetensors
Deleting older checkpoint [results\checkpoint-400000] due to args.save_total_limit


{'loss': 2.0654, 'grad_norm': 15.8417387008667, 'learning_rate': 1.933088479273218e-06, 'epoch': 2.88}
{'loss': 2.0531, 'grad_norm': 25.773788452148438, 'learning_rate': 1.921646700763167e-06, 'epoch': 2.88}
{'loss': 1.9648, 'grad_norm': 0.621884286403656, 'learning_rate': 1.910204922253115e-06, 'epoch': 2.89}
{'loss': 2.2149, 'grad_norm': 21.82547378540039, 'learning_rate': 1.8987631437430636e-06, 'epoch': 2.89}
{'loss': 2.088, 'grad_norm': 45.974796295166016, 'learning_rate': 1.8873213652330118e-06, 'epoch': 2.89}
{'loss': 2.2409, 'grad_norm': 25.54612159729004, 'learning_rate': 1.8758795867229604e-06, 'epoch': 2.89}
{'loss': 2.2173, 'grad_norm': 37.673770904541016, 'learning_rate': 1.8644378082129085e-06, 'epoch': 2.89}
{'loss': 2.04, 'grad_norm': 10.440642356872559, 'learning_rate': 1.8529960297028571e-06, 'epoch': 2.89}
{'loss': 1.979, 'grad_norm': 18.06093406677246, 'learning_rate': 1.8415542511928057e-06, 'epoch': 2.89}
{'loss': 2.1855, 'grad_norm': 23.13888931274414, 'learning_

Saving model checkpoint to ./results\checkpoint-430000
Configuration saved in ./results\checkpoint-430000\config.json


{'loss': 2.0039, 'grad_norm': 25.44904899597168, 'learning_rate': 8.003524067781097e-07, 'epoch': 2.95}


Model weights saved in ./results\checkpoint-430000\model.safetensors
Deleting older checkpoint [results\checkpoint-410000] due to args.save_total_limit


{'loss': 2.0743, 'grad_norm': 14.663187026977539, 'learning_rate': 7.889106282680581e-07, 'epoch': 2.95}
{'loss': 2.173, 'grad_norm': 29.198667526245117, 'learning_rate': 7.774688497580064e-07, 'epoch': 2.95}
{'loss': 2.2421, 'grad_norm': 14.411456108093262, 'learning_rate': 7.660270712479548e-07, 'epoch': 2.95}
{'loss': 2.0204, 'grad_norm': 12.599611282348633, 'learning_rate': 7.545852927379032e-07, 'epoch': 2.95}
{'loss': 2.1024, 'grad_norm': 16.336156845092773, 'learning_rate': 7.431435142278516e-07, 'epoch': 2.96}
{'loss': 2.2775, 'grad_norm': 18.42116355895996, 'learning_rate': 7.317017357178e-07, 'epoch': 2.96}
{'loss': 2.0329, 'grad_norm': 25.307931900024414, 'learning_rate': 7.202599572077484e-07, 'epoch': 2.96}
{'loss': 2.0906, 'grad_norm': 26.186702728271484, 'learning_rate': 7.088181786976968e-07, 'epoch': 2.96}
{'loss': 2.0312, 'grad_norm': 26.278783798217773, 'learning_rate': 6.973764001876451e-07, 'epoch': 2.96}
{'loss': 1.9955, 'grad_norm': 33.63789367675781, 'learning_r



Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 151672.5055, 'train_samples_per_second': 23.049, 'train_steps_per_second': 2.881, 'train_loss': 2.235039036472468, 'epoch': 3.0}


TrainOutput(global_step=436995, training_loss=2.235039036472468, metrics={'train_runtime': 151672.5055, 'train_samples_per_second': 23.049, 'train_steps_per_second': 2.881, 'train_loss': 2.235039036472468, 'epoch': 3.0})

In [None]:
model.save_pretrained("final model2")


Configuration saved in final model2\config.json


Model weights saved in final model2\model.safetensors


In [2]:
model.save_pretrained("final model 4/28", safe_serialization=False)

NameError: name 'model' is not defined