In [1]:
import warnings # 忽略警告
warnings.simplefilter("ignore")
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
from utils import get_prompt, get_bnb_config

bin d:\Users\Ray Lee\anaconda3\envs\adl3\Lib\site-packages\bitsandbytes\libbitsandbytes_cuda118.dll


In [2]:
model_name = "yentinglin/Taiwan-LLM-7B-v2.0-chat"

new_model = "1129"

model_path = "./1129"  # 更改為您的路徑

In [4]:
# Quantized LLMs with Low-Rank Adapters (QLoRA) parameters
################################################################################
lora_r = 4
lora_alpha = 16
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters 輕量級封裝，專門用於CUDA自定義函數，特別是8位優化器、矩陣乘法和量化
################################################################################
use_4bit = True
bnb_4bit_compute_dtype = "bfloat16" # float16 or bfloat16
bnb_4bit_quant_type = "nf4" # fp4 or nf4
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################
output_dir = "./results"
num_train_epochs = 1
fp16 = False
bf16 = False
per_device_train_batch_size = 4
per_device_eval_batch_size = 4
gradient_accumulation_steps = 1
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"
lr_scheduler_type = "cosine"
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
save_steps = 5
logging_steps = 5

################################################################################
# Supervised finetuning (SFT) parameters
################################################################################
max_seq_length = None
packing = False
device_map = {"": 0} #{"": 0} or "auto"

In [5]:
train_dataset = load_dataset('json', data_files='G:/HW3/data/train_1.json', split="train")  # 從JSON文件中載入訓練數據集
valid_dataset = load_dataset('json', data_files='G:/HW3/data/valid_1.json', split="train")

Downloading data files: 100%|██████████| 1/1 [00:00<?, ?it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 1001.74it/s]
Generating train split: 3200 examples [00:00, 34114.77 examples/s]
Downloading data files: 100%|██████████| 1/1 [00:00<?, ?it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 1001.03it/s]
Generating train split: 800 examples [00:00, 160447.72 examples/s]


In [6]:
train_dataset = train_dataset.map(
    lambda examples: {
        'instruction': [get_prompt(instruction) for instruction in examples['instruction']],
        'output': examples['output']
    },
    batched=True
)

Map: 100%|██████████| 3200/3200 [00:00<00:00, 84409.95 examples/s]


In [7]:
valid_dataset = valid_dataset.map(
    lambda examples: {
        'instruction': [get_prompt(instruction) for instruction in examples['instruction']],
        'output': examples['output']
    },
    batched=True
)

Map: 100%|██████████| 800/800 [00:00<00:00, 50136.62 examples/s]


In [8]:
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

# 檢查 GPU 是否與 bfloat16 相容
if compute_dtype == torch.bfloat16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

Your GPU supports bfloat16: accelerate training with bf16=True


In [9]:
# 載入與模型對應的分詞器
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

In [10]:
# Example of freeing GPU memory
torch.cuda.empty_cache()

In [12]:
# 從預訓練模型中載入自動生成模型
model = AutoModelForCausalLM.from_pretrained(
    "../Taiwan-LLM-7B-v2.0-chat",
    quantization_config=get_bnb_config(),
    torch_dtype=torch.bfloat16,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.88s/it]
Some weights of LlamaForCausalLM were not initialized from the model checkpoint at ../Taiwan-LLM-7B-v2.0-chat and are newly initialized: ['model.layers.12.self_attn.rotary_emb.inv_freq', 'model.layers.18.self_attn.rotary_emb.inv_freq', 'model.layers.16.self_attn.rotary_emb.inv_freq', 'model.layers.19.self_attn.rotary_emb.inv_freq', 'model.layers.10.self_attn.rotary_emb.inv_freq', 'model.layers.24.self_attn.rotary_emb.inv_freq', 'model.layers.14.self_attn.rotary_emb.inv_freq', 'model.layers.27.self_attn.rotary_emb.inv_freq', 'model.layers.13.self_attn.rotary_emb.inv_freq', 'model.layers.28.self_attn.rotary_emb.inv_freq', 'model.layers.8.self_attn.rotary_emb.inv_freq', 'model.layers.5.self_attn.rotary_emb.inv_freq', 'model.layers.15.self_attn.rotary_emb.inv_freq', 'model.layers.2.self_attn.rotary_emb.inv_freq', 'model.layers.30.self_attn.rotary_emb.inv_freq', 'model.layers.3.self_attn.rotary_emb.inv_freq', 'model.la

In [13]:
# 定義 Prompt Engineering Fine-Tuning （PEFT）的相關設定
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

In [14]:
# 設置訓練參數
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard", #"all"
    evaluation_strategy="steps",
    eval_steps=5  # 每5步驗證
)

In [15]:
# 使用 SFTTrainer 進行監督式微調訓練
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset, # 在這裡傳入驗證集
    eval_dataset=valid_dataset,
    peft_config=peft_config,
    dataset_text_field="instruction",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

# 開始訓練模型
trainer.train()

Map: 100%|██████████| 3200/3200 [00:00<00:00, 17154.81 examples/s]
Map: 100%|██████████| 800/800 [00:00<00:00, 16014.45 examples/s]
  0%|          | 0/800 [00:00<?, ?it/s]You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  1%|          | 5/800 [00:12<26:35,  2.01s/it]  

{'loss': 1.6367, 'learning_rate': 4.166666666666667e-05, 'epoch': 0.01}


                                               
  1%|          | 5/800 [01:54<26:35,  2.01s/it]  

{'eval_loss': 1.3150025606155396, 'eval_runtime': 102.3725, 'eval_samples_per_second': 7.815, 'eval_steps_per_second': 0.977, 'epoch': 0.01}


  1%|▏         | 10/800 [02:02<2:01:03,  9.19s/it]

{'loss': 1.4276, 'learning_rate': 8.333333333333334e-05, 'epoch': 0.01}


                                                  
  1%|▏         | 10/800 [03:45<2:01:03,  9.19s/it]

{'eval_loss': 1.2876046895980835, 'eval_runtime': 102.6616, 'eval_samples_per_second': 7.793, 'eval_steps_per_second': 0.974, 'epoch': 0.01}


  2%|▏         | 15/800 [03:52<2:12:37, 10.14s/it]

{'loss': 1.3258, 'learning_rate': 0.000125, 'epoch': 0.02}


                                                  
  2%|▏         | 15/800 [05:35<2:12:37, 10.14s/it]

{'eval_loss': 1.2248637676239014, 'eval_runtime': 102.7002, 'eval_samples_per_second': 7.79, 'eval_steps_per_second': 0.974, 'epoch': 0.02}


  2%|▎         | 20/800 [05:42<2:12:56, 10.23s/it]

{'loss': 1.2384, 'learning_rate': 0.0001666666666666667, 'epoch': 0.03}


                                                  
  2%|▎         | 20/800 [07:26<2:12:56, 10.23s/it]

{'eval_loss': 1.1213972568511963, 'eval_runtime': 103.1801, 'eval_samples_per_second': 7.753, 'eval_steps_per_second': 0.969, 'epoch': 0.03}


  3%|▎         | 25/800 [07:32<2:12:22, 10.25s/it]

{'loss': 1.1081, 'learning_rate': 0.00019999918050612108, 'epoch': 0.03}


                                                  
  3%|▎         | 25/800 [09:14<2:12:22, 10.25s/it]

{'eval_loss': 1.0078315734863281, 'eval_runtime': 101.8322, 'eval_samples_per_second': 7.856, 'eval_steps_per_second': 0.982, 'epoch': 0.03}


  4%|▍         | 30/800 [09:21<2:09:54, 10.12s/it]

{'loss': 0.8821, 'learning_rate': 0.0001999704996306308, 'epoch': 0.04}


                                                  
  4%|▍         | 30/800 [11:04<2:09:54, 10.12s/it]

{'eval_loss': 0.9635260105133057, 'eval_runtime': 102.972, 'eval_samples_per_second': 7.769, 'eval_steps_per_second': 0.971, 'epoch': 0.04}


  4%|▍         | 35/800 [11:11<2:09:32, 10.16s/it]

{'loss': 0.7585, 'learning_rate': 0.00019990085749160822, 'epoch': 0.04}


                                                  
  4%|▍         | 35/800 [12:53<2:09:32, 10.16s/it]

{'eval_loss': 0.8985203504562378, 'eval_runtime': 102.7003, 'eval_samples_per_second': 7.79, 'eval_steps_per_second': 0.974, 'epoch': 0.04}


  5%|▌         | 40/800 [12:59<2:07:04, 10.03s/it]

{'loss': 0.6547, 'learning_rate': 0.00019979028262377118, 'epoch': 0.05}


                                                  
  5%|▌         | 40/800 [14:41<2:07:04, 10.03s/it]

{'eval_loss': 0.8877922892570496, 'eval_runtime': 101.0619, 'eval_samples_per_second': 7.916, 'eval_steps_per_second': 0.989, 'epoch': 0.05}


  6%|▌         | 45/800 [14:47<2:04:19,  9.88s/it]

{'loss': 0.6165, 'learning_rate': 0.00019963882033334826, 'epoch': 0.06}


                                                  
  6%|▌         | 45/800 [16:27<2:04:19,  9.88s/it]

{'eval_loss': 0.8703145384788513, 'eval_runtime': 100.8445, 'eval_samples_per_second': 7.933, 'eval_steps_per_second': 0.992, 'epoch': 0.06}


  6%|▋         | 50/800 [16:33<2:02:51,  9.83s/it]

{'loss': 0.4104, 'learning_rate': 0.00019944653267951504, 'epoch': 0.06}


                                                  
  6%|▋         | 50/800 [18:14<2:02:51,  9.83s/it]

{'eval_loss': 0.8788974285125732, 'eval_runtime': 100.7394, 'eval_samples_per_second': 7.941, 'eval_steps_per_second': 0.993, 'epoch': 0.06}


  7%|▋         | 55/800 [18:23<2:08:31, 10.35s/it]

{'loss': 1.3744, 'learning_rate': 0.00019921349844896654, 'epoch': 0.07}


                                                  
  7%|▋         | 55/800 [20:04<2:08:31, 10.35s/it]

{'eval_loss': 0.8612074255943298, 'eval_runtime': 100.573, 'eval_samples_per_second': 7.954, 'eval_steps_per_second': 0.994, 'epoch': 0.07}


  8%|▊         | 60/800 [20:12<2:06:04, 10.22s/it]

{'loss': 1.0881, 'learning_rate': 0.00019893981312363562, 'epoch': 0.07}


                                                  
  8%|▊         | 60/800 [21:55<2:06:04, 10.22s/it]

{'eval_loss': 0.8502552509307861, 'eval_runtime': 102.7, 'eval_samples_per_second': 7.79, 'eval_steps_per_second': 0.974, 'epoch': 0.07}


  8%|▊         | 65/800 [22:02<2:06:25, 10.32s/it]

{'loss': 1.0012, 'learning_rate': 0.00019862558884157068, 'epoch': 0.08}


                                                  
  8%|▊         | 65/800 [23:45<2:06:25, 10.32s/it]

{'eval_loss': 0.8476576209068298, 'eval_runtime': 102.4974, 'eval_samples_per_second': 7.805, 'eval_steps_per_second': 0.976, 'epoch': 0.08}


  9%|▉         | 70/800 [23:52<2:04:45, 10.25s/it]

{'loss': 0.9622, 'learning_rate': 0.00019827095435098925, 'epoch': 0.09}


                                                  
  9%|▉         | 70/800 [25:34<2:04:45, 10.25s/it]

{'eval_loss': 0.8366280198097229, 'eval_runtime': 101.7711, 'eval_samples_per_second': 7.861, 'eval_steps_per_second': 0.983, 'epoch': 0.09}


  9%|▉         | 75/800 [25:41<2:02:41, 10.15s/it]

{'loss': 0.8415, 'learning_rate': 0.00019787605495752528, 'epoch': 0.09}


                                                  
  9%|▉         | 75/800 [27:23<2:02:41, 10.15s/it]

{'eval_loss': 0.8314067721366882, 'eval_runtime': 102.5492, 'eval_samples_per_second': 7.801, 'eval_steps_per_second': 0.975, 'epoch': 0.09}


 10%|█         | 80/800 [27:31<2:02:30, 10.21s/it]

{'loss': 0.7398, 'learning_rate': 0.00019744105246469263, 'epoch': 0.1}


                                                  
 10%|█         | 80/800 [29:13<2:02:30, 10.21s/it]

{'eval_loss': 0.8302356004714966, 'eval_runtime': 102.0036, 'eval_samples_per_second': 7.843, 'eval_steps_per_second': 0.98, 'epoch': 0.1}


 11%|█         | 85/800 [29:20<2:00:43, 10.13s/it]

{'loss': 0.7069, 'learning_rate': 0.00019696612510758876, 'epoch': 0.11}


                                                  
 11%|█         | 85/800 [31:02<2:00:43, 10.13s/it]

{'eval_loss': 0.8283106088638306, 'eval_runtime': 101.9554, 'eval_samples_per_second': 7.847, 'eval_steps_per_second': 0.981, 'epoch': 0.11}


 11%|█▏        | 90/800 [31:08<1:59:29, 10.10s/it]

{'loss': 0.6479, 'learning_rate': 0.0001964514674798659, 'epoch': 0.11}


                                                  
 11%|█▏        | 90/800 [32:51<1:59:29, 10.10s/it]

{'eval_loss': 0.8335476517677307, 'eval_runtime': 102.668, 'eval_samples_per_second': 7.792, 'eval_steps_per_second': 0.974, 'epoch': 0.11}


 12%|█▏        | 95/800 [32:57<1:57:46, 10.02s/it]

{'loss': 0.5608, 'learning_rate': 0.00019589729045399934, 'epoch': 0.12}


                                                  
 12%|█▏        | 95/800 [34:40<1:57:46, 10.02s/it]

{'eval_loss': 0.8297709822654724, 'eval_runtime': 102.7761, 'eval_samples_per_second': 7.784, 'eval_steps_per_second': 0.973, 'epoch': 0.12}


 12%|█▎        | 100/800 [34:46<1:56:48, 10.01s/it]

{'loss': 0.4278, 'learning_rate': 0.0001953038210948861, 'epoch': 0.12}


                                                   
 12%|█▎        | 100/800 [36:28<1:56:48, 10.01s/it]

{'eval_loss': 0.8344895243644714, 'eval_runtime': 102.2225, 'eval_samples_per_second': 7.826, 'eval_steps_per_second': 0.978, 'epoch': 0.12}


 13%|█▎        | 105/800 [36:38<2:02:04, 10.54s/it]

{'loss': 1.2737, 'learning_rate': 0.00019467130256680868, 'epoch': 0.13}


                                                   
 13%|█▎        | 105/800 [38:20<2:02:04, 10.54s/it]

{'eval_loss': 0.8308849930763245, 'eval_runtime': 101.8472, 'eval_samples_per_second': 7.855, 'eval_steps_per_second': 0.982, 'epoch': 0.13}


 14%|█▍        | 110/800 [38:28<1:59:30, 10.39s/it]

{'loss': 1.0218, 'learning_rate': 0.00019399999403380266, 'epoch': 0.14}


                                                   
 14%|█▍        | 110/800 [40:10<1:59:30, 10.39s/it]

{'eval_loss': 0.8271040916442871, 'eval_runtime': 102.0048, 'eval_samples_per_second': 7.843, 'eval_steps_per_second': 0.98, 'epoch': 0.14}


 14%|█▍        | 115/800 [40:18<1:57:38, 10.30s/it]

{'loss': 0.9766, 'learning_rate': 0.0001932901705534683, 'epoch': 0.14}


                                                   
 14%|█▍        | 115/800 [41:59<1:57:38, 10.30s/it]

{'eval_loss': 0.8238651752471924, 'eval_runtime': 100.9511, 'eval_samples_per_second': 7.925, 'eval_steps_per_second': 0.991, 'epoch': 0.14}


 15%|█▌        | 120/800 [42:06<1:55:44, 10.21s/it]

{'loss': 0.9829, 'learning_rate': 0.00019254212296427044, 'epoch': 0.15}


                                                   
 15%|█▌        | 120/800 [43:48<1:55:44, 10.21s/it]

{'eval_loss': 0.8209121823310852, 'eval_runtime': 101.8053, 'eval_samples_per_second': 7.858, 'eval_steps_per_second': 0.982, 'epoch': 0.15}


 16%|█▌        | 125/800 [43:55<1:54:07, 10.14s/it]

{'loss': 0.7974, 'learning_rate': 0.0001917561577663721, 'epoch': 0.16}


                                                   
 16%|█▌        | 125/800 [45:38<1:54:07, 10.14s/it]

{'eval_loss': 0.8189890384674072, 'eval_runtime': 102.8716, 'eval_samples_per_second': 7.777, 'eval_steps_per_second': 0.972, 'epoch': 0.16}


 16%|█▋        | 130/800 [45:45<1:53:34, 10.17s/it]

{'loss': 0.6463, 'learning_rate': 0.00019093259699605125, 'epoch': 0.16}


                                                   
 16%|█▋        | 130/800 [47:27<1:53:34, 10.17s/it]

{'eval_loss': 0.8162444829940796, 'eval_runtime': 102.7989, 'eval_samples_per_second': 7.782, 'eval_steps_per_second': 0.973, 'epoch': 0.16}


 17%|█▋        | 135/800 [47:34<1:52:35, 10.16s/it]

{'loss': 0.6257, 'learning_rate': 0.0001900717780937514, 'epoch': 0.17}


                                                   
 17%|█▋        | 135/800 [49:17<1:52:35, 10.16s/it]

{'eval_loss': 0.8141389489173889, 'eval_runtime': 103.0528, 'eval_samples_per_second': 7.763, 'eval_steps_per_second': 0.97, 'epoch': 0.17}


 18%|█▊        | 140/800 [49:23<1:51:04, 10.10s/it]

{'loss': 0.5371, 'learning_rate': 0.00018917405376582145, 'epoch': 0.17}


                                                   
 18%|█▊        | 140/800 [51:04<1:51:04, 10.10s/it]

{'eval_loss': 0.8160005211830139, 'eval_runtime': 100.5909, 'eval_samples_per_second': 7.953, 'eval_steps_per_second': 0.994, 'epoch': 0.17}


 18%|█▊        | 145/800 [51:10<1:47:25,  9.84s/it]

{'loss': 0.5047, 'learning_rate': 0.00018823979183999964, 'epoch': 0.18}


                                                   
 18%|█▊        | 145/800 [52:50<1:47:25,  9.84s/it]

{'eval_loss': 0.8276729583740234, 'eval_runtime': 100.2239, 'eval_samples_per_second': 7.982, 'eval_steps_per_second': 0.998, 'epoch': 0.18}


 19%|█▉        | 150/800 [52:56<1:45:32,  9.74s/it]

{'loss': 0.3697, 'learning_rate': 0.00018726937511470246, 'epoch': 0.19}


                                                   
 19%|█▉        | 150/800 [54:36<1:45:32,  9.74s/it]

{'eval_loss': 0.8370651006698608, 'eval_runtime': 100.4093, 'eval_samples_per_second': 7.967, 'eval_steps_per_second': 0.996, 'epoch': 0.19}


 19%|█▉        | 155/800 [54:46<1:51:07, 10.34s/it]

{'loss': 1.2699, 'learning_rate': 0.00018626320120217923, 'epoch': 0.19}


                                                   
 19%|█▉        | 155/800 [56:26<1:51:07, 10.34s/it]

{'eval_loss': 0.8275873064994812, 'eval_runtime': 100.1685, 'eval_samples_per_second': 7.987, 'eval_steps_per_second': 0.998, 'epoch': 0.19}


 20%|██        | 160/800 [56:34<1:48:36, 10.18s/it]

{'loss': 1.1341, 'learning_rate': 0.00018522168236559695, 'epoch': 0.2}


                                                   
 20%|██        | 160/800 [58:14<1:48:36, 10.18s/it]

{'eval_loss': 0.8156430125236511, 'eval_runtime': 99.9316, 'eval_samples_per_second': 8.005, 'eval_steps_per_second': 1.001, 'epoch': 0.2}


 21%|██        | 165/800 [58:21<1:46:37, 10.08s/it]

{'loss': 0.9848, 'learning_rate': 0.00018414524535012244, 'epoch': 0.21}


                                                   
 21%|██        | 165/800 [1:00:03<1:46:37, 10.08s/it]

{'eval_loss': 0.8132601380348206, 'eval_runtime': 101.5898, 'eval_samples_per_second': 7.875, 'eval_steps_per_second': 0.984, 'epoch': 0.21}


 21%|██▏       | 170/800 [1:00:10<1:46:40, 10.16s/it]

{'loss': 0.8968, 'learning_rate': 0.0001830343312080704, 'epoch': 0.21}


                                                     
 21%|██▏       | 170/800 [1:01:54<1:46:40, 10.16s/it]

{'eval_loss': 0.8098770380020142, 'eval_runtime': 103.4731, 'eval_samples_per_second': 7.731, 'eval_steps_per_second': 0.966, 'epoch': 0.21}


 22%|██▏       | 175/800 [1:02:01<1:46:51, 10.26s/it]

{'loss': 0.8166, 'learning_rate': 0.00018188939511818965, 'epoch': 0.22}


                                                     
 22%|██▏       | 175/800 [1:03:43<1:46:51, 10.26s/it]

{'eval_loss': 0.807019054889679, 'eval_runtime': 102.7166, 'eval_samples_per_second': 7.788, 'eval_steps_per_second': 0.974, 'epoch': 0.22}


 22%|██▎       | 180/800 [1:03:50<1:45:15, 10.19s/it]

{'loss': 0.6852, 'learning_rate': 0.00018071090619916093, 'epoch': 0.23}


                                                     
 22%|██▎       | 180/800 [1:05:32<1:45:15, 10.19s/it]

{'eval_loss': 0.8059892058372498, 'eval_runtime': 101.6148, 'eval_samples_per_second': 7.873, 'eval_steps_per_second': 0.984, 'epoch': 0.23}


 23%|██▎       | 185/800 [1:05:38<1:43:20, 10.08s/it]

{'loss': 0.5989, 'learning_rate': 0.00017949934731738347, 'epoch': 0.23}


                                                     
 23%|██▎       | 185/800 [1:07:20<1:43:20, 10.08s/it]

{'eval_loss': 0.8048079013824463, 'eval_runtime': 101.4529, 'eval_samples_per_second': 7.885, 'eval_steps_per_second': 0.986, 'epoch': 0.23}


 24%|██▍       | 190/800 [1:07:27<1:42:13, 10.05s/it]

{'loss': 0.5524, 'learning_rate': 0.0001782552148891283, 'epoch': 0.24}


                                                     
 24%|██▍       | 190/800 [1:09:09<1:42:13, 10.05s/it]

{'eval_loss': 0.8054037690162659, 'eval_runtime': 102.424, 'eval_samples_per_second': 7.811, 'eval_steps_per_second': 0.976, 'epoch': 0.24}


 24%|██▍       | 195/800 [1:09:15<1:40:44,  9.99s/it]

{'loss': 0.4884, 'learning_rate': 0.00017697901867713995, 'epoch': 0.24}


                                                     
 24%|██▍       | 195/800 [1:10:57<1:40:44,  9.99s/it]

{'eval_loss': 0.8076191544532776, 'eval_runtime': 101.9076, 'eval_samples_per_second': 7.85, 'eval_steps_per_second': 0.981, 'epoch': 0.24}


 25%|██▌       | 200/800 [1:11:03<1:39:05,  9.91s/it]

{'loss': 0.4033, 'learning_rate': 0.00017567128158176953, 'epoch': 0.25}


                                                     
 25%|██▌       | 200/800 [1:12:46<1:39:05,  9.91s/it]

{'eval_loss': 0.8127384185791016, 'eval_runtime': 102.9499, 'eval_samples_per_second': 7.771, 'eval_steps_per_second': 0.971, 'epoch': 0.25}


 26%|██▌       | 205/800 [1:12:55<1:44:35, 10.55s/it]

{'loss': 1.1396, 'learning_rate': 0.00017433253942672496, 'epoch': 0.26}


                                                     
 26%|██▌       | 205/800 [1:14:36<1:44:35, 10.55s/it]

{'eval_loss': 0.8089140057563782, 'eval_runtime': 101.2569, 'eval_samples_per_second': 7.901, 'eval_steps_per_second': 0.988, 'epoch': 0.26}


 26%|██▋       | 210/800 [1:14:44<1:41:19, 10.30s/it]

{'loss': 1.0833, 'learning_rate': 0.00017296334073952605, 'epoch': 0.26}


                                                     
 26%|██▋       | 210/800 [1:16:26<1:41:19, 10.30s/it]

{'eval_loss': 0.8038191199302673, 'eval_runtime': 102.0033, 'eval_samples_per_second': 7.843, 'eval_steps_per_second': 0.98, 'epoch': 0.26}


 27%|██▋       | 215/800 [1:16:34<1:40:18, 10.29s/it]

{'loss': 0.9846, 'learning_rate': 0.0001715642465267543, 'epoch': 0.27}


                                                     
 27%|██▋       | 215/800 [1:18:16<1:40:18, 10.29s/it]

{'eval_loss': 0.8024136424064636, 'eval_runtime': 101.6268, 'eval_samples_per_second': 7.872, 'eval_steps_per_second': 0.984, 'epoch': 0.27}


 28%|██▊       | 220/800 [1:18:23<1:38:27, 10.19s/it]

{'loss': 0.8361, 'learning_rate': 0.00017013583004418993, 'epoch': 0.28}


                                                     
 28%|██▊       | 220/800 [1:20:07<1:38:27, 10.19s/it]

{'eval_loss': 0.8021200299263, 'eval_runtime': 103.57, 'eval_samples_per_second': 7.724, 'eval_steps_per_second': 0.966, 'epoch': 0.28}


 28%|██▊       | 225/800 [1:20:14<1:38:25, 10.27s/it]

{'loss': 0.7679, 'learning_rate': 0.00016867867656192946, 'epoch': 0.28}


                                                     
 28%|██▊       | 225/800 [1:21:57<1:38:25, 10.27s/it]

{'eval_loss': 0.8013540506362915, 'eval_runtime': 103.3534, 'eval_samples_per_second': 7.74, 'eval_steps_per_second': 0.968, 'epoch': 0.28}


 29%|██▉       | 230/800 [1:22:04<1:37:22, 10.25s/it]

{'loss': 0.7676, 'learning_rate': 0.00016719338312458124, 'epoch': 0.29}


                                                     
 29%|██▉       | 230/800 [1:23:46<1:37:22, 10.25s/it]

{'eval_loss': 0.8007920980453491, 'eval_runtime': 102.3736, 'eval_samples_per_second': 7.815, 'eval_steps_per_second': 0.977, 'epoch': 0.29}


 29%|██▉       | 235/800 [1:23:53<1:35:57, 10.19s/it]

{'loss': 0.7422, 'learning_rate': 0.0001656805583066361, 'epoch': 0.29}


                                                     
 29%|██▉       | 235/800 [1:25:36<1:35:57, 10.19s/it]

{'eval_loss': 0.8002366423606873, 'eval_runtime': 102.5149, 'eval_samples_per_second': 7.804, 'eval_steps_per_second': 0.975, 'epoch': 0.29}


 30%|███       | 240/800 [1:25:43<1:34:45, 10.15s/it]

{'loss': 0.5833, 'learning_rate': 0.000164140821963114, 'epoch': 0.3}


                                                     
 30%|███       | 240/800 [1:27:26<1:34:45, 10.15s/it]

{'eval_loss': 0.8017604351043701, 'eval_runtime': 102.8014, 'eval_samples_per_second': 7.782, 'eval_steps_per_second': 0.973, 'epoch': 0.3}


 31%|███       | 245/800 [1:27:32<1:33:09, 10.07s/it]

{'loss': 0.4494, 'learning_rate': 0.00016257480497558873, 'epoch': 0.31}


                                                     
 31%|███       | 245/800 [1:29:15<1:33:09, 10.07s/it]

{'eval_loss': 0.8005191683769226, 'eval_runtime': 103.2086, 'eval_samples_per_second': 7.751, 'eval_steps_per_second': 0.969, 'epoch': 0.31}


 31%|███▏      | 250/800 [1:29:21<1:32:02, 10.04s/it]

{'loss': 0.3795, 'learning_rate': 0.00016098314899369446, 'epoch': 0.31}


                                                     
 31%|███▏      | 250/800 [1:31:04<1:32:02, 10.04s/it]

{'eval_loss': 0.8022778034210205, 'eval_runtime': 103.4311, 'eval_samples_per_second': 7.735, 'eval_steps_per_second': 0.967, 'epoch': 0.31}


 32%|███▏      | 255/800 [1:31:15<1:37:06, 10.69s/it]

{'loss': 1.2485, 'learning_rate': 0.00015936650617222063, 'epoch': 0.32}


                                                     
 32%|███▏      | 255/800 [1:32:56<1:37:06, 10.69s/it]

{'eval_loss': 0.8009532690048218, 'eval_runtime': 101.0554, 'eval_samples_per_second': 7.916, 'eval_steps_per_second': 0.99, 'epoch': 0.32}


 32%|███▎      | 260/800 [1:33:04<1:32:51, 10.32s/it]

{'loss': 0.9992, 'learning_rate': 0.00015772553890390197, 'epoch': 0.33}


                                                     
 32%|███▎      | 260/800 [1:34:46<1:32:51, 10.32s/it]

{'eval_loss': 0.7981907725334167, 'eval_runtime': 102.0074, 'eval_samples_per_second': 7.843, 'eval_steps_per_second': 0.98, 'epoch': 0.33}


 33%|███▎      | 265/800 [1:34:54<1:31:42, 10.28s/it]

{'loss': 0.9427, 'learning_rate': 0.0001560609195480142, 'epoch': 0.33}


                                                     
 33%|███▎      | 265/800 [1:36:35<1:31:42, 10.28s/it]

{'eval_loss': 0.7969557046890259, 'eval_runtime': 101.0807, 'eval_samples_per_second': 7.914, 'eval_steps_per_second': 0.989, 'epoch': 0.33}


 34%|███▍      | 270/800 [1:36:42<1:29:09, 10.09s/it]

{'loss': 0.8201, 'learning_rate': 0.00015437333015488587, 'epoch': 0.34}


                                                     
 34%|███▍      | 270/800 [1:38:23<1:29:09, 10.09s/it]

{'eval_loss': 0.7975451946258545, 'eval_runtime': 100.6547, 'eval_samples_per_second': 7.948, 'eval_steps_per_second': 0.993, 'epoch': 0.34}


 34%|███▍      | 275/800 [1:38:29<1:27:31, 10.00s/it]

{'loss': 0.7802, 'learning_rate': 0.00015266346218643947, 'epoch': 0.34}


                                                     
 34%|███▍      | 275/800 [1:40:10<1:27:31, 10.00s/it]

{'eval_loss': 0.797583818435669, 'eval_runtime': 100.5774, 'eval_samples_per_second': 7.954, 'eval_steps_per_second': 0.994, 'epoch': 0.34}


 35%|███▌      | 280/800 [1:40:17<1:26:19,  9.96s/it]

{'loss': 0.7407, 'learning_rate': 0.00015093201623287631, 'epoch': 0.35}


                                                     
 35%|███▌      | 280/800 [1:41:57<1:26:19,  9.96s/it]

{'eval_loss': 0.7959440350532532, 'eval_runtime': 100.6837, 'eval_samples_per_second': 7.946, 'eval_steps_per_second': 0.993, 'epoch': 0.35}


 36%|███▌      | 285/800 [1:42:04<1:25:24,  9.95s/it]

{'loss': 0.7011, 'learning_rate': 0.0001491797017256212, 'epoch': 0.36}


                                                     
 36%|███▌      | 285/800 [1:43:44<1:25:24,  9.95s/it]

{'eval_loss': 0.7945469617843628, 'eval_runtime': 100.5957, 'eval_samples_per_second': 7.953, 'eval_steps_per_second': 0.994, 'epoch': 0.36}


 36%|███▋      | 290/800 [1:43:51<1:24:07,  9.90s/it]

{'loss': 0.5959, 'learning_rate': 0.0001474072366466448, 'epoch': 0.36}


                                                     
 36%|███▋      | 290/800 [1:45:32<1:24:07,  9.90s/it]

{'eval_loss': 0.7945230007171631, 'eval_runtime': 101.2992, 'eval_samples_per_second': 7.897, 'eval_steps_per_second': 0.987, 'epoch': 0.36}


 37%|███▋      | 295/800 [1:45:38<1:23:09,  9.88s/it]

{'loss': 0.5385, 'learning_rate': 0.00014561534723428205, 'epoch': 0.37}


                                                     
 37%|███▋      | 295/800 [1:47:19<1:23:09,  9.88s/it]

{'eval_loss': 0.7945309281349182, 'eval_runtime': 100.7781, 'eval_samples_per_second': 7.938, 'eval_steps_per_second': 0.992, 'epoch': 0.37}


 38%|███▊      | 300/800 [1:47:25<1:21:53,  9.83s/it]

{'loss': 0.4152, 'learning_rate': 0.00014380476768566824, 'epoch': 0.38}


                                                     
 38%|███▊      | 300/800 [1:49:06<1:21:53,  9.83s/it]

{'eval_loss': 0.7964293956756592, 'eval_runtime': 100.8395, 'eval_samples_per_second': 7.933, 'eval_steps_per_second': 0.992, 'epoch': 0.38}


 38%|███▊      | 305/800 [1:49:15<1:25:19, 10.34s/it]

{'loss': 1.1837, 'learning_rate': 0.00014197623985591373, 'epoch': 0.38}


                                                     
 38%|███▊      | 305/800 [1:50:55<1:25:19, 10.34s/it]

{'eval_loss': 0.7957618832588196, 'eval_runtime': 100.334, 'eval_samples_per_second': 7.973, 'eval_steps_per_second': 0.997, 'epoch': 0.38}


 39%|███▉      | 310/800 [1:51:03<1:23:32, 10.23s/it]

{'loss': 1.1192, 'learning_rate': 0.00014013051295414108, 'epoch': 0.39}


                                                     
 39%|███▉      | 310/800 [1:52:44<1:23:32, 10.23s/it]

{'eval_loss': 0.7954627871513367, 'eval_runtime': 100.2976, 'eval_samples_per_second': 7.976, 'eval_steps_per_second': 0.997, 'epoch': 0.39}


 39%|███▉      | 315/800 [1:52:51<1:21:56, 10.14s/it]

{'loss': 0.9902, 'learning_rate': 0.000138268343236509, 'epoch': 0.39}


                                                     
 39%|███▉      | 315/800 [1:54:32<1:21:56, 10.14s/it]

{'eval_loss': 0.7940285205841064, 'eval_runtime': 100.8393, 'eval_samples_per_second': 7.933, 'eval_steps_per_second': 0.992, 'epoch': 0.39}


 40%|████      | 320/800 [1:54:40<1:21:05, 10.14s/it]

{'loss': 0.8809, 'learning_rate': 0.00013639049369634876, 'epoch': 0.4}


                                                     
 40%|████      | 320/800 [1:56:21<1:21:05, 10.14s/it]

{'eval_loss': 0.7917087078094482, 'eval_runtime': 101.4622, 'eval_samples_per_second': 7.885, 'eval_steps_per_second': 0.986, 'epoch': 0.4}


 41%|████      | 325/800 [1:56:28<1:20:06, 10.12s/it]

{'loss': 0.7829, 'learning_rate': 0.0001344977337515404, 'epoch': 0.41}


                                                     
 41%|████      | 325/800 [1:58:09<1:20:06, 10.12s/it]

{'eval_loss': 0.7918438911437988, 'eval_runtime': 100.5647, 'eval_samples_per_second': 7.955, 'eval_steps_per_second': 0.994, 'epoch': 0.41}


 41%|████▏     | 330/800 [1:58:16<1:18:32, 10.03s/it]

{'loss': 0.6959, 'learning_rate': 0.00013259083892925633, 'epoch': 0.41}


                                                     
 41%|████▏     | 330/800 [1:59:57<1:18:32, 10.03s/it]

{'eval_loss': 0.7906479835510254, 'eval_runtime': 101.6174, 'eval_samples_per_second': 7.873, 'eval_steps_per_second': 0.984, 'epoch': 0.41}


 42%|████▏     | 335/800 [2:00:04<1:17:52, 10.05s/it]

{'loss': 0.6734, 'learning_rate': 0.00013067059054820183, 'epoch': 0.42}


                                                     
 42%|████▏     | 335/800 [2:01:45<1:17:52, 10.05s/it]

{'eval_loss': 0.7897329926490784, 'eval_runtime': 100.8589, 'eval_samples_per_second': 7.932, 'eval_steps_per_second': 0.991, 'epoch': 0.42}


 42%|████▎     | 340/800 [2:01:52<1:16:29,  9.98s/it]

{'loss': 0.5093, 'learning_rate': 0.00012873777539848283, 'epoch': 0.42}


                                                     
 42%|████▎     | 340/800 [2:03:32<1:16:29,  9.98s/it]

{'eval_loss': 0.7893052101135254, 'eval_runtime': 100.7221, 'eval_samples_per_second': 7.943, 'eval_steps_per_second': 0.993, 'epoch': 0.42}


 43%|████▎     | 345/800 [2:03:38<1:14:33,  9.83s/it]

{'loss': 0.4735, 'learning_rate': 0.0001267931854192313, 'epoch': 0.43}


                                                     
 43%|████▎     | 345/800 [2:05:19<1:14:33,  9.83s/it]

{'eval_loss': 0.7890584468841553, 'eval_runtime': 100.7308, 'eval_samples_per_second': 7.942, 'eval_steps_per_second': 0.993, 'epoch': 0.43}


 44%|████▍     | 350/800 [2:05:25<1:13:38,  9.82s/it]

{'loss': 0.3934, 'learning_rate': 0.0001248376173741215, 'epoch': 0.44}


                                                     
 44%|████▍     | 350/800 [2:07:06<1:13:38,  9.82s/it]

{'eval_loss': 0.7911199927330017, 'eval_runtime': 100.9267, 'eval_samples_per_second': 7.927, 'eval_steps_per_second': 0.991, 'epoch': 0.44}


 44%|████▍     | 355/800 [2:07:16<1:17:19, 10.43s/it]

{'loss': 1.1273, 'learning_rate': 0.00012287187252490913, 'epoch': 0.44}


                                                     
 44%|████▍     | 355/800 [2:08:59<1:17:19, 10.43s/it]

{'eval_loss': 0.7905482649803162, 'eval_runtime': 102.6017, 'eval_samples_per_second': 7.797, 'eval_steps_per_second': 0.975, 'epoch': 0.44}


 45%|████▌     | 360/800 [2:09:07<1:16:15, 10.40s/it]

{'loss': 1.0183, 'learning_rate': 0.00012089675630312754, 'epoch': 0.45}


                                                     
 45%|████▌     | 360/800 [2:10:49<1:16:15, 10.40s/it]

{'eval_loss': 0.7888903021812439, 'eval_runtime': 102.5745, 'eval_samples_per_second': 7.799, 'eval_steps_per_second': 0.975, 'epoch': 0.45}


 46%|████▌     | 365/800 [2:10:57<1:14:57, 10.34s/it]

{'loss': 0.9323, 'learning_rate': 0.00011891307798007536, 'epoch': 0.46}


                                                     
 46%|████▌     | 365/800 [2:12:39<1:14:57, 10.34s/it]

{'eval_loss': 0.7890087962150574, 'eval_runtime': 101.7387, 'eval_samples_per_second': 7.863, 'eval_steps_per_second': 0.983, 'epoch': 0.46}


 46%|████▋     | 370/800 [2:12:46<1:13:07, 10.20s/it]

{'loss': 0.869, 'learning_rate': 0.00011692165033523117, 'epoch': 0.46}


                                                     
 46%|████▋     | 370/800 [2:14:28<1:13:07, 10.20s/it]

{'eval_loss': 0.7875843644142151, 'eval_runtime': 102.183, 'eval_samples_per_second': 7.829, 'eval_steps_per_second': 0.979, 'epoch': 0.46}


 47%|████▋     | 375/800 [2:14:35<1:11:49, 10.14s/it]

{'loss': 0.8257, 'learning_rate': 0.00011492328932323022, 'epoch': 0.47}


                                                     
 47%|████▋     | 375/800 [2:16:16<1:11:49, 10.14s/it]

{'eval_loss': 0.7868843078613281, 'eval_runtime': 100.9032, 'eval_samples_per_second': 7.928, 'eval_steps_per_second': 0.991, 'epoch': 0.47}


 48%|████▊     | 380/800 [2:16:23<1:10:07, 10.02s/it]

{'loss': 0.7503, 'learning_rate': 0.00011291881373954065, 'epoch': 0.47}


                                                     
 48%|████▊     | 380/800 [2:18:04<1:10:07, 10.02s/it]

{'eval_loss': 0.7863669395446777, 'eval_runtime': 101.351, 'eval_samples_per_second': 7.893, 'eval_steps_per_second': 0.987, 'epoch': 0.47}


 48%|████▊     | 385/800 [2:18:11<1:09:20, 10.03s/it]

{'loss': 0.7369, 'learning_rate': 0.00011090904488497549, 'epoch': 0.48}


                                                     
 48%|████▊     | 385/800 [2:19:53<1:09:20, 10.03s/it]

{'eval_loss': 0.7865879535675049, 'eval_runtime': 102.7309, 'eval_samples_per_second': 7.787, 'eval_steps_per_second': 0.973, 'epoch': 0.48}


 49%|████▉     | 390/800 [2:20:00<1:09:13, 10.13s/it]

{'loss': 0.6129, 'learning_rate': 0.0001088948062291783, 'epoch': 0.49}


                                                     
 49%|████▉     | 390/800 [2:21:42<1:09:13, 10.13s/it]

{'eval_loss': 0.7866897583007812, 'eval_runtime': 102.4841, 'eval_samples_per_second': 7.806, 'eval_steps_per_second': 0.976, 'epoch': 0.49}


 49%|████▉     | 395/800 [2:21:49<1:07:45, 10.04s/it]

{'loss': 0.4386, 'learning_rate': 0.00010687692307321984, 'epoch': 0.49}


                                                     
 49%|████▉     | 395/800 [2:23:32<1:07:45, 10.04s/it]

{'eval_loss': 0.787234365940094, 'eval_runtime': 103.0974, 'eval_samples_per_second': 7.76, 'eval_steps_per_second': 0.97, 'epoch': 0.49}


 50%|█████     | 400/800 [2:23:38<1:06:47, 10.02s/it]

{'loss': 0.3796, 'learning_rate': 0.00010485622221144484, 'epoch': 0.5}


                                                     
 50%|█████     | 400/800 [2:25:20<1:06:47, 10.02s/it]

{'eval_loss': 0.7893497347831726, 'eval_runtime': 102.5933, 'eval_samples_per_second': 7.798, 'eval_steps_per_second': 0.975, 'epoch': 0.5}


 51%|█████     | 405/800 [2:25:32<1:10:50, 10.76s/it]

{'loss': 1.1811, 'learning_rate': 0.00010283353159270643, 'epoch': 0.51}


                                                     
 51%|█████     | 405/800 [2:27:15<1:10:50, 10.76s/it]

{'eval_loss': 0.7894497513771057, 'eval_runtime': 102.5588, 'eval_samples_per_second': 7.8, 'eval_steps_per_second': 0.975, 'epoch': 0.51}


 51%|█████▏    | 410/800 [2:27:23<1:07:55, 10.45s/it]

{'loss': 1.0916, 'learning_rate': 0.00010080967998112787, 'epoch': 0.51}


                                                     
 51%|█████▏    | 410/800 [2:29:04<1:07:55, 10.45s/it]

{'eval_loss': 0.7881640791893005, 'eval_runtime': 101.0184, 'eval_samples_per_second': 7.919, 'eval_steps_per_second': 0.99, 'epoch': 0.51}


 52%|█████▏    | 415/800 [2:29:12<1:05:40, 10.24s/it]

{'loss': 0.888, 'learning_rate': 9.878549661653012e-05, 'epoch': 0.52}


                                                     
 52%|█████▏    | 415/800 [2:30:53<1:05:40, 10.24s/it]

{'eval_loss': 0.7869178652763367, 'eval_runtime': 101.2535, 'eval_samples_per_second': 7.901, 'eval_steps_per_second': 0.988, 'epoch': 0.52}


 52%|█████▎    | 420/800 [2:31:00<1:04:11, 10.14s/it]

{'loss': 0.8257, 'learning_rate': 9.676181087466444e-05, 'epoch': 0.53}


                                                     
 52%|█████▎    | 420/800 [2:32:43<1:04:11, 10.14s/it]

{'eval_loss': 0.7862952351570129, 'eval_runtime': 103.06, 'eval_samples_per_second': 7.762, 'eval_steps_per_second': 0.97, 'epoch': 0.53}


 53%|█████▎    | 425/800 [2:32:50<1:03:53, 10.22s/it]

{'loss': 0.7224, 'learning_rate': 9.473945192738933e-05, 'epoch': 0.53}


                                                     
 53%|█████▎    | 425/800 [2:34:32<1:03:53, 10.22s/it]

{'eval_loss': 0.7857365608215332, 'eval_runtime': 102.148, 'eval_samples_per_second': 7.832, 'eval_steps_per_second': 0.979, 'epoch': 0.53}


 54%|█████▍    | 430/800 [2:34:39<1:02:28, 10.13s/it]

{'loss': 0.7182, 'learning_rate': 9.27192484029312e-05, 'epoch': 0.54}


                                                     
 54%|█████▍    | 430/800 [2:36:23<1:02:28, 10.13s/it]

{'eval_loss': 0.7852199077606201, 'eval_runtime': 103.7311, 'eval_samples_per_second': 7.712, 'eval_steps_per_second': 0.964, 'epoch': 0.54}


 54%|█████▍    | 435/800 [2:36:29<1:02:11, 10.22s/it]

{'loss': 0.6271, 'learning_rate': 9.070202804636745e-05, 'epoch': 0.54}


                                                     
 54%|█████▍    | 435/800 [2:38:12<1:02:11, 10.22s/it]

{'eval_loss': 0.7855300307273865, 'eval_runtime': 103.0219, 'eval_samples_per_second': 7.765, 'eval_steps_per_second': 0.971, 'epoch': 0.54}


 55%|█████▌    | 440/800 [2:38:19<1:00:54, 10.15s/it]

{'loss': 0.5327, 'learning_rate': 8.868861738047158e-05, 'epoch': 0.55}


                                                     
 55%|█████▌    | 440/800 [2:40:02<1:00:54, 10.15s/it]

{'eval_loss': 0.7855960130691528, 'eval_runtime': 103.2647, 'eval_samples_per_second': 7.747, 'eval_steps_per_second': 0.968, 'epoch': 0.55}


 56%|█████▌    | 445/800 [2:40:08<59:40, 10.09s/it]  

{'loss': 0.4658, 'learning_rate': 8.667984136705928e-05, 'epoch': 0.56}


                                                   
 56%|█████▌    | 445/800 [2:41:50<59:40, 10.09s/it]

{'eval_loss': 0.7853729128837585, 'eval_runtime': 101.7892, 'eval_samples_per_second': 7.859, 'eval_steps_per_second': 0.982, 'epoch': 0.56}


 56%|█████▋    | 450/800 [2:41:56<57:56,  9.93s/it]  

{'loss': 0.417, 'learning_rate': 8.467652306897369e-05, 'epoch': 0.56}


                                                   
 56%|█████▋    | 450/800 [2:43:39<57:56,  9.93s/it]

{'eval_loss': 0.7863684296607971, 'eval_runtime': 103.2332, 'eval_samples_per_second': 7.749, 'eval_steps_per_second': 0.969, 'epoch': 0.56}


 57%|█████▋    | 455/800 [2:43:50<1:01:31, 10.70s/it]

{'loss': 1.2268, 'learning_rate': 8.267948331284923e-05, 'epoch': 0.57}


                                                     
 57%|█████▋    | 455/800 [2:45:32<1:01:31, 10.70s/it]

{'eval_loss': 0.7868205308914185, 'eval_runtime': 102.5066, 'eval_samples_per_second': 7.804, 'eval_steps_per_second': 0.976, 'epoch': 0.57}


 57%|█████▊    | 460/800 [2:45:41<59:43, 10.54s/it]  

{'loss': 1.053, 'learning_rate': 8.068954035279121e-05, 'epoch': 0.57}


                                                   
 57%|█████▊    | 460/800 [2:47:24<59:43, 10.54s/it]

{'eval_loss': 0.7853949069976807, 'eval_runtime': 102.984, 'eval_samples_per_second': 7.768, 'eval_steps_per_second': 0.971, 'epoch': 0.57}


 58%|█████▊    | 465/800 [2:47:32<58:22, 10.45s/it]  

{'loss': 0.9431, 'learning_rate': 7.870750953510984e-05, 'epoch': 0.58}


                                                   
 58%|█████▊    | 465/800 [2:49:14<58:22, 10.45s/it]

{'eval_loss': 0.7846952676773071, 'eval_runtime': 102.3017, 'eval_samples_per_second': 7.82, 'eval_steps_per_second': 0.978, 'epoch': 0.58}


 59%|█████▉    | 470/800 [2:49:22<56:36, 10.29s/it]  

{'loss': 0.9613, 'learning_rate': 7.673420296424541e-05, 'epoch': 0.59}


                                                   
 59%|█████▉    | 470/800 [2:51:02<56:36, 10.29s/it]

{'eval_loss': 0.7845063209533691, 'eval_runtime': 100.2113, 'eval_samples_per_second': 7.983, 'eval_steps_per_second': 0.998, 'epoch': 0.59}


 59%|█████▉    | 475/800 [2:51:09<54:23, 10.04s/it]  

{'loss': 0.8468, 'learning_rate': 7.4770429170022e-05, 'epoch': 0.59}


                                                   
 59%|█████▉    | 475/800 [2:52:49<54:23, 10.04s/it]

{'eval_loss': 0.7841472029685974, 'eval_runtime': 100.4831, 'eval_samples_per_second': 7.962, 'eval_steps_per_second': 0.995, 'epoch': 0.59}


 60%|██████    | 480/800 [2:52:56<53:11,  9.97s/it]  

{'loss': 0.7619, 'learning_rate': 7.281699277636572e-05, 'epoch': 0.6}


                                                   
 60%|██████    | 480/800 [2:54:37<53:11,  9.97s/it]

{'eval_loss': 0.7835143208503723, 'eval_runtime': 100.383, 'eval_samples_per_second': 7.969, 'eval_steps_per_second': 0.996, 'epoch': 0.6}


 61%|██████    | 485/800 [2:54:43<52:11,  9.94s/it]  

{'loss': 0.7496, 'learning_rate': 7.08746941716232e-05, 'epoch': 0.61}


                                                   
 61%|██████    | 485/800 [2:56:24<52:11,  9.94s/it]

{'eval_loss': 0.7833383083343506, 'eval_runtime': 100.4661, 'eval_samples_per_second': 7.963, 'eval_steps_per_second': 0.995, 'epoch': 0.61}


 61%|██████▏   | 490/800 [2:56:30<51:17,  9.93s/it]  

{'loss': 0.5705, 'learning_rate': 6.894432918061579e-05, 'epoch': 0.61}


                                                   
 61%|██████▏   | 490/800 [2:58:11<51:17,  9.93s/it]

{'eval_loss': 0.7830208539962769, 'eval_runtime': 100.4881, 'eval_samples_per_second': 7.961, 'eval_steps_per_second': 0.995, 'epoch': 0.61}


 62%|██████▏   | 495/800 [2:58:17<50:13,  9.88s/it]  

{'loss': 0.5498, 'learning_rate': 6.702668873856338e-05, 'epoch': 0.62}


                                                   
 62%|██████▏   | 495/800 [2:59:58<50:13,  9.88s/it]

{'eval_loss': 0.7835330367088318, 'eval_runtime': 100.6595, 'eval_samples_per_second': 7.948, 'eval_steps_per_second': 0.993, 'epoch': 0.62}


 62%|██████▎   | 500/800 [3:00:04<49:01,  9.80s/it]  

{'loss': 0.3808, 'learning_rate': 6.512255856701177e-05, 'epoch': 0.62}


                                                   
 62%|██████▎   | 500/800 [3:01:45<49:01,  9.80s/it]

{'eval_loss': 0.78450608253479, 'eval_runtime': 101.5109, 'eval_samples_per_second': 7.881, 'eval_steps_per_second': 0.985, 'epoch': 0.62}


 63%|██████▎   | 505/800 [3:01:56<51:26, 10.46s/it]  

{'loss': 1.2161, 'learning_rate': 6.323271885189635e-05, 'epoch': 0.63}


                                                   
 63%|██████▎   | 505/800 [3:03:37<51:26, 10.46s/it]

{'eval_loss': 0.786203920841217, 'eval_runtime': 100.8553, 'eval_samples_per_second': 7.932, 'eval_steps_per_second': 0.992, 'epoch': 0.63}


 64%|██████▍   | 510/800 [3:03:45<49:35, 10.26s/it]  

{'loss': 1.0667, 'learning_rate': 6.135794392387353e-05, 'epoch': 0.64}


                                                   
 64%|██████▍   | 510/800 [3:05:26<49:35, 10.26s/it]

{'eval_loss': 0.7848932147026062, 'eval_runtime': 100.798, 'eval_samples_per_second': 7.937, 'eval_steps_per_second': 0.992, 'epoch': 0.64}


 64%|██████▍   | 515/800 [3:05:33<48:22, 10.18s/it]  

{'loss': 0.9419, 'learning_rate': 5.949900194105167e-05, 'epoch': 0.64}


                                                   
 64%|██████▍   | 515/800 [3:07:13<48:22, 10.18s/it]

{'eval_loss': 0.7825599908828735, 'eval_runtime': 100.1176, 'eval_samples_per_second': 7.991, 'eval_steps_per_second': 0.999, 'epoch': 0.64}


 65%|██████▌   | 520/800 [3:07:20<46:52, 10.04s/it]  

{'loss': 0.9523, 'learning_rate': 5.765665457425102e-05, 'epoch': 0.65}


                                                   
 65%|██████▌   | 520/800 [3:09:02<46:52, 10.04s/it]

{'eval_loss': 0.7820336222648621, 'eval_runtime': 101.0483, 'eval_samples_per_second': 7.917, 'eval_steps_per_second': 0.99, 'epoch': 0.65}


 66%|██████▌   | 525/800 [3:09:09<46:13, 10.08s/it]  

{'loss': 0.8507, 'learning_rate': 5.5831656694921465e-05, 'epoch': 0.66}


                                                   
 66%|██████▌   | 525/800 [3:10:50<46:13, 10.08s/it]

{'eval_loss': 0.7820698618888855, 'eval_runtime': 101.7433, 'eval_samples_per_second': 7.863, 'eval_steps_per_second': 0.983, 'epoch': 0.66}


 66%|██████▋   | 530/800 [3:10:57<45:14, 10.05s/it]  

{'loss': 0.707, 'learning_rate': 5.402475606584669e-05, 'epoch': 0.66}


                                                   
 66%|██████▋   | 530/800 [3:12:37<45:14, 10.05s/it]

{'eval_loss': 0.7820502519607544, 'eval_runtime': 99.6322, 'eval_samples_per_second': 8.03, 'eval_steps_per_second': 1.004, 'epoch': 0.66}


 67%|██████▋   | 535/800 [3:12:43<43:40,  9.89s/it]  

{'loss': 0.6104, 'learning_rate': 5.223669303476041e-05, 'epoch': 0.67}


                                                   
 67%|██████▋   | 535/800 [3:14:23<43:40,  9.89s/it]

{'eval_loss': 0.7815704345703125, 'eval_runtime': 99.9899, 'eval_samples_per_second': 8.001, 'eval_steps_per_second': 1.0, 'epoch': 0.67}


 68%|██████▊   | 540/800 [3:14:29<42:27,  9.80s/it]  

{'loss': 0.516, 'learning_rate': 5.0468200231001286e-05, 'epoch': 0.68}


                                                   
 68%|██████▊   | 540/800 [3:16:11<42:27,  9.80s/it]

{'eval_loss': 0.7813812494277954, 'eval_runtime': 101.3445, 'eval_samples_per_second': 7.894, 'eval_steps_per_second': 0.987, 'epoch': 0.68}


 68%|██████▊   | 545/800 [3:16:17<41:58,  9.88s/it]  

{'loss': 0.4356, 'learning_rate': 4.8720002265330015e-05, 'epoch': 0.68}


                                                   
 68%|██████▊   | 545/800 [3:18:00<41:58,  9.88s/it]

{'eval_loss': 0.7813186645507812, 'eval_runtime': 103.217, 'eval_samples_per_second': 7.751, 'eval_steps_per_second': 0.969, 'epoch': 0.68}


 69%|██████▉   | 550/800 [3:18:06<41:44, 10.02s/it]  

{'loss': 0.378, 'learning_rate': 4.699281543303222e-05, 'epoch': 0.69}


                                                   
 69%|██████▉   | 550/800 [3:19:49<41:44, 10.02s/it]

{'eval_loss': 0.7818038463592529, 'eval_runtime': 102.8424, 'eval_samples_per_second': 7.779, 'eval_steps_per_second': 0.972, 'epoch': 0.69}


 69%|██████▉   | 555/800 [3:19:59<43:08, 10.56s/it]  

{'loss': 1.1699, 'learning_rate': 4.528734742042803e-05, 'epoch': 0.69}


                                                   
 69%|██████▉   | 555/800 [3:21:40<43:08, 10.56s/it]

{'eval_loss': 0.7824380397796631, 'eval_runtime': 101.37, 'eval_samples_per_second': 7.892, 'eval_steps_per_second': 0.986, 'epoch': 0.69}


 70%|███████   | 560/800 [3:21:48<41:20, 10.33s/it]  

{'loss': 1.0437, 'learning_rate': 4.360429701490934e-05, 'epoch': 0.7}


                                                   
 70%|███████   | 560/800 [3:23:29<41:20, 10.33s/it]

{'eval_loss': 0.7821084856987, 'eval_runtime': 100.5988, 'eval_samples_per_second': 7.952, 'eval_steps_per_second': 0.994, 'epoch': 0.7}


 71%|███████   | 565/800 [3:23:37<39:56, 10.20s/it]  

{'loss': 0.8649, 'learning_rate': 4.1944353818623424e-05, 'epoch': 0.71}


                                                   
 71%|███████   | 565/800 [3:25:18<39:56, 10.20s/it]

{'eval_loss': 0.7813282012939453, 'eval_runtime': 101.0143, 'eval_samples_per_second': 7.92, 'eval_steps_per_second': 0.99, 'epoch': 0.71}


 71%|███████▏  | 570/800 [3:25:25<38:43, 10.10s/it]  

{'loss': 0.872, 'learning_rate': 4.030819796591949e-05, 'epoch': 0.71}


                                                   
 71%|███████▏  | 570/800 [3:27:06<38:43, 10.10s/it]

{'eval_loss': 0.7809504866600037, 'eval_runtime': 101.1261, 'eval_samples_per_second': 7.911, 'eval_steps_per_second': 0.989, 'epoch': 0.71}


 72%|███████▏  | 575/800 [3:27:13<37:40, 10.05s/it]  

{'loss': 0.8034, 'learning_rate': 3.869649984467504e-05, 'epoch': 0.72}


                                                   
 72%|███████▏  | 575/800 [3:28:54<37:40, 10.05s/it]

{'eval_loss': 0.7806559801101685, 'eval_runtime': 100.5703, 'eval_samples_per_second': 7.955, 'eval_steps_per_second': 0.994, 'epoch': 0.72}


 72%|███████▎  | 580/800 [3:29:00<36:34,  9.98s/it]  

{'loss': 0.7692, 'learning_rate': 3.710991982161555e-05, 'epoch': 0.72}


                                                   
 72%|███████▎  | 580/800 [3:30:41<36:34,  9.98s/it]

{'eval_loss': 0.7803995609283447, 'eval_runtime': 100.5881, 'eval_samples_per_second': 7.953, 'eval_steps_per_second': 0.994, 'epoch': 0.72}


 73%|███████▎  | 585/800 [3:30:47<35:35,  9.93s/it]  

{'loss': 0.6357, 'learning_rate': 3.55491079717399e-05, 'epoch': 0.73}


                                                   
 73%|███████▎  | 585/800 [3:32:28<35:35,  9.93s/it]

{'eval_loss': 0.7803628444671631, 'eval_runtime': 100.5473, 'eval_samples_per_second': 7.956, 'eval_steps_per_second': 0.995, 'epoch': 0.73}


 74%|███████▍  | 590/800 [3:32:34<34:45,  9.93s/it]  

{'loss': 0.5802, 'learning_rate': 3.4014703811963025e-05, 'epoch': 0.74}


                                                   
 74%|███████▍  | 590/800 [3:34:14<34:45,  9.93s/it]

{'eval_loss': 0.7803699970245361, 'eval_runtime': 99.9141, 'eval_samples_per_second': 8.007, 'eval_steps_per_second': 1.001, 'epoch': 0.74}


 74%|███████▍  | 595/800 [3:34:20<33:19,  9.76s/it]  

{'loss': 0.5121, 'learning_rate': 3.2507336039084314e-05, 'epoch': 0.74}


                                                   
 74%|███████▍  | 595/800 [3:36:01<33:19,  9.76s/it]

{'eval_loss': 0.7801821827888489, 'eval_runtime': 100.8533, 'eval_samples_per_second': 7.932, 'eval_steps_per_second': 0.992, 'epoch': 0.74}


 75%|███████▌  | 600/800 [3:36:06<32:37,  9.79s/it]  

{'loss': 0.4136, 'learning_rate': 3.102762227218957e-05, 'epoch': 0.75}


                                                   
 75%|███████▌  | 600/800 [3:37:47<32:37,  9.79s/it]

{'eval_loss': 0.7804909348487854, 'eval_runtime': 100.816, 'eval_samples_per_second': 7.935, 'eval_steps_per_second': 0.992, 'epoch': 0.75}


 76%|███████▌  | 605/800 [3:37:57<33:39, 10.35s/it]  

{'loss': 1.1691, 'learning_rate': 2.9576168799591664e-05, 'epoch': 0.76}


                                                   
 76%|███████▌  | 605/800 [3:39:38<33:39, 10.35s/it]

{'eval_loss': 0.7808462381362915, 'eval_runtime': 100.6945, 'eval_samples_per_second': 7.945, 'eval_steps_per_second': 0.993, 'epoch': 0.76}


 76%|███████▋  | 610/800 [3:39:46<32:17, 10.20s/it]  

{'loss': 0.9957, 'learning_rate': 2.8153570330413925e-05, 'epoch': 0.76}


                                                   
 76%|███████▋  | 610/800 [3:41:27<32:17, 10.20s/it]

{'eval_loss': 0.7807842493057251, 'eval_runtime': 101.07, 'eval_samples_per_second': 7.915, 'eval_steps_per_second': 0.989, 'epoch': 0.76}


 77%|███████▋  | 615/800 [3:41:34<31:11, 10.12s/it]  

{'loss': 0.9462, 'learning_rate': 2.6760409750917927e-05, 'epoch': 0.77}


                                                   
 77%|███████▋  | 615/800 [3:43:15<31:11, 10.12s/it]

{'eval_loss': 0.7804591655731201, 'eval_runtime': 100.6569, 'eval_samples_per_second': 7.948, 'eval_steps_per_second': 0.993, 'epoch': 0.77}


 78%|███████▊  | 620/800 [3:43:22<30:08, 10.05s/it]  

{'loss': 0.7852, 'learning_rate': 2.5397257885675397e-05, 'epoch': 0.78}


                                                   
 78%|███████▊  | 620/800 [3:45:03<30:08, 10.05s/it]

{'eval_loss': 0.7802379131317139, 'eval_runtime': 101.123, 'eval_samples_per_second': 7.911, 'eval_steps_per_second': 0.989, 'epoch': 0.78}


 78%|███████▊  | 625/800 [3:45:10<29:21, 10.07s/it]  

{'loss': 0.7535, 'learning_rate': 2.406467326368237e-05, 'epoch': 0.78}


                                                   
 78%|███████▊  | 625/800 [3:46:51<29:21, 10.07s/it]

{'eval_loss': 0.7800135016441345, 'eval_runtime': 101.2371, 'eval_samples_per_second': 7.902, 'eval_steps_per_second': 0.988, 'epoch': 0.78}


 79%|███████▉  | 630/800 [3:46:58<28:21, 10.01s/it]  

{'loss': 0.6338, 'learning_rate': 2.2763201889510987e-05, 'epoch': 0.79}


                                                   
 79%|███████▉  | 630/800 [3:48:39<28:21, 10.01s/it]

{'eval_loss': 0.7797491550445557, 'eval_runtime': 101.236, 'eval_samples_per_second': 7.902, 'eval_steps_per_second': 0.988, 'epoch': 0.79}


 79%|███████▉  | 635/800 [3:48:45<27:29, 10.00s/it]  

{'loss': 0.6053, 'learning_rate': 2.149337701959325e-05, 'epoch': 0.79}


                                                   
 79%|███████▉  | 635/800 [3:50:26<27:29, 10.00s/it]

{'eval_loss': 0.7795661687850952, 'eval_runtime': 100.9966, 'eval_samples_per_second': 7.921, 'eval_steps_per_second': 0.99, 'epoch': 0.79}


 80%|████████  | 640/800 [3:50:33<26:28,  9.93s/it]  

{'loss': 0.5469, 'learning_rate': 2.025571894372794e-05, 'epoch': 0.8}


                                                   
 80%|████████  | 640/800 [3:52:13<26:28,  9.93s/it]

{'eval_loss': 0.7794932723045349, 'eval_runtime': 100.5037, 'eval_samples_per_second': 7.96, 'eval_steps_per_second': 0.995, 'epoch': 0.8}


 81%|████████  | 645/800 [3:52:19<25:23,  9.83s/it]  

{'loss': 0.4325, 'learning_rate': 1.9050734771900413e-05, 'epoch': 0.81}


                                                   
 81%|████████  | 645/800 [3:54:00<25:23,  9.83s/it]

{'eval_loss': 0.7794876098632812, 'eval_runtime': 100.8512, 'eval_samples_per_second': 7.932, 'eval_steps_per_second': 0.992, 'epoch': 0.81}


 81%|████████▏ | 650/800 [3:54:06<24:31,  9.81s/it]  

{'loss': 0.3656, 'learning_rate': 1.7878918226502816e-05, 'epoch': 0.81}


                                                   
 81%|████████▏ | 650/800 [3:55:47<24:31,  9.81s/it]

{'eval_loss': 0.779486894607544, 'eval_runtime': 100.9532, 'eval_samples_per_second': 7.924, 'eval_steps_per_second': 0.991, 'epoch': 0.81}


 82%|████████▏ | 655/800 [3:55:56<24:57, 10.33s/it]  

{'loss': 1.1371, 'learning_rate': 1.6740749440039262e-05, 'epoch': 0.82}


                                                   
 82%|████████▏ | 655/800 [3:57:37<24:57, 10.33s/it]

{'eval_loss': 0.7794232368469238, 'eval_runtime': 101.1737, 'eval_samples_per_second': 7.907, 'eval_steps_per_second': 0.988, 'epoch': 0.82}


 82%|████████▎ | 660/800 [3:57:45<23:59, 10.28s/it]  

{'loss': 1.0603, 'learning_rate': 1.563669475839956e-05, 'epoch': 0.82}


                                                   
 82%|████████▎ | 660/800 [3:59:27<23:59, 10.28s/it]

{'eval_loss': 0.7793282866477966, 'eval_runtime': 101.2677, 'eval_samples_per_second': 7.9, 'eval_steps_per_second': 0.987, 'epoch': 0.82}


 83%|████████▎ | 665/800 [3:59:34<23:01, 10.23s/it]  

{'loss': 0.9488, 'learning_rate': 1.4567206549781698e-05, 'epoch': 0.83}


                                                   
 83%|████████▎ | 665/800 [4:01:17<23:01, 10.23s/it]

{'eval_loss': 0.7792463898658752, 'eval_runtime': 102.3269, 'eval_samples_per_second': 7.818, 'eval_steps_per_second': 0.977, 'epoch': 0.83}


 84%|████████▍ | 670/800 [4:01:24<22:09, 10.23s/it]  

{'loss': 0.7932, 'learning_rate': 1.3532723019341375e-05, 'epoch': 0.84}


                                                   
 84%|████████▍ | 670/800 [4:03:07<22:09, 10.23s/it]

{'eval_loss': 0.7791444659233093, 'eval_runtime': 102.465, 'eval_samples_per_second': 7.808, 'eval_steps_per_second': 0.976, 'epoch': 0.84}


 84%|████████▍ | 675/800 [4:03:14<21:15, 10.21s/it]  

{'loss': 0.7589, 'learning_rate': 1.2533668029644751e-05, 'epoch': 0.84}


                                                   
 84%|████████▍ | 675/800 [4:04:56<21:15, 10.21s/it]

{'eval_loss': 0.779018223285675, 'eval_runtime': 102.7656, 'eval_samples_per_second': 7.785, 'eval_steps_per_second': 0.973, 'epoch': 0.84}


 85%|████████▌ | 680/800 [4:05:03<20:20, 10.17s/it]  

{'loss': 0.72, 'learning_rate': 1.1570450926997655e-05, 'epoch': 0.85}


                                                   
 85%|████████▌ | 680/800 [4:06:46<20:20, 10.17s/it]

{'eval_loss': 0.7789266705513, 'eval_runtime': 102.9325, 'eval_samples_per_second': 7.772, 'eval_steps_per_second': 0.972, 'epoch': 0.85}


 86%|████████▌ | 685/800 [4:06:53<19:31, 10.19s/it]  

{'loss': 0.6049, 'learning_rate': 1.0643466373722711e-05, 'epoch': 0.86}


                                                   
 86%|████████▌ | 685/800 [4:08:36<19:31, 10.19s/it]

{'eval_loss': 0.7788793444633484, 'eval_runtime': 102.7523, 'eval_samples_per_second': 7.786, 'eval_steps_per_second': 0.973, 'epoch': 0.86}


 86%|████████▋ | 690/800 [4:08:42<18:37, 10.16s/it]  

{'loss': 0.6083, 'learning_rate': 9.753094186453026e-06, 'epoch': 0.86}


                                                   
 86%|████████▋ | 690/800 [4:10:26<18:37, 10.16s/it]

{'eval_loss': 0.7788457274436951, 'eval_runtime': 103.4249, 'eval_samples_per_second': 7.735, 'eval_steps_per_second': 0.967, 'epoch': 0.86}


 87%|████████▋ | 695/800 [4:10:32<17:38, 10.08s/it]  

{'loss': 0.5312, 'learning_rate': 8.89969918050847e-06, 'epoch': 0.87}


                                                   
 87%|████████▋ | 695/800 [4:12:15<17:38, 10.08s/it]

{'eval_loss': 0.7788212299346924, 'eval_runtime': 103.0332, 'eval_samples_per_second': 7.764, 'eval_steps_per_second': 0.971, 'epoch': 0.87}


 88%|████████▊ | 700/800 [4:12:21<16:40, 10.00s/it]  

{'loss': 0.3584, 'learning_rate': 8.083631020418791e-06, 'epoch': 0.88}


                                                   
 88%|████████▊ | 700/800 [4:14:03<16:40, 10.00s/it]

{'eval_loss': 0.7788112163543701, 'eval_runtime': 102.5843, 'eval_samples_per_second': 7.798, 'eval_steps_per_second': 0.975, 'epoch': 0.88}


 88%|████████▊ | 705/800 [4:14:13<16:41, 10.55s/it]  

{'loss': 1.2337, 'learning_rate': 7.305224076654127e-06, 'epoch': 0.88}


                                                   
 88%|████████▊ | 705/800 [4:15:54<16:41, 10.55s/it]

{'eval_loss': 0.7787989974021912, 'eval_runtime': 101.0002, 'eval_samples_per_second': 7.921, 'eval_steps_per_second': 0.99, 'epoch': 0.88}


 89%|████████▉ | 710/800 [4:16:01<15:25, 10.29s/it]  

{'loss': 1.0267, 'learning_rate': 6.564797288622371e-06, 'epoch': 0.89}


                                                   
 89%|████████▉ | 710/800 [4:17:42<15:25, 10.29s/it]

{'eval_loss': 0.7787727117538452, 'eval_runtime': 100.5437, 'eval_samples_per_second': 7.957, 'eval_steps_per_second': 0.995, 'epoch': 0.89}


 89%|████████▉ | 715/800 [4:17:50<14:23, 10.15s/it]

{'loss': 0.9082, 'learning_rate': 5.86265403398899e-06, 'epoch': 0.89}


                                                   
 89%|████████▉ | 715/800 [4:19:30<14:23, 10.15s/it]

{'eval_loss': 0.7787339091300964, 'eval_runtime': 100.2531, 'eval_samples_per_second': 7.98, 'eval_steps_per_second': 0.997, 'epoch': 0.89}


 90%|█████████ | 720/800 [4:19:37<13:24, 10.05s/it]

{'loss': 0.8431, 'learning_rate': 5.199082004372957e-06, 'epoch': 0.9}


                                                   
 90%|█████████ | 720/800 [4:21:18<13:24, 10.05s/it]

{'eval_loss': 0.7786949276924133, 'eval_runtime': 100.765, 'eval_samples_per_second': 7.939, 'eval_steps_per_second': 0.992, 'epoch': 0.9}


 91%|█████████ | 725/800 [4:21:25<12:32, 10.03s/it]

{'loss': 0.7747, 'learning_rate': 4.574353087469929e-06, 'epoch': 0.91}


                                                   
 91%|█████████ | 725/800 [4:23:06<12:32, 10.03s/it]

{'eval_loss': 0.7786573767662048, 'eval_runtime': 100.5998, 'eval_samples_per_second': 7.952, 'eval_steps_per_second': 0.994, 'epoch': 0.91}


 91%|█████████▏| 730/800 [4:23:13<11:39,  9.99s/it]

{'loss': 0.7157, 'learning_rate': 3.988723255650728e-06, 'epoch': 0.91}


                                                   
 91%|█████████▏| 730/800 [4:24:53<11:39,  9.99s/it]

{'eval_loss': 0.7786319255828857, 'eval_runtime': 100.5851, 'eval_samples_per_second': 7.953, 'eval_steps_per_second': 0.994, 'epoch': 0.91}


 92%|█████████▏| 735/800 [4:25:00<10:46,  9.95s/it]

{'loss': 0.6714, 'learning_rate': 3.442432461080858e-06, 'epoch': 0.92}


                                                   
 92%|█████████▏| 735/800 [4:26:40<10:46,  9.95s/it]

{'eval_loss': 0.7786187529563904, 'eval_runtime': 100.6809, 'eval_samples_per_second': 7.946, 'eval_steps_per_second': 0.993, 'epoch': 0.92}


 92%|█████████▎| 740/800 [4:26:47<09:52,  9.87s/it]

{'loss': 0.4661, 'learning_rate': 2.9357045374040825e-06, 'epoch': 0.93}


                                                   
 92%|█████████▎| 740/800 [4:28:27<09:52,  9.87s/it]

{'eval_loss': 0.7786157727241516, 'eval_runtime': 100.1857, 'eval_samples_per_second': 7.985, 'eval_steps_per_second': 0.998, 'epoch': 0.93}


 93%|█████████▎| 745/800 [4:28:33<08:58,  9.79s/it]

{'loss': 0.4835, 'learning_rate': 2.4687471080302894e-06, 'epoch': 0.93}


                                                   
 93%|█████████▎| 745/800 [4:30:13<08:58,  9.79s/it]

{'eval_loss': 0.7786147594451904, 'eval_runtime': 100.281, 'eval_samples_per_second': 7.978, 'eval_steps_per_second': 0.997, 'epoch': 0.93}


 94%|█████████▍| 750/800 [4:30:19<08:07,  9.75s/it]

{'loss': 0.3469, 'learning_rate': 2.041751501065203e-06, 'epoch': 0.94}


                                                   
 94%|█████████▍| 750/800 [4:31:59<08:07,  9.75s/it]

{'eval_loss': 0.7786192297935486, 'eval_runtime': 100.2747, 'eval_samples_per_second': 7.978, 'eval_steps_per_second': 0.997, 'epoch': 0.94}


 94%|█████████▍| 755/800 [4:32:09<07:44, 10.32s/it]

{'loss': 1.1102, 'learning_rate': 1.6548926709168633e-06, 'epoch': 0.94}


                                                   
 94%|█████████▍| 755/800 [4:33:50<07:44, 10.32s/it]

{'eval_loss': 0.7786183953285217, 'eval_runtime': 100.8016, 'eval_samples_per_second': 7.936, 'eval_steps_per_second': 0.992, 'epoch': 0.94}


 95%|█████████▌| 760/800 [4:33:58<06:50, 10.26s/it]

{'loss': 0.9896, 'learning_rate': 1.30832912661093e-06, 'epoch': 0.95}


                                                   
 95%|█████████▌| 760/800 [4:35:40<06:50, 10.26s/it]

{'eval_loss': 0.7786121964454651, 'eval_runtime': 101.6369, 'eval_samples_per_second': 7.871, 'eval_steps_per_second': 0.984, 'epoch': 0.95}


 96%|█████████▌| 765/800 [4:35:47<05:57, 10.23s/it]

{'loss': 0.8873, 'learning_rate': 1.0022028668442375e-06, 'epoch': 0.96}


                                                   
 96%|█████████▌| 765/800 [4:37:28<05:57, 10.23s/it]

{'eval_loss': 0.7786041498184204, 'eval_runtime': 101.1966, 'eval_samples_per_second': 7.905, 'eval_steps_per_second': 0.988, 'epoch': 0.96}


 96%|█████████▋| 770/800 [4:37:36<05:03, 10.12s/it]

{'loss': 0.7968, 'learning_rate': 7.366393218031564e-07, 'epoch': 0.96}


                                                   
 96%|█████████▋| 770/800 [4:39:16<05:03, 10.12s/it]

{'eval_loss': 0.7785984873771667, 'eval_runtime': 100.3234, 'eval_samples_per_second': 7.974, 'eval_steps_per_second': 0.997, 'epoch': 0.96}


 97%|█████████▋| 775/800 [4:39:23<04:09, 10.00s/it]

{'loss': 0.7981, 'learning_rate': 5.117473017706486e-07, 'epoch': 0.97}


                                                   
 97%|█████████▋| 775/800 [4:41:03<04:09, 10.00s/it]

{'eval_loss': 0.7785946726799011, 'eval_runtime': 100.4583, 'eval_samples_per_second': 7.964, 'eval_steps_per_second': 0.995, 'epoch': 0.97}


 98%|█████████▊| 780/800 [4:41:10<03:18,  9.92s/it]

{'loss': 0.696, 'learning_rate': 3.2761895254306287e-07, 'epoch': 0.97}


                                                   
 98%|█████████▊| 780/800 [4:42:50<03:18,  9.92s/it]

{'eval_loss': 0.7785922288894653, 'eval_runtime': 100.3069, 'eval_samples_per_second': 7.976, 'eval_steps_per_second': 0.997, 'epoch': 0.97}


 98%|█████████▊| 785/800 [4:42:57<02:28,  9.93s/it]

{'loss': 0.6225, 'learning_rate': 1.843297176748804e-07, 'epoch': 0.98}


                                                   
 98%|█████████▊| 785/800 [4:44:39<02:28,  9.93s/it]

{'eval_loss': 0.7785908579826355, 'eval_runtime': 102.5756, 'eval_samples_per_second': 7.799, 'eval_steps_per_second': 0.975, 'epoch': 0.98}


 99%|█████████▉| 790/800 [4:44:45<01:40, 10.03s/it]

{'loss': 0.5381, 'learning_rate': 8.193830756699772e-08, 'epoch': 0.99}


                                                   
 99%|█████████▉| 790/800 [4:46:28<01:40, 10.03s/it]

{'eval_loss': 0.7785903811454773, 'eval_runtime': 102.1885, 'eval_samples_per_second': 7.829, 'eval_steps_per_second': 0.979, 'epoch': 0.99}


 99%|█████████▉| 795/800 [4:46:34<00:49,  9.97s/it]

{'loss': 0.4773, 'learning_rate': 2.0486675411102163e-08, 'epoch': 0.99}


                                                   
 99%|█████████▉| 795/800 [4:48:16<00:49,  9.97s/it]

{'eval_loss': 0.7785902619361877, 'eval_runtime': 102.7215, 'eval_samples_per_second': 7.788, 'eval_steps_per_second': 0.974, 'epoch': 0.99}


100%|██████████| 800/800 [4:48:22<00:00, 10.00s/it]

{'loss': 0.415, 'learning_rate': 0.0, 'epoch': 1.0}


                                                   
100%|██████████| 800/800 [4:50:06<00:00, 10.00s/it]

{'eval_loss': 0.7785902619361877, 'eval_runtime': 103.4597, 'eval_samples_per_second': 7.732, 'eval_steps_per_second': 0.967, 'epoch': 1.0}


100%|██████████| 800/800 [4:50:06<00:00, 21.76s/it]

{'train_runtime': 17406.9139, 'train_samples_per_second': 0.184, 'train_steps_per_second': 0.046, 'train_loss': 0.783082422465086, 'epoch': 1.0}





TrainOutput(global_step=800, training_loss=0.783082422465086, metrics={'train_runtime': 17406.9139, 'train_samples_per_second': 0.184, 'train_steps_per_second': 0.046, 'train_loss': 0.783082422465086, 'epoch': 1.0})

In [16]:
trainer.model.save_pretrained(new_model)

In [17]:
# 以FP16重新載入模型並將其與LoRA權重合併
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.bfloat16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# 重新載入分詞器以進行保存
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# 儲存合併後的模型
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

Loading checkpoint shards: 100%|██████████| 2/2 [00:11<00:00,  5.98s/it]
Some weights of LlamaForCausalLM were not initialized from the model checkpoint at yentinglin/Taiwan-LLM-7B-v2.0-chat and are newly initialized: ['model.layers.12.self_attn.rotary_emb.inv_freq', 'model.layers.18.self_attn.rotary_emb.inv_freq', 'model.layers.16.self_attn.rotary_emb.inv_freq', 'model.layers.19.self_attn.rotary_emb.inv_freq', 'model.layers.10.self_attn.rotary_emb.inv_freq', 'model.layers.24.self_attn.rotary_emb.inv_freq', 'model.layers.14.self_attn.rotary_emb.inv_freq', 'model.layers.27.self_attn.rotary_emb.inv_freq', 'model.layers.13.self_attn.rotary_emb.inv_freq', 'model.layers.28.self_attn.rotary_emb.inv_freq', 'model.layers.8.self_attn.rotary_emb.inv_freq', 'model.layers.5.self_attn.rotary_emb.inv_freq', 'model.layers.15.self_attn.rotary_emb.inv_freq', 'model.layers.2.self_attn.rotary_emb.inv_freq', 'model.layers.30.self_attn.rotary_emb.inv_freq', 'model.layers.3.self_attn.rotary_emb.inv_freq', '

('./1129\\tokenizer_config.json',
 './1129\\special_tokens_map.json',
 './1129\\tokenizer.model',
 './1129\\added_tokens.json',
 './1129\\tokenizer.json')