In [36]:
import os

In [37]:
%pwd

'/Users/siddharth/Documents/EndtoEndProjects/TextSummarizerProject'

In [38]:
os.chdir("/Users/siddharth/Documents/EndtoEndProjects/TextSummarizerProject")

In [39]:
%pwd

'/Users/siddharth/Documents/EndtoEndProjects/TextSummarizerProject'

In [40]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    data_path: Path
    model_ckpt: Path
    num_train_epochs: int
    warmup_steps: int
    per_device_train_batch_size: int
    weight_decay: float
    logging_steps: int
    eval_strategy: str
    eval_steps: int
    save_steps: float
    gradient_accumulation_steps: int

In [41]:
from textSummarizer.constants import *
from textSummarizer.utils.common import read_yaml, create_directories

In [42]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.TrainingArguments

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            model_ckpt = config.model_ckpt,
            num_train_epochs = params.num_train_epochs,
            warmup_steps = params.warmup_steps,
            per_device_train_batch_size = params.per_device_train_batch_size,
            weight_decay = params.weight_decay,
            logging_steps = params.logging_steps,
            eval_strategy = params.eval_strategy,
            eval_steps = params.eval_steps,
            save_steps = params.save_steps,
            gradient_accumulation_steps = params.gradient_accumulation_steps
        )

        return model_trainer_config

In [43]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, load_from_disk
import torch

In [44]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    def train(self):
        # Disable MPS and force CPU
        os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
        torch.backends.mps.is_available = lambda: False
        device = "cpu"

        tokenizer = AutoTokenizer.from_pretrained(self.config.model_ckpt)
        model = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_ckpt).to(device)
        data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

        # Load tokenized dataset
        dataset_samsum_pt = load_from_disk(self.config.data_path)

        # Ultrafast TrainingArguments
        trainer_args = TrainingArguments(
            output_dir=self.config.root_dir,
            num_train_epochs=self.config.num_train_epochs,
            warmup_steps=self.config.warmup_steps,
            per_device_train_batch_size=self.config.per_device_train_batch_size,
            per_device_eval_batch_size=self.config.per_device_train_batch_size,
            weight_decay=self.config.weight_decay,
            logging_steps=self.config.logging_steps,
            eval_strategy=self.config.eval_strategy,
            eval_steps=self.config.eval_steps,
            save_steps=self.config.save_steps,
            gradient_accumulation_steps=self.config.gradient_accumulation_steps,
            fp16=False,                  
            dataloader_num_workers=0,   
            save_total_limit=1,         
            report_to=[],               
        )

        trainer = Trainer(
            model=model,
            args=trainer_args,
            tokenizer=tokenizer,
            data_collator=data_collator,
            train_dataset=dataset_samsum_pt["test"],
            eval_dataset=dataset_samsum_pt["validation"]
        )

        trainer.train()

        # Save model and tokenizer
        model.save_pretrained(os.path.join(self.config.root_dir, "t5-small"))
        tokenizer.save_pretrained(os.path.join(self.config.root_dir, "tokenizer"))


In [45]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer_config = ModelTrainer(config=model_trainer_config)
    model_trainer_config.train()
except Exception as e:
    raise e

[2025-07-19 23:23:09,832: INFO: common: yaml file: config/config.yaml loaded successfully]
[2025-07-19 23:23:09,834: INFO: common: yaml file: params.yaml loaded successfully]
[2025-07-19 23:23:09,835: INFO: common: created directory at: artifacts]
[2025-07-19 23:23:09,835: INFO: common: created directory at: artifacts/model_trainer]


  0%|          | 20/14732 [02:49<34:36:49,  8.47s/it]
  1%|          | 5/819 [00:02<06:53,  1.97it/s]

[A[A                                          


                                                 
[A

  1%|          | 5/819 [00:02<06:53,  1.97it/s]

{'loss': 2.9257, 'grad_norm': 8.752639770507812, 'learning_rate': 5e-05, 'epoch': 0.01}


  1%|          | 10/819 [00:03<04:46,  2.83it/s]

[A[A                                          


                                                 
[A

  1%|          | 10/819 [00:03<04:46,  2.83it/s]

{'loss': 3.1741, 'grad_norm': 15.644474983215332, 'learning_rate': 4.9692874692874694e-05, 'epoch': 0.01}



[A                                              

[A[A                                          


                                                 
[A

  1%|          | 10/819 [00:35<04:46,  2.83it/s]

{'eval_loss': 2.8033080101013184, 'eval_runtime': 31.605, 'eval_samples_per_second': 25.882, 'eval_steps_per_second': 25.882, 'epoch': 0.01}


  2%|▏         | 15/819 [00:37<34:47,  2.60s/it]  

[A[A                                          


                                                 
[A

  2%|▏         | 15/819 [00:37<34:47,  2.60s/it]

{'loss': 2.7978, 'grad_norm': 11.219767570495605, 'learning_rate': 4.9385749385749387e-05, 'epoch': 0.02}


  2%|▏         | 20/819 [00:38<08:19,  1.60it/s]

[A[A                                          


                                                 
[A

  2%|▏         | 20/819 [00:38<08:19,  1.60it/s]

{'loss': 2.9572, 'grad_norm': 33.99694061279297, 'learning_rate': 4.907862407862408e-05, 'epoch': 0.02}



[A                                              

[A[A                                          


                                                 
[A

  2%|▏         | 20/819 [01:09<08:19,  1.60it/s]

{'eval_loss': 2.563750743865967, 'eval_runtime': 31.155, 'eval_samples_per_second': 26.256, 'eval_steps_per_second': 26.256, 'epoch': 0.02}


  3%|▎         | 25/819 [01:11<34:17,  2.59s/it]  

[A[A                                          


                                                 
[A

  3%|▎         | 25/819 [01:11<34:17,  2.59s/it]

{'loss': 3.0927, 'grad_norm': 18.805370330810547, 'learning_rate': 4.877149877149878e-05, 'epoch': 0.03}


  4%|▎         | 30/819 [01:12<08:54,  1.47it/s]

[A[A                                          


                                                 
[A

  4%|▎         | 30/819 [01:12<08:54,  1.47it/s]

{'loss': 2.8111, 'grad_norm': 20.137855529785156, 'learning_rate': 4.846437346437347e-05, 'epoch': 0.04}



[A                                              

[A[A                                          


                                                 
[A

  4%|▎         | 30/819 [01:43<08:54,  1.47it/s]

{'eval_loss': 2.4350433349609375, 'eval_runtime': 31.0463, 'eval_samples_per_second': 26.348, 'eval_steps_per_second': 26.348, 'epoch': 0.04}


  4%|▍         | 35/819 [01:45<33:43,  2.58s/it]  

[A[A                                          


                                                 
[A

  4%|▍         | 35/819 [01:45<33:43,  2.58s/it]

{'loss': 2.167, 'grad_norm': 7.514581680297852, 'learning_rate': 4.8157248157248155e-05, 'epoch': 0.04}


  5%|▍         | 40/819 [01:46<08:13,  1.58it/s]

[A[A                                          


                                                 
[A

  5%|▍         | 40/819 [01:46<08:13,  1.58it/s]

{'loss': 2.4353, 'grad_norm': 12.948577880859375, 'learning_rate': 4.7850122850122854e-05, 'epoch': 0.05}



[A                                              

[A[A                                          


                                                 
[A

  5%|▍         | 40/819 [02:17<08:13,  1.58it/s]

{'eval_loss': 2.354356050491333, 'eval_runtime': 30.9192, 'eval_samples_per_second': 26.456, 'eval_steps_per_second': 26.456, 'epoch': 0.05}


  5%|▌         | 45/819 [02:19<33:14,  2.58s/it]  

[A[A                                          


                                                 
[A

  5%|▌         | 45/819 [02:19<33:14,  2.58s/it]

{'loss': 2.6914, 'grad_norm': 313.9160461425781, 'learning_rate': 4.7542997542997546e-05, 'epoch': 0.05}


  6%|▌         | 50/819 [02:20<07:58,  1.61it/s]

[A[A                                          


                                                 
[A

  6%|▌         | 50/819 [02:20<07:58,  1.61it/s]

{'loss': 2.6096, 'grad_norm': 38.288551330566406, 'learning_rate': 4.723587223587224e-05, 'epoch': 0.06}



[A                                              

[A[A                                          


                                                 
[A

  6%|▌         | 50/819 [02:51<07:58,  1.61it/s]

{'eval_loss': 2.2963218688964844, 'eval_runtime': 30.9463, 'eval_samples_per_second': 26.433, 'eval_steps_per_second': 26.433, 'epoch': 0.06}


  7%|▋         | 55/819 [02:52<32:39,  2.56s/it]  

[A[A                                          


                                                 
[A

  7%|▋         | 55/819 [02:52<32:39,  2.56s/it]

{'loss': 1.7961, 'grad_norm': 11.095928192138672, 'learning_rate': 4.692874692874693e-05, 'epoch': 0.07}


  7%|▋         | 60/819 [02:54<08:10,  1.55it/s]

[A[A                                          


                                                 
[A

  7%|▋         | 60/819 [02:54<08:10,  1.55it/s]

{'loss': 2.3012, 'grad_norm': 15.308932304382324, 'learning_rate': 4.662162162162162e-05, 'epoch': 0.07}



[A                                              

[A[A                                          


                                                 
[A

  7%|▋         | 60/819 [03:25<08:10,  1.55it/s]

{'eval_loss': 2.256046772003174, 'eval_runtime': 31.1937, 'eval_samples_per_second': 26.223, 'eval_steps_per_second': 26.223, 'epoch': 0.07}


  8%|▊         | 65/819 [03:26<32:21,  2.57s/it]  

[A[A                                          


                                                 
[A

  8%|▊         | 65/819 [03:26<32:21,  2.57s/it]

{'loss': 2.7841, 'grad_norm': 24.003433227539062, 'learning_rate': 4.6314496314496314e-05, 'epoch': 0.08}


  9%|▊         | 70/819 [03:28<07:52,  1.59it/s]

[A[A                                          


                                                 
[A

  9%|▊         | 70/819 [03:28<07:52,  1.59it/s]

{'loss': 2.8363, 'grad_norm': 8.146971702575684, 'learning_rate': 4.6007371007371006e-05, 'epoch': 0.09}



[A                                              

[A[A                                          


                                                 
[A

  9%|▊         | 70/819 [03:59<07:52,  1.59it/s]

{'eval_loss': 2.2232377529144287, 'eval_runtime': 31.3431, 'eval_samples_per_second': 26.098, 'eval_steps_per_second': 26.098, 'epoch': 0.09}


  9%|▉         | 75/819 [04:01<32:36,  2.63s/it]  

[A[A                                          


                                                 
[A

  9%|▉         | 75/819 [04:01<32:36,  2.63s/it]

{'loss': 3.0912, 'grad_norm': 6.686858177185059, 'learning_rate': 4.5700245700245705e-05, 'epoch': 0.09}


 10%|▉         | 80/819 [04:02<08:59,  1.37it/s]

[A[A                                          


                                                 
[A

 10%|▉         | 80/819 [04:02<08:59,  1.37it/s]

{'loss': 2.4171, 'grad_norm': 8.955682754516602, 'learning_rate': 4.53931203931204e-05, 'epoch': 0.1}



[A                                              

[A[A                                          


                                                 
[A

 10%|▉         | 80/819 [04:34<08:59,  1.37it/s]

{'eval_loss': 2.1922214031219482, 'eval_runtime': 31.5945, 'eval_samples_per_second': 25.891, 'eval_steps_per_second': 25.891, 'epoch': 0.1}


 10%|█         | 85/819 [04:36<32:25,  2.65s/it]  

[A[A                                          


                                                 
[A

 10%|█         | 85/819 [04:36<32:25,  2.65s/it]

{'loss': 2.0388, 'grad_norm': 10.51115608215332, 'learning_rate': 4.508599508599509e-05, 'epoch': 0.1}


 11%|█         | 90/819 [04:37<08:03,  1.51it/s]

[A[A                                          


                                                 
[A

 11%|█         | 90/819 [04:37<08:03,  1.51it/s]

{'loss': 2.5287, 'grad_norm': 9.742125511169434, 'learning_rate': 4.477886977886978e-05, 'epoch': 0.11}



[A                                              

[A[A                                          


                                                 
[A

 11%|█         | 90/819 [05:08<08:03,  1.51it/s]

{'eval_loss': 2.165886163711548, 'eval_runtime': 31.1777, 'eval_samples_per_second': 26.237, 'eval_steps_per_second': 26.237, 'epoch': 0.11}


 12%|█▏        | 95/819 [05:10<31:34,  2.62s/it]  

[A[A                                          


                                                 
[A

 12%|█▏        | 95/819 [05:10<31:34,  2.62s/it]

{'loss': 2.1736, 'grad_norm': 7.523964881896973, 'learning_rate': 4.447174447174447e-05, 'epoch': 0.12}


 12%|█▏        | 100/819 [05:11<07:50,  1.53it/s]

[A[A                                          


                                                 
[A

 12%|█▏        | 100/819 [05:11<07:50,  1.53it/s]

{'loss': 2.1904, 'grad_norm': 12.105571746826172, 'learning_rate': 4.4164619164619165e-05, 'epoch': 0.12}



[A                                              

[A[A                                          


                                                 
[A

 12%|█▏        | 100/819 [05:42<07:50,  1.53it/s]

{'eval_loss': 2.144035577774048, 'eval_runtime': 30.9587, 'eval_samples_per_second': 26.422, 'eval_steps_per_second': 26.422, 'epoch': 0.12}


 13%|█▎        | 105/819 [05:44<30:36,  2.57s/it]  

[A[A                                          


                                                 
[A

 13%|█▎        | 105/819 [05:44<30:36,  2.57s/it]

{'loss': 3.0205, 'grad_norm': 13.574424743652344, 'learning_rate': 4.385749385749386e-05, 'epoch': 0.13}


 13%|█▎        | 110/819 [05:45<08:10,  1.45it/s]

[A[A                                          


                                                 
[A

 13%|█▎        | 110/819 [05:45<08:10,  1.45it/s]

{'loss': 2.6662, 'grad_norm': 22.642019271850586, 'learning_rate': 4.3550368550368556e-05, 'epoch': 0.13}



[A                                              

[A[A                                          


                                                 
[A

 13%|█▎        | 110/819 [06:16<08:10,  1.45it/s]

{'eval_loss': 2.128634452819824, 'eval_runtime': 30.9688, 'eval_samples_per_second': 26.414, 'eval_steps_per_second': 26.414, 'epoch': 0.13}


 14%|█▍        | 115/819 [06:18<31:25,  2.68s/it]  

[A[A                                          


                                                 
[A

 14%|█▍        | 115/819 [06:18<31:25,  2.68s/it]

{'loss': 2.2818, 'grad_norm': 4.512773036956787, 'learning_rate': 4.324324324324325e-05, 'epoch': 0.14}


 15%|█▍        | 120/819 [06:19<08:11,  1.42it/s]

[A[A                                          


                                                 
[A

 15%|█▍        | 120/819 [06:19<08:11,  1.42it/s]

{'loss': 2.4075, 'grad_norm': 7.490412712097168, 'learning_rate': 4.2936117936117934e-05, 'epoch': 0.15}



[A                                              

[A[A                                          


                                                 
[A

 15%|█▍        | 120/819 [06:50<08:11,  1.42it/s]

{'eval_loss': 2.114285469055176, 'eval_runtime': 30.8651, 'eval_samples_per_second': 26.502, 'eval_steps_per_second': 26.502, 'epoch': 0.15}


 15%|█▌        | 125/819 [06:52<29:56,  2.59s/it]  

[A[A                                          


                                                 
[A

 15%|█▌        | 125/819 [06:52<29:56,  2.59s/it]

{'loss': 2.819, 'grad_norm': 10.266819953918457, 'learning_rate': 4.262899262899263e-05, 'epoch': 0.15}


 16%|█▌        | 130/819 [06:53<07:57,  1.44it/s]

[A[A                                          


                                                 
[A

 16%|█▌        | 130/819 [06:53<07:57,  1.44it/s]

{'loss': 2.6191, 'grad_norm': 7.143156051635742, 'learning_rate': 4.2321867321867324e-05, 'epoch': 0.16}



[A                                              

[A[A                                          


                                                 
[A

 16%|█▌        | 130/819 [07:24<07:57,  1.44it/s]

{'eval_loss': 2.1022462844848633, 'eval_runtime': 30.9733, 'eval_samples_per_second': 26.41, 'eval_steps_per_second': 26.41, 'epoch': 0.16}


 16%|█▋        | 135/819 [07:26<29:28,  2.59s/it]  

[A[A                                          


                                                 
[A

 16%|█▋        | 135/819 [07:26<29:28,  2.59s/it]

{'loss': 2.0213, 'grad_norm': 8.848861694335938, 'learning_rate': 4.2014742014742017e-05, 'epoch': 0.16}


 17%|█▋        | 140/819 [07:28<08:13,  1.37it/s]

[A[A                                          


                                                 
[A

 17%|█▋        | 140/819 [07:28<08:13,  1.37it/s]

{'loss': 2.3319, 'grad_norm': 18.582489013671875, 'learning_rate': 4.170761670761671e-05, 'epoch': 0.17}



[A                                              

[A[A                                          


                                                 
[A

 17%|█▋        | 140/819 [07:58<08:13,  1.37it/s]

{'eval_loss': 2.0901639461517334, 'eval_runtime': 30.9553, 'eval_samples_per_second': 26.425, 'eval_steps_per_second': 26.425, 'epoch': 0.17}


 18%|█▊        | 145/819 [08:00<29:23,  2.62s/it]  

[A[A                                          


                                                 
[A

 18%|█▊        | 145/819 [08:00<29:23,  2.62s/it]

{'loss': 2.4963, 'grad_norm': 6.829893589019775, 'learning_rate': 4.14004914004914e-05, 'epoch': 0.18}


 18%|█▊        | 150/819 [08:02<08:02,  1.39it/s]

[A[A                                          


                                                 
[A

 18%|█▊        | 150/819 [08:02<08:02,  1.39it/s]

{'loss': 2.1244, 'grad_norm': 5.752482891082764, 'learning_rate': 4.10933660933661e-05, 'epoch': 0.18}



[A                                              

[A[A                                          


                                                 
[A

 18%|█▊        | 150/819 [08:33<08:02,  1.39it/s]

{'eval_loss': 2.076676368713379, 'eval_runtime': 31.2587, 'eval_samples_per_second': 26.169, 'eval_steps_per_second': 26.169, 'epoch': 0.18}


 19%|█▉        | 155/819 [08:34<28:56,  2.61s/it]  

[A[A                                          


                                                 
[A

 19%|█▉        | 155/819 [08:34<28:56,  2.61s/it]

{'loss': 2.6111, 'grad_norm': 24.79416275024414, 'learning_rate': 4.0786240786240785e-05, 'epoch': 0.19}


 20%|█▉        | 160/819 [08:36<07:29,  1.47it/s]

[A[A                                          


                                                 
[A

 20%|█▉        | 160/819 [08:36<07:29,  1.47it/s]

{'loss': 2.9491, 'grad_norm': 8.609511375427246, 'learning_rate': 4.0479115479115484e-05, 'epoch': 0.2}



[A                                              

[A[A                                          


                                                 
[A

 20%|█▉        | 160/819 [09:07<07:29,  1.47it/s]

{'eval_loss': 2.070887565612793, 'eval_runtime': 31.0417, 'eval_samples_per_second': 26.352, 'eval_steps_per_second': 26.352, 'epoch': 0.2}


 20%|██        | 165/819 [09:09<28:27,  2.61s/it]  

[A[A                                          


                                                 
[A

 20%|██        | 165/819 [09:09<28:27,  2.61s/it]

{'loss': 2.6391, 'grad_norm': 7.9498610496521, 'learning_rate': 4.0171990171990176e-05, 'epoch': 0.2}


 21%|██        | 170/819 [09:10<06:57,  1.55it/s]

[A[A                                          


                                                 
[A

 21%|██        | 170/819 [09:10<06:57,  1.55it/s]

{'loss': 2.3743, 'grad_norm': 31.92779541015625, 'learning_rate': 3.986486486486487e-05, 'epoch': 0.21}



[A                                              

[A[A                                          


                                                 
[A

 21%|██        | 170/819 [09:41<06:57,  1.55it/s]

{'eval_loss': 2.060485363006592, 'eval_runtime': 30.9431, 'eval_samples_per_second': 26.436, 'eval_steps_per_second': 26.436, 'epoch': 0.21}


 21%|██▏       | 175/819 [09:43<27:40,  2.58s/it]  

[A[A                                          


                                                 
[A

 21%|██▏       | 175/819 [09:43<27:40,  2.58s/it]

{'loss': 1.8884, 'grad_norm': 6.065046310424805, 'learning_rate': 3.955773955773956e-05, 'epoch': 0.21}


 22%|██▏       | 180/819 [09:45<07:48,  1.36it/s]

[A[A                                          


                                                 
[A

 22%|██▏       | 180/819 [09:45<07:48,  1.36it/s]

{'loss': 2.526, 'grad_norm': 8.884674072265625, 'learning_rate': 3.925061425061425e-05, 'epoch': 0.22}



[A                                              

[A[A                                          


                                                 
[A

 22%|██▏       | 180/819 [10:15<07:48,  1.36it/s]

{'eval_loss': 2.0546488761901855, 'eval_runtime': 30.8614, 'eval_samples_per_second': 26.506, 'eval_steps_per_second': 26.506, 'epoch': 0.22}


 23%|██▎       | 185/819 [10:17<27:32,  2.61s/it]  

[A[A                                          


                                                 
[A

 23%|██▎       | 185/819 [10:17<27:32,  2.61s/it]

{'loss': 2.4033, 'grad_norm': 14.88188362121582, 'learning_rate': 3.8943488943488944e-05, 'epoch': 0.23}


 23%|██▎       | 190/819 [10:19<07:38,  1.37it/s]

[A[A                                          


                                                 
[A

 23%|██▎       | 190/819 [10:19<07:38,  1.37it/s]

{'loss': 2.2778, 'grad_norm': 8.858638763427734, 'learning_rate': 3.8636363636363636e-05, 'epoch': 0.23}



[A                                              

[A[A                                          


                                                 
[A

 23%|██▎       | 190/819 [10:50<07:38,  1.37it/s]

{'eval_loss': 2.053133964538574, 'eval_runtime': 31.0199, 'eval_samples_per_second': 26.37, 'eval_steps_per_second': 26.37, 'epoch': 0.23}


 24%|██▍       | 195/819 [10:52<27:11,  2.61s/it]  

[A[A                                          


                                                 
[A

 24%|██▍       | 195/819 [10:52<27:11,  2.61s/it]

{'loss': 2.2707, 'grad_norm': 12.922744750976562, 'learning_rate': 3.8329238329238335e-05, 'epoch': 0.24}


 24%|██▍       | 200/819 [10:53<06:45,  1.53it/s]

[A[A                                          


                                                 
[A

 24%|██▍       | 200/819 [10:53<06:45,  1.53it/s]

{'loss': 2.005, 'grad_norm': 7.071416854858398, 'learning_rate': 3.802211302211303e-05, 'epoch': 0.24}



[A                                              

[A[A                                          


                                                 
[A

 24%|██▍       | 200/819 [11:24<06:45,  1.53it/s]

{'eval_loss': 2.045300245285034, 'eval_runtime': 30.9347, 'eval_samples_per_second': 26.443, 'eval_steps_per_second': 26.443, 'epoch': 0.24}


 25%|██▌       | 205/819 [11:26<26:26,  2.58s/it]  

[A[A                                          


                                                 
[A

 25%|██▌       | 205/819 [11:26<26:26,  2.58s/it]

{'loss': 2.106, 'grad_norm': 10.709415435791016, 'learning_rate': 3.771498771498771e-05, 'epoch': 0.25}


 26%|██▌       | 210/819 [11:27<06:46,  1.50it/s]

[A[A                                          


                                                 
[A

 26%|██▌       | 210/819 [11:27<06:46,  1.50it/s]

{'loss': 2.1061, 'grad_norm': 19.02802085876465, 'learning_rate': 3.740786240786241e-05, 'epoch': 0.26}



[A                                              

[A[A                                          


                                                 
[A

 26%|██▌       | 210/819 [11:58<06:46,  1.50it/s]

{'eval_loss': 2.0392725467681885, 'eval_runtime': 30.9043, 'eval_samples_per_second': 26.469, 'eval_steps_per_second': 26.469, 'epoch': 0.26}


 26%|██▋       | 215/819 [11:59<25:44,  2.56s/it]  

[A[A                                          


                                                 
[A

 26%|██▋       | 215/819 [11:59<25:44,  2.56s/it]

{'loss': 2.2383, 'grad_norm': 6.963653087615967, 'learning_rate': 3.71007371007371e-05, 'epoch': 0.26}


 27%|██▋       | 220/819 [12:01<06:14,  1.60it/s]

[A[A                                          


                                                 
[A

 27%|██▋       | 220/819 [12:01<06:14,  1.60it/s]

{'loss': 2.2347, 'grad_norm': 16.556501388549805, 'learning_rate': 3.6793611793611795e-05, 'epoch': 0.27}



[A                                              

[A[A                                          


                                                 
[A

 27%|██▋       | 220/819 [12:32<06:14,  1.60it/s]

{'eval_loss': 2.0394093990325928, 'eval_runtime': 30.9728, 'eval_samples_per_second': 26.41, 'eval_steps_per_second': 26.41, 'epoch': 0.27}


 27%|██▋       | 225/819 [12:33<25:45,  2.60s/it]  

[A[A                                          


                                                 
[A

 27%|██▋       | 225/819 [12:33<25:45,  2.60s/it]

{'loss': 2.1157, 'grad_norm': 10.504711151123047, 'learning_rate': 3.648648648648649e-05, 'epoch': 0.27}


 28%|██▊       | 230/819 [12:35<06:27,  1.52it/s]

[A[A                                          


                                                 
[A

 28%|██▊       | 230/819 [12:35<06:27,  1.52it/s]

{'loss': 2.8282, 'grad_norm': 11.279829978942871, 'learning_rate': 3.617936117936118e-05, 'epoch': 0.28}



[A                                              

[A[A                                          


                                                 
[A

 28%|██▊       | 230/819 [13:06<06:27,  1.52it/s]

{'eval_loss': 2.035468339920044, 'eval_runtime': 31.1064, 'eval_samples_per_second': 26.297, 'eval_steps_per_second': 26.297, 'epoch': 0.28}


 29%|██▊       | 235/819 [13:07<25:27,  2.62s/it]  

[A[A                                          


                                                 
[A

 29%|██▊       | 235/819 [13:07<25:27,  2.62s/it]

{'loss': 2.2784, 'grad_norm': 6.967667579650879, 'learning_rate': 3.587223587223588e-05, 'epoch': 0.29}


 29%|██▉       | 240/819 [13:09<06:28,  1.49it/s]

[A[A                                          


                                                 
[A

 29%|██▉       | 240/819 [13:09<06:28,  1.49it/s]

{'loss': 2.4664, 'grad_norm': 9.964985847473145, 'learning_rate': 3.5565110565110564e-05, 'epoch': 0.29}



[A                                              

[A[A                                          


                                                 
[A

 29%|██▉       | 240/819 [13:40<06:28,  1.49it/s]

{'eval_loss': 2.0259926319122314, 'eval_runtime': 31.6911, 'eval_samples_per_second': 25.812, 'eval_steps_per_second': 25.812, 'epoch': 0.29}


 30%|██▉       | 245/819 [13:42<25:12,  2.63s/it]  

[A[A                                          


                                                 
[A

 30%|██▉       | 245/819 [13:42<25:12,  2.63s/it]

{'loss': 2.7844, 'grad_norm': 11.47446060180664, 'learning_rate': 3.525798525798526e-05, 'epoch': 0.3}


 31%|███       | 250/819 [13:44<06:39,  1.42it/s]

[A[A                                          


                                                 
[A

 31%|███       | 250/819 [13:44<06:39,  1.42it/s]

{'loss': 2.4429, 'grad_norm': 18.097824096679688, 'learning_rate': 3.4950859950859954e-05, 'epoch': 0.31}



[A                                              

[A[A                                          


                                                 
[A

 31%|███       | 250/819 [14:15<06:39,  1.42it/s]

{'eval_loss': 2.021775484085083, 'eval_runtime': 31.0332, 'eval_samples_per_second': 26.359, 'eval_steps_per_second': 26.359, 'epoch': 0.31}


 31%|███       | 255/819 [14:16<24:18,  2.59s/it]  

[A[A                                          


                                                 
[A

 31%|███       | 255/819 [14:16<24:18,  2.59s/it]

{'loss': 2.3337, 'grad_norm': 18.916950225830078, 'learning_rate': 3.4643734643734647e-05, 'epoch': 0.31}


 32%|███▏      | 260/819 [14:17<05:53,  1.58it/s]

[A[A                                          


                                                 
[A

 32%|███▏      | 260/819 [14:17<05:53,  1.58it/s]

{'loss': 2.1403, 'grad_norm': 15.15397834777832, 'learning_rate': 3.433660933660934e-05, 'epoch': 0.32}



[A                                              

[A[A                                          


                                                 
[A

 32%|███▏      | 260/819 [14:49<05:53,  1.58it/s]

{'eval_loss': 2.019946336746216, 'eval_runtime': 31.0197, 'eval_samples_per_second': 26.37, 'eval_steps_per_second': 26.37, 'epoch': 0.32}


 32%|███▏      | 265/819 [14:50<23:39,  2.56s/it]  

[A[A                                          


                                                 
[A

 32%|███▏      | 265/819 [14:50<23:39,  2.56s/it]

{'loss': 2.4897, 'grad_norm': 7.9902729988098145, 'learning_rate': 3.402948402948403e-05, 'epoch': 0.32}


 33%|███▎      | 270/819 [14:52<06:36,  1.39it/s]

[A[A                                          


                                                 
[A

 33%|███▎      | 270/819 [14:52<06:36,  1.39it/s]

{'loss': 2.7647, 'grad_norm': 6.979297161102295, 'learning_rate': 3.372235872235873e-05, 'epoch': 0.33}



[A                                              

[A[A                                          


                                                 
[A

 33%|███▎      | 270/819 [15:23<06:36,  1.39it/s]

{'eval_loss': 2.018894672393799, 'eval_runtime': 31.0241, 'eval_samples_per_second': 26.367, 'eval_steps_per_second': 26.367, 'epoch': 0.33}


 34%|███▎      | 275/819 [15:24<23:41,  2.61s/it]  

[A[A                                          


                                                 
[A

 34%|███▎      | 275/819 [15:24<23:41,  2.61s/it]

{'loss': 2.7717, 'grad_norm': 10.892004013061523, 'learning_rate': 3.3415233415233415e-05, 'epoch': 0.34}


 34%|███▍      | 280/819 [15:26<05:45,  1.56it/s]

[A[A                                          


                                                 
[A

 34%|███▍      | 280/819 [15:26<05:45,  1.56it/s]

{'loss': 2.0545, 'grad_norm': 7.408411026000977, 'learning_rate': 3.310810810810811e-05, 'epoch': 0.34}



[A                                              

[A[A                                          


                                                 
[A

 34%|███▍      | 280/819 [15:57<05:45,  1.56it/s]

{'eval_loss': 2.014774799346924, 'eval_runtime': 31.0342, 'eval_samples_per_second': 26.358, 'eval_steps_per_second': 26.358, 'epoch': 0.34}


 35%|███▍      | 285/819 [15:58<23:12,  2.61s/it]  

[A[A                                          


                                                 
[A

 35%|███▍      | 285/819 [15:58<23:12,  2.61s/it]

{'loss': 1.9387, 'grad_norm': 7.316982269287109, 'learning_rate': 3.2800982800982806e-05, 'epoch': 0.35}


 35%|███▌      | 290/819 [16:00<05:57,  1.48it/s]

[A[A                                          


                                                 
[A

 35%|███▌      | 290/819 [16:00<05:57,  1.48it/s]

{'loss': 2.0829, 'grad_norm': 7.699914455413818, 'learning_rate': 3.249385749385749e-05, 'epoch': 0.35}



[A                                              

[A[A                                          


                                                 
[A

 35%|███▌      | 290/819 [16:31<05:57,  1.48it/s]

{'eval_loss': 2.009809970855713, 'eval_runtime': 30.9494, 'eval_samples_per_second': 26.43, 'eval_steps_per_second': 26.43, 'epoch': 0.35}


 36%|███▌      | 295/819 [16:33<23:01,  2.64s/it]  

[A[A                                          


                                                 
[A

 36%|███▌      | 295/819 [16:33<23:01,  2.64s/it]

{'loss': 2.4571, 'grad_norm': 7.2054219245910645, 'learning_rate': 3.218673218673219e-05, 'epoch': 0.36}


 37%|███▋      | 300/819 [16:34<06:36,  1.31it/s]

[A[A                                          


                                                 
[A

 37%|███▋      | 300/819 [16:34<06:36,  1.31it/s]

{'loss': 2.7823, 'grad_norm': 16.942790985107422, 'learning_rate': 3.187960687960688e-05, 'epoch': 0.37}



[A                                              

[A[A                                          


                                                 
[A

 37%|███▋      | 300/819 [17:05<06:36,  1.31it/s]

{'eval_loss': 2.006753921508789, 'eval_runtime': 30.9552, 'eval_samples_per_second': 26.425, 'eval_steps_per_second': 26.425, 'epoch': 0.37}


 37%|███▋      | 305/819 [17:07<22:05,  2.58s/it]  

[A[A                                          


                                                 
[A

 37%|███▋      | 305/819 [17:07<22:05,  2.58s/it]

{'loss': 2.3838, 'grad_norm': 11.531720161437988, 'learning_rate': 3.1572481572481574e-05, 'epoch': 0.37}


 38%|███▊      | 310/819 [17:08<05:24,  1.57it/s]

[A[A                                          


                                                 
[A

 38%|███▊      | 310/819 [17:08<05:24,  1.57it/s]

{'loss': 2.5337, 'grad_norm': 10.845064163208008, 'learning_rate': 3.1265356265356266e-05, 'epoch': 0.38}



[A                                              

[A[A                                          


                                                 
[A

 38%|███▊      | 310/819 [17:39<05:24,  1.57it/s]

{'eval_loss': 2.0034689903259277, 'eval_runtime': 30.9356, 'eval_samples_per_second': 26.442, 'eval_steps_per_second': 26.442, 'epoch': 0.38}


 38%|███▊      | 315/819 [17:41<21:38,  2.58s/it]  

[A[A                                          


                                                 
[A

 38%|███▊      | 315/819 [17:41<21:38,  2.58s/it]

{'loss': 2.4196, 'grad_norm': 8.326079368591309, 'learning_rate': 3.095823095823096e-05, 'epoch': 0.38}


 39%|███▉      | 320/819 [17:42<05:15,  1.58it/s]

[A[A                                          


                                                 
[A

 39%|███▉      | 320/819 [17:42<05:15,  1.58it/s]

{'loss': 2.6667, 'grad_norm': 9.053651809692383, 'learning_rate': 3.065110565110566e-05, 'epoch': 0.39}



[A                                              

[A[A                                          


                                                 
[A

 39%|███▉      | 320/819 [18:13<05:15,  1.58it/s]

{'eval_loss': 2.000458240509033, 'eval_runtime': 30.9624, 'eval_samples_per_second': 26.419, 'eval_steps_per_second': 26.419, 'epoch': 0.39}


 40%|███▉      | 325/819 [18:15<21:25,  2.60s/it]  

[A[A                                          


                                                 
[A

 40%|███▉      | 325/819 [18:15<21:25,  2.60s/it]

{'loss': 1.8609, 'grad_norm': 6.23404598236084, 'learning_rate': 3.0343980343980342e-05, 'epoch': 0.4}


 40%|████      | 330/819 [18:16<05:53,  1.38it/s]

[A[A                                          


                                                 
[A

 40%|████      | 330/819 [18:16<05:53,  1.38it/s]

{'loss': 2.6589, 'grad_norm': 8.996383666992188, 'learning_rate': 3.0036855036855038e-05, 'epoch': 0.4}



[A                                              

[A[A                                          


                                                 
[A

 40%|████      | 330/819 [18:48<05:53,  1.38it/s]

{'eval_loss': 1.9981937408447266, 'eval_runtime': 31.5391, 'eval_samples_per_second': 25.936, 'eval_steps_per_second': 25.936, 'epoch': 0.4}


 41%|████      | 335/819 [18:50<21:23,  2.65s/it]  

[A[A                                          


                                                 
[A

 41%|████      | 335/819 [18:50<21:23,  2.65s/it]

{'loss': 2.3116, 'grad_norm': 9.394037246704102, 'learning_rate': 2.9729729729729733e-05, 'epoch': 0.41}


 42%|████▏     | 340/819 [18:51<06:09,  1.30it/s]

[A[A                                          


                                                 
[A

 42%|████▏     | 340/819 [18:51<06:09,  1.30it/s]

{'loss': 2.7154, 'grad_norm': 11.562793731689453, 'learning_rate': 2.9422604422604422e-05, 'epoch': 0.42}



[A                                              

[A[A                                          


                                                 
[A

 42%|████▏     | 340/819 [19:23<06:09,  1.30it/s]

{'eval_loss': 1.9969381093978882, 'eval_runtime': 31.6105, 'eval_samples_per_second': 25.877, 'eval_steps_per_second': 25.877, 'epoch': 0.42}


 42%|████▏     | 345/819 [19:24<20:52,  2.64s/it]  

[A[A                                          


                                                 
[A

 42%|████▏     | 345/819 [19:24<20:52,  2.64s/it]

{'loss': 2.0493, 'grad_norm': 10.660293579101562, 'learning_rate': 2.9115479115479117e-05, 'epoch': 0.42}


 43%|████▎     | 350/819 [19:26<05:26,  1.44it/s]

[A[A                                          


                                                 
[A

 43%|████▎     | 350/819 [19:26<05:26,  1.44it/s]

{'loss': 2.117, 'grad_norm': 8.984894752502441, 'learning_rate': 2.880835380835381e-05, 'epoch': 0.43}



[A                                              

[A[A                                          


                                                 
[A

 43%|████▎     | 350/819 [19:57<05:26,  1.44it/s]

{'eval_loss': 1.9955344200134277, 'eval_runtime': 30.9909, 'eval_samples_per_second': 26.395, 'eval_steps_per_second': 26.395, 'epoch': 0.43}


 43%|████▎     | 355/819 [19:58<19:55,  2.58s/it]  

[A[A                                          


                                                 
[A

 43%|████▎     | 355/819 [19:58<19:55,  2.58s/it]

{'loss': 2.2069, 'grad_norm': 7.790976524353027, 'learning_rate': 2.8501228501228505e-05, 'epoch': 0.43}


 44%|████▍     | 360/819 [20:00<04:52,  1.57it/s]

[A[A                                          


                                                 
[A

 44%|████▍     | 360/819 [20:00<04:52,  1.57it/s]

{'loss': 2.3053, 'grad_norm': 19.605728149414062, 'learning_rate': 2.8194103194103194e-05, 'epoch': 0.44}



[A                                              

[A[A                                          


                                                 
[A

 44%|████▍     | 360/819 [20:30<04:52,  1.57it/s]

{'eval_loss': 1.993568778038025, 'eval_runtime': 30.6921, 'eval_samples_per_second': 26.652, 'eval_steps_per_second': 26.652, 'epoch': 0.44}


 45%|████▍     | 365/819 [20:32<19:10,  2.53s/it]  

[A[A                                          


                                                 
[A

 45%|████▍     | 365/819 [20:32<19:10,  2.53s/it]

{'loss': 2.5382, 'grad_norm': 11.116211891174316, 'learning_rate': 2.788697788697789e-05, 'epoch': 0.45}


 45%|████▌     | 370/819 [20:33<04:58,  1.50it/s]

[A[A                                          


                                                 
[A

 45%|████▌     | 370/819 [20:33<04:58,  1.50it/s]

{'loss': 2.5693, 'grad_norm': 7.207502365112305, 'learning_rate': 2.7579852579852584e-05, 'epoch': 0.45}



[A                                              

[A[A                                          


                                                 
[A

 45%|████▌     | 370/819 [21:04<04:58,  1.50it/s]

{'eval_loss': 1.9905420541763306, 'eval_runtime': 30.7332, 'eval_samples_per_second': 26.616, 'eval_steps_per_second': 26.616, 'epoch': 0.45}


 46%|████▌     | 375/819 [21:06<19:04,  2.58s/it]  

[A[A                                          


                                                 
[A

 46%|████▌     | 375/819 [21:06<19:04,  2.58s/it]

{'loss': 2.1394, 'grad_norm': 8.51257610321045, 'learning_rate': 2.7272727272727273e-05, 'epoch': 0.46}


 46%|████▋     | 380/819 [21:08<05:33,  1.32it/s]

[A[A                                          


                                                 
[A

 46%|████▋     | 380/819 [21:08<05:33,  1.32it/s]

{'loss': 2.6487, 'grad_norm': 6.387150764465332, 'learning_rate': 2.6965601965601965e-05, 'epoch': 0.46}



[A                                              

[A[A                                          


                                                 
[A

 46%|████▋     | 380/819 [21:39<05:33,  1.32it/s]

{'eval_loss': 1.9880770444869995, 'eval_runtime': 30.8891, 'eval_samples_per_second': 26.482, 'eval_steps_per_second': 26.482, 'epoch': 0.46}


 47%|████▋     | 385/819 [21:40<18:35,  2.57s/it]  

[A[A                                          


                                                 
[A

 47%|████▋     | 385/819 [21:40<18:35,  2.57s/it]

{'loss': 1.7101, 'grad_norm': 7.174516201019287, 'learning_rate': 2.665847665847666e-05, 'epoch': 0.47}


 48%|████▊     | 390/819 [21:41<04:56,  1.45it/s]

[A[A                                          


                                                 
[A

 48%|████▊     | 390/819 [21:41<04:56,  1.45it/s]

{'loss': 2.4771, 'grad_norm': 5.188896656036377, 'learning_rate': 2.635135135135135e-05, 'epoch': 0.48}



[A                                              

[A[A                                          


                                                 
[A

 48%|████▊     | 390/819 [22:12<04:56,  1.45it/s]

{'eval_loss': 1.985023856163025, 'eval_runtime': 30.9757, 'eval_samples_per_second': 26.408, 'eval_steps_per_second': 26.408, 'epoch': 0.48}


 48%|████▊     | 395/819 [22:14<18:16,  2.59s/it]  

[A[A                                          


                                                 
[A

 48%|████▊     | 395/819 [22:14<18:16,  2.59s/it]

{'loss': 1.9848, 'grad_norm': 7.498081684112549, 'learning_rate': 2.6044226044226045e-05, 'epoch': 0.48}


 49%|████▉     | 400/819 [22:15<04:33,  1.53it/s]

[A[A                                          


                                                 
[A

 49%|████▉     | 400/819 [22:15<04:33,  1.53it/s]

{'loss': 2.2338, 'grad_norm': 8.284555435180664, 'learning_rate': 2.573710073710074e-05, 'epoch': 0.49}



[A                                              

[A[A                                          


                                                 
[A

 49%|████▉     | 400/819 [22:47<04:33,  1.53it/s]

{'eval_loss': 1.9822962284088135, 'eval_runtime': 31.7062, 'eval_samples_per_second': 25.799, 'eval_steps_per_second': 25.799, 'epoch': 0.49}


 49%|████▉     | 405/819 [22:49<18:35,  2.69s/it]  

[A[A                                          


                                                 
[A

 49%|████▉     | 405/819 [22:49<18:35,  2.69s/it]

{'loss': 2.5291, 'grad_norm': 7.399516582489014, 'learning_rate': 2.5429975429975432e-05, 'epoch': 0.49}


 50%|█████     | 410/819 [22:50<04:29,  1.52it/s]

[A[A                                          


                                                 
[A

 50%|█████     | 410/819 [22:50<04:29,  1.52it/s]

{'loss': 2.2309, 'grad_norm': 9.677757263183594, 'learning_rate': 2.512285012285012e-05, 'epoch': 0.5}



[A                                              

[A[A                                          


                                                 
[A

 50%|█████     | 410/819 [23:21<04:29,  1.52it/s]

{'eval_loss': 1.9808675050735474, 'eval_runtime': 30.8933, 'eval_samples_per_second': 26.478, 'eval_steps_per_second': 26.478, 'epoch': 0.5}


 51%|█████     | 415/819 [23:22<17:08,  2.55s/it]  

[A[A                                          


                                                 
[A

 51%|█████     | 415/819 [23:22<17:08,  2.55s/it]

{'loss': 1.8086, 'grad_norm': 9.058269500732422, 'learning_rate': 2.4815724815724816e-05, 'epoch': 0.51}


 51%|█████▏    | 420/819 [23:24<04:11,  1.59it/s]

[A[A                                          


                                                 
[A

 51%|█████▏    | 420/819 [23:24<04:11,  1.59it/s]

{'loss': 1.9584, 'grad_norm': 7.704722881317139, 'learning_rate': 2.450859950859951e-05, 'epoch': 0.51}



[A                                              

[A[A                                          


                                                 
[A

 51%|█████▏    | 420/819 [23:55<04:11,  1.59it/s]

{'eval_loss': 1.9796634912490845, 'eval_runtime': 31.2509, 'eval_samples_per_second': 26.175, 'eval_steps_per_second': 26.175, 'epoch': 0.51}


 52%|█████▏    | 425/819 [23:56<17:00,  2.59s/it]  

[A[A                                          


                                                 
[A

 52%|█████▏    | 425/819 [23:56<17:00,  2.59s/it]

{'loss': 2.5522, 'grad_norm': 7.88394832611084, 'learning_rate': 2.4201474201474204e-05, 'epoch': 0.52}


 53%|█████▎    | 430/819 [23:58<04:42,  1.38it/s]

[A[A                                          


                                                 
[A

 53%|█████▎    | 430/819 [23:58<04:42,  1.38it/s]

{'loss': 1.9836, 'grad_norm': 6.0188751220703125, 'learning_rate': 2.3894348894348896e-05, 'epoch': 0.53}



[A                                              

[A[A                                          


                                                 
[A

 53%|█████▎    | 430/819 [24:29<04:42,  1.38it/s]

{'eval_loss': 1.9786337614059448, 'eval_runtime': 31.216, 'eval_samples_per_second': 26.205, 'eval_steps_per_second': 26.205, 'epoch': 0.53}


 53%|█████▎    | 435/819 [24:31<16:57,  2.65s/it]  

[A[A                                          


                                                 
[A

 53%|█████▎    | 435/819 [24:31<16:57,  2.65s/it]

{'loss': 2.4873, 'grad_norm': 5.90641450881958, 'learning_rate': 2.3587223587223588e-05, 'epoch': 0.53}


 54%|█████▎    | 440/819 [24:33<04:20,  1.45it/s]

[A[A                                          


                                                 
[A

 54%|█████▎    | 440/819 [24:33<04:20,  1.45it/s]

{'loss': 2.2805, 'grad_norm': 10.7733793258667, 'learning_rate': 2.328009828009828e-05, 'epoch': 0.54}



[A                                              

[A[A                                          


                                                 
[A

 54%|█████▎    | 440/819 [25:08<04:20,  1.45it/s]

{'eval_loss': 1.9771970510482788, 'eval_runtime': 35.5527, 'eval_samples_per_second': 23.008, 'eval_steps_per_second': 23.008, 'epoch': 0.54}


 54%|█████▍    | 445/819 [25:10<18:05,  2.90s/it]  

[A[A                                          


                                                 
[A

 54%|█████▍    | 445/819 [25:10<18:05,  2.90s/it]

{'loss': 1.7247, 'grad_norm': 9.40220832824707, 'learning_rate': 2.2972972972972976e-05, 'epoch': 0.54}


 55%|█████▍    | 450/819 [25:11<04:16,  1.44it/s]

[A[A                                          


                                                 
[A

 55%|█████▍    | 450/819 [25:11<04:16,  1.44it/s]

{'loss': 2.3616, 'grad_norm': 10.62214183807373, 'learning_rate': 2.2665847665847668e-05, 'epoch': 0.55}



[A                                              

[A[A                                          


                                                 
[A

 55%|█████▍    | 450/819 [25:48<04:16,  1.44it/s]

{'eval_loss': 1.9773898124694824, 'eval_runtime': 36.5348, 'eval_samples_per_second': 22.39, 'eval_steps_per_second': 22.39, 'epoch': 0.55}


 56%|█████▌    | 455/819 [25:49<18:04,  2.98s/it]  

[A[A                                          


                                                 
[A

 56%|█████▌    | 455/819 [25:49<18:04,  2.98s/it]

{'loss': 2.3758, 'grad_norm': 13.188456535339355, 'learning_rate': 2.235872235872236e-05, 'epoch': 0.56}


 56%|█████▌    | 460/819 [25:51<04:20,  1.38it/s]

[A[A                                          


                                                 
[A

 56%|█████▌    | 460/819 [25:51<04:20,  1.38it/s]

{'loss': 2.2218, 'grad_norm': 10.165526390075684, 'learning_rate': 2.2051597051597052e-05, 'epoch': 0.56}



[A                                              

[A[A                                          


                                                 
[A

 56%|█████▌    | 460/819 [26:27<04:20,  1.38it/s]

{'eval_loss': 1.9759711027145386, 'eval_runtime': 36.2346, 'eval_samples_per_second': 22.575, 'eval_steps_per_second': 22.575, 'epoch': 0.56}


 57%|█████▋    | 465/819 [26:29<17:37,  2.99s/it]  

[A[A                                          


                                                 
[A

 57%|█████▋    | 465/819 [26:29<17:37,  2.99s/it]

{'loss': 2.1714, 'grad_norm': 11.757250785827637, 'learning_rate': 2.1744471744471744e-05, 'epoch': 0.57}


 57%|█████▋    | 470/819 [26:30<04:17,  1.35it/s]

[A[A                                          


                                                 
[A

 57%|█████▋    | 470/819 [26:30<04:17,  1.35it/s]

{'loss': 2.0511, 'grad_norm': 7.788796901702881, 'learning_rate': 2.143734643734644e-05, 'epoch': 0.57}



[A                                              

[A[A                                          


                                                 
[A

 57%|█████▋    | 470/819 [27:07<04:17,  1.35it/s]

{'eval_loss': 1.976244330406189, 'eval_runtime': 36.7255, 'eval_samples_per_second': 22.273, 'eval_steps_per_second': 22.273, 'epoch': 0.57}


 58%|█████▊    | 475/819 [27:09<17:33,  3.06s/it]  

[A[A                                          


                                                 
[A

 58%|█████▊    | 475/819 [27:09<17:33,  3.06s/it]

{'loss': 2.142, 'grad_norm': 5.8161091804504395, 'learning_rate': 2.113022113022113e-05, 'epoch': 0.58}


 59%|█████▊    | 480/819 [27:10<04:26,  1.27it/s]

[A[A                                          


                                                 
[A

 59%|█████▊    | 480/819 [27:10<04:26,  1.27it/s]

{'loss': 2.4194, 'grad_norm': 12.464340209960938, 'learning_rate': 2.0823095823095824e-05, 'epoch': 0.59}



[A                                              

[A[A                                          


                                                 
[A

 59%|█████▊    | 480/819 [27:46<04:26,  1.27it/s]

{'eval_loss': 1.9735041856765747, 'eval_runtime': 35.233, 'eval_samples_per_second': 23.217, 'eval_steps_per_second': 23.217, 'epoch': 0.59}


 59%|█████▉    | 485/819 [27:47<16:11,  2.91s/it]  

[A[A                                          


                                                 
[A

 59%|█████▉    | 485/819 [27:47<16:11,  2.91s/it]

{'loss': 1.5846, 'grad_norm': 6.573406219482422, 'learning_rate': 2.051597051597052e-05, 'epoch': 0.59}


 60%|█████▉    | 490/819 [27:49<04:00,  1.37it/s]

[A[A                                          


                                                 
[A

 60%|█████▉    | 490/819 [27:49<04:00,  1.37it/s]

{'loss': 2.3952, 'grad_norm': 7.918684959411621, 'learning_rate': 2.0208845208845208e-05, 'epoch': 0.6}



[A                                              

[A[A                                          


                                                 
[A

 60%|█████▉    | 490/819 [28:24<04:00,  1.37it/s]

{'eval_loss': 1.9716018438339233, 'eval_runtime': 35.3321, 'eval_samples_per_second': 23.152, 'eval_steps_per_second': 23.152, 'epoch': 0.6}


 60%|██████    | 495/819 [28:26<15:39,  2.90s/it]  

[A[A                                          


                                                 
[A

 60%|██████    | 495/819 [28:26<15:39,  2.90s/it]

{'loss': 1.8595, 'grad_norm': 11.311070442199707, 'learning_rate': 1.9901719901719903e-05, 'epoch': 0.6}


 61%|██████    | 500/819 [28:27<03:51,  1.38it/s]

[A[A                                          


                                                 
[A

 61%|██████    | 500/819 [28:27<03:51,  1.38it/s]

{'loss': 2.5807, 'grad_norm': 8.78870964050293, 'learning_rate': 1.9594594594594595e-05, 'epoch': 0.61}



[A                                              

[A[A                                          


                                                 
[A

 61%|██████    | 500/819 [29:03<03:51,  1.38it/s]

{'eval_loss': 1.9697597026824951, 'eval_runtime': 35.6692, 'eval_samples_per_second': 22.933, 'eval_steps_per_second': 22.933, 'epoch': 0.61}


 62%|██████▏   | 505/819 [29:04<15:21,  2.93s/it]  

[A[A                                          


                                                 
[A

 62%|██████▏   | 505/819 [29:04<15:21,  2.93s/it]

{'loss': 2.4849, 'grad_norm': 12.029952049255371, 'learning_rate': 1.928746928746929e-05, 'epoch': 0.62}


 62%|██████▏   | 510/819 [29:06<03:51,  1.33it/s]

[A[A                                          


                                                 
[A

 62%|██████▏   | 510/819 [29:06<03:51,  1.33it/s]

{'loss': 1.9412, 'grad_norm': 5.191678047180176, 'learning_rate': 1.8980343980343983e-05, 'epoch': 0.62}



[A                                              

[A[A                                          


                                                 
[A

 62%|██████▏   | 510/819 [29:41<03:51,  1.33it/s]

{'eval_loss': 1.9682918787002563, 'eval_runtime': 35.628, 'eval_samples_per_second': 22.959, 'eval_steps_per_second': 22.959, 'epoch': 0.62}


 63%|██████▎   | 515/819 [29:43<14:50,  2.93s/it]

[A[A                                          


                                                 
[A

 63%|██████▎   | 515/819 [29:43<14:50,  2.93s/it]

{'loss': 2.4851, 'grad_norm': 14.023566246032715, 'learning_rate': 1.8673218673218675e-05, 'epoch': 0.63}


 63%|██████▎   | 520/819 [29:44<03:31,  1.42it/s]

[A[A                                          


                                                 
[A

 63%|██████▎   | 520/819 [29:44<03:31,  1.42it/s]

{'loss': 2.2508, 'grad_norm': 12.671850204467773, 'learning_rate': 1.8366093366093367e-05, 'epoch': 0.63}



[A                                              

[A[A                                          


                                                 
[A

 63%|██████▎   | 520/819 [30:20<03:31,  1.42it/s]

{'eval_loss': 1.9660453796386719, 'eval_runtime': 35.1271, 'eval_samples_per_second': 23.287, 'eval_steps_per_second': 23.287, 'epoch': 0.63}


 64%|██████▍   | 525/819 [30:21<14:09,  2.89s/it]

[A[A                                          


                                                 
[A

 64%|██████▍   | 525/819 [30:21<14:09,  2.89s/it]

{'loss': 2.6385, 'grad_norm': 7.603978157043457, 'learning_rate': 1.805896805896806e-05, 'epoch': 0.64}


 65%|██████▍   | 530/819 [30:23<03:31,  1.36it/s]

[A[A                                          


                                                 
[A

 65%|██████▍   | 530/819 [30:23<03:31,  1.36it/s]

{'loss': 2.2365, 'grad_norm': 8.9651517868042, 'learning_rate': 1.7751842751842754e-05, 'epoch': 0.65}



[A                                              

[A[A                                          


                                                 
[A

 65%|██████▍   | 530/819 [30:58<03:31,  1.36it/s]

{'eval_loss': 1.9650315046310425, 'eval_runtime': 35.4965, 'eval_samples_per_second': 23.045, 'eval_steps_per_second': 23.045, 'epoch': 0.65}


 65%|██████▌   | 535/819 [31:00<13:47,  2.91s/it]

[A[A                                          


                                                 
[A

 65%|██████▌   | 535/819 [31:00<13:47,  2.91s/it]

{'loss': 2.2963, 'grad_norm': 7.621071815490723, 'learning_rate': 1.7444717444717446e-05, 'epoch': 0.65}


 66%|██████▌   | 540/819 [31:01<03:14,  1.43it/s]

[A[A                                          


                                                 
[A

 66%|██████▌   | 540/819 [31:01<03:14,  1.43it/s]

{'loss': 2.7895, 'grad_norm': 9.647500991821289, 'learning_rate': 1.713759213759214e-05, 'epoch': 0.66}



[A                                              

[A[A                                          


                                                 
[A

 66%|██████▌   | 540/819 [31:36<03:14,  1.43it/s]

{'eval_loss': 1.964350938796997, 'eval_runtime': 35.1204, 'eval_samples_per_second': 23.291, 'eval_steps_per_second': 23.291, 'epoch': 0.66}


 67%|██████▋   | 545/819 [31:37<13:02,  2.85s/it]

[A[A                                          


                                                 
[A

 67%|██████▋   | 545/819 [31:37<13:02,  2.85s/it]

{'loss': 1.8977, 'grad_norm': 11.054890632629395, 'learning_rate': 1.683046683046683e-05, 'epoch': 0.67}


 67%|██████▋   | 550/819 [31:39<03:07,  1.43it/s]

[A[A                                          


                                                 
[A

 67%|██████▋   | 550/819 [31:39<03:07,  1.43it/s]

{'loss': 2.2698, 'grad_norm': 6.9383931159973145, 'learning_rate': 1.6523341523341523e-05, 'epoch': 0.67}



[A                                              

[A[A                                          


                                                 
[A

 67%|██████▋   | 550/819 [32:14<03:07,  1.43it/s]

{'eval_loss': 1.9630240201950073, 'eval_runtime': 35.6311, 'eval_samples_per_second': 22.957, 'eval_steps_per_second': 22.957, 'epoch': 0.67}


 68%|██████▊   | 555/819 [32:16<12:47,  2.91s/it]

[A[A                                          


                                                 
[A

 68%|██████▊   | 555/819 [32:16<12:47,  2.91s/it]

{'loss': 2.2296, 'grad_norm': 7.828383922576904, 'learning_rate': 1.6216216216216218e-05, 'epoch': 0.68}


 68%|██████▊   | 560/819 [32:17<03:02,  1.42it/s]

[A[A                                          


                                                 
[A

 68%|██████▊   | 560/819 [32:17<03:02,  1.42it/s]

{'loss': 2.1914, 'grad_norm': 5.412710666656494, 'learning_rate': 1.590909090909091e-05, 'epoch': 0.68}



[A                                              

[A[A                                          


                                                 
[A

 68%|██████▊   | 560/819 [32:54<03:02,  1.42it/s]

{'eval_loss': 1.961602807044983, 'eval_runtime': 36.5372, 'eval_samples_per_second': 22.388, 'eval_steps_per_second': 22.388, 'epoch': 0.68}


 69%|██████▉   | 565/819 [32:55<12:46,  3.02s/it]

[A[A                                          


                                                 
[A

 69%|██████▉   | 565/819 [32:55<12:46,  3.02s/it]

{'loss': 1.9975, 'grad_norm': 15.164763450622559, 'learning_rate': 1.5601965601965606e-05, 'epoch': 0.69}


 70%|██████▉   | 570/819 [32:57<02:58,  1.39it/s]

[A[A                                          


                                                 
[A

 70%|██████▉   | 570/819 [32:57<02:58,  1.39it/s]

{'loss': 2.0883, 'grad_norm': 14.086338996887207, 'learning_rate': 1.5294840294840294e-05, 'epoch': 0.7}



[A                                              

[A[A                                          


                                                 
[A

 70%|██████▉   | 570/819 [33:38<02:58,  1.39it/s]

{'eval_loss': 1.9600647687911987, 'eval_runtime': 41.6863, 'eval_samples_per_second': 19.623, 'eval_steps_per_second': 19.623, 'epoch': 0.7}


 70%|███████   | 575/819 [33:40<13:37,  3.35s/it]

[A[A                                          


                                                 
[A

 70%|███████   | 575/819 [33:40<13:37,  3.35s/it]

{'loss': 2.1918, 'grad_norm': 8.274762153625488, 'learning_rate': 1.4987714987714988e-05, 'epoch': 0.7}


 71%|███████   | 580/819 [33:41<03:09,  1.26it/s]

[A[A                                          


                                                 
[A

 71%|███████   | 580/819 [33:41<03:09,  1.26it/s]

{'loss': 2.5788, 'grad_norm': 11.815483093261719, 'learning_rate': 1.4680589680589682e-05, 'epoch': 0.71}



[A                                              

[A[A                                          


                                                 
[A

 71%|███████   | 580/819 [34:22<03:09,  1.26it/s]

{'eval_loss': 1.958867073059082, 'eval_runtime': 40.6962, 'eval_samples_per_second': 20.1, 'eval_steps_per_second': 20.1, 'epoch': 0.71}


 71%|███████▏  | 585/819 [34:23<12:51,  3.30s/it]

[A[A                                          


                                                 
[A

 71%|███████▏  | 585/819 [34:23<12:51,  3.30s/it]

{'loss': 2.1645, 'grad_norm': 5.315128803253174, 'learning_rate': 1.4373464373464374e-05, 'epoch': 0.71}


 72%|███████▏  | 590/819 [34:25<02:54,  1.31it/s]

[A[A                                          


                                                 
[A

 72%|███████▏  | 590/819 [34:25<02:54,  1.31it/s]

{'loss': 1.7839, 'grad_norm': 12.131937980651855, 'learning_rate': 1.4066339066339068e-05, 'epoch': 0.72}



[A                                              

[A[A                                          


                                                 
[A

 72%|███████▏  | 590/819 [35:06<02:54,  1.31it/s]

{'eval_loss': 1.958220362663269, 'eval_runtime': 41.2041, 'eval_samples_per_second': 19.852, 'eval_steps_per_second': 19.852, 'epoch': 0.72}


 73%|███████▎  | 595/819 [35:08<12:41,  3.40s/it]

[A[A                                          


                                                 
[A

 73%|███████▎  | 595/819 [35:08<12:41,  3.40s/it]

{'loss': 2.7273, 'grad_norm': 16.73533821105957, 'learning_rate': 1.375921375921376e-05, 'epoch': 0.73}


 73%|███████▎  | 600/819 [35:10<03:02,  1.20it/s]

[A[A                                          


                                                 
[A

 73%|███████▎  | 600/819 [35:10<03:02,  1.20it/s]

{'loss': 1.8889, 'grad_norm': 9.529058456420898, 'learning_rate': 1.3452088452088452e-05, 'epoch': 0.73}



[A                                              

[A[A                                          


                                                 
[A

 73%|███████▎  | 600/819 [35:52<03:02,  1.20it/s]

{'eval_loss': 1.957782506942749, 'eval_runtime': 42.5414, 'eval_samples_per_second': 19.228, 'eval_steps_per_second': 19.228, 'epoch': 0.73}


 74%|███████▍  | 605/819 [35:54<12:12,  3.42s/it]

[A[A                                          


                                                 
[A

 74%|███████▍  | 605/819 [35:54<12:12,  3.42s/it]

{'loss': 2.027, 'grad_norm': 6.993647575378418, 'learning_rate': 1.3144963144963146e-05, 'epoch': 0.74}


 74%|███████▍  | 610/819 [35:55<02:49,  1.23it/s]

[A[A                                          


                                                 
[A

 74%|███████▍  | 610/819 [35:55<02:49,  1.23it/s]

{'loss': 2.4586, 'grad_norm': 13.997492790222168, 'learning_rate': 1.2837837837837838e-05, 'epoch': 0.74}



[A                                              

[A[A                                          


                                                 
[A

 74%|███████▍  | 610/819 [36:40<02:49,  1.23it/s]

{'eval_loss': 1.9568665027618408, 'eval_runtime': 45.1424, 'eval_samples_per_second': 18.12, 'eval_steps_per_second': 18.12, 'epoch': 0.74}


 75%|███████▌  | 615/819 [36:42<12:22,  3.64s/it]

[A[A                                          


                                                 
[A

 75%|███████▌  | 615/819 [36:42<12:22,  3.64s/it]

{'loss': 1.8805, 'grad_norm': 21.116897583007812, 'learning_rate': 1.2530712530712533e-05, 'epoch': 0.75}


 76%|███████▌  | 620/819 [36:43<02:42,  1.23it/s]

[A[A                                          


                                                 
[A

 76%|███████▌  | 620/819 [36:43<02:42,  1.23it/s]

{'loss': 2.1932, 'grad_norm': 9.931144714355469, 'learning_rate': 1.2223587223587223e-05, 'epoch': 0.76}



[A                                              

[A[A                                          


                                                 
[A

 76%|███████▌  | 620/819 [37:26<02:42,  1.23it/s]

{'eval_loss': 1.9562160968780518, 'eval_runtime': 42.6345, 'eval_samples_per_second': 19.186, 'eval_steps_per_second': 19.186, 'epoch': 0.76}


 76%|███████▋  | 625/819 [37:27<11:10,  3.45s/it]

[A[A                                          


                                                 
[A

 76%|███████▋  | 625/819 [37:27<11:10,  3.45s/it]

{'loss': 2.306, 'grad_norm': 11.258722305297852, 'learning_rate': 1.1916461916461917e-05, 'epoch': 0.76}


 77%|███████▋  | 630/819 [37:29<02:34,  1.22it/s]

[A[A                                          


                                                 
[A

 77%|███████▋  | 630/819 [37:29<02:34,  1.22it/s]

{'loss': 2.0747, 'grad_norm': 5.706875801086426, 'learning_rate': 1.1609336609336611e-05, 'epoch': 0.77}



[A                                              

[A[A                                          


                                                 
[A

 77%|███████▋  | 630/819 [38:11<02:34,  1.22it/s]

{'eval_loss': 1.956640601158142, 'eval_runtime': 42.5057, 'eval_samples_per_second': 19.244, 'eval_steps_per_second': 19.244, 'epoch': 0.77}


 78%|███████▊  | 635/819 [38:13<10:29,  3.42s/it]

[A[A                                          


                                                 
[A

 78%|███████▊  | 635/819 [38:13<10:29,  3.42s/it]

{'loss': 1.8277, 'grad_norm': 8.352355003356934, 'learning_rate': 1.1302211302211303e-05, 'epoch': 0.78}


 78%|███████▊  | 640/819 [38:14<02:27,  1.21it/s]

[A[A                                          


                                                 
[A

 78%|███████▊  | 640/819 [38:14<02:27,  1.21it/s]

{'loss': 2.1778, 'grad_norm': 7.27571439743042, 'learning_rate': 1.0995085995085995e-05, 'epoch': 0.78}



[A                                              

[A[A                                          


                                                 
[A

 78%|███████▊  | 640/819 [38:57<02:27,  1.21it/s]

{'eval_loss': 1.956787109375, 'eval_runtime': 42.4782, 'eval_samples_per_second': 19.257, 'eval_steps_per_second': 19.257, 'epoch': 0.78}


 79%|███████▉  | 645/819 [38:58<10:04,  3.47s/it]

[A[A                                          


                                                 
[A

 79%|███████▉  | 645/819 [38:58<10:04,  3.47s/it]

{'loss': 1.7532, 'grad_norm': 12.437250137329102, 'learning_rate': 1.0687960687960689e-05, 'epoch': 0.79}


 79%|███████▉  | 650/819 [39:00<02:19,  1.21it/s]

[A[A                                          


                                                 
[A

 79%|███████▉  | 650/819 [39:00<02:19,  1.21it/s]

{'loss': 1.8617, 'grad_norm': 10.853907585144043, 'learning_rate': 1.0380835380835381e-05, 'epoch': 0.79}



[A                                              

[A[A                                          


                                                 
[A

 79%|███████▉  | 650/819 [39:42<02:19,  1.21it/s]

{'eval_loss': 1.956416130065918, 'eval_runtime': 42.1057, 'eval_samples_per_second': 19.427, 'eval_steps_per_second': 19.427, 'epoch': 0.79}


 80%|███████▉  | 655/819 [39:44<09:22,  3.43s/it]

[A[A                                          


                                                 
[A

 80%|███████▉  | 655/819 [39:44<09:22,  3.43s/it]

{'loss': 2.3124, 'grad_norm': 13.205765724182129, 'learning_rate': 1.0073710073710075e-05, 'epoch': 0.8}


 81%|████████  | 660/819 [39:45<02:05,  1.27it/s]

[A[A                                          


                                                 
[A

 81%|████████  | 660/819 [39:45<02:05,  1.27it/s]

{'loss': 1.6864, 'grad_norm': 7.588732719421387, 'learning_rate': 9.766584766584767e-06, 'epoch': 0.81}



[A                                              

[A[A                                          


                                                 
[A

 81%|████████  | 660/819 [40:27<02:05,  1.27it/s]

{'eval_loss': 1.956027626991272, 'eval_runtime': 41.9158, 'eval_samples_per_second': 19.515, 'eval_steps_per_second': 19.515, 'epoch': 0.81}


 81%|████████  | 665/819 [40:29<08:46,  3.42s/it]

[A[A                                          


                                                 
[A

 81%|████████  | 665/819 [40:29<08:46,  3.42s/it]

{'loss': 2.0364, 'grad_norm': 7.080205917358398, 'learning_rate': 9.45945945945946e-06, 'epoch': 0.81}


 82%|████████▏ | 670/819 [40:30<01:57,  1.27it/s]

[A[A                                          


                                                 
[A

 82%|████████▏ | 670/819 [40:30<01:57,  1.27it/s]

{'loss': 1.7866, 'grad_norm': 5.4350266456604, 'learning_rate': 9.152334152334153e-06, 'epoch': 0.82}



[A                                              

[A[A                                          


                                                 
[A

 82%|████████▏ | 670/819 [41:12<01:57,  1.27it/s]

{'eval_loss': 1.9552792310714722, 'eval_runtime': 41.9697, 'eval_samples_per_second': 19.49, 'eval_steps_per_second': 19.49, 'epoch': 0.82}


 82%|████████▏ | 675/819 [41:14<08:09,  3.40s/it]

[A[A                                          


                                                 
[A

 82%|████████▏ | 675/819 [41:14<08:09,  3.40s/it]

{'loss': 1.8883, 'grad_norm': 9.35812759399414, 'learning_rate': 8.845208845208845e-06, 'epoch': 0.82}


 83%|████████▎ | 680/819 [41:15<01:48,  1.28it/s]

[A[A                                          


                                                 
[A

 83%|████████▎ | 680/819 [41:15<01:48,  1.28it/s]

{'loss': 1.6734, 'grad_norm': 9.876590728759766, 'learning_rate': 8.538083538083538e-06, 'epoch': 0.83}



[A                                              

[A[A                                          


                                                 
[A

 83%|████████▎ | 680/819 [41:58<01:48,  1.28it/s]

{'eval_loss': 1.9547841548919678, 'eval_runtime': 42.6542, 'eval_samples_per_second': 19.177, 'eval_steps_per_second': 19.177, 'epoch': 0.83}


 84%|████████▎ | 685/819 [41:59<07:43,  3.46s/it]

[A[A                                          


                                                 
[A

 84%|████████▎ | 685/819 [41:59<07:43,  3.46s/it]

{'loss': 1.9785, 'grad_norm': 6.777307510375977, 'learning_rate': 8.230958230958232e-06, 'epoch': 0.84}


 84%|████████▍ | 690/819 [42:01<01:45,  1.23it/s]

[A[A                                          


                                                 
[A

 84%|████████▍ | 690/819 [42:01<01:45,  1.23it/s]

{'loss': 2.2114, 'grad_norm': 10.508621215820312, 'learning_rate': 7.923832923832924e-06, 'epoch': 0.84}



[A                                              

[A[A                                          


                                                 
[A

 84%|████████▍ | 690/819 [42:44<01:45,  1.23it/s]

{'eval_loss': 1.9544352293014526, 'eval_runtime': 43.5104, 'eval_samples_per_second': 18.8, 'eval_steps_per_second': 18.8, 'epoch': 0.84}


 85%|████████▍ | 695/819 [42:46<07:13,  3.50s/it]

[A[A                                          


                                                 
[A

 85%|████████▍ | 695/819 [42:46<07:13,  3.50s/it]

{'loss': 1.9933, 'grad_norm': 21.598085403442383, 'learning_rate': 7.616707616707617e-06, 'epoch': 0.85}


 85%|████████▌ | 700/819 [42:48<01:47,  1.10it/s]

[A[A                                          


                                                 
[A

 85%|████████▌ | 700/819 [42:48<01:47,  1.10it/s]

{'loss': 2.5694, 'grad_norm': 6.872961521148682, 'learning_rate': 7.309582309582309e-06, 'epoch': 0.85}



[A                                              

[A[A                                          


                                                 
[A

 85%|████████▌ | 700/819 [43:33<01:47,  1.10it/s]

{'eval_loss': 1.9541192054748535, 'eval_runtime': 45.5462, 'eval_samples_per_second': 17.96, 'eval_steps_per_second': 17.96, 'epoch': 0.85}


 86%|████████▌ | 705/819 [43:35<07:09,  3.77s/it]

[A[A                                          


                                                 
[A

 86%|████████▌ | 705/819 [43:35<07:09,  3.77s/it]

{'loss': 1.9397, 'grad_norm': 8.632119178771973, 'learning_rate': 7.002457002457002e-06, 'epoch': 0.86}


 87%|████████▋ | 710/819 [43:37<01:33,  1.17it/s]

[A[A                                          


                                                 
[A

 87%|████████▋ | 710/819 [43:37<01:33,  1.17it/s]

{'loss': 2.6157, 'grad_norm': 9.270471572875977, 'learning_rate': 6.695331695331695e-06, 'epoch': 0.87}



[A                                              

[A[A                                          


                                                 
[A

 87%|████████▋ | 710/819 [44:22<01:33,  1.17it/s]

{'eval_loss': 1.9540554285049438, 'eval_runtime': 45.5603, 'eval_samples_per_second': 17.954, 'eval_steps_per_second': 17.954, 'epoch': 0.87}


 87%|████████▋ | 715/819 [44:24<06:22,  3.67s/it]

[A[A                                          


                                                 
[A

 87%|████████▋ | 715/819 [44:24<06:22,  3.67s/it]

{'loss': 2.1875, 'grad_norm': 8.178705215454102, 'learning_rate': 6.388206388206389e-06, 'epoch': 0.87}


 88%|████████▊ | 720/819 [44:26<01:23,  1.18it/s]

[A[A                                          


                                                 
[A

 88%|████████▊ | 720/819 [44:26<01:23,  1.18it/s]

{'loss': 2.4536, 'grad_norm': 114.71417236328125, 'learning_rate': 6.081081081081082e-06, 'epoch': 0.88}



[A                                              

[A[A                                          


                                                 
[A

 88%|████████▊ | 720/819 [45:07<01:23,  1.18it/s]

{'eval_loss': 1.953810453414917, 'eval_runtime': 41.603, 'eval_samples_per_second': 19.662, 'eval_steps_per_second': 19.662, 'epoch': 0.88}


 89%|████████▊ | 725/819 [45:09<05:16,  3.37s/it]

[A[A                                          


                                                 
[A

 89%|████████▊ | 725/819 [45:09<05:16,  3.37s/it]

{'loss': 1.9662, 'grad_norm': 8.91751766204834, 'learning_rate': 5.773955773955774e-06, 'epoch': 0.89}


 89%|████████▉ | 730/819 [45:10<01:08,  1.29it/s]

[A[A                                          


                                                 
[A

 89%|████████▉ | 730/819 [45:10<01:08,  1.29it/s]

{'loss': 1.6489, 'grad_norm': 12.331920623779297, 'learning_rate': 5.466830466830468e-06, 'epoch': 0.89}



[A                                              

[A[A                                          


                                                 
[A

 89%|████████▉ | 730/819 [45:45<01:08,  1.29it/s]

{'eval_loss': 1.953546404838562, 'eval_runtime': 35.4502, 'eval_samples_per_second': 23.075, 'eval_steps_per_second': 23.075, 'epoch': 0.89}


 90%|████████▉ | 735/819 [45:47<04:07,  2.95s/it]

[A[A                                          


                                                 
[A

 90%|████████▉ | 735/819 [45:47<04:07,  2.95s/it]

{'loss': 1.8128, 'grad_norm': 6.650791645050049, 'learning_rate': 5.15970515970516e-06, 'epoch': 0.9}


 90%|█████████ | 740/819 [45:48<00:58,  1.36it/s]

[A[A                                          


                                                 
[A

 90%|█████████ | 740/819 [45:48<00:58,  1.36it/s]

{'loss': 2.1134, 'grad_norm': 5.779880523681641, 'learning_rate': 4.852579852579853e-06, 'epoch': 0.9}



[A                                              

[A[A                                          


                                                 
[A

 90%|█████████ | 740/819 [46:24<00:58,  1.36it/s]

{'eval_loss': 1.953550100326538, 'eval_runtime': 35.8368, 'eval_samples_per_second': 22.826, 'eval_steps_per_second': 22.826, 'epoch': 0.9}


 91%|█████████ | 745/819 [46:26<03:39,  2.97s/it]

[A[A                                          


                                                 
[A

 91%|█████████ | 745/819 [46:26<03:39,  2.97s/it]

{'loss': 1.8196, 'grad_norm': 7.084177017211914, 'learning_rate': 4.5454545454545455e-06, 'epoch': 0.91}


 92%|█████████▏| 750/819 [46:27<00:48,  1.41it/s]

[A[A                                          


                                                 
[A

 92%|█████████▏| 750/819 [46:27<00:48,  1.41it/s]

{'loss': 2.1014, 'grad_norm': 10.375741958618164, 'learning_rate': 4.2383292383292384e-06, 'epoch': 0.92}



[A                                              

[A[A                                          


                                                 
[A

 92%|█████████▏| 750/819 [47:03<00:48,  1.41it/s]

{'eval_loss': 1.9534965753555298, 'eval_runtime': 35.1895, 'eval_samples_per_second': 23.246, 'eval_steps_per_second': 23.246, 'epoch': 0.92}


 92%|█████████▏| 755/819 [47:04<03:05,  2.89s/it]

[A[A                                          


                                                 
[A

 92%|█████████▏| 755/819 [47:04<03:05,  2.89s/it]

{'loss': 2.0888, 'grad_norm': 12.561498641967773, 'learning_rate': 3.931203931203931e-06, 'epoch': 0.92}


 93%|█████████▎| 760/819 [47:06<00:42,  1.39it/s]

[A[A                                          


                                                 
[A

 93%|█████████▎| 760/819 [47:06<00:42,  1.39it/s]

{'loss': 1.7441, 'grad_norm': 6.4378886222839355, 'learning_rate': 3.6240786240786243e-06, 'epoch': 0.93}



[A                                              

[A[A                                          


                                                 
[A

 93%|█████████▎| 760/819 [47:41<00:42,  1.39it/s]

{'eval_loss': 1.9533798694610596, 'eval_runtime': 35.4241, 'eval_samples_per_second': 23.092, 'eval_steps_per_second': 23.092, 'epoch': 0.93}


 93%|█████████▎| 765/819 [47:43<02:37,  2.92s/it]

[A[A                                          


                                                 
[A

 93%|█████████▎| 765/819 [47:43<02:37,  2.92s/it]

{'loss': 2.7457, 'grad_norm': 13.198820114135742, 'learning_rate': 3.3169533169533168e-06, 'epoch': 0.93}


 94%|█████████▍| 770/819 [47:44<00:34,  1.43it/s]

[A[A                                          


                                                 
[A

 94%|█████████▍| 770/819 [47:44<00:34,  1.43it/s]

{'loss': 2.4306, 'grad_norm': 20.70522117614746, 'learning_rate': 3.0098280098280097e-06, 'epoch': 0.94}



[A                                              

[A[A                                          


                                                 
[A

 94%|█████████▍| 770/819 [48:17<00:34,  1.43it/s]

{'eval_loss': 1.953180193901062, 'eval_runtime': 33.2151, 'eval_samples_per_second': 24.627, 'eval_steps_per_second': 24.627, 'epoch': 0.94}


 95%|█████████▍| 775/819 [48:19<02:03,  2.80s/it]

[A[A                                          


                                                 
[A

 95%|█████████▍| 775/819 [48:19<02:03,  2.80s/it]

{'loss': 1.9831, 'grad_norm': 7.841094970703125, 'learning_rate': 2.702702702702703e-06, 'epoch': 0.95}


 95%|█████████▌| 780/819 [48:20<00:25,  1.50it/s]

[A[A                                          


                                                 
[A

 95%|█████████▌| 780/819 [48:20<00:25,  1.50it/s]

{'loss': 2.686, 'grad_norm': 16.949844360351562, 'learning_rate': 2.395577395577396e-06, 'epoch': 0.95}



[A                                              

[A[A                                          


                                                 
[A

 95%|█████████▌| 780/819 [48:55<00:25,  1.50it/s]

{'eval_loss': 1.9529021978378296, 'eval_runtime': 34.7038, 'eval_samples_per_second': 23.571, 'eval_steps_per_second': 23.571, 'epoch': 0.95}


 96%|█████████▌| 785/819 [48:57<01:40,  2.97s/it]

[A[A                                          


                                                 
[A

 96%|█████████▌| 785/819 [48:57<01:40,  2.97s/it]

{'loss': 2.4104, 'grad_norm': 7.275155544281006, 'learning_rate': 2.0884520884520884e-06, 'epoch': 0.96}


 96%|█████████▋| 790/819 [48:58<00:21,  1.34it/s]

[A[A                                          


                                                 
[A

 96%|█████████▋| 790/819 [48:58<00:21,  1.34it/s]

{'loss': 2.3174, 'grad_norm': 8.590594291687012, 'learning_rate': 1.7813267813267816e-06, 'epoch': 0.96}



[A                                              

[A[A                                          


                                                 
[A

 96%|█████████▋| 790/819 [49:35<00:21,  1.34it/s]

{'eval_loss': 1.95270836353302, 'eval_runtime': 36.356, 'eval_samples_per_second': 22.5, 'eval_steps_per_second': 22.5, 'epoch': 0.96}


 97%|█████████▋| 795/819 [49:37<01:15,  3.13s/it]

[A[A                                          


                                                 
[A

 97%|█████████▋| 795/819 [49:37<01:15,  3.13s/it]

{'loss': 2.4716, 'grad_norm': 60.954429626464844, 'learning_rate': 1.4742014742014743e-06, 'epoch': 0.97}


 98%|█████████▊| 800/819 [49:39<00:15,  1.27it/s]

[A[A                                          


                                                 
[A

 98%|█████████▊| 800/819 [49:39<00:15,  1.27it/s]

{'loss': 1.5528, 'grad_norm': 6.455740928649902, 'learning_rate': 1.1670761670761672e-06, 'epoch': 0.98}



[A                                              

[A[A                                          


                                                 
[A

 98%|█████████▊| 800/819 [50:14<00:15,  1.27it/s]

{'eval_loss': 1.9525432586669922, 'eval_runtime': 35.2634, 'eval_samples_per_second': 23.197, 'eval_steps_per_second': 23.197, 'epoch': 0.98}


 98%|█████████▊| 805/819 [50:16<00:40,  2.92s/it]

[A[A                                          


                                                 
[A

 98%|█████████▊| 805/819 [50:16<00:40,  2.92s/it]

{'loss': 2.1126, 'grad_norm': 14.630450248718262, 'learning_rate': 8.5995085995086e-07, 'epoch': 0.98}


 99%|█████████▉| 810/819 [50:17<00:06,  1.46it/s]

[A[A                                          


                                                 
[A

 99%|█████████▉| 810/819 [50:17<00:06,  1.46it/s]

{'loss': 1.7185, 'grad_norm': 15.586721420288086, 'learning_rate': 5.528255528255528e-07, 'epoch': 0.99}



[A                                              

[A[A                                          


                                                 
[A

 99%|█████████▉| 810/819 [50:52<00:06,  1.46it/s]

{'eval_loss': 1.9525151252746582, 'eval_runtime': 35.2959, 'eval_samples_per_second': 23.175, 'eval_steps_per_second': 23.175, 'epoch': 0.99}


100%|█████████▉| 815/819 [50:54<00:11,  2.94s/it]

[A[A                                          


                                                 
[A

100%|█████████▉| 815/819 [50:54<00:11,  2.94s/it]

{'loss': 2.1544, 'grad_norm': 5.987986087799072, 'learning_rate': 2.457002457002457e-07, 'epoch': 1.0}


100%|██████████| 819/819 [50:55<00:00,  1.10it/s]

[A[A                                          


                                                 
[A

100%|██████████| 819/819 [50:55<00:00,  3.73s/it]


{'train_runtime': 3055.8511, 'train_samples_per_second': 0.268, 'train_steps_per_second': 0.268, 'train_loss': 2.2896759018065436, 'epoch': 1.0}
