In [1]:
from transformers import BartTokenizer, BartForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import DataCollatorForSeq2Seq
import torch
from torch.utils.data import Dataset
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_df = pd.read_csv('/Users/shreyanakum/Downloads/Sophomore Year/Summer 2024/Aristocrat-Model/data_collection/caesar_cipher_output.csv')

print(data_df.head())

# BART tokenizer
bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

# Custom Dataset Class
class CaesarCipherDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=None):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        item = self.df.iloc[idx]
        input_ids = self.tokenizer(item['Ciphertext'], padding='max_length', truncation=True, max_length=self.max_length, return_tensors="pt").input_ids.squeeze()
        labels = self.tokenizer(item['Plaintext'], padding='max_length', truncation=True, max_length=self.max_length, return_tensors="pt").input_ids.squeeze()
        
        return {'input_ids': input_ids, 'labels': labels}

# Set a max length for sequences --> change later to longer to see what happens
max_length = 128

# Create Dataset
dataset = CaesarCipherDataset(data_df, bart_tokenizer, max_length=max_length)

                                           Plaintext  \
0  His playing has taken him across the United Ki...   
1  In recent years Rino has developed his Conduct...   
2  Since arriving in Milan orchestrated Santa Mes...   
3  Recently at the Garden of Forgiveness, the two...   
4  Their shaded colonnades became busy markets on...   

                                          Ciphertext  
0  DEO LHWUEJC DWO PWGAJ DEI WYNKOO PDA QJEPAZ GE...  
1  EJ NAYAJP UAWNO NEJK DWO ZARAHKLAZ DEO YKJZQYP...  
2  OEJYA WNNEREJC EJ IEHWJ KNYDAOPNWPAZ OWJPW IAO...  
3  NAYAJPHU WP PDA CWNZAJ KB BKNCERAJAOO, PDA PSK...  
4  PDAEN ODWZAZ YKHKJJWZAO XAYWIA XQOU IWNGAPO KJ...  




In [3]:
# Bart for sequence-to-sequence tasks
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')
data_collator = DataCollatorForSeq2Seq(tokenizer=bart_tokenizer, model=model)

if torch.cuda.is_available():
    model.to('cuda')
    print(f"Using GPU: {torch.cuda.get_device_name(0)}") # for nvidia gpu in case its used
else:
    print("Using CPU")


training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir='./logs', 
    logging_strategy="steps",  # log every N steps
    logging_steps=10, # adjust to bigger
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
    data_collator=data_collator,
)

trainer.train()
trainer.save_model("./caesar_cipher_decoder")

Using CPU


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  0%|          | 10/25428 [00:08<4:56:44,  1.43it/s]

{'loss': 11.4532, 'grad_norm': 26.63015365600586, 'learning_rate': 1.9992134654711343e-05, 'epoch': 0.0}


  0%|          | 20/25428 [00:14<4:41:29,  1.50it/s]

{'loss': 8.4901, 'grad_norm': 48.14887619018555, 'learning_rate': 1.9984269309422687e-05, 'epoch': 0.0}


  0%|          | 30/25428 [00:21<4:34:23,  1.54it/s]

{'loss': 6.1919, 'grad_norm': 43.32392883300781, 'learning_rate': 1.9976403964134028e-05, 'epoch': 0.0}


  0%|          | 40/25428 [00:27<4:36:04,  1.53it/s]

{'loss': 5.1757, 'grad_norm': 43.00865173339844, 'learning_rate': 1.996853861884537e-05, 'epoch': 0.0}


  0%|          | 50/25428 [00:34<4:37:03,  1.53it/s]

{'loss': 4.4844, 'grad_norm': 43.127418518066406, 'learning_rate': 1.996067327355671e-05, 'epoch': 0.01}


  0%|          | 60/25428 [00:40<4:29:06,  1.57it/s]

{'loss': 3.9691, 'grad_norm': 43.737606048583984, 'learning_rate': 1.995280792826805e-05, 'epoch': 0.01}


  0%|          | 70/25428 [00:47<4:38:16,  1.52it/s]

{'loss': 3.5448, 'grad_norm': 43.24372100830078, 'learning_rate': 1.9944942582979395e-05, 'epoch': 0.01}


  0%|          | 80/25428 [00:53<4:35:59,  1.53it/s]

{'loss': 3.218, 'grad_norm': 41.49054718017578, 'learning_rate': 1.9937077237690736e-05, 'epoch': 0.01}


  0%|          | 90/25428 [01:00<4:30:22,  1.56it/s]

{'loss': 2.7923, 'grad_norm': 39.59723663330078, 'learning_rate': 1.9929211892402077e-05, 'epoch': 0.01}


  0%|          | 100/25428 [01:07<4:38:59,  1.51it/s]

{'loss': 2.433, 'grad_norm': 36.586448669433594, 'learning_rate': 1.992134654711342e-05, 'epoch': 0.01}


  0%|          | 110/25428 [01:13<4:51:27,  1.45it/s]

{'loss': 2.066, 'grad_norm': 32.983306884765625, 'learning_rate': 1.9913481201824763e-05, 'epoch': 0.01}


  0%|          | 120/25428 [01:20<4:36:22,  1.53it/s]

{'loss': 1.8024, 'grad_norm': 27.39020347595215, 'learning_rate': 1.9905615856536104e-05, 'epoch': 0.01}


  1%|          | 130/25428 [01:26<4:31:53,  1.55it/s]

{'loss': 1.5359, 'grad_norm': 22.560914993286133, 'learning_rate': 1.9897750511247445e-05, 'epoch': 0.02}


  1%|          | 140/25428 [01:33<4:41:38,  1.50it/s]

{'loss': 1.3687, 'grad_norm': 17.36453628540039, 'learning_rate': 1.9889885165958786e-05, 'epoch': 0.02}


  1%|          | 150/25428 [01:40<4:32:19,  1.55it/s]

{'loss': 1.2404, 'grad_norm': 13.225412368774414, 'learning_rate': 1.988201982067013e-05, 'epoch': 0.02}


  1%|          | 160/25428 [01:46<4:39:10,  1.51it/s]

{'loss': 1.0146, 'grad_norm': 8.527945518493652, 'learning_rate': 1.987415447538147e-05, 'epoch': 0.02}


  1%|          | 170/25428 [01:53<4:30:11,  1.56it/s]

{'loss': 0.9516, 'grad_norm': 6.309955596923828, 'learning_rate': 1.9866289130092812e-05, 'epoch': 0.02}


  1%|          | 180/25428 [01:59<4:37:04,  1.52it/s]

{'loss': 0.8965, 'grad_norm': 4.058046817779541, 'learning_rate': 1.9858423784804156e-05, 'epoch': 0.02}


  1%|          | 190/25428 [02:06<4:35:01,  1.53it/s]

{'loss': 0.9236, 'grad_norm': 3.2001566886901855, 'learning_rate': 1.9850558439515497e-05, 'epoch': 0.02}


  1%|          | 200/25428 [02:12<4:36:52,  1.52it/s]

{'loss': 0.8432, 'grad_norm': 3.058065176010132, 'learning_rate': 1.984269309422684e-05, 'epoch': 0.02}


  1%|          | 210/25428 [02:19<4:30:08,  1.56it/s]

{'loss': 0.9059, 'grad_norm': 2.8480641841888428, 'learning_rate': 1.983482774893818e-05, 'epoch': 0.02}


  1%|          | 220/25428 [02:26<4:35:26,  1.53it/s]

{'loss': 0.8332, 'grad_norm': 1.9986951351165771, 'learning_rate': 1.982696240364952e-05, 'epoch': 0.03}


  1%|          | 230/25428 [02:32<4:35:12,  1.53it/s]

{'loss': 0.8424, 'grad_norm': 1.9060739278793335, 'learning_rate': 1.9819097058360865e-05, 'epoch': 0.03}


  1%|          | 240/25428 [02:38<4:28:16,  1.56it/s]

{'loss': 0.8123, 'grad_norm': 2.1517012119293213, 'learning_rate': 1.9811231713072206e-05, 'epoch': 0.03}


  1%|          | 250/25428 [02:45<4:40:36,  1.50it/s]

{'loss': 0.8339, 'grad_norm': 2.152052879333496, 'learning_rate': 1.9803366367783547e-05, 'epoch': 0.03}


  1%|          | 260/25428 [02:52<4:33:22,  1.53it/s]

{'loss': 0.7594, 'grad_norm': 2.2537624835968018, 'learning_rate': 1.979550102249489e-05, 'epoch': 0.03}


  1%|          | 270/25428 [02:58<4:34:46,  1.53it/s]

{'loss': 0.9093, 'grad_norm': 1.5688626766204834, 'learning_rate': 1.9787635677206232e-05, 'epoch': 0.03}


  1%|          | 280/25428 [03:05<4:28:23,  1.56it/s]

{'loss': 0.7935, 'grad_norm': 1.6935049295425415, 'learning_rate': 1.9779770331917573e-05, 'epoch': 0.03}


  1%|          | 290/25428 [03:11<4:31:52,  1.54it/s]

{'loss': 0.8063, 'grad_norm': 2.2442891597747803, 'learning_rate': 1.9771904986628914e-05, 'epoch': 0.03}


  1%|          | 300/25428 [03:18<4:38:23,  1.50it/s]

{'loss': 0.8603, 'grad_norm': 1.419424057006836, 'learning_rate': 1.9764039641340255e-05, 'epoch': 0.04}


  1%|          | 310/25428 [03:24<4:28:02,  1.56it/s]

{'loss': 0.8988, 'grad_norm': 1.6694315671920776, 'learning_rate': 1.97561742960516e-05, 'epoch': 0.04}


  1%|▏         | 320/25428 [03:31<4:28:20,  1.56it/s]

{'loss': 0.6907, 'grad_norm': 1.5229085683822632, 'learning_rate': 1.974830895076294e-05, 'epoch': 0.04}


  1%|▏         | 330/25428 [03:37<4:35:05,  1.52it/s]

{'loss': 0.7562, 'grad_norm': 1.2943518161773682, 'learning_rate': 1.974044360547428e-05, 'epoch': 0.04}


  1%|▏         | 340/25428 [03:44<4:30:57,  1.54it/s]

{'loss': 0.7676, 'grad_norm': 1.8365917205810547, 'learning_rate': 1.9732578260185626e-05, 'epoch': 0.04}


  1%|▏         | 350/25428 [03:50<4:37:08,  1.51it/s]

{'loss': 0.6925, 'grad_norm': 1.9045078754425049, 'learning_rate': 1.9724712914896967e-05, 'epoch': 0.04}


  1%|▏         | 360/25428 [03:57<4:31:10,  1.54it/s]

{'loss': 0.7341, 'grad_norm': 2.254157543182373, 'learning_rate': 1.9716847569608308e-05, 'epoch': 0.04}


  1%|▏         | 370/25428 [04:04<4:33:11,  1.53it/s]

{'loss': 0.7271, 'grad_norm': 1.7609038352966309, 'learning_rate': 1.970898222431965e-05, 'epoch': 0.04}


  1%|▏         | 380/25428 [04:10<4:31:53,  1.54it/s]

{'loss': 0.773, 'grad_norm': 1.6173704862594604, 'learning_rate': 1.970111687903099e-05, 'epoch': 0.04}


  2%|▏         | 390/25428 [04:17<4:32:29,  1.53it/s]

{'loss': 0.6302, 'grad_norm': 1.5482022762298584, 'learning_rate': 1.9693251533742334e-05, 'epoch': 0.05}


  2%|▏         | 400/25428 [04:23<4:33:43,  1.52it/s]

{'loss': 0.8037, 'grad_norm': 2.0825533866882324, 'learning_rate': 1.9685386188453675e-05, 'epoch': 0.05}


  2%|▏         | 410/25428 [04:30<4:27:33,  1.56it/s]

{'loss': 0.6967, 'grad_norm': 1.5516234636306763, 'learning_rate': 1.9677520843165016e-05, 'epoch': 0.05}


  2%|▏         | 420/25428 [04:36<4:28:39,  1.55it/s]

{'loss': 0.7658, 'grad_norm': 1.6503534317016602, 'learning_rate': 1.966965549787636e-05, 'epoch': 0.05}


  2%|▏         | 430/25428 [04:43<4:33:48,  1.52it/s]

{'loss': 0.7133, 'grad_norm': 1.9401118755340576, 'learning_rate': 1.96617901525877e-05, 'epoch': 0.05}


  2%|▏         | 440/25428 [04:49<4:30:19,  1.54it/s]

{'loss': 0.8756, 'grad_norm': 3.0511980056762695, 'learning_rate': 1.9653924807299042e-05, 'epoch': 0.05}


  2%|▏         | 450/25428 [04:56<4:27:40,  1.56it/s]

{'loss': 0.6794, 'grad_norm': 1.5044806003570557, 'learning_rate': 1.9646059462010383e-05, 'epoch': 0.05}


  2%|▏         | 460/25428 [05:02<4:32:03,  1.53it/s]

{'loss': 0.732, 'grad_norm': 1.8606864213943481, 'learning_rate': 1.9638194116721724e-05, 'epoch': 0.05}


  2%|▏         | 470/25428 [05:09<4:29:25,  1.54it/s]

{'loss': 0.7374, 'grad_norm': 1.5327504873275757, 'learning_rate': 1.963032877143307e-05, 'epoch': 0.06}


  2%|▏         | 480/25428 [05:15<4:30:39,  1.54it/s]

{'loss': 0.7641, 'grad_norm': 1.857775330543518, 'learning_rate': 1.962246342614441e-05, 'epoch': 0.06}


  2%|▏         | 490/25428 [05:22<4:27:47,  1.55it/s]

{'loss': 0.7336, 'grad_norm': 1.7392195463180542, 'learning_rate': 1.961459808085575e-05, 'epoch': 0.06}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.7378, 'grad_norm': 1.6414406299591064, 'learning_rate': 1.9606732735567095e-05, 'epoch': 0.06}


  2%|▏         | 510/25428 [05:38<4:43:33,  1.46it/s] 

{'loss': 0.6517, 'grad_norm': 1.9727591276168823, 'learning_rate': 1.9598867390278436e-05, 'epoch': 0.06}


  2%|▏         | 520/25428 [05:44<4:29:09,  1.54it/s]

{'loss': 0.7596, 'grad_norm': 1.92246413230896, 'learning_rate': 1.9591002044989777e-05, 'epoch': 0.06}


  2%|▏         | 530/25428 [05:51<4:27:37,  1.55it/s]

{'loss': 0.6279, 'grad_norm': 1.753363013267517, 'learning_rate': 1.9583136699701118e-05, 'epoch': 0.06}


  2%|▏         | 540/25428 [05:57<4:29:23,  1.54it/s]

{'loss': 0.6941, 'grad_norm': 1.5024470090866089, 'learning_rate': 1.957527135441246e-05, 'epoch': 0.06}


  2%|▏         | 550/25428 [06:04<4:31:00,  1.53it/s]

{'loss': 0.6394, 'grad_norm': 1.6527196168899536, 'learning_rate': 1.95674060091238e-05, 'epoch': 0.06}


  2%|▏         | 560/25428 [06:10<4:28:24,  1.54it/s]

{'loss': 0.7037, 'grad_norm': 1.7771046161651611, 'learning_rate': 1.9559540663835144e-05, 'epoch': 0.07}


  2%|▏         | 570/25428 [06:17<4:28:53,  1.54it/s]

{'loss': 0.7996, 'grad_norm': 1.750868320465088, 'learning_rate': 1.9551675318546485e-05, 'epoch': 0.07}


  2%|▏         | 580/25428 [06:23<4:28:08,  1.54it/s]

{'loss': 0.655, 'grad_norm': 1.4697009325027466, 'learning_rate': 1.9543809973257826e-05, 'epoch': 0.07}


  2%|▏         | 590/25428 [06:30<4:32:19,  1.52it/s]

{'loss': 0.7621, 'grad_norm': 1.861336588859558, 'learning_rate': 1.953594462796917e-05, 'epoch': 0.07}


  2%|▏         | 600/25428 [06:36<4:23:07,  1.57it/s]

{'loss': 0.7612, 'grad_norm': 2.174323558807373, 'learning_rate': 1.952807928268051e-05, 'epoch': 0.07}


  2%|▏         | 610/25428 [06:43<4:23:37,  1.57it/s]

{'loss': 0.7392, 'grad_norm': 1.6383507251739502, 'learning_rate': 1.9520213937391853e-05, 'epoch': 0.07}


  2%|▏         | 620/25428 [06:49<4:26:10,  1.55it/s]

{'loss': 0.7258, 'grad_norm': 1.7511736154556274, 'learning_rate': 1.9512348592103194e-05, 'epoch': 0.07}


  2%|▏         | 630/25428 [06:56<4:32:54,  1.51it/s]

{'loss': 0.7034, 'grad_norm': 1.7525389194488525, 'learning_rate': 1.9504483246814535e-05, 'epoch': 0.07}


  3%|▎         | 640/25428 [07:02<4:27:42,  1.54it/s]

{'loss': 0.6445, 'grad_norm': 2.1227669715881348, 'learning_rate': 1.949661790152588e-05, 'epoch': 0.08}


  3%|▎         | 650/25428 [07:08<4:25:27,  1.56it/s]

{'loss': 0.6673, 'grad_norm': 2.0057902336120605, 'learning_rate': 1.948875255623722e-05, 'epoch': 0.08}


  3%|▎         | 660/25428 [07:15<4:30:01,  1.53it/s]

{'loss': 0.7697, 'grad_norm': 2.2731335163116455, 'learning_rate': 1.948088721094856e-05, 'epoch': 0.08}


  3%|▎         | 670/25428 [07:22<4:31:46,  1.52it/s]

{'loss': 0.7626, 'grad_norm': 1.8892476558685303, 'learning_rate': 1.9473021865659905e-05, 'epoch': 0.08}


  3%|▎         | 680/25428 [07:28<4:25:09,  1.56it/s]

{'loss': 0.6477, 'grad_norm': 1.64429771900177, 'learning_rate': 1.9465156520371246e-05, 'epoch': 0.08}


  3%|▎         | 690/25428 [07:35<4:23:32,  1.56it/s]

{'loss': 0.6506, 'grad_norm': 1.7069448232650757, 'learning_rate': 1.9457291175082587e-05, 'epoch': 0.08}


  3%|▎         | 700/25428 [07:41<4:28:43,  1.53it/s]

{'loss': 0.7787, 'grad_norm': 1.8887261152267456, 'learning_rate': 1.9449425829793928e-05, 'epoch': 0.08}


  3%|▎         | 710/25428 [07:47<4:24:07,  1.56it/s]

{'loss': 0.7894, 'grad_norm': 2.286461114883423, 'learning_rate': 1.944156048450527e-05, 'epoch': 0.08}


  3%|▎         | 720/25428 [07:54<4:27:02,  1.54it/s]

{'loss': 0.6759, 'grad_norm': 1.9545013904571533, 'learning_rate': 1.9433695139216614e-05, 'epoch': 0.08}


  3%|▎         | 730/25428 [08:00<4:25:40,  1.55it/s]

{'loss': 0.6259, 'grad_norm': 2.0941853523254395, 'learning_rate': 1.9425829793927955e-05, 'epoch': 0.09}


  3%|▎         | 740/25428 [08:07<4:22:48,  1.57it/s]

{'loss': 0.7275, 'grad_norm': 1.8918927907943726, 'learning_rate': 1.9417964448639296e-05, 'epoch': 0.09}


  3%|▎         | 750/25428 [08:13<4:35:15,  1.49it/s]

{'loss': 0.6408, 'grad_norm': 1.6909617185592651, 'learning_rate': 1.941009910335064e-05, 'epoch': 0.09}


  3%|▎         | 760/25428 [08:20<4:30:27,  1.52it/s]

{'loss': 0.6765, 'grad_norm': 1.7125245332717896, 'learning_rate': 1.940223375806198e-05, 'epoch': 0.09}


  3%|▎         | 770/25428 [08:26<4:31:01,  1.52it/s]

{'loss': 0.7025, 'grad_norm': 1.8063983917236328, 'learning_rate': 1.9394368412773322e-05, 'epoch': 0.09}


  3%|▎         | 780/25428 [08:33<4:25:26,  1.55it/s]

{'loss': 0.6746, 'grad_norm': 1.9726288318634033, 'learning_rate': 1.9386503067484663e-05, 'epoch': 0.09}


  3%|▎         | 790/25428 [08:40<4:28:05,  1.53it/s]

{'loss': 0.6615, 'grad_norm': 3.7935781478881836, 'learning_rate': 1.9378637722196004e-05, 'epoch': 0.09}


  3%|▎         | 800/25428 [08:46<4:20:48,  1.57it/s]

{'loss': 0.725, 'grad_norm': 1.5887795686721802, 'learning_rate': 1.937077237690735e-05, 'epoch': 0.09}


  3%|▎         | 810/25428 [08:52<4:27:17,  1.54it/s]

{'loss': 0.6651, 'grad_norm': 2.2821877002716064, 'learning_rate': 1.936290703161869e-05, 'epoch': 0.1}


  3%|▎         | 820/25428 [08:59<4:25:32,  1.54it/s]

{'loss': 0.6254, 'grad_norm': 1.7657681703567505, 'learning_rate': 1.935504168633003e-05, 'epoch': 0.1}


  3%|▎         | 830/25428 [09:05<4:30:08,  1.52it/s]

{'loss': 0.7009, 'grad_norm': 1.904114007949829, 'learning_rate': 1.9347176341041375e-05, 'epoch': 0.1}


  3%|▎         | 840/25428 [09:12<4:29:35,  1.52it/s]

{'loss': 0.7171, 'grad_norm': 3.469884157180786, 'learning_rate': 1.9339310995752716e-05, 'epoch': 0.1}


  3%|▎         | 850/25428 [09:19<4:26:17,  1.54it/s]

{'loss': 0.7493, 'grad_norm': 2.5359132289886475, 'learning_rate': 1.9331445650464057e-05, 'epoch': 0.1}


  3%|▎         | 860/25428 [09:25<4:22:30,  1.56it/s]

{'loss': 0.6183, 'grad_norm': 1.5241295099258423, 'learning_rate': 1.9323580305175398e-05, 'epoch': 0.1}


  3%|▎         | 870/25428 [09:31<4:21:22,  1.57it/s]

{'loss': 0.6735, 'grad_norm': 2.058004856109619, 'learning_rate': 1.931571495988674e-05, 'epoch': 0.1}


  3%|▎         | 880/25428 [09:38<4:28:35,  1.52it/s]

{'loss': 0.6379, 'grad_norm': 1.8549836874008179, 'learning_rate': 1.9307849614598083e-05, 'epoch': 0.1}


  4%|▎         | 890/25428 [09:45<4:27:00,  1.53it/s]

{'loss': 0.6928, 'grad_norm': 2.209618330001831, 'learning_rate': 1.9299984269309424e-05, 'epoch': 0.11}


  4%|▎         | 900/25428 [09:51<4:27:05,  1.53it/s]

{'loss': 0.5788, 'grad_norm': 1.3763363361358643, 'learning_rate': 1.9292118924020765e-05, 'epoch': 0.11}


  4%|▎         | 910/25428 [09:58<4:30:24,  1.51it/s]

{'loss': 0.6864, 'grad_norm': 2.1937856674194336, 'learning_rate': 1.928425357873211e-05, 'epoch': 0.11}


  4%|▎         | 920/25428 [10:04<4:30:08,  1.51it/s]

{'loss': 0.6588, 'grad_norm': 2.2298521995544434, 'learning_rate': 1.927638823344345e-05, 'epoch': 0.11}


  4%|▎         | 930/25428 [10:11<4:30:23,  1.51it/s]

{'loss': 0.6994, 'grad_norm': 1.7789764404296875, 'learning_rate': 1.926852288815479e-05, 'epoch': 0.11}


  4%|▎         | 940/25428 [10:17<4:28:49,  1.52it/s]

{'loss': 0.6007, 'grad_norm': 2.095271110534668, 'learning_rate': 1.9260657542866132e-05, 'epoch': 0.11}


  4%|▎         | 950/25428 [10:24<4:26:43,  1.53it/s]

{'loss': 0.6031, 'grad_norm': 2.5035626888275146, 'learning_rate': 1.9252792197577473e-05, 'epoch': 0.11}


  4%|▍         | 960/25428 [10:30<4:25:34,  1.54it/s]

{'loss': 0.6746, 'grad_norm': 2.0775704383850098, 'learning_rate': 1.9244926852288818e-05, 'epoch': 0.11}


  4%|▍         | 970/25428 [10:37<4:34:20,  1.49it/s]

{'loss': 0.6716, 'grad_norm': 2.4171602725982666, 'learning_rate': 1.923706150700016e-05, 'epoch': 0.11}


  4%|▍         | 980/25428 [10:44<4:24:34,  1.54it/s]

{'loss': 0.6595, 'grad_norm': 1.8119983673095703, 'learning_rate': 1.92291961617115e-05, 'epoch': 0.12}


  4%|▍         | 990/25428 [10:50<4:27:14,  1.52it/s]

{'loss': 0.596, 'grad_norm': 1.7350788116455078, 'learning_rate': 1.9221330816422844e-05, 'epoch': 0.12}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.6518, 'grad_norm': 2.553313732147217, 'learning_rate': 1.9213465471134185e-05, 'epoch': 0.12}


  4%|▍         | 1010/25428 [11:06<4:42:09,  1.44it/s] 

{'loss': 0.5577, 'grad_norm': 1.7018957138061523, 'learning_rate': 1.9205600125845526e-05, 'epoch': 0.12}


  4%|▍         | 1020/25428 [11:13<4:25:09,  1.53it/s]

{'loss': 0.593, 'grad_norm': 1.7378764152526855, 'learning_rate': 1.9197734780556867e-05, 'epoch': 0.12}


  4%|▍         | 1030/25428 [11:19<4:22:37,  1.55it/s]

{'loss': 0.6155, 'grad_norm': 2.1643669605255127, 'learning_rate': 1.9189869435268208e-05, 'epoch': 0.12}


  4%|▍         | 1040/25428 [11:26<4:27:19,  1.52it/s]

{'loss': 0.5673, 'grad_norm': 2.7845261096954346, 'learning_rate': 1.9182004089979552e-05, 'epoch': 0.12}


  4%|▍         | 1050/25428 [11:32<4:22:42,  1.55it/s]

{'loss': 0.6588, 'grad_norm': 2.2498233318328857, 'learning_rate': 1.9174138744690893e-05, 'epoch': 0.12}


  4%|▍         | 1060/25428 [11:39<4:23:38,  1.54it/s]

{'loss': 0.5682, 'grad_norm': 2.2848596572875977, 'learning_rate': 1.9166273399402234e-05, 'epoch': 0.13}


  4%|▍         | 1070/25428 [11:45<4:29:27,  1.51it/s]

{'loss': 0.6007, 'grad_norm': 1.7496601343154907, 'learning_rate': 1.915840805411358e-05, 'epoch': 0.13}


  4%|▍         | 1080/25428 [11:52<4:32:23,  1.49it/s]

{'loss': 0.5388, 'grad_norm': 2.4691383838653564, 'learning_rate': 1.915054270882492e-05, 'epoch': 0.13}


  4%|▍         | 1090/25428 [11:59<4:27:30,  1.52it/s]

{'loss': 0.6117, 'grad_norm': 2.699200391769409, 'learning_rate': 1.914267736353626e-05, 'epoch': 0.13}


  4%|▍         | 1100/25428 [12:05<4:19:35,  1.56it/s]

{'loss': 0.6091, 'grad_norm': 1.9332337379455566, 'learning_rate': 1.91348120182476e-05, 'epoch': 0.13}


  4%|▍         | 1110/25428 [12:12<4:28:10,  1.51it/s]

{'loss': 0.6234, 'grad_norm': 1.9963701963424683, 'learning_rate': 1.9126946672958943e-05, 'epoch': 0.13}


  4%|▍         | 1120/25428 [12:18<4:26:09,  1.52it/s]

{'loss': 0.5021, 'grad_norm': 2.0044896602630615, 'learning_rate': 1.9119081327670287e-05, 'epoch': 0.13}


  4%|▍         | 1130/25428 [12:25<4:22:53,  1.54it/s]

{'loss': 0.5966, 'grad_norm': 1.4936367273330688, 'learning_rate': 1.9111215982381628e-05, 'epoch': 0.13}


  4%|▍         | 1140/25428 [12:31<4:20:56,  1.55it/s]

{'loss': 0.6056, 'grad_norm': 2.9206361770629883, 'learning_rate': 1.910335063709297e-05, 'epoch': 0.13}


  5%|▍         | 1150/25428 [12:38<4:32:31,  1.48it/s]

{'loss': 0.4519, 'grad_norm': 1.8358268737792969, 'learning_rate': 1.9095485291804313e-05, 'epoch': 0.14}


  5%|▍         | 1160/25428 [12:44<4:20:04,  1.56it/s]

{'loss': 0.5382, 'grad_norm': 2.1516010761260986, 'learning_rate': 1.9087619946515654e-05, 'epoch': 0.14}


  5%|▍         | 1170/25428 [12:51<4:23:17,  1.54it/s]

{'loss': 0.6829, 'grad_norm': 1.765152096748352, 'learning_rate': 1.9079754601226995e-05, 'epoch': 0.14}


  5%|▍         | 1180/25428 [12:57<4:24:30,  1.53it/s]

{'loss': 0.6159, 'grad_norm': 1.8553811311721802, 'learning_rate': 1.9071889255938336e-05, 'epoch': 0.14}


  5%|▍         | 1190/25428 [13:04<4:21:09,  1.55it/s]

{'loss': 0.6729, 'grad_norm': 2.051631450653076, 'learning_rate': 1.9064023910649677e-05, 'epoch': 0.14}


  5%|▍         | 1200/25428 [13:10<4:18:55,  1.56it/s]

{'loss': 0.5148, 'grad_norm': 1.69899582862854, 'learning_rate': 1.905615856536102e-05, 'epoch': 0.14}


  5%|▍         | 1210/25428 [13:17<4:25:09,  1.52it/s]

{'loss': 0.6193, 'grad_norm': 2.18174409866333, 'learning_rate': 1.9048293220072363e-05, 'epoch': 0.14}


  5%|▍         | 1220/25428 [13:24<4:24:07,  1.53it/s]

{'loss': 0.6013, 'grad_norm': 1.7056037187576294, 'learning_rate': 1.9040427874783704e-05, 'epoch': 0.14}


  5%|▍         | 1230/25428 [13:30<4:25:48,  1.52it/s]

{'loss': 0.684, 'grad_norm': 2.072906017303467, 'learning_rate': 1.9032562529495048e-05, 'epoch': 0.15}


  5%|▍         | 1240/25428 [13:37<4:24:00,  1.53it/s]

{'loss': 0.5298, 'grad_norm': 1.940177321434021, 'learning_rate': 1.902469718420639e-05, 'epoch': 0.15}


  5%|▍         | 1250/25428 [13:43<4:21:52,  1.54it/s]

{'loss': 0.5999, 'grad_norm': 1.873260498046875, 'learning_rate': 1.901683183891773e-05, 'epoch': 0.15}


  5%|▍         | 1260/25428 [13:50<4:18:53,  1.56it/s]

{'loss': 0.5661, 'grad_norm': 1.6981970071792603, 'learning_rate': 1.9008966493629074e-05, 'epoch': 0.15}


  5%|▍         | 1270/25428 [13:56<4:22:02,  1.54it/s]

{'loss': 0.5615, 'grad_norm': 1.7696911096572876, 'learning_rate': 1.9001101148340412e-05, 'epoch': 0.15}


  5%|▌         | 1280/25428 [14:03<4:19:26,  1.55it/s]

{'loss': 0.5367, 'grad_norm': 1.795861840248108, 'learning_rate': 1.8993235803051756e-05, 'epoch': 0.15}


  5%|▌         | 1290/25428 [14:09<4:21:36,  1.54it/s]

{'loss': 0.514, 'grad_norm': 1.9430660009384155, 'learning_rate': 1.8985370457763097e-05, 'epoch': 0.15}


  5%|▌         | 1300/25428 [14:16<4:28:12,  1.50it/s]

{'loss': 0.494, 'grad_norm': 1.856080174446106, 'learning_rate': 1.8977505112474438e-05, 'epoch': 0.15}


  5%|▌         | 1310/25428 [14:22<4:17:44,  1.56it/s]

{'loss': 0.5515, 'grad_norm': 2.2341747283935547, 'learning_rate': 1.8969639767185783e-05, 'epoch': 0.15}


  5%|▌         | 1320/25428 [14:29<4:19:07,  1.55it/s]

{'loss': 0.4889, 'grad_norm': 1.8600579500198364, 'learning_rate': 1.8961774421897124e-05, 'epoch': 0.16}


  5%|▌         | 1330/25428 [14:35<4:28:07,  1.50it/s]

{'loss': 0.5864, 'grad_norm': 1.8487517833709717, 'learning_rate': 1.8953909076608465e-05, 'epoch': 0.16}


  5%|▌         | 1340/25428 [14:42<4:22:00,  1.53it/s]

{'loss': 0.5226, 'grad_norm': 2.272958278656006, 'learning_rate': 1.894604373131981e-05, 'epoch': 0.16}


  5%|▌         | 1350/25428 [14:49<4:26:16,  1.51it/s]

{'loss': 0.4932, 'grad_norm': 2.017077922821045, 'learning_rate': 1.8938178386031147e-05, 'epoch': 0.16}


  5%|▌         | 1360/25428 [14:55<4:25:04,  1.51it/s]

{'loss': 0.5422, 'grad_norm': 2.1622314453125, 'learning_rate': 1.893031304074249e-05, 'epoch': 0.16}


  5%|▌         | 1370/25428 [15:02<4:22:55,  1.53it/s]

{'loss': 0.545, 'grad_norm': 1.4881837368011475, 'learning_rate': 1.8922447695453832e-05, 'epoch': 0.16}


  5%|▌         | 1380/25428 [15:08<4:21:17,  1.53it/s]

{'loss': 0.5149, 'grad_norm': 1.6845539808273315, 'learning_rate': 1.8914582350165173e-05, 'epoch': 0.16}


  5%|▌         | 1390/25428 [15:15<4:24:19,  1.52it/s]

{'loss': 0.5381, 'grad_norm': 1.9046807289123535, 'learning_rate': 1.8906717004876517e-05, 'epoch': 0.16}


  6%|▌         | 1400/25428 [15:21<4:22:21,  1.53it/s]

{'loss': 0.536, 'grad_norm': 1.5437939167022705, 'learning_rate': 1.889885165958786e-05, 'epoch': 0.17}


  6%|▌         | 1410/25428 [15:28<4:18:15,  1.55it/s]

{'loss': 0.5187, 'grad_norm': 2.429102659225464, 'learning_rate': 1.88909863142992e-05, 'epoch': 0.17}


  6%|▌         | 1420/25428 [15:34<4:24:26,  1.51it/s]

{'loss': 0.4816, 'grad_norm': 2.2617027759552, 'learning_rate': 1.8883120969010544e-05, 'epoch': 0.17}


  6%|▌         | 1430/25428 [15:41<4:22:10,  1.53it/s]

{'loss': 0.5869, 'grad_norm': 2.3780786991119385, 'learning_rate': 1.887525562372188e-05, 'epoch': 0.17}


  6%|▌         | 1440/25428 [15:48<4:25:40,  1.50it/s]

{'loss': 0.4863, 'grad_norm': 1.5688199996948242, 'learning_rate': 1.8867390278433226e-05, 'epoch': 0.17}


  6%|▌         | 1450/25428 [15:54<4:18:57,  1.54it/s]

{'loss': 0.4568, 'grad_norm': 1.4812264442443848, 'learning_rate': 1.8859524933144567e-05, 'epoch': 0.17}


  6%|▌         | 1460/25428 [16:01<4:23:02,  1.52it/s]

{'loss': 0.5717, 'grad_norm': 1.9570976495742798, 'learning_rate': 1.8851659587855908e-05, 'epoch': 0.17}


  6%|▌         | 1470/25428 [16:07<4:18:30,  1.54it/s]

{'loss': 0.5217, 'grad_norm': 1.7059444189071655, 'learning_rate': 1.8843794242567252e-05, 'epoch': 0.17}


  6%|▌         | 1480/25428 [16:14<4:16:24,  1.56it/s]

{'loss': 0.4833, 'grad_norm': 2.1089890003204346, 'learning_rate': 1.8835928897278593e-05, 'epoch': 0.17}


  6%|▌         | 1490/25428 [16:20<4:24:23,  1.51it/s]

{'loss': 0.4356, 'grad_norm': 2.369671106338501, 'learning_rate': 1.8828063551989934e-05, 'epoch': 0.18}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.5227, 'grad_norm': 2.352579116821289, 'learning_rate': 1.882019820670128e-05, 'epoch': 0.18}


  6%|▌         | 1510/25428 [16:36<4:40:47,  1.42it/s] 

{'loss': 0.4428, 'grad_norm': 1.7636373043060303, 'learning_rate': 1.8812332861412616e-05, 'epoch': 0.18}


  6%|▌         | 1520/25428 [16:43<4:16:43,  1.55it/s]

{'loss': 0.5008, 'grad_norm': 2.4363365173339844, 'learning_rate': 1.880446751612396e-05, 'epoch': 0.18}


  6%|▌         | 1530/25428 [16:49<4:26:17,  1.50it/s]

{'loss': 0.591, 'grad_norm': 1.9021697044372559, 'learning_rate': 1.87966021708353e-05, 'epoch': 0.18}


  6%|▌         | 1540/25428 [16:56<4:18:22,  1.54it/s]

{'loss': 0.4223, 'grad_norm': 1.7701691389083862, 'learning_rate': 1.8788736825546642e-05, 'epoch': 0.18}


  6%|▌         | 1550/25428 [17:02<4:15:29,  1.56it/s]

{'loss': 0.4391, 'grad_norm': 1.5915342569351196, 'learning_rate': 1.8780871480257987e-05, 'epoch': 0.18}


  6%|▌         | 1560/25428 [17:09<4:18:21,  1.54it/s]

{'loss': 0.4846, 'grad_norm': 1.7553907632827759, 'learning_rate': 1.8773006134969328e-05, 'epoch': 0.18}


  6%|▌         | 1570/25428 [17:15<4:23:03,  1.51it/s]

{'loss': 0.55, 'grad_norm': 2.605618953704834, 'learning_rate': 1.876514078968067e-05, 'epoch': 0.19}


  6%|▌         | 1580/25428 [17:22<4:21:24,  1.52it/s]

{'loss': 0.4713, 'grad_norm': 1.929904580116272, 'learning_rate': 1.875727544439201e-05, 'epoch': 0.19}


  6%|▋         | 1590/25428 [17:28<4:18:48,  1.54it/s]

{'loss': 0.4164, 'grad_norm': 1.7590810060501099, 'learning_rate': 1.874941009910335e-05, 'epoch': 0.19}


  6%|▋         | 1600/25428 [17:35<4:19:58,  1.53it/s]

{'loss': 0.5683, 'grad_norm': 2.7713379859924316, 'learning_rate': 1.874154475381469e-05, 'epoch': 0.19}


  6%|▋         | 1610/25428 [17:42<4:21:48,  1.52it/s]

{'loss': 0.4787, 'grad_norm': 1.5445966720581055, 'learning_rate': 1.8733679408526036e-05, 'epoch': 0.19}


  6%|▋         | 1620/25428 [17:48<4:19:11,  1.53it/s]

{'loss': 0.4683, 'grad_norm': 2.1357898712158203, 'learning_rate': 1.8725814063237377e-05, 'epoch': 0.19}


  6%|▋         | 1630/25428 [17:55<4:23:25,  1.51it/s]

{'loss': 0.4388, 'grad_norm': 2.169038772583008, 'learning_rate': 1.8717948717948718e-05, 'epoch': 0.19}


  6%|▋         | 1640/25428 [18:01<4:16:06,  1.55it/s]

{'loss': 0.4534, 'grad_norm': 1.505839467048645, 'learning_rate': 1.8710083372660062e-05, 'epoch': 0.19}


  6%|▋         | 1650/25428 [18:08<4:12:41,  1.57it/s]

{'loss': 0.4517, 'grad_norm': 1.9824267625808716, 'learning_rate': 1.8702218027371403e-05, 'epoch': 0.19}


  7%|▋         | 1660/25428 [18:14<4:20:36,  1.52it/s]

{'loss': 0.4397, 'grad_norm': 1.902003526687622, 'learning_rate': 1.8694352682082744e-05, 'epoch': 0.2}


  7%|▋         | 1670/25428 [18:21<4:20:20,  1.52it/s]

{'loss': 0.5175, 'grad_norm': 2.2190005779266357, 'learning_rate': 1.8686487336794085e-05, 'epoch': 0.2}


  7%|▋         | 1680/25428 [18:27<4:16:13,  1.54it/s]

{'loss': 0.4641, 'grad_norm': 1.680714726448059, 'learning_rate': 1.8678621991505426e-05, 'epoch': 0.2}


  7%|▋         | 1690/25428 [18:34<4:21:01,  1.52it/s]

{'loss': 0.4719, 'grad_norm': 1.9320698976516724, 'learning_rate': 1.867075664621677e-05, 'epoch': 0.2}


  7%|▋         | 1700/25428 [18:41<4:24:47,  1.49it/s]

{'loss': 0.4108, 'grad_norm': 2.34079647064209, 'learning_rate': 1.866289130092811e-05, 'epoch': 0.2}


  7%|▋         | 1710/25428 [18:47<4:16:59,  1.54it/s]

{'loss': 0.4436, 'grad_norm': 1.200344443321228, 'learning_rate': 1.8655025955639453e-05, 'epoch': 0.2}


  7%|▋         | 1720/25428 [18:54<4:20:49,  1.51it/s]

{'loss': 0.4821, 'grad_norm': 2.0211751461029053, 'learning_rate': 1.8647160610350797e-05, 'epoch': 0.2}


  7%|▋         | 1730/25428 [19:00<4:17:56,  1.53it/s]

{'loss': 0.4824, 'grad_norm': 1.6639387607574463, 'learning_rate': 1.8639295265062138e-05, 'epoch': 0.2}


  7%|▋         | 1740/25428 [19:07<4:17:31,  1.53it/s]

{'loss': 0.4663, 'grad_norm': 2.200623035430908, 'learning_rate': 1.863142991977348e-05, 'epoch': 0.21}


  7%|▋         | 1750/25428 [19:13<4:21:22,  1.51it/s]

{'loss': 0.5198, 'grad_norm': 2.1271252632141113, 'learning_rate': 1.862356457448482e-05, 'epoch': 0.21}


  7%|▋         | 1760/25428 [19:20<4:21:15,  1.51it/s]

{'loss': 0.4129, 'grad_norm': 1.8367490768432617, 'learning_rate': 1.861569922919616e-05, 'epoch': 0.21}


  7%|▋         | 1770/25428 [19:27<4:17:09,  1.53it/s]

{'loss': 0.4781, 'grad_norm': 1.693590521812439, 'learning_rate': 1.8607833883907505e-05, 'epoch': 0.21}


  7%|▋         | 1780/25428 [19:33<4:15:29,  1.54it/s]

{'loss': 0.5184, 'grad_norm': 1.8079575300216675, 'learning_rate': 1.8599968538618846e-05, 'epoch': 0.21}


  7%|▋         | 1790/25428 [19:40<4:12:42,  1.56it/s]

{'loss': 0.4135, 'grad_norm': 1.6858552694320679, 'learning_rate': 1.8592103193330187e-05, 'epoch': 0.21}


  7%|▋         | 1800/25428 [19:46<4:17:21,  1.53it/s]

{'loss': 0.3908, 'grad_norm': 1.9564982652664185, 'learning_rate': 1.858423784804153e-05, 'epoch': 0.21}


  7%|▋         | 1810/25428 [19:53<4:20:46,  1.51it/s]

{'loss': 0.4237, 'grad_norm': 1.738982915878296, 'learning_rate': 1.8576372502752873e-05, 'epoch': 0.21}


  7%|▋         | 1820/25428 [19:59<4:21:01,  1.51it/s]

{'loss': 0.4562, 'grad_norm': 2.576148748397827, 'learning_rate': 1.8568507157464214e-05, 'epoch': 0.21}


  7%|▋         | 1830/25428 [20:06<4:25:03,  1.48it/s]

{'loss': 0.3784, 'grad_norm': 1.7805805206298828, 'learning_rate': 1.8560641812175555e-05, 'epoch': 0.22}


  7%|▋         | 1840/25428 [20:13<4:39:03,  1.41it/s]

{'loss': 0.4402, 'grad_norm': 1.9022846221923828, 'learning_rate': 1.8552776466886896e-05, 'epoch': 0.22}


  7%|▋         | 1850/25428 [20:19<4:14:31,  1.54it/s]

{'loss': 0.4866, 'grad_norm': 1.9240821599960327, 'learning_rate': 1.854491112159824e-05, 'epoch': 0.22}


  7%|▋         | 1860/25428 [20:26<4:18:53,  1.52it/s]

{'loss': 0.4018, 'grad_norm': 1.8798253536224365, 'learning_rate': 1.853704577630958e-05, 'epoch': 0.22}


  7%|▋         | 1870/25428 [20:33<4:20:00,  1.51it/s]

{'loss': 0.4599, 'grad_norm': 2.344599485397339, 'learning_rate': 1.8529180431020922e-05, 'epoch': 0.22}


  7%|▋         | 1880/25428 [20:39<4:17:36,  1.52it/s]

{'loss': 0.468, 'grad_norm': 1.8890923261642456, 'learning_rate': 1.8521315085732266e-05, 'epoch': 0.22}


  7%|▋         | 1890/25428 [20:46<4:20:13,  1.51it/s]

{'loss': 0.4247, 'grad_norm': 1.2691529989242554, 'learning_rate': 1.8513449740443607e-05, 'epoch': 0.22}


  7%|▋         | 1900/25428 [20:52<4:22:23,  1.49it/s]

{'loss': 0.354, 'grad_norm': 1.9741358757019043, 'learning_rate': 1.850558439515495e-05, 'epoch': 0.22}


  8%|▊         | 1910/25428 [20:59<4:20:23,  1.51it/s]

{'loss': 0.3835, 'grad_norm': 2.155892848968506, 'learning_rate': 1.8497719049866293e-05, 'epoch': 0.23}


  8%|▊         | 1920/25428 [21:06<4:17:28,  1.52it/s]

{'loss': 0.4579, 'grad_norm': 2.099381923675537, 'learning_rate': 1.848985370457763e-05, 'epoch': 0.23}


  8%|▊         | 1930/25428 [21:12<4:18:30,  1.52it/s]

{'loss': 0.3833, 'grad_norm': 1.2643448114395142, 'learning_rate': 1.8481988359288975e-05, 'epoch': 0.23}


  8%|▊         | 1940/25428 [21:19<4:17:25,  1.52it/s]

{'loss': 0.4655, 'grad_norm': 3.8991429805755615, 'learning_rate': 1.8474123014000316e-05, 'epoch': 0.23}


  8%|▊         | 1950/25428 [21:25<4:12:33,  1.55it/s]

{'loss': 0.3714, 'grad_norm': 1.22169828414917, 'learning_rate': 1.8466257668711657e-05, 'epoch': 0.23}


  8%|▊         | 1960/25428 [21:32<4:14:23,  1.54it/s]

{'loss': 0.476, 'grad_norm': 1.6717907190322876, 'learning_rate': 1.8458392323423e-05, 'epoch': 0.23}


  8%|▊         | 1970/25428 [21:38<4:18:57,  1.51it/s]

{'loss': 0.3408, 'grad_norm': 1.628191351890564, 'learning_rate': 1.8450526978134342e-05, 'epoch': 0.23}


  8%|▊         | 1980/25428 [21:45<4:19:41,  1.50it/s]

{'loss': 0.3597, 'grad_norm': 1.5557277202606201, 'learning_rate': 1.8442661632845683e-05, 'epoch': 0.23}


  8%|▊         | 1990/25428 [21:51<4:13:51,  1.54it/s]

{'loss': 0.4772, 'grad_norm': 2.51204514503479, 'learning_rate': 1.8434796287557027e-05, 'epoch': 0.23}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.4007, 'grad_norm': 1.752272129058838, 'learning_rate': 1.8426930942268365e-05, 'epoch': 0.24}


  8%|▊         | 2010/25428 [22:08<4:27:35,  1.46it/s] 

{'loss': 0.4595, 'grad_norm': 1.8259121179580688, 'learning_rate': 1.841906559697971e-05, 'epoch': 0.24}


  8%|▊         | 2020/25428 [22:14<4:16:14,  1.52it/s]

{'loss': 0.4238, 'grad_norm': 3.6293556690216064, 'learning_rate': 1.841120025169105e-05, 'epoch': 0.24}


  8%|▊         | 2030/25428 [22:21<4:13:02,  1.54it/s]

{'loss': 0.3692, 'grad_norm': 1.3827120065689087, 'learning_rate': 1.840333490640239e-05, 'epoch': 0.24}


  8%|▊         | 2040/25428 [22:27<4:06:55,  1.58it/s]

{'loss': 0.4299, 'grad_norm': 1.5403465032577515, 'learning_rate': 1.8395469561113736e-05, 'epoch': 0.24}


  8%|▊         | 2050/25428 [22:33<4:06:39,  1.58it/s]

{'loss': 0.3979, 'grad_norm': 2.105191707611084, 'learning_rate': 1.8387604215825077e-05, 'epoch': 0.24}


  8%|▊         | 2060/25428 [22:40<4:15:01,  1.53it/s]

{'loss': 0.4438, 'grad_norm': 1.9838488101959229, 'learning_rate': 1.8379738870536418e-05, 'epoch': 0.24}


  8%|▊         | 2070/25428 [22:46<4:10:34,  1.55it/s]

{'loss': 0.3258, 'grad_norm': 1.7551934719085693, 'learning_rate': 1.8371873525247762e-05, 'epoch': 0.24}


  8%|▊         | 2080/25428 [22:53<4:09:03,  1.56it/s]

{'loss': 0.3959, 'grad_norm': 1.7559826374053955, 'learning_rate': 1.83640081799591e-05, 'epoch': 0.25}


  8%|▊         | 2090/25428 [22:59<4:08:09,  1.57it/s]

{'loss': 0.4238, 'grad_norm': 1.8637858629226685, 'learning_rate': 1.8356142834670444e-05, 'epoch': 0.25}


  8%|▊         | 2100/25428 [23:05<4:10:18,  1.55it/s]

{'loss': 0.4423, 'grad_norm': 1.597534418106079, 'learning_rate': 1.8348277489381785e-05, 'epoch': 0.25}


  8%|▊         | 2110/25428 [23:12<4:09:19,  1.56it/s]

{'loss': 0.454, 'grad_norm': 1.971672534942627, 'learning_rate': 1.8340412144093126e-05, 'epoch': 0.25}


  8%|▊         | 2120/25428 [23:18<4:10:05,  1.55it/s]

{'loss': 0.3776, 'grad_norm': 1.3862111568450928, 'learning_rate': 1.833254679880447e-05, 'epoch': 0.25}


  8%|▊         | 2130/25428 [23:25<4:07:41,  1.57it/s]

{'loss': 0.41, 'grad_norm': 2.0123825073242188, 'learning_rate': 1.832468145351581e-05, 'epoch': 0.25}


  8%|▊         | 2140/25428 [23:31<4:10:48,  1.55it/s]

{'loss': 0.3841, 'grad_norm': 1.7248586416244507, 'learning_rate': 1.8316816108227152e-05, 'epoch': 0.25}


  8%|▊         | 2150/25428 [23:38<4:07:20,  1.57it/s]

{'loss': 0.4325, 'grad_norm': 1.829662799835205, 'learning_rate': 1.8308950762938497e-05, 'epoch': 0.25}


  8%|▊         | 2160/25428 [23:44<4:04:29,  1.59it/s]

{'loss': 0.4206, 'grad_norm': 1.7734323740005493, 'learning_rate': 1.8301085417649834e-05, 'epoch': 0.25}


  9%|▊         | 2170/25428 [23:50<4:04:36,  1.58it/s]

{'loss': 0.3822, 'grad_norm': 1.5209476947784424, 'learning_rate': 1.829322007236118e-05, 'epoch': 0.26}


  9%|▊         | 2180/25428 [23:57<4:29:09,  1.44it/s]

{'loss': 0.3705, 'grad_norm': 2.1131715774536133, 'learning_rate': 1.828535472707252e-05, 'epoch': 0.26}


  9%|▊         | 2190/25428 [24:03<4:06:32,  1.57it/s]

{'loss': 0.3911, 'grad_norm': 1.7312195301055908, 'learning_rate': 1.827748938178386e-05, 'epoch': 0.26}


  9%|▊         | 2200/25428 [24:10<4:06:56,  1.57it/s]

{'loss': 0.3786, 'grad_norm': 2.362537384033203, 'learning_rate': 1.8269624036495205e-05, 'epoch': 0.26}


  9%|▊         | 2210/25428 [24:16<4:11:22,  1.54it/s]

{'loss': 0.3582, 'grad_norm': 1.4307972192764282, 'learning_rate': 1.8261758691206546e-05, 'epoch': 0.26}


  9%|▊         | 2220/25428 [24:23<4:09:23,  1.55it/s]

{'loss': 0.4311, 'grad_norm': 3.0856292247772217, 'learning_rate': 1.8253893345917887e-05, 'epoch': 0.26}


  9%|▉         | 2230/25428 [24:29<4:10:13,  1.55it/s]

{'loss': 0.471, 'grad_norm': 2.5731897354125977, 'learning_rate': 1.824602800062923e-05, 'epoch': 0.26}


  9%|▉         | 2240/25428 [24:36<4:11:44,  1.54it/s]

{'loss': 0.3647, 'grad_norm': 1.6629995107650757, 'learning_rate': 1.823816265534057e-05, 'epoch': 0.26}


  9%|▉         | 2250/25428 [24:42<4:06:32,  1.57it/s]

{'loss': 0.3981, 'grad_norm': 1.4541523456573486, 'learning_rate': 1.8230297310051913e-05, 'epoch': 0.27}


  9%|▉         | 2260/25428 [24:48<4:03:54,  1.58it/s]

{'loss': 0.4122, 'grad_norm': 2.009045124053955, 'learning_rate': 1.8222431964763254e-05, 'epoch': 0.27}


  9%|▉         | 2270/25428 [24:55<4:14:52,  1.51it/s]

{'loss': 0.3872, 'grad_norm': 2.445441484451294, 'learning_rate': 1.8214566619474595e-05, 'epoch': 0.27}


  9%|▉         | 2280/25428 [25:01<4:07:21,  1.56it/s]

{'loss': 0.3679, 'grad_norm': 1.6833040714263916, 'learning_rate': 1.820670127418594e-05, 'epoch': 0.27}


  9%|▉         | 2290/25428 [25:08<4:09:42,  1.54it/s]

{'loss': 0.3787, 'grad_norm': 2.031039237976074, 'learning_rate': 1.819883592889728e-05, 'epoch': 0.27}


  9%|▉         | 2300/25428 [25:14<4:06:35,  1.56it/s]

{'loss': 0.4004, 'grad_norm': 1.9727914333343506, 'learning_rate': 1.819097058360862e-05, 'epoch': 0.27}


  9%|▉         | 2310/25428 [25:20<4:04:27,  1.58it/s]

{'loss': 0.342, 'grad_norm': 2.1747000217437744, 'learning_rate': 1.8183105238319966e-05, 'epoch': 0.27}


  9%|▉         | 2320/25428 [25:27<4:04:41,  1.57it/s]

{'loss': 0.3832, 'grad_norm': 1.6740913391113281, 'learning_rate': 1.8175239893031304e-05, 'epoch': 0.27}


  9%|▉         | 2330/25428 [25:33<4:05:41,  1.57it/s]

{'loss': 0.4336, 'grad_norm': 1.9424808025360107, 'learning_rate': 1.8167374547742648e-05, 'epoch': 0.27}


  9%|▉         | 2340/25428 [25:40<4:04:31,  1.57it/s]

{'loss': 0.4101, 'grad_norm': 1.534128189086914, 'learning_rate': 1.815950920245399e-05, 'epoch': 0.28}


  9%|▉         | 2350/25428 [25:46<4:07:25,  1.55it/s]

{'loss': 0.3371, 'grad_norm': 1.5735127925872803, 'learning_rate': 1.815164385716533e-05, 'epoch': 0.28}


  9%|▉         | 2360/25428 [25:53<4:24:04,  1.46it/s]

{'loss': 0.3065, 'grad_norm': 1.630873441696167, 'learning_rate': 1.8143778511876674e-05, 'epoch': 0.28}


  9%|▉         | 2370/25428 [25:59<4:10:32,  1.53it/s]

{'loss': 0.3437, 'grad_norm': 2.6872756481170654, 'learning_rate': 1.8135913166588015e-05, 'epoch': 0.28}


  9%|▉         | 2380/25428 [26:06<4:20:25,  1.47it/s]

{'loss': 0.3425, 'grad_norm': 1.9052855968475342, 'learning_rate': 1.8128047821299356e-05, 'epoch': 0.28}


  9%|▉         | 2390/25428 [26:12<4:11:05,  1.53it/s]

{'loss': 0.3474, 'grad_norm': 2.2089171409606934, 'learning_rate': 1.81201824760107e-05, 'epoch': 0.28}


  9%|▉         | 2400/25428 [26:19<4:11:19,  1.53it/s]

{'loss': 0.3624, 'grad_norm': 2.308232307434082, 'learning_rate': 1.8112317130722038e-05, 'epoch': 0.28}


  9%|▉         | 2410/25428 [26:25<4:07:14,  1.55it/s]

{'loss': 0.3482, 'grad_norm': 2.222531795501709, 'learning_rate': 1.8104451785433383e-05, 'epoch': 0.28}


 10%|▉         | 2420/25428 [26:32<4:05:37,  1.56it/s]

{'loss': 0.3118, 'grad_norm': 1.9472569227218628, 'learning_rate': 1.8096586440144724e-05, 'epoch': 0.29}


 10%|▉         | 2430/25428 [26:38<4:00:43,  1.59it/s]

{'loss': 0.3501, 'grad_norm': 1.9129315614700317, 'learning_rate': 1.8088721094856065e-05, 'epoch': 0.29}


 10%|▉         | 2440/25428 [26:45<4:09:11,  1.54it/s]

{'loss': 0.4038, 'grad_norm': 1.994889259338379, 'learning_rate': 1.808085574956741e-05, 'epoch': 0.29}


 10%|▉         | 2450/25428 [26:51<4:06:42,  1.55it/s]

{'loss': 0.325, 'grad_norm': 3.4520411491394043, 'learning_rate': 1.807299040427875e-05, 'epoch': 0.29}


 10%|▉         | 2460/25428 [26:57<4:05:36,  1.56it/s]

{'loss': 0.3788, 'grad_norm': 1.7639503479003906, 'learning_rate': 1.806512505899009e-05, 'epoch': 0.29}


 10%|▉         | 2470/25428 [27:04<4:00:51,  1.59it/s]

{'loss': 0.3262, 'grad_norm': 1.5859787464141846, 'learning_rate': 1.8057259713701435e-05, 'epoch': 0.29}


 10%|▉         | 2480/25428 [27:11<4:12:14,  1.52it/s]

{'loss': 0.3862, 'grad_norm': 2.0515191555023193, 'learning_rate': 1.8049394368412773e-05, 'epoch': 0.29}


 10%|▉         | 2490/25428 [27:18<4:06:36,  1.55it/s]

{'loss': 0.4681, 'grad_norm': 2.2406582832336426, 'learning_rate': 1.8041529023124117e-05, 'epoch': 0.29}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.2981, 'grad_norm': 1.8746100664138794, 'learning_rate': 1.803366367783546e-05, 'epoch': 0.29}


 10%|▉         | 2510/25428 [27:33<4:20:38,  1.47it/s]

{'loss': 0.3288, 'grad_norm': 1.6125723123550415, 'learning_rate': 1.80257983325468e-05, 'epoch': 0.3}


 10%|▉         | 2520/25428 [27:40<4:07:27,  1.54it/s]

{'loss': 0.3282, 'grad_norm': 1.830498456954956, 'learning_rate': 1.8017932987258144e-05, 'epoch': 0.3}


 10%|▉         | 2530/25428 [27:46<4:09:17,  1.53it/s]

{'loss': 0.31, 'grad_norm': 1.5094871520996094, 'learning_rate': 1.8010067641969485e-05, 'epoch': 0.3}


 10%|▉         | 2540/25428 [27:53<4:07:06,  1.54it/s]

{'loss': 0.3705, 'grad_norm': 1.7948079109191895, 'learning_rate': 1.8002202296680826e-05, 'epoch': 0.3}


 10%|█         | 2550/25428 [27:59<4:07:28,  1.54it/s]

{'loss': 0.4161, 'grad_norm': 2.3838486671447754, 'learning_rate': 1.799433695139217e-05, 'epoch': 0.3}


 10%|█         | 2560/25428 [28:06<4:11:54,  1.51it/s]

{'loss': 0.3686, 'grad_norm': 1.8651052713394165, 'learning_rate': 1.798647160610351e-05, 'epoch': 0.3}


 10%|█         | 2570/25428 [28:12<4:11:00,  1.52it/s]

{'loss': 0.3181, 'grad_norm': 2.337707757949829, 'learning_rate': 1.7978606260814852e-05, 'epoch': 0.3}


 10%|█         | 2580/25428 [28:19<4:11:08,  1.52it/s]

{'loss': 0.3869, 'grad_norm': 2.6222615242004395, 'learning_rate': 1.7970740915526193e-05, 'epoch': 0.3}


 10%|█         | 2590/25428 [28:26<4:07:51,  1.54it/s]

{'loss': 0.3534, 'grad_norm': 1.4056310653686523, 'learning_rate': 1.7962875570237534e-05, 'epoch': 0.31}


 10%|█         | 2600/25428 [28:32<4:12:13,  1.51it/s]

{'loss': 0.3364, 'grad_norm': 1.5148757696151733, 'learning_rate': 1.795501022494888e-05, 'epoch': 0.31}


 10%|█         | 2610/25428 [28:39<4:01:36,  1.57it/s]

{'loss': 0.3788, 'grad_norm': 1.2241559028625488, 'learning_rate': 1.794714487966022e-05, 'epoch': 0.31}


 10%|█         | 2620/25428 [28:45<4:15:48,  1.49it/s]

{'loss': 0.2998, 'grad_norm': 1.8938400745391846, 'learning_rate': 1.793927953437156e-05, 'epoch': 0.31}


 10%|█         | 2630/25428 [28:52<4:07:50,  1.53it/s]

{'loss': 0.3248, 'grad_norm': 2.4788548946380615, 'learning_rate': 1.79314141890829e-05, 'epoch': 0.31}


 10%|█         | 2640/25428 [28:58<4:08:21,  1.53it/s]

{'loss': 0.3775, 'grad_norm': 1.643777847290039, 'learning_rate': 1.7923548843794246e-05, 'epoch': 0.31}


 10%|█         | 2650/25428 [29:04<4:04:22,  1.55it/s]

{'loss': 0.3457, 'grad_norm': 1.87484872341156, 'learning_rate': 1.7915683498505583e-05, 'epoch': 0.31}


 10%|█         | 2660/25428 [29:11<4:04:53,  1.55it/s]

{'loss': 0.3411, 'grad_norm': 1.4398374557495117, 'learning_rate': 1.7907818153216928e-05, 'epoch': 0.31}


 11%|█         | 2670/25428 [29:17<4:04:38,  1.55it/s]

{'loss': 0.3179, 'grad_norm': 3.2485272884368896, 'learning_rate': 1.789995280792827e-05, 'epoch': 0.32}


 11%|█         | 2680/25428 [29:24<4:07:27,  1.53it/s]

{'loss': 0.341, 'grad_norm': 1.7482348680496216, 'learning_rate': 1.789208746263961e-05, 'epoch': 0.32}


 11%|█         | 2690/25428 [29:30<4:03:59,  1.55it/s]

{'loss': 0.351, 'grad_norm': 2.466318130493164, 'learning_rate': 1.7884222117350954e-05, 'epoch': 0.32}


 11%|█         | 2700/25428 [29:37<4:04:34,  1.55it/s]

{'loss': 0.3355, 'grad_norm': 2.186796188354492, 'learning_rate': 1.7876356772062295e-05, 'epoch': 0.32}


 11%|█         | 2710/25428 [29:43<4:08:00,  1.53it/s]

{'loss': 0.2834, 'grad_norm': 1.8934547901153564, 'learning_rate': 1.7868491426773636e-05, 'epoch': 0.32}


 11%|█         | 2720/25428 [29:50<4:03:57,  1.55it/s]

{'loss': 0.3179, 'grad_norm': 1.9381047487258911, 'learning_rate': 1.786062608148498e-05, 'epoch': 0.32}


 11%|█         | 2730/25428 [29:56<4:07:23,  1.53it/s]

{'loss': 0.3402, 'grad_norm': 1.6898212432861328, 'learning_rate': 1.7852760736196318e-05, 'epoch': 0.32}


 11%|█         | 2740/25428 [30:03<3:59:05,  1.58it/s]

{'loss': 0.3118, 'grad_norm': 1.5655375719070435, 'learning_rate': 1.7844895390907662e-05, 'epoch': 0.32}


 11%|█         | 2750/25428 [30:09<4:03:57,  1.55it/s]

{'loss': 0.3207, 'grad_norm': 1.9875462055206299, 'learning_rate': 1.7837030045619003e-05, 'epoch': 0.32}


 11%|█         | 2760/25428 [30:16<4:07:40,  1.53it/s]

{'loss': 0.2934, 'grad_norm': 1.6777257919311523, 'learning_rate': 1.7829164700330344e-05, 'epoch': 0.33}


 11%|█         | 2770/25428 [30:22<4:00:52,  1.57it/s]

{'loss': 0.3648, 'grad_norm': 1.8123663663864136, 'learning_rate': 1.782129935504169e-05, 'epoch': 0.33}


 11%|█         | 2780/25428 [30:29<4:03:42,  1.55it/s]

{'loss': 0.3136, 'grad_norm': 1.5491911172866821, 'learning_rate': 1.781343400975303e-05, 'epoch': 0.33}


 11%|█         | 2790/25428 [30:35<4:05:03,  1.54it/s]

{'loss': 0.3576, 'grad_norm': 1.2858027219772339, 'learning_rate': 1.780556866446437e-05, 'epoch': 0.33}


 11%|█         | 2800/25428 [30:42<4:06:13,  1.53it/s]

{'loss': 0.3111, 'grad_norm': 3.1887149810791016, 'learning_rate': 1.7797703319175715e-05, 'epoch': 0.33}


 11%|█         | 2810/25428 [30:48<4:06:06,  1.53it/s]

{'loss': 0.3485, 'grad_norm': 2.2815799713134766, 'learning_rate': 1.7789837973887053e-05, 'epoch': 0.33}


 11%|█         | 2820/25428 [30:54<4:02:43,  1.55it/s]

{'loss': 0.2958, 'grad_norm': 1.7654955387115479, 'learning_rate': 1.7781972628598397e-05, 'epoch': 0.33}


 11%|█         | 2830/25428 [31:01<4:04:53,  1.54it/s]

{'loss': 0.2778, 'grad_norm': 2.0074706077575684, 'learning_rate': 1.7774107283309738e-05, 'epoch': 0.33}


 11%|█         | 2840/25428 [31:07<4:04:47,  1.54it/s]

{'loss': 0.3871, 'grad_norm': 2.5515620708465576, 'learning_rate': 1.776624193802108e-05, 'epoch': 0.34}


 11%|█         | 2850/25428 [31:14<4:25:20,  1.42it/s]

{'loss': 0.3052, 'grad_norm': 1.3760279417037964, 'learning_rate': 1.7758376592732423e-05, 'epoch': 0.34}


 11%|█         | 2860/25428 [31:21<4:02:10,  1.55it/s]

{'loss': 0.2828, 'grad_norm': 1.7793996334075928, 'learning_rate': 1.7750511247443764e-05, 'epoch': 0.34}


 11%|█▏        | 2870/25428 [31:28<4:14:53,  1.48it/s]

{'loss': 0.2924, 'grad_norm': 1.8042017221450806, 'learning_rate': 1.7742645902155105e-05, 'epoch': 0.34}


 11%|█▏        | 2880/25428 [31:34<4:01:43,  1.55it/s]

{'loss': 0.3183, 'grad_norm': 1.9547176361083984, 'learning_rate': 1.773478055686645e-05, 'epoch': 0.34}


 11%|█▏        | 2890/25428 [31:41<4:04:28,  1.54it/s]

{'loss': 0.3001, 'grad_norm': 1.9837539196014404, 'learning_rate': 1.7726915211577787e-05, 'epoch': 0.34}


 11%|█▏        | 2900/25428 [31:47<4:00:06,  1.56it/s]

{'loss': 0.2867, 'grad_norm': 1.5544558763504028, 'learning_rate': 1.771904986628913e-05, 'epoch': 0.34}


 11%|█▏        | 2910/25428 [31:53<3:58:29,  1.57it/s]

{'loss': 0.2799, 'grad_norm': 2.40439772605896, 'learning_rate': 1.7711184521000473e-05, 'epoch': 0.34}


 11%|█▏        | 2920/25428 [32:00<4:01:33,  1.55it/s]

{'loss': 0.3089, 'grad_norm': 1.5099858045578003, 'learning_rate': 1.7703319175711814e-05, 'epoch': 0.34}


 12%|█▏        | 2930/25428 [32:07<4:08:17,  1.51it/s]

{'loss': 0.3599, 'grad_norm': 2.8391733169555664, 'learning_rate': 1.7695453830423158e-05, 'epoch': 0.35}


 12%|█▏        | 2940/25428 [32:13<4:15:22,  1.47it/s]

{'loss': 0.3351, 'grad_norm': 1.679990530014038, 'learning_rate': 1.76875884851345e-05, 'epoch': 0.35}


 12%|█▏        | 2950/25428 [32:20<4:11:11,  1.49it/s]

{'loss': 0.3293, 'grad_norm': 1.987377405166626, 'learning_rate': 1.767972313984584e-05, 'epoch': 0.35}


 12%|█▏        | 2960/25428 [32:27<3:59:01,  1.57it/s]

{'loss': 0.2812, 'grad_norm': 1.6089274883270264, 'learning_rate': 1.7671857794557184e-05, 'epoch': 0.35}


 12%|█▏        | 2970/25428 [32:33<3:59:08,  1.57it/s]

{'loss': 0.3313, 'grad_norm': 1.7523436546325684, 'learning_rate': 1.7663992449268522e-05, 'epoch': 0.35}


 12%|█▏        | 2980/25428 [32:40<4:02:22,  1.54it/s]

{'loss': 0.3077, 'grad_norm': 1.7509424686431885, 'learning_rate': 1.7656127103979866e-05, 'epoch': 0.35}


 12%|█▏        | 2990/25428 [32:46<4:12:25,  1.48it/s]

{'loss': 0.294, 'grad_norm': 2.234713077545166, 'learning_rate': 1.7648261758691207e-05, 'epoch': 0.35}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.301, 'grad_norm': 1.6810575723648071, 'learning_rate': 1.7640396413402548e-05, 'epoch': 0.35}


 12%|█▏        | 3010/25428 [33:02<4:16:18,  1.46it/s]

{'loss': 0.2879, 'grad_norm': 1.6451627016067505, 'learning_rate': 1.7632531068113893e-05, 'epoch': 0.36}


 12%|█▏        | 3020/25428 [33:09<4:06:32,  1.51it/s]

{'loss': 0.2946, 'grad_norm': 1.818813681602478, 'learning_rate': 1.7624665722825234e-05, 'epoch': 0.36}


 12%|█▏        | 3030/25428 [33:16<4:07:15,  1.51it/s]

{'loss': 0.3031, 'grad_norm': 2.5505781173706055, 'learning_rate': 1.7616800377536575e-05, 'epoch': 0.36}


 12%|█▏        | 3040/25428 [33:22<3:52:44,  1.60it/s]

{'loss': 0.2925, 'grad_norm': 1.9406025409698486, 'learning_rate': 1.760893503224792e-05, 'epoch': 0.36}


 12%|█▏        | 3050/25428 [33:28<4:00:12,  1.55it/s]

{'loss': 0.265, 'grad_norm': 1.6013768911361694, 'learning_rate': 1.7601069686959257e-05, 'epoch': 0.36}


 12%|█▏        | 3060/25428 [33:35<3:56:47,  1.57it/s]

{'loss': 0.3023, 'grad_norm': 2.279545783996582, 'learning_rate': 1.75932043416706e-05, 'epoch': 0.36}


 12%|█▏        | 3070/25428 [33:41<3:55:39,  1.58it/s]

{'loss': 0.3117, 'grad_norm': 1.9711257219314575, 'learning_rate': 1.7585338996381942e-05, 'epoch': 0.36}


 12%|█▏        | 3080/25428 [33:48<3:57:11,  1.57it/s]

{'loss': 0.3208, 'grad_norm': 2.107658624649048, 'learning_rate': 1.7577473651093283e-05, 'epoch': 0.36}


 12%|█▏        | 3090/25428 [33:54<4:00:33,  1.55it/s]

{'loss': 0.2595, 'grad_norm': 1.7701809406280518, 'learning_rate': 1.7569608305804627e-05, 'epoch': 0.36}


 12%|█▏        | 3100/25428 [34:01<4:02:43,  1.53it/s]

{'loss': 0.2988, 'grad_norm': 1.8272268772125244, 'learning_rate': 1.756174296051597e-05, 'epoch': 0.37}


 12%|█▏        | 3110/25428 [34:07<4:00:26,  1.55it/s]

{'loss': 0.2899, 'grad_norm': 1.740945816040039, 'learning_rate': 1.755387761522731e-05, 'epoch': 0.37}


 12%|█▏        | 3120/25428 [34:14<3:57:00,  1.57it/s]

{'loss': 0.2819, 'grad_norm': 2.054994821548462, 'learning_rate': 1.7546012269938654e-05, 'epoch': 0.37}


 12%|█▏        | 3130/25428 [34:20<3:54:24,  1.59it/s]

{'loss': 0.2808, 'grad_norm': 1.996476650238037, 'learning_rate': 1.753814692464999e-05, 'epoch': 0.37}


 12%|█▏        | 3140/25428 [34:26<4:03:40,  1.52it/s]

{'loss': 0.3001, 'grad_norm': 1.4827048778533936, 'learning_rate': 1.7530281579361336e-05, 'epoch': 0.37}


 12%|█▏        | 3150/25428 [34:33<4:02:36,  1.53it/s]

{'loss': 0.2796, 'grad_norm': 2.0111899375915527, 'learning_rate': 1.7522416234072677e-05, 'epoch': 0.37}


 12%|█▏        | 3160/25428 [34:39<4:02:46,  1.53it/s]

{'loss': 0.3242, 'grad_norm': 1.6408615112304688, 'learning_rate': 1.7514550888784018e-05, 'epoch': 0.37}


 12%|█▏        | 3170/25428 [34:46<3:57:04,  1.56it/s]

{'loss': 0.2685, 'grad_norm': 1.5558295249938965, 'learning_rate': 1.7506685543495362e-05, 'epoch': 0.37}


 13%|█▎        | 3180/25428 [34:52<4:00:40,  1.54it/s]

{'loss': 0.2937, 'grad_norm': 3.3455207347869873, 'learning_rate': 1.7498820198206703e-05, 'epoch': 0.38}


 13%|█▎        | 3190/25428 [34:59<3:59:51,  1.55it/s]

{'loss': 0.2865, 'grad_norm': 1.9588239192962646, 'learning_rate': 1.7490954852918044e-05, 'epoch': 0.38}


 13%|█▎        | 3200/25428 [35:05<3:57:33,  1.56it/s]

{'loss': 0.3506, 'grad_norm': 1.574201226234436, 'learning_rate': 1.748308950762939e-05, 'epoch': 0.38}


 13%|█▎        | 3210/25428 [35:12<4:01:24,  1.53it/s]

{'loss': 0.2865, 'grad_norm': 1.722315788269043, 'learning_rate': 1.747522416234073e-05, 'epoch': 0.38}


 13%|█▎        | 3220/25428 [35:18<3:52:15,  1.59it/s]

{'loss': 0.2316, 'grad_norm': 1.4433928728103638, 'learning_rate': 1.746735881705207e-05, 'epoch': 0.38}


 13%|█▎        | 3230/25428 [35:24<3:56:05,  1.57it/s]

{'loss': 0.3282, 'grad_norm': 1.8638951778411865, 'learning_rate': 1.745949347176341e-05, 'epoch': 0.38}


 13%|█▎        | 3240/25428 [35:31<3:54:01,  1.58it/s]

{'loss': 0.2906, 'grad_norm': 1.4597910642623901, 'learning_rate': 1.7451628126474752e-05, 'epoch': 0.38}


 13%|█▎        | 3250/25428 [35:37<3:56:45,  1.56it/s]

{'loss': 0.2985, 'grad_norm': 1.5136173963546753, 'learning_rate': 1.7443762781186097e-05, 'epoch': 0.38}


 13%|█▎        | 3260/25428 [35:44<4:01:04,  1.53it/s]

{'loss': 0.2389, 'grad_norm': 2.154050588607788, 'learning_rate': 1.7435897435897438e-05, 'epoch': 0.38}


 13%|█▎        | 3270/25428 [35:50<3:58:26,  1.55it/s]

{'loss': 0.243, 'grad_norm': 1.9258486032485962, 'learning_rate': 1.742803209060878e-05, 'epoch': 0.39}


 13%|█▎        | 3280/25428 [35:56<3:56:14,  1.56it/s]

{'loss': 0.2938, 'grad_norm': 1.991110920906067, 'learning_rate': 1.7420166745320123e-05, 'epoch': 0.39}


 13%|█▎        | 3290/25428 [36:03<3:58:32,  1.55it/s]

{'loss': 0.2708, 'grad_norm': 1.5152428150177002, 'learning_rate': 1.7412301400031464e-05, 'epoch': 0.39}


 13%|█▎        | 3300/25428 [36:09<3:54:55,  1.57it/s]

{'loss': 0.2863, 'grad_norm': 2.065391778945923, 'learning_rate': 1.7404436054742805e-05, 'epoch': 0.39}


 13%|█▎        | 3310/25428 [36:16<3:53:24,  1.58it/s]

{'loss': 0.2916, 'grad_norm': 1.473317265510559, 'learning_rate': 1.7396570709454146e-05, 'epoch': 0.39}


 13%|█▎        | 3320/25428 [36:22<3:48:05,  1.62it/s]

{'loss': 0.2967, 'grad_norm': 1.6021209955215454, 'learning_rate': 1.7388705364165487e-05, 'epoch': 0.39}


 13%|█▎        | 3330/25428 [36:28<3:47:12,  1.62it/s]

{'loss': 0.2731, 'grad_norm': 1.5734593868255615, 'learning_rate': 1.738084001887683e-05, 'epoch': 0.39}


 13%|█▎        | 3340/25428 [36:35<3:55:35,  1.56it/s]

{'loss': 0.3324, 'grad_norm': 1.4282376766204834, 'learning_rate': 1.7372974673588172e-05, 'epoch': 0.39}


 13%|█▎        | 3350/25428 [36:41<3:59:57,  1.53it/s]

{'loss': 0.2899, 'grad_norm': 1.451597809791565, 'learning_rate': 1.7365109328299513e-05, 'epoch': 0.4}


 13%|█▎        | 3360/25428 [36:48<4:00:39,  1.53it/s]

{'loss': 0.2741, 'grad_norm': 2.0398900508880615, 'learning_rate': 1.7357243983010858e-05, 'epoch': 0.4}


 13%|█▎        | 3370/25428 [36:54<3:51:09,  1.59it/s]

{'loss': 0.2274, 'grad_norm': 4.692226409912109, 'learning_rate': 1.73493786377222e-05, 'epoch': 0.4}


 13%|█▎        | 3380/25428 [37:00<4:01:33,  1.52it/s]

{'loss': 0.2216, 'grad_norm': 2.167623996734619, 'learning_rate': 1.734151329243354e-05, 'epoch': 0.4}


 13%|█▎        | 3390/25428 [37:07<3:51:50,  1.58it/s]

{'loss': 0.2908, 'grad_norm': 1.872090220451355, 'learning_rate': 1.733364794714488e-05, 'epoch': 0.4}


 13%|█▎        | 3400/25428 [37:13<3:54:27,  1.57it/s]

{'loss': 0.3117, 'grad_norm': 1.8368602991104126, 'learning_rate': 1.732578260185622e-05, 'epoch': 0.4}


 13%|█▎        | 3410/25428 [37:20<3:51:07,  1.59it/s]

{'loss': 0.2745, 'grad_norm': 2.1331586837768555, 'learning_rate': 1.7317917256567566e-05, 'epoch': 0.4}


 13%|█▎        | 3420/25428 [37:26<3:54:17,  1.57it/s]

{'loss': 0.3075, 'grad_norm': 1.2419673204421997, 'learning_rate': 1.7310051911278907e-05, 'epoch': 0.4}


 13%|█▎        | 3430/25428 [37:33<3:59:42,  1.53it/s]

{'loss': 0.2609, 'grad_norm': 1.4908356666564941, 'learning_rate': 1.7302186565990248e-05, 'epoch': 0.4}


 14%|█▎        | 3440/25428 [37:39<3:57:23,  1.54it/s]

{'loss': 0.2807, 'grad_norm': 1.8440812826156616, 'learning_rate': 1.7294321220701592e-05, 'epoch': 0.41}


 14%|█▎        | 3450/25428 [37:46<3:58:44,  1.53it/s]

{'loss': 0.2529, 'grad_norm': 1.8304743766784668, 'learning_rate': 1.7286455875412933e-05, 'epoch': 0.41}


 14%|█▎        | 3460/25428 [37:52<3:54:36,  1.56it/s]

{'loss': 0.2951, 'grad_norm': 1.4598222970962524, 'learning_rate': 1.7278590530124274e-05, 'epoch': 0.41}


 14%|█▎        | 3470/25428 [37:59<3:54:51,  1.56it/s]

{'loss': 0.2849, 'grad_norm': 3.4088845252990723, 'learning_rate': 1.7270725184835615e-05, 'epoch': 0.41}


 14%|█▎        | 3480/25428 [38:05<3:54:30,  1.56it/s]

{'loss': 0.2749, 'grad_norm': 2.168897867202759, 'learning_rate': 1.7262859839546956e-05, 'epoch': 0.41}


 14%|█▎        | 3490/25428 [38:11<3:56:51,  1.54it/s]

{'loss': 0.2634, 'grad_norm': 1.8947802782058716, 'learning_rate': 1.72549944942583e-05, 'epoch': 0.41}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.2586, 'grad_norm': 1.6372995376586914, 'learning_rate': 1.724712914896964e-05, 'epoch': 0.41}


 14%|█▍        | 3510/25428 [38:27<4:02:32,  1.51it/s]

{'loss': 0.2342, 'grad_norm': 2.2260305881500244, 'learning_rate': 1.7239263803680983e-05, 'epoch': 0.41}


 14%|█▍        | 3520/25428 [38:33<3:55:49,  1.55it/s]

{'loss': 0.2445, 'grad_norm': 1.4121695756912231, 'learning_rate': 1.7231398458392327e-05, 'epoch': 0.42}


 14%|█▍        | 3530/25428 [38:40<3:49:32,  1.59it/s]

{'loss': 0.2873, 'grad_norm': 1.9357120990753174, 'learning_rate': 1.7223533113103668e-05, 'epoch': 0.42}


 14%|█▍        | 3540/25428 [38:46<3:59:19,  1.52it/s]

{'loss': 0.2711, 'grad_norm': 1.7438864707946777, 'learning_rate': 1.721566776781501e-05, 'epoch': 0.42}


 14%|█▍        | 3550/25428 [38:53<3:56:10,  1.54it/s]

{'loss': 0.2815, 'grad_norm': 1.9202144145965576, 'learning_rate': 1.720780242252635e-05, 'epoch': 0.42}


 14%|█▍        | 3560/25428 [38:59<3:53:36,  1.56it/s]

{'loss': 0.2113, 'grad_norm': 1.9716111421585083, 'learning_rate': 1.719993707723769e-05, 'epoch': 0.42}


 14%|█▍        | 3570/25428 [39:06<3:51:41,  1.57it/s]

{'loss': 0.3277, 'grad_norm': 1.4281857013702393, 'learning_rate': 1.7192071731949035e-05, 'epoch': 0.42}


 14%|█▍        | 3580/25428 [39:12<3:49:28,  1.59it/s]

{'loss': 0.2493, 'grad_norm': 2.283196449279785, 'learning_rate': 1.7184206386660376e-05, 'epoch': 0.42}


 14%|█▍        | 3590/25428 [39:18<3:47:11,  1.60it/s]

{'loss': 0.2645, 'grad_norm': 2.20784330368042, 'learning_rate': 1.7176341041371717e-05, 'epoch': 0.42}


 14%|█▍        | 3600/25428 [39:24<3:44:04,  1.62it/s]

{'loss': 0.2662, 'grad_norm': 1.2725645303726196, 'learning_rate': 1.7168475696083062e-05, 'epoch': 0.42}


 14%|█▍        | 3610/25428 [39:31<3:46:03,  1.61it/s]

{'loss': 0.2587, 'grad_norm': 2.0612757205963135, 'learning_rate': 1.7160610350794403e-05, 'epoch': 0.43}


 14%|█▍        | 3620/25428 [39:37<3:50:24,  1.58it/s]

{'loss': 0.294, 'grad_norm': 1.3650915622711182, 'learning_rate': 1.7152745005505744e-05, 'epoch': 0.43}


 14%|█▍        | 3630/25428 [39:43<3:53:18,  1.56it/s]

{'loss': 0.2338, 'grad_norm': 1.2542983293533325, 'learning_rate': 1.7144879660217085e-05, 'epoch': 0.43}


 14%|█▍        | 3640/25428 [39:50<3:50:09,  1.58it/s]

{'loss': 0.3366, 'grad_norm': 2.0641517639160156, 'learning_rate': 1.7137014314928426e-05, 'epoch': 0.43}


 14%|█▍        | 3650/25428 [39:56<3:54:30,  1.55it/s]

{'loss': 0.2604, 'grad_norm': 1.3980251550674438, 'learning_rate': 1.712914896963977e-05, 'epoch': 0.43}


 14%|█▍        | 3660/25428 [40:03<3:53:37,  1.55it/s]

{'loss': 0.2525, 'grad_norm': 1.9046601057052612, 'learning_rate': 1.712128362435111e-05, 'epoch': 0.43}


 14%|█▍        | 3670/25428 [40:09<3:56:04,  1.54it/s]

{'loss': 0.2625, 'grad_norm': 1.173039436340332, 'learning_rate': 1.7113418279062452e-05, 'epoch': 0.43}


 14%|█▍        | 3680/25428 [40:16<3:53:43,  1.55it/s]

{'loss': 0.27, 'grad_norm': 1.0053507089614868, 'learning_rate': 1.7105552933773793e-05, 'epoch': 0.43}


 15%|█▍        | 3690/25428 [40:22<3:57:43,  1.52it/s]

{'loss': 0.2447, 'grad_norm': 1.2319380044937134, 'learning_rate': 1.7097687588485137e-05, 'epoch': 0.44}


 15%|█▍        | 3700/25428 [40:29<3:53:35,  1.55it/s]

{'loss': 0.2528, 'grad_norm': 1.71501886844635, 'learning_rate': 1.7089822243196475e-05, 'epoch': 0.44}


 15%|█▍        | 3710/25428 [40:35<3:50:39,  1.57it/s]

{'loss': 0.298, 'grad_norm': 0.8440806269645691, 'learning_rate': 1.708195689790782e-05, 'epoch': 0.44}


 15%|█▍        | 3720/25428 [40:41<3:54:32,  1.54it/s]

{'loss': 0.2645, 'grad_norm': 2.631239175796509, 'learning_rate': 1.707409155261916e-05, 'epoch': 0.44}


 15%|█▍        | 3730/25428 [40:48<3:53:52,  1.55it/s]

{'loss': 0.2489, 'grad_norm': 1.3271675109863281, 'learning_rate': 1.70662262073305e-05, 'epoch': 0.44}


 15%|█▍        | 3740/25428 [40:54<3:51:08,  1.56it/s]

{'loss': 0.2424, 'grad_norm': 1.641237735748291, 'learning_rate': 1.7058360862041846e-05, 'epoch': 0.44}


 15%|█▍        | 3750/25428 [41:00<3:49:11,  1.58it/s]

{'loss': 0.2973, 'grad_norm': 1.1156928539276123, 'learning_rate': 1.7050495516753187e-05, 'epoch': 0.44}


 15%|█▍        | 3760/25428 [41:07<3:47:36,  1.59it/s]

{'loss': 0.314, 'grad_norm': 1.7245891094207764, 'learning_rate': 1.7042630171464528e-05, 'epoch': 0.44}


 15%|█▍        | 3770/25428 [41:13<3:55:28,  1.53it/s]

{'loss': 0.2764, 'grad_norm': 2.064669132232666, 'learning_rate': 1.7034764826175872e-05, 'epoch': 0.44}


 15%|█▍        | 3780/25428 [41:20<3:53:37,  1.54it/s]

{'loss': 0.2977, 'grad_norm': 1.6402554512023926, 'learning_rate': 1.7026899480887213e-05, 'epoch': 0.45}


 15%|█▍        | 3790/25428 [41:26<3:51:24,  1.56it/s]

{'loss': 0.2585, 'grad_norm': 1.9595099687576294, 'learning_rate': 1.7019034135598554e-05, 'epoch': 0.45}


 15%|█▍        | 3800/25428 [41:32<3:49:10,  1.57it/s]

{'loss': 0.2373, 'grad_norm': 1.4116414785385132, 'learning_rate': 1.7011168790309895e-05, 'epoch': 0.45}


 15%|█▍        | 3810/25428 [41:39<3:46:30,  1.59it/s]

{'loss': 0.2459, 'grad_norm': 1.5933082103729248, 'learning_rate': 1.7003303445021236e-05, 'epoch': 0.45}


 15%|█▌        | 3820/25428 [41:45<3:51:49,  1.55it/s]

{'loss': 0.2108, 'grad_norm': 1.6468805074691772, 'learning_rate': 1.699543809973258e-05, 'epoch': 0.45}


 15%|█▌        | 3830/25428 [41:52<3:50:16,  1.56it/s]

{'loss': 0.2386, 'grad_norm': 2.9508373737335205, 'learning_rate': 1.698757275444392e-05, 'epoch': 0.45}


 15%|█▌        | 3840/25428 [41:58<3:48:31,  1.57it/s]

{'loss': 0.2048, 'grad_norm': 2.310420513153076, 'learning_rate': 1.6979707409155262e-05, 'epoch': 0.45}


 15%|█▌        | 3850/25428 [42:04<3:49:07,  1.57it/s]

{'loss': 0.2664, 'grad_norm': 1.2525008916854858, 'learning_rate': 1.6971842063866607e-05, 'epoch': 0.45}


 15%|█▌        | 3860/25428 [42:11<3:52:09,  1.55it/s]

{'loss': 0.2846, 'grad_norm': 1.1416842937469482, 'learning_rate': 1.6963976718577948e-05, 'epoch': 0.46}


 15%|█▌        | 3870/25428 [42:17<3:46:50,  1.58it/s]

{'loss': 0.2876, 'grad_norm': 1.3567225933074951, 'learning_rate': 1.695611137328929e-05, 'epoch': 0.46}


 15%|█▌        | 3880/25428 [42:24<3:54:20,  1.53it/s]

{'loss': 0.2668, 'grad_norm': 1.7331091165542603, 'learning_rate': 1.694824602800063e-05, 'epoch': 0.46}


 15%|█▌        | 3890/25428 [42:30<3:52:50,  1.54it/s]

{'loss': 0.235, 'grad_norm': 1.1949797868728638, 'learning_rate': 1.694038068271197e-05, 'epoch': 0.46}


 15%|█▌        | 3900/25428 [42:37<3:51:12,  1.55it/s]

{'loss': 0.2436, 'grad_norm': 1.4714205265045166, 'learning_rate': 1.6932515337423315e-05, 'epoch': 0.46}


 15%|█▌        | 3910/25428 [42:43<3:51:29,  1.55it/s]

{'loss': 0.2861, 'grad_norm': 2.437880516052246, 'learning_rate': 1.6924649992134656e-05, 'epoch': 0.46}


 15%|█▌        | 3920/25428 [42:50<3:48:12,  1.57it/s]

{'loss': 0.2359, 'grad_norm': 1.1350525617599487, 'learning_rate': 1.6916784646845997e-05, 'epoch': 0.46}


 15%|█▌        | 3930/25428 [42:56<3:52:39,  1.54it/s]

{'loss': 0.2292, 'grad_norm': 1.313677191734314, 'learning_rate': 1.690891930155734e-05, 'epoch': 0.46}


 15%|█▌        | 3940/25428 [43:02<3:52:05,  1.54it/s]

{'loss': 0.2471, 'grad_norm': 1.8003315925598145, 'learning_rate': 1.6901053956268682e-05, 'epoch': 0.46}


 16%|█▌        | 3950/25428 [43:09<3:52:17,  1.54it/s]

{'loss': 0.2252, 'grad_norm': 1.7054541110992432, 'learning_rate': 1.6893188610980023e-05, 'epoch': 0.47}


 16%|█▌        | 3960/25428 [43:15<3:48:51,  1.56it/s]

{'loss': 0.2281, 'grad_norm': 1.2176865339279175, 'learning_rate': 1.6885323265691364e-05, 'epoch': 0.47}


 16%|█▌        | 3970/25428 [43:22<3:49:36,  1.56it/s]

{'loss': 0.2456, 'grad_norm': 1.3352171182632446, 'learning_rate': 1.6877457920402705e-05, 'epoch': 0.47}


 16%|█▌        | 3980/25428 [43:28<3:47:44,  1.57it/s]

{'loss': 0.2995, 'grad_norm': 1.3623955249786377, 'learning_rate': 1.686959257511405e-05, 'epoch': 0.47}


 16%|█▌        | 3990/25428 [43:34<3:44:28,  1.59it/s]

{'loss': 0.2657, 'grad_norm': 2.6261627674102783, 'learning_rate': 1.686172722982539e-05, 'epoch': 0.47}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.2755, 'grad_norm': 1.8288124799728394, 'learning_rate': 1.685386188453673e-05, 'epoch': 0.47}


 16%|█▌        | 4010/25428 [43:50<4:00:35,  1.48it/s]

{'loss': 0.2527, 'grad_norm': 1.530177354812622, 'learning_rate': 1.6845996539248076e-05, 'epoch': 0.47}


 16%|█▌        | 4020/25428 [43:56<3:54:34,  1.52it/s]

{'loss': 0.2757, 'grad_norm': 1.5006786584854126, 'learning_rate': 1.6838131193959417e-05, 'epoch': 0.47}


 16%|█▌        | 4030/25428 [44:03<3:51:06,  1.54it/s]

{'loss': 0.2365, 'grad_norm': 1.6451419591903687, 'learning_rate': 1.6830265848670758e-05, 'epoch': 0.48}


 16%|█▌        | 4040/25428 [44:09<3:48:53,  1.56it/s]

{'loss': 0.2577, 'grad_norm': 1.2161741256713867, 'learning_rate': 1.68224005033821e-05, 'epoch': 0.48}


 16%|█▌        | 4050/25428 [44:16<3:52:33,  1.53it/s]

{'loss': 0.2302, 'grad_norm': 1.285126805305481, 'learning_rate': 1.681453515809344e-05, 'epoch': 0.48}


 16%|█▌        | 4060/25428 [44:22<3:46:39,  1.57it/s]

{'loss': 0.2236, 'grad_norm': 2.5039665699005127, 'learning_rate': 1.6806669812804784e-05, 'epoch': 0.48}


 16%|█▌        | 4070/25428 [44:29<3:45:37,  1.58it/s]

{'loss': 0.2355, 'grad_norm': 1.2963447570800781, 'learning_rate': 1.6798804467516125e-05, 'epoch': 0.48}


 16%|█▌        | 4080/25428 [44:35<3:52:40,  1.53it/s]

{'loss': 0.2199, 'grad_norm': 1.291719913482666, 'learning_rate': 1.6790939122227466e-05, 'epoch': 0.48}


 16%|█▌        | 4090/25428 [44:42<3:42:37,  1.60it/s]

{'loss': 0.2378, 'grad_norm': 1.7201766967773438, 'learning_rate': 1.678307377693881e-05, 'epoch': 0.48}


 16%|█▌        | 4100/25428 [44:48<3:47:06,  1.57it/s]

{'loss': 0.2299, 'grad_norm': 1.1477062702178955, 'learning_rate': 1.677520843165015e-05, 'epoch': 0.48}


 16%|█▌        | 4110/25428 [44:55<3:49:41,  1.55it/s]

{'loss': 0.2543, 'grad_norm': 1.3793569803237915, 'learning_rate': 1.6767343086361493e-05, 'epoch': 0.48}


 16%|█▌        | 4120/25428 [45:01<3:47:08,  1.56it/s]

{'loss': 0.2523, 'grad_norm': 1.3404663801193237, 'learning_rate': 1.6759477741072834e-05, 'epoch': 0.49}


 16%|█▌        | 4130/25428 [45:07<3:48:52,  1.55it/s]

{'loss': 0.2273, 'grad_norm': 1.1456165313720703, 'learning_rate': 1.6751612395784175e-05, 'epoch': 0.49}


 16%|█▋        | 4140/25428 [45:14<3:52:45,  1.52it/s]

{'loss': 0.2183, 'grad_norm': 1.4165632724761963, 'learning_rate': 1.674374705049552e-05, 'epoch': 0.49}


 16%|█▋        | 4150/25428 [45:20<3:51:12,  1.53it/s]

{'loss': 0.2877, 'grad_norm': 2.0445642471313477, 'learning_rate': 1.673588170520686e-05, 'epoch': 0.49}


 16%|█▋        | 4160/25428 [45:27<3:46:15,  1.57it/s]

{'loss': 0.2287, 'grad_norm': 1.6635613441467285, 'learning_rate': 1.67280163599182e-05, 'epoch': 0.49}


 16%|█▋        | 4170/25428 [45:33<3:52:29,  1.52it/s]

{'loss': 0.2011, 'grad_norm': 1.8632540702819824, 'learning_rate': 1.6720151014629545e-05, 'epoch': 0.49}


 16%|█▋        | 4180/25428 [45:40<3:51:29,  1.53it/s]

{'loss': 0.2228, 'grad_norm': 1.0455511808395386, 'learning_rate': 1.6712285669340886e-05, 'epoch': 0.49}


 16%|█▋        | 4190/25428 [45:47<4:14:28,  1.39it/s]

{'loss': 0.2488, 'grad_norm': 3.395798444747925, 'learning_rate': 1.6704420324052227e-05, 'epoch': 0.49}


 17%|█▋        | 4200/25428 [45:53<3:47:24,  1.56it/s]

{'loss': 0.2286, 'grad_norm': 1.980700135231018, 'learning_rate': 1.669655497876357e-05, 'epoch': 0.5}


 17%|█▋        | 4210/25428 [46:00<3:55:51,  1.50it/s]

{'loss': 0.2401, 'grad_norm': 1.0108678340911865, 'learning_rate': 1.668868963347491e-05, 'epoch': 0.5}


 17%|█▋        | 4220/25428 [46:07<3:48:34,  1.55it/s]

{'loss': 0.2409, 'grad_norm': 1.7131543159484863, 'learning_rate': 1.6680824288186254e-05, 'epoch': 0.5}


 17%|█▋        | 4230/25428 [46:13<3:48:45,  1.54it/s]

{'loss': 0.2329, 'grad_norm': 1.4051927328109741, 'learning_rate': 1.6672958942897595e-05, 'epoch': 0.5}


 17%|█▋        | 4240/25428 [46:19<3:42:24,  1.59it/s]

{'loss': 0.2009, 'grad_norm': 30.73165512084961, 'learning_rate': 1.6665093597608936e-05, 'epoch': 0.5}


 17%|█▋        | 4250/25428 [46:26<3:45:57,  1.56it/s]

{'loss': 0.1969, 'grad_norm': 1.2564961910247803, 'learning_rate': 1.665722825232028e-05, 'epoch': 0.5}


 17%|█▋        | 4260/25428 [46:32<3:47:03,  1.55it/s]

{'loss': 0.2322, 'grad_norm': 1.8831901550292969, 'learning_rate': 1.664936290703162e-05, 'epoch': 0.5}


 17%|█▋        | 4270/25428 [46:38<3:39:54,  1.60it/s]

{'loss': 0.2042, 'grad_norm': 1.7946513891220093, 'learning_rate': 1.6641497561742962e-05, 'epoch': 0.5}


 17%|█▋        | 4280/25428 [46:44<3:40:46,  1.60it/s]

{'loss': 0.2352, 'grad_norm': 0.9326739311218262, 'learning_rate': 1.6633632216454303e-05, 'epoch': 0.5}


 17%|█▋        | 4290/25428 [46:51<3:39:38,  1.60it/s]

{'loss': 0.2243, 'grad_norm': 1.300723910331726, 'learning_rate': 1.6625766871165644e-05, 'epoch': 0.51}


 17%|█▋        | 4300/25428 [46:57<3:41:28,  1.59it/s]

{'loss': 0.2398, 'grad_norm': 1.342769980430603, 'learning_rate': 1.661790152587699e-05, 'epoch': 0.51}


 17%|█▋        | 4310/25428 [47:04<3:48:00,  1.54it/s]

{'loss': 0.2083, 'grad_norm': 1.260559320449829, 'learning_rate': 1.661003618058833e-05, 'epoch': 0.51}


 17%|█▋        | 4320/25428 [47:10<3:41:12,  1.59it/s]

{'loss': 0.1993, 'grad_norm': 0.9170747995376587, 'learning_rate': 1.660217083529967e-05, 'epoch': 0.51}


 17%|█▋        | 4330/25428 [47:16<3:45:47,  1.56it/s]

{'loss': 0.2759, 'grad_norm': 1.628874659538269, 'learning_rate': 1.6594305490011015e-05, 'epoch': 0.51}


 17%|█▋        | 4340/25428 [47:22<3:41:47,  1.58it/s]

{'loss': 0.2726, 'grad_norm': 1.5285574197769165, 'learning_rate': 1.6586440144722356e-05, 'epoch': 0.51}


 17%|█▋        | 4350/25428 [47:29<3:44:25,  1.57it/s]

{'loss': 0.2774, 'grad_norm': 1.3027883768081665, 'learning_rate': 1.6578574799433697e-05, 'epoch': 0.51}


 17%|█▋        | 4360/25428 [47:35<3:43:23,  1.57it/s]

{'loss': 0.2331, 'grad_norm': 1.5113712549209595, 'learning_rate': 1.6570709454145038e-05, 'epoch': 0.51}


 17%|█▋        | 4370/25428 [47:42<3:40:18,  1.59it/s]

{'loss': 0.2306, 'grad_norm': 1.4390058517456055, 'learning_rate': 1.656284410885638e-05, 'epoch': 0.52}


 17%|█▋        | 4380/25428 [47:48<3:48:11,  1.54it/s]

{'loss': 0.1993, 'grad_norm': 1.5915603637695312, 'learning_rate': 1.6554978763567723e-05, 'epoch': 0.52}


 17%|█▋        | 4390/25428 [47:54<3:35:01,  1.63it/s]

{'loss': 0.2372, 'grad_norm': 1.8538892269134521, 'learning_rate': 1.6547113418279064e-05, 'epoch': 0.52}


 17%|█▋        | 4400/25428 [48:01<3:37:11,  1.61it/s]

{'loss': 0.2348, 'grad_norm': 2.0954947471618652, 'learning_rate': 1.6539248072990405e-05, 'epoch': 0.52}


 17%|█▋        | 4410/25428 [48:07<3:44:53,  1.56it/s]

{'loss': 0.2424, 'grad_norm': 2.8057289123535156, 'learning_rate': 1.653138272770175e-05, 'epoch': 0.52}


 17%|█▋        | 4420/25428 [48:13<3:46:02,  1.55it/s]

{'loss': 0.3116, 'grad_norm': 2.0863289833068848, 'learning_rate': 1.652351738241309e-05, 'epoch': 0.52}


 17%|█▋        | 4430/25428 [48:20<3:46:37,  1.54it/s]

{'loss': 0.2117, 'grad_norm': 1.765661358833313, 'learning_rate': 1.651565203712443e-05, 'epoch': 0.52}


 17%|█▋        | 4440/25428 [48:26<3:44:49,  1.56it/s]

{'loss': 0.2217, 'grad_norm': 1.1950348615646362, 'learning_rate': 1.6507786691835772e-05, 'epoch': 0.52}


 18%|█▊        | 4450/25428 [48:32<3:37:57,  1.60it/s]

{'loss': 0.2583, 'grad_norm': 1.7509405612945557, 'learning_rate': 1.6499921346547113e-05, 'epoch': 0.53}


 18%|█▊        | 4460/25428 [48:39<3:37:13,  1.61it/s]

{'loss': 0.2286, 'grad_norm': 1.7548490762710571, 'learning_rate': 1.6492056001258458e-05, 'epoch': 0.53}


 18%|█▊        | 4470/25428 [48:45<3:41:21,  1.58it/s]

{'loss': 0.261, 'grad_norm': 1.6015205383300781, 'learning_rate': 1.64841906559698e-05, 'epoch': 0.53}


 18%|█▊        | 4480/25428 [48:51<3:38:22,  1.60it/s]

{'loss': 0.2322, 'grad_norm': 2.401576042175293, 'learning_rate': 1.647632531068114e-05, 'epoch': 0.53}


 18%|█▊        | 4490/25428 [48:58<3:36:21,  1.61it/s]

{'loss': 0.2185, 'grad_norm': 1.70829439163208, 'learning_rate': 1.6468459965392484e-05, 'epoch': 0.53}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.2621, 'grad_norm': 1.196478009223938, 'learning_rate': 1.6460594620103825e-05, 'epoch': 0.53}


 18%|█▊        | 4510/25428 [49:13<3:49:48,  1.52it/s]

{'loss': 0.2437, 'grad_norm': 1.462343454360962, 'learning_rate': 1.6452729274815166e-05, 'epoch': 0.53}


 18%|█▊        | 4520/25428 [49:19<3:39:59,  1.58it/s]

{'loss': 0.2163, 'grad_norm': 1.5481212139129639, 'learning_rate': 1.6444863929526507e-05, 'epoch': 0.53}


 18%|█▊        | 4530/25428 [49:26<3:41:15,  1.57it/s]

{'loss': 0.244, 'grad_norm': 1.47034752368927, 'learning_rate': 1.6436998584237848e-05, 'epoch': 0.53}


 18%|█▊        | 4540/25428 [49:32<3:37:13,  1.60it/s]

{'loss': 0.2544, 'grad_norm': 1.5209802389144897, 'learning_rate': 1.6429133238949192e-05, 'epoch': 0.54}


 18%|█▊        | 4550/25428 [49:38<3:40:00,  1.58it/s]

{'loss': 0.2255, 'grad_norm': 1.6519882678985596, 'learning_rate': 1.6421267893660533e-05, 'epoch': 0.54}


 18%|█▊        | 4560/25428 [49:45<3:38:08,  1.59it/s]

{'loss': 0.2079, 'grad_norm': 1.9579695463180542, 'learning_rate': 1.6413402548371874e-05, 'epoch': 0.54}


 18%|█▊        | 4570/25428 [49:51<3:41:47,  1.57it/s]

{'loss': 0.2213, 'grad_norm': 1.5848608016967773, 'learning_rate': 1.640553720308322e-05, 'epoch': 0.54}


 18%|█▊        | 4580/25428 [49:57<3:40:01,  1.58it/s]

{'loss': 0.2772, 'grad_norm': 1.3382900953292847, 'learning_rate': 1.639767185779456e-05, 'epoch': 0.54}


 18%|█▊        | 4590/25428 [50:03<3:36:14,  1.61it/s]

{'loss': 0.2339, 'grad_norm': 1.7791544198989868, 'learning_rate': 1.63898065125059e-05, 'epoch': 0.54}


 18%|█▊        | 4600/25428 [50:10<3:35:31,  1.61it/s]

{'loss': 0.2291, 'grad_norm': 1.1809070110321045, 'learning_rate': 1.638194116721724e-05, 'epoch': 0.54}


 18%|█▊        | 4610/25428 [50:16<3:36:09,  1.61it/s]

{'loss': 0.1655, 'grad_norm': 2.9357447624206543, 'learning_rate': 1.6374075821928583e-05, 'epoch': 0.54}


 18%|█▊        | 4620/25428 [50:22<3:40:01,  1.58it/s]

{'loss': 0.2428, 'grad_norm': 1.5715688467025757, 'learning_rate': 1.6366210476639927e-05, 'epoch': 0.55}


 18%|█▊        | 4630/25428 [50:28<3:33:08,  1.63it/s]

{'loss': 0.2367, 'grad_norm': 1.4232065677642822, 'learning_rate': 1.6358345131351268e-05, 'epoch': 0.55}


 18%|█▊        | 4640/25428 [50:35<3:38:45,  1.58it/s]

{'loss': 0.2271, 'grad_norm': 1.8445587158203125, 'learning_rate': 1.635047978606261e-05, 'epoch': 0.55}


 18%|█▊        | 4650/25428 [50:41<3:33:40,  1.62it/s]

{'loss': 0.2446, 'grad_norm': 1.9053007364273071, 'learning_rate': 1.6342614440773953e-05, 'epoch': 0.55}


 18%|█▊        | 4660/25428 [50:47<3:35:24,  1.61it/s]

{'loss': 0.211, 'grad_norm': 1.675244927406311, 'learning_rate': 1.6334749095485294e-05, 'epoch': 0.55}


 18%|█▊        | 4670/25428 [50:54<3:37:22,  1.59it/s]

{'loss': 0.2208, 'grad_norm': 1.4916090965270996, 'learning_rate': 1.6326883750196635e-05, 'epoch': 0.55}


 18%|█▊        | 4680/25428 [51:00<3:36:30,  1.60it/s]

{'loss': 0.2094, 'grad_norm': 1.344569206237793, 'learning_rate': 1.6319018404907976e-05, 'epoch': 0.55}


 18%|█▊        | 4690/25428 [51:06<3:42:07,  1.56it/s]

{'loss': 0.2014, 'grad_norm': 1.200935959815979, 'learning_rate': 1.6311153059619317e-05, 'epoch': 0.55}


 18%|█▊        | 4700/25428 [51:13<3:43:55,  1.54it/s]

{'loss': 0.2669, 'grad_norm': 2.010979413986206, 'learning_rate': 1.6303287714330662e-05, 'epoch': 0.55}


 19%|█▊        | 4710/25428 [51:19<3:35:39,  1.60it/s]

{'loss': 0.2348, 'grad_norm': 1.181030511856079, 'learning_rate': 1.6295422369042003e-05, 'epoch': 0.56}


 19%|█▊        | 4720/25428 [51:25<3:37:12,  1.59it/s]

{'loss': 0.2584, 'grad_norm': 1.3446991443634033, 'learning_rate': 1.6287557023753344e-05, 'epoch': 0.56}


 19%|█▊        | 4730/25428 [51:31<3:35:23,  1.60it/s]

{'loss': 0.2787, 'grad_norm': 1.6264110803604126, 'learning_rate': 1.6279691678464685e-05, 'epoch': 0.56}


 19%|█▊        | 4740/25428 [51:38<3:31:45,  1.63it/s]

{'loss': 0.2169, 'grad_norm': 1.4453284740447998, 'learning_rate': 1.627182633317603e-05, 'epoch': 0.56}


 19%|█▊        | 4750/25428 [51:44<3:36:03,  1.60it/s]

{'loss': 0.2174, 'grad_norm': 1.1754869222640991, 'learning_rate': 1.626396098788737e-05, 'epoch': 0.56}


 19%|█▊        | 4760/25428 [51:51<3:59:28,  1.44it/s]

{'loss': 0.2074, 'grad_norm': 1.2425963878631592, 'learning_rate': 1.625609564259871e-05, 'epoch': 0.56}


 19%|█▉        | 4770/25428 [51:57<3:39:32,  1.57it/s]

{'loss': 0.2093, 'grad_norm': 1.9758800268173218, 'learning_rate': 1.6248230297310052e-05, 'epoch': 0.56}


 19%|█▉        | 4780/25428 [52:03<3:35:25,  1.60it/s]

{'loss': 0.2106, 'grad_norm': 1.632088541984558, 'learning_rate': 1.6240364952021393e-05, 'epoch': 0.56}


 19%|█▉        | 4790/25428 [52:09<3:34:50,  1.60it/s]

{'loss': 0.2237, 'grad_norm': 1.156794786453247, 'learning_rate': 1.6232499606732737e-05, 'epoch': 0.57}


 19%|█▉        | 4800/25428 [52:16<3:31:32,  1.63it/s]

{'loss': 0.2218, 'grad_norm': 1.9002596139907837, 'learning_rate': 1.622463426144408e-05, 'epoch': 0.57}


 19%|█▉        | 4810/25428 [52:22<3:33:44,  1.61it/s]

{'loss': 0.2118, 'grad_norm': 2.142975330352783, 'learning_rate': 1.621676891615542e-05, 'epoch': 0.57}


 19%|█▉        | 4820/25428 [52:28<3:33:24,  1.61it/s]

{'loss': 0.2351, 'grad_norm': 1.4585943222045898, 'learning_rate': 1.6208903570866764e-05, 'epoch': 0.57}


 19%|█▉        | 4830/25428 [52:34<3:30:02,  1.63it/s]

{'loss': 0.1829, 'grad_norm': 1.7168748378753662, 'learning_rate': 1.6201038225578105e-05, 'epoch': 0.57}


 19%|█▉        | 4840/25428 [52:41<3:30:23,  1.63it/s]

{'loss': 0.2024, 'grad_norm': 1.5028489828109741, 'learning_rate': 1.6193172880289446e-05, 'epoch': 0.57}


 19%|█▉        | 4850/25428 [52:47<4:23:12,  1.30it/s]

{'loss': 0.2572, 'grad_norm': 1.4992727041244507, 'learning_rate': 1.6185307535000787e-05, 'epoch': 0.57}


 19%|█▉        | 4860/25428 [52:54<3:36:35,  1.58it/s]

{'loss': 0.2191, 'grad_norm': 2.0975358486175537, 'learning_rate': 1.6177442189712128e-05, 'epoch': 0.57}


 19%|█▉        | 4870/25428 [53:00<3:40:06,  1.56it/s]

{'loss': 0.2665, 'grad_norm': 1.5152747631072998, 'learning_rate': 1.6169576844423472e-05, 'epoch': 0.57}


 19%|█▉        | 4880/25428 [53:07<3:41:23,  1.55it/s]

{'loss': 0.2268, 'grad_norm': 1.6928553581237793, 'learning_rate': 1.6161711499134813e-05, 'epoch': 0.58}


 19%|█▉        | 4890/25428 [53:13<3:40:55,  1.55it/s]

{'loss': 0.2185, 'grad_norm': 1.2864502668380737, 'learning_rate': 1.6153846153846154e-05, 'epoch': 0.58}


 19%|█▉        | 4900/25428 [53:20<3:43:09,  1.53it/s]

{'loss': 0.2151, 'grad_norm': 1.235600471496582, 'learning_rate': 1.61459808085575e-05, 'epoch': 0.58}


 19%|█▉        | 4910/25428 [53:26<3:39:15,  1.56it/s]

{'loss': 0.2058, 'grad_norm': 1.658403754234314, 'learning_rate': 1.613811546326884e-05, 'epoch': 0.58}


 19%|█▉        | 4920/25428 [53:33<3:43:01,  1.53it/s]

{'loss': 0.2591, 'grad_norm': 2.0506184101104736, 'learning_rate': 1.613025011798018e-05, 'epoch': 0.58}


 19%|█▉        | 4930/25428 [53:39<3:37:50,  1.57it/s]

{'loss': 0.2435, 'grad_norm': 2.0765132904052734, 'learning_rate': 1.612238477269152e-05, 'epoch': 0.58}


 19%|█▉        | 4940/25428 [53:46<3:42:10,  1.54it/s]

{'loss': 0.1823, 'grad_norm': 1.3945555686950684, 'learning_rate': 1.6114519427402862e-05, 'epoch': 0.58}


 19%|█▉        | 4950/25428 [53:52<3:43:13,  1.53it/s]

{'loss': 0.2276, 'grad_norm': 1.6582895517349243, 'learning_rate': 1.6106654082114207e-05, 'epoch': 0.58}


 20%|█▉        | 4960/25428 [53:59<3:40:59,  1.54it/s]

{'loss': 0.2323, 'grad_norm': 1.0287268161773682, 'learning_rate': 1.6098788736825548e-05, 'epoch': 0.59}


 20%|█▉        | 4970/25428 [54:05<3:41:41,  1.54it/s]

{'loss': 0.175, 'grad_norm': 1.31304931640625, 'learning_rate': 1.609092339153689e-05, 'epoch': 0.59}


 20%|█▉        | 4980/25428 [54:11<3:34:39,  1.59it/s]

{'loss': 0.2376, 'grad_norm': 1.738951563835144, 'learning_rate': 1.6083058046248233e-05, 'epoch': 0.59}


 20%|█▉        | 4990/25428 [54:18<3:43:19,  1.53it/s]

{'loss': 0.2152, 'grad_norm': 1.3722938299179077, 'learning_rate': 1.6075192700959574e-05, 'epoch': 0.59}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.2569, 'grad_norm': 1.3265080451965332, 'learning_rate': 1.6067327355670915e-05, 'epoch': 0.59}


 20%|█▉        | 5010/25428 [54:34<3:50:24,  1.48it/s]

{'loss': 0.2277, 'grad_norm': 2.20387601852417, 'learning_rate': 1.6059462010382256e-05, 'epoch': 0.59}


 20%|█▉        | 5020/25428 [54:40<3:40:38,  1.54it/s]

{'loss': 0.2233, 'grad_norm': 1.2193996906280518, 'learning_rate': 1.6051596665093597e-05, 'epoch': 0.59}


 20%|█▉        | 5030/25428 [54:47<3:38:41,  1.55it/s]

{'loss': 0.1985, 'grad_norm': 1.1707779169082642, 'learning_rate': 1.604373131980494e-05, 'epoch': 0.59}


 20%|█▉        | 5040/25428 [54:53<3:38:35,  1.55it/s]

{'loss': 0.1989, 'grad_norm': 1.769138216972351, 'learning_rate': 1.6035865974516282e-05, 'epoch': 0.59}


 20%|█▉        | 5050/25428 [54:59<3:40:23,  1.54it/s]

{'loss': 0.2145, 'grad_norm': 1.64349365234375, 'learning_rate': 1.6028000629227623e-05, 'epoch': 0.6}


 20%|█▉        | 5060/25428 [55:06<3:38:52,  1.55it/s]

{'loss': 0.2392, 'grad_norm': 1.2348153591156006, 'learning_rate': 1.6020135283938968e-05, 'epoch': 0.6}


 20%|█▉        | 5070/25428 [55:12<3:36:28,  1.57it/s]

{'loss': 0.1751, 'grad_norm': 0.9289231300354004, 'learning_rate': 1.601226993865031e-05, 'epoch': 0.6}


 20%|█▉        | 5080/25428 [55:19<3:39:40,  1.54it/s]

{'loss': 0.2003, 'grad_norm': 1.4698984622955322, 'learning_rate': 1.600440459336165e-05, 'epoch': 0.6}


 20%|██        | 5090/25428 [55:25<3:36:11,  1.57it/s]

{'loss': 0.1928, 'grad_norm': 1.7454278469085693, 'learning_rate': 1.599653924807299e-05, 'epoch': 0.6}


 20%|██        | 5100/25428 [55:32<3:35:49,  1.57it/s]

{'loss': 0.184, 'grad_norm': 1.0091969966888428, 'learning_rate': 1.598867390278433e-05, 'epoch': 0.6}


 20%|██        | 5110/25428 [55:38<3:35:55,  1.57it/s]

{'loss': 0.2044, 'grad_norm': 1.7066799402236938, 'learning_rate': 1.5980808557495676e-05, 'epoch': 0.6}


 20%|██        | 5120/25428 [55:44<3:42:07,  1.52it/s]

{'loss': 0.1817, 'grad_norm': 1.1348191499710083, 'learning_rate': 1.5972943212207017e-05, 'epoch': 0.6}


 20%|██        | 5130/25428 [55:51<3:35:05,  1.57it/s]

{'loss': 0.2026, 'grad_norm': 1.3458852767944336, 'learning_rate': 1.5965077866918358e-05, 'epoch': 0.61}


 20%|██        | 5140/25428 [55:57<3:39:29,  1.54it/s]

{'loss': 0.214, 'grad_norm': 1.4620411396026611, 'learning_rate': 1.5957212521629702e-05, 'epoch': 0.61}


 20%|██        | 5150/25428 [56:04<3:36:52,  1.56it/s]

{'loss': 0.2318, 'grad_norm': 2.0838615894317627, 'learning_rate': 1.5949347176341043e-05, 'epoch': 0.61}


 20%|██        | 5160/25428 [56:10<3:40:22,  1.53it/s]

{'loss': 0.2356, 'grad_norm': 1.513757586479187, 'learning_rate': 1.5941481831052384e-05, 'epoch': 0.61}


 20%|██        | 5170/25428 [56:17<3:38:46,  1.54it/s]

{'loss': 0.2074, 'grad_norm': 1.8770371675491333, 'learning_rate': 1.5933616485763725e-05, 'epoch': 0.61}


 20%|██        | 5180/25428 [56:23<3:39:12,  1.54it/s]

{'loss': 0.3083, 'grad_norm': 2.595712423324585, 'learning_rate': 1.5925751140475066e-05, 'epoch': 0.61}


 20%|██        | 5190/25428 [56:30<3:36:10,  1.56it/s]

{'loss': 0.2499, 'grad_norm': 1.2862005233764648, 'learning_rate': 1.591788579518641e-05, 'epoch': 0.61}


 20%|██        | 5200/25428 [56:36<3:34:52,  1.57it/s]

{'loss': 0.184, 'grad_norm': 1.533463716506958, 'learning_rate': 1.591002044989775e-05, 'epoch': 0.61}


 20%|██        | 5210/25428 [56:42<3:34:01,  1.57it/s]

{'loss': 0.2408, 'grad_norm': 1.3641904592514038, 'learning_rate': 1.5902155104609093e-05, 'epoch': 0.61}


 21%|██        | 5220/25428 [56:49<3:34:43,  1.57it/s]

{'loss': 0.264, 'grad_norm': 1.5790947675704956, 'learning_rate': 1.5894289759320437e-05, 'epoch': 0.62}


 21%|██        | 5230/25428 [56:55<3:32:50,  1.58it/s]

{'loss': 0.2288, 'grad_norm': 1.9995810985565186, 'learning_rate': 1.5886424414031778e-05, 'epoch': 0.62}


 21%|██        | 5240/25428 [57:02<3:36:08,  1.56it/s]

{'loss': 0.1998, 'grad_norm': 1.8463528156280518, 'learning_rate': 1.587855906874312e-05, 'epoch': 0.62}


 21%|██        | 5250/25428 [57:08<3:35:55,  1.56it/s]

{'loss': 0.2361, 'grad_norm': 1.0449941158294678, 'learning_rate': 1.587069372345446e-05, 'epoch': 0.62}


 21%|██        | 5260/25428 [57:15<3:39:17,  1.53it/s]

{'loss': 0.2014, 'grad_norm': 1.7771131992340088, 'learning_rate': 1.58628283781658e-05, 'epoch': 0.62}


 21%|██        | 5270/25428 [57:21<3:38:47,  1.54it/s]

{'loss': 0.1957, 'grad_norm': 1.9856524467468262, 'learning_rate': 1.5854963032877145e-05, 'epoch': 0.62}


 21%|██        | 5280/25428 [57:27<3:38:32,  1.54it/s]

{'loss': 0.2039, 'grad_norm': 1.6325229406356812, 'learning_rate': 1.5847097687588486e-05, 'epoch': 0.62}


 21%|██        | 5290/25428 [57:34<3:37:30,  1.54it/s]

{'loss': 0.1767, 'grad_norm': 1.8160114288330078, 'learning_rate': 1.5839232342299827e-05, 'epoch': 0.62}


 21%|██        | 5300/25428 [57:40<3:34:02,  1.57it/s]

{'loss': 0.2351, 'grad_norm': 1.453467607498169, 'learning_rate': 1.5831366997011172e-05, 'epoch': 0.63}


 21%|██        | 5310/25428 [57:47<3:34:47,  1.56it/s]

{'loss': 0.2005, 'grad_norm': 1.5929796695709229, 'learning_rate': 1.5823501651722513e-05, 'epoch': 0.63}


 21%|██        | 5320/25428 [57:53<3:34:47,  1.56it/s]

{'loss': 0.1877, 'grad_norm': 1.353338360786438, 'learning_rate': 1.5815636306433854e-05, 'epoch': 0.63}


 21%|██        | 5330/25428 [58:00<3:34:25,  1.56it/s]

{'loss': 0.2664, 'grad_norm': 1.095125675201416, 'learning_rate': 1.5807770961145195e-05, 'epoch': 0.63}


 21%|██        | 5340/25428 [58:06<3:32:57,  1.57it/s]

{'loss': 0.2493, 'grad_norm': 1.5277791023254395, 'learning_rate': 1.5799905615856536e-05, 'epoch': 0.63}


 21%|██        | 5350/25428 [58:12<3:37:39,  1.54it/s]

{'loss': 0.171, 'grad_norm': 1.5642518997192383, 'learning_rate': 1.579204027056788e-05, 'epoch': 0.63}


 21%|██        | 5360/25428 [58:19<3:33:30,  1.57it/s]

{'loss': 0.2189, 'grad_norm': 3.275583028793335, 'learning_rate': 1.578417492527922e-05, 'epoch': 0.63}


 21%|██        | 5370/25428 [58:25<3:37:07,  1.54it/s]

{'loss': 0.1795, 'grad_norm': 1.3799735307693481, 'learning_rate': 1.5776309579990562e-05, 'epoch': 0.63}


 21%|██        | 5380/25428 [58:32<3:34:56,  1.55it/s]

{'loss': 0.2453, 'grad_norm': 1.2584203481674194, 'learning_rate': 1.5768444234701906e-05, 'epoch': 0.63}


 21%|██        | 5390/25428 [58:38<3:35:27,  1.55it/s]

{'loss': 0.1852, 'grad_norm': 1.1553696393966675, 'learning_rate': 1.5760578889413247e-05, 'epoch': 0.64}


 21%|██        | 5400/25428 [58:45<3:38:03,  1.53it/s]

{'loss': 0.1684, 'grad_norm': 1.4330971240997314, 'learning_rate': 1.575271354412459e-05, 'epoch': 0.64}


 21%|██▏       | 5410/25428 [58:51<3:38:32,  1.53it/s]

{'loss': 0.2133, 'grad_norm': 1.6620090007781982, 'learning_rate': 1.574484819883593e-05, 'epoch': 0.64}


 21%|██▏       | 5420/25428 [58:58<3:35:12,  1.55it/s]

{'loss': 0.1837, 'grad_norm': 2.6545023918151855, 'learning_rate': 1.573698285354727e-05, 'epoch': 0.64}


 21%|██▏       | 5430/25428 [59:04<3:37:13,  1.53it/s]

{'loss': 0.1966, 'grad_norm': 1.4440255165100098, 'learning_rate': 1.5729117508258615e-05, 'epoch': 0.64}


 21%|██▏       | 5440/25428 [59:10<3:31:06,  1.58it/s]

{'loss': 0.1923, 'grad_norm': 1.114902138710022, 'learning_rate': 1.5721252162969956e-05, 'epoch': 0.64}


 21%|██▏       | 5450/25428 [59:17<3:33:03,  1.56it/s]

{'loss': 0.1842, 'grad_norm': 1.82525634765625, 'learning_rate': 1.5713386817681297e-05, 'epoch': 0.64}


 21%|██▏       | 5460/25428 [59:23<3:32:26,  1.57it/s]

{'loss': 0.1877, 'grad_norm': 1.8931673765182495, 'learning_rate': 1.570552147239264e-05, 'epoch': 0.64}


 22%|██▏       | 5470/25428 [59:30<3:33:36,  1.56it/s]

{'loss': 0.2391, 'grad_norm': 1.2317038774490356, 'learning_rate': 1.5697656127103982e-05, 'epoch': 0.65}


 22%|██▏       | 5480/25428 [59:36<3:31:43,  1.57it/s]

{'loss': 0.1953, 'grad_norm': 2.478795051574707, 'learning_rate': 1.5689790781815323e-05, 'epoch': 0.65}


 22%|██▏       | 5490/25428 [59:42<3:30:01,  1.58it/s]

{'loss': 0.2125, 'grad_norm': 1.2581701278686523, 'learning_rate': 1.5681925436526664e-05, 'epoch': 0.65}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.201, 'grad_norm': 1.6442193984985352, 'learning_rate': 1.5674060091238005e-05, 'epoch': 0.65}


 22%|██▏       | 5510/25428 [59:58<3:43:25,  1.49it/s]

{'loss': 0.1904, 'grad_norm': 1.4132802486419678, 'learning_rate': 1.566619474594935e-05, 'epoch': 0.65}


 22%|██▏       | 5520/25428 [1:00:04<3:32:04,  1.56it/s]

{'loss': 0.2316, 'grad_norm': 1.4989012479782104, 'learning_rate': 1.565832940066069e-05, 'epoch': 0.65}


 22%|██▏       | 5530/25428 [1:00:10<3:30:26,  1.58it/s]

{'loss': 0.1666, 'grad_norm': 1.458390235900879, 'learning_rate': 1.565046405537203e-05, 'epoch': 0.65}


 22%|██▏       | 5540/25428 [1:00:17<3:32:10,  1.56it/s]

{'loss': 0.2013, 'grad_norm': 2.1267194747924805, 'learning_rate': 1.5642598710083376e-05, 'epoch': 0.65}


 22%|██▏       | 5550/25428 [1:00:23<3:33:09,  1.55it/s]

{'loss': 0.25, 'grad_norm': 5.0562567710876465, 'learning_rate': 1.5634733364794717e-05, 'epoch': 0.65}


 22%|██▏       | 5560/25428 [1:00:30<3:30:57,  1.57it/s]

{'loss': 0.1895, 'grad_norm': 0.948459267616272, 'learning_rate': 1.5626868019506058e-05, 'epoch': 0.66}


 22%|██▏       | 5570/25428 [1:00:36<3:30:55,  1.57it/s]

{'loss': 0.1829, 'grad_norm': 1.0555157661437988, 'learning_rate': 1.56190026742174e-05, 'epoch': 0.66}


 22%|██▏       | 5580/25428 [1:00:43<3:32:01,  1.56it/s]

{'loss': 0.1613, 'grad_norm': 1.436810851097107, 'learning_rate': 1.561113732892874e-05, 'epoch': 0.66}


 22%|██▏       | 5590/25428 [1:00:49<3:33:05,  1.55it/s]

{'loss': 0.196, 'grad_norm': 1.1554995775222778, 'learning_rate': 1.5603271983640084e-05, 'epoch': 0.66}


 22%|██▏       | 5600/25428 [1:00:56<3:34:52,  1.54it/s]

{'loss': 0.2504, 'grad_norm': 0.8913695812225342, 'learning_rate': 1.5595406638351425e-05, 'epoch': 0.66}


 22%|██▏       | 5610/25428 [1:01:02<3:31:38,  1.56it/s]

{'loss': 0.1803, 'grad_norm': 1.4347161054611206, 'learning_rate': 1.5587541293062766e-05, 'epoch': 0.66}


 22%|██▏       | 5620/25428 [1:01:08<3:28:13,  1.59it/s]

{'loss': 0.1982, 'grad_norm': 1.3451355695724487, 'learning_rate': 1.557967594777411e-05, 'epoch': 0.66}


 22%|██▏       | 5630/25428 [1:01:15<3:31:49,  1.56it/s]

{'loss': 0.2509, 'grad_norm': 1.227341651916504, 'learning_rate': 1.557181060248545e-05, 'epoch': 0.66}


 22%|██▏       | 5640/25428 [1:01:21<3:31:43,  1.56it/s]

{'loss': 0.2493, 'grad_norm': 1.2495207786560059, 'learning_rate': 1.5563945257196792e-05, 'epoch': 0.67}


 22%|██▏       | 5650/25428 [1:01:28<3:32:54,  1.55it/s]

{'loss': 0.2193, 'grad_norm': 1.4763354063034058, 'learning_rate': 1.5556079911908133e-05, 'epoch': 0.67}


 22%|██▏       | 5660/25428 [1:01:34<3:27:48,  1.59it/s]

{'loss': 0.212, 'grad_norm': 1.9775680303573608, 'learning_rate': 1.5548214566619474e-05, 'epoch': 0.67}


 22%|██▏       | 5670/25428 [1:01:41<3:31:20,  1.56it/s]

{'loss': 0.1686, 'grad_norm': 0.9367384910583496, 'learning_rate': 1.554034922133082e-05, 'epoch': 0.67}


 22%|██▏       | 5680/25428 [1:01:47<3:31:08,  1.56it/s]

{'loss': 0.259, 'grad_norm': 2.259913682937622, 'learning_rate': 1.553248387604216e-05, 'epoch': 0.67}


 22%|██▏       | 5690/25428 [1:01:54<3:34:29,  1.53it/s]

{'loss': 0.182, 'grad_norm': 1.508644461631775, 'learning_rate': 1.55246185307535e-05, 'epoch': 0.67}


 22%|██▏       | 5700/25428 [1:02:00<3:30:30,  1.56it/s]

{'loss': 0.1872, 'grad_norm': 1.209267258644104, 'learning_rate': 1.5516753185464845e-05, 'epoch': 0.67}


 22%|██▏       | 5710/25428 [1:02:06<3:31:52,  1.55it/s]

{'loss': 0.1748, 'grad_norm': 1.5229185819625854, 'learning_rate': 1.5508887840176186e-05, 'epoch': 0.67}


 22%|██▏       | 5720/25428 [1:02:13<3:32:38,  1.54it/s]

{'loss': 0.1884, 'grad_norm': 0.8536161184310913, 'learning_rate': 1.5501022494887527e-05, 'epoch': 0.67}


 23%|██▎       | 5730/25428 [1:02:19<3:28:54,  1.57it/s]

{'loss': 0.1852, 'grad_norm': 1.3092637062072754, 'learning_rate': 1.549315714959887e-05, 'epoch': 0.68}


 23%|██▎       | 5740/25428 [1:02:25<3:33:16,  1.54it/s]

{'loss': 0.2251, 'grad_norm': 1.2121435403823853, 'learning_rate': 1.548529180431021e-05, 'epoch': 0.68}


 23%|██▎       | 5750/25428 [1:02:32<3:28:32,  1.57it/s]

{'loss': 0.169, 'grad_norm': 1.2061972618103027, 'learning_rate': 1.5477426459021553e-05, 'epoch': 0.68}


 23%|██▎       | 5760/25428 [1:02:38<3:30:35,  1.56it/s]

{'loss': 0.2438, 'grad_norm': 2.4680185317993164, 'learning_rate': 1.5469561113732894e-05, 'epoch': 0.68}


 23%|██▎       | 5770/25428 [1:02:45<3:28:08,  1.57it/s]

{'loss': 0.1976, 'grad_norm': 1.293999433517456, 'learning_rate': 1.5461695768444235e-05, 'epoch': 0.68}


 23%|██▎       | 5780/25428 [1:02:51<3:28:15,  1.57it/s]

{'loss': 0.1765, 'grad_norm': 1.5817458629608154, 'learning_rate': 1.5453830423155576e-05, 'epoch': 0.68}


 23%|██▎       | 5790/25428 [1:02:57<3:29:06,  1.57it/s]

{'loss': 0.2063, 'grad_norm': 1.0947288274765015, 'learning_rate': 1.544596507786692e-05, 'epoch': 0.68}


 23%|██▎       | 5800/25428 [1:03:04<3:31:17,  1.55it/s]

{'loss': 0.1776, 'grad_norm': 2.8869943618774414, 'learning_rate': 1.543809973257826e-05, 'epoch': 0.68}


 23%|██▎       | 5810/25428 [1:03:10<3:30:02,  1.56it/s]

{'loss': 0.1703, 'grad_norm': 1.5937577486038208, 'learning_rate': 1.5430234387289603e-05, 'epoch': 0.69}


 23%|██▎       | 5820/25428 [1:03:17<3:31:35,  1.54it/s]

{'loss': 0.1738, 'grad_norm': 1.2150466442108154, 'learning_rate': 1.5422369042000944e-05, 'epoch': 0.69}


 23%|██▎       | 5830/25428 [1:03:23<3:28:37,  1.57it/s]

{'loss': 0.1567, 'grad_norm': 1.2269045114517212, 'learning_rate': 1.5414503696712285e-05, 'epoch': 0.69}


 23%|██▎       | 5840/25428 [1:03:30<3:32:43,  1.53it/s]

{'loss': 0.1959, 'grad_norm': 1.704012393951416, 'learning_rate': 1.540663835142363e-05, 'epoch': 0.69}


 23%|██▎       | 5850/25428 [1:03:36<3:31:00,  1.55it/s]

{'loss': 0.2006, 'grad_norm': 1.3942890167236328, 'learning_rate': 1.539877300613497e-05, 'epoch': 0.69}


 23%|██▎       | 5860/25428 [1:03:43<3:24:59,  1.59it/s]

{'loss': 0.163, 'grad_norm': 0.9305630326271057, 'learning_rate': 1.539090766084631e-05, 'epoch': 0.69}


 23%|██▎       | 5870/25428 [1:03:49<3:32:56,  1.53it/s]

{'loss': 0.1844, 'grad_norm': 1.5868688821792603, 'learning_rate': 1.5383042315557655e-05, 'epoch': 0.69}


 23%|██▎       | 5880/25428 [1:03:55<3:26:52,  1.57it/s]

{'loss': 0.1821, 'grad_norm': 1.4117295742034912, 'learning_rate': 1.5375176970268996e-05, 'epoch': 0.69}


 23%|██▎       | 5890/25428 [1:04:02<3:26:13,  1.58it/s]

{'loss': 0.2095, 'grad_norm': 2.152438163757324, 'learning_rate': 1.5367311624980337e-05, 'epoch': 0.69}


 23%|██▎       | 5900/25428 [1:04:08<3:30:08,  1.55it/s]

{'loss': 0.1713, 'grad_norm': 1.028319239616394, 'learning_rate': 1.535944627969168e-05, 'epoch': 0.7}


 23%|██▎       | 5910/25428 [1:04:15<3:36:24,  1.50it/s]

{'loss': 0.1916, 'grad_norm': 1.429560899734497, 'learning_rate': 1.535158093440302e-05, 'epoch': 0.7}


 23%|██▎       | 5920/25428 [1:04:21<3:24:47,  1.59it/s]

{'loss': 0.1807, 'grad_norm': 1.4034045934677124, 'learning_rate': 1.5343715589114364e-05, 'epoch': 0.7}


 23%|██▎       | 5930/25428 [1:04:28<3:30:32,  1.54it/s]

{'loss': 0.2114, 'grad_norm': 1.8406555652618408, 'learning_rate': 1.5335850243825705e-05, 'epoch': 0.7}


 23%|██▎       | 5940/25428 [1:04:34<3:29:48,  1.55it/s]

{'loss': 0.1773, 'grad_norm': 2.7774951457977295, 'learning_rate': 1.5327984898537046e-05, 'epoch': 0.7}


 23%|██▎       | 5950/25428 [1:04:41<3:30:53,  1.54it/s]

{'loss': 0.1932, 'grad_norm': 1.3929578065872192, 'learning_rate': 1.532011955324839e-05, 'epoch': 0.7}


 23%|██▎       | 5960/25428 [1:04:47<3:29:03,  1.55it/s]

{'loss': 0.1545, 'grad_norm': 0.9834649562835693, 'learning_rate': 1.531225420795973e-05, 'epoch': 0.7}


 23%|██▎       | 5970/25428 [1:04:53<3:27:10,  1.57it/s]

{'loss': 0.2028, 'grad_norm': 0.9455866813659668, 'learning_rate': 1.5304388862671072e-05, 'epoch': 0.7}


 24%|██▎       | 5980/25428 [1:05:00<3:30:29,  1.54it/s]

{'loss': 0.1789, 'grad_norm': 1.562206745147705, 'learning_rate': 1.5296523517382413e-05, 'epoch': 0.71}


 24%|██▎       | 5990/25428 [1:05:06<3:26:53,  1.57it/s]

{'loss': 0.1606, 'grad_norm': 1.0551098585128784, 'learning_rate': 1.5288658172093754e-05, 'epoch': 0.71}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.2253, 'grad_norm': 0.9201045632362366, 'learning_rate': 1.52807928268051e-05, 'epoch': 0.71}


 24%|██▎       | 6010/25428 [1:05:22<3:44:17,  1.44it/s]

{'loss': 0.172, 'grad_norm': 1.2719204425811768, 'learning_rate': 1.527292748151644e-05, 'epoch': 0.71}


 24%|██▎       | 6020/25428 [1:05:28<3:30:16,  1.54it/s]

{'loss': 0.1574, 'grad_norm': 1.6702470779418945, 'learning_rate': 1.526506213622778e-05, 'epoch': 0.71}


 24%|██▎       | 6030/25428 [1:05:35<3:30:55,  1.53it/s]

{'loss': 0.1941, 'grad_norm': 1.59438157081604, 'learning_rate': 1.5257196790939125e-05, 'epoch': 0.71}


 24%|██▍       | 6040/25428 [1:05:41<3:24:46,  1.58it/s]

{'loss': 0.1758, 'grad_norm': 1.6388360261917114, 'learning_rate': 1.5249331445650466e-05, 'epoch': 0.71}


 24%|██▍       | 6050/25428 [1:05:48<3:31:18,  1.53it/s]

{'loss': 0.1857, 'grad_norm': 1.7975802421569824, 'learning_rate': 1.5241466100361808e-05, 'epoch': 0.71}


 24%|██▍       | 6060/25428 [1:05:54<3:31:59,  1.52it/s]

{'loss': 0.1799, 'grad_norm': 1.2500510215759277, 'learning_rate': 1.5233600755073148e-05, 'epoch': 0.71}


 24%|██▍       | 6070/25428 [1:06:01<3:27:59,  1.55it/s]

{'loss': 0.2137, 'grad_norm': 0.8747636675834656, 'learning_rate': 1.522573540978449e-05, 'epoch': 0.72}


 24%|██▍       | 6080/25428 [1:06:07<3:29:53,  1.54it/s]

{'loss': 0.1585, 'grad_norm': 1.368818759918213, 'learning_rate': 1.5217870064495831e-05, 'epoch': 0.72}


 24%|██▍       | 6090/25428 [1:06:14<3:31:05,  1.53it/s]

{'loss': 0.2047, 'grad_norm': 1.2332061529159546, 'learning_rate': 1.5210004719207174e-05, 'epoch': 0.72}


 24%|██▍       | 6100/25428 [1:06:20<3:26:16,  1.56it/s]

{'loss': 0.1936, 'grad_norm': 1.713154673576355, 'learning_rate': 1.5202139373918517e-05, 'epoch': 0.72}


 24%|██▍       | 6110/25428 [1:06:27<3:28:24,  1.54it/s]

{'loss': 0.1711, 'grad_norm': 1.2060377597808838, 'learning_rate': 1.5194274028629858e-05, 'epoch': 0.72}


 24%|██▍       | 6120/25428 [1:06:33<3:31:40,  1.52it/s]

{'loss': 0.1738, 'grad_norm': 1.274855136871338, 'learning_rate': 1.51864086833412e-05, 'epoch': 0.72}


 24%|██▍       | 6130/25428 [1:06:40<3:31:02,  1.52it/s]

{'loss': 0.2217, 'grad_norm': 2.5580384731292725, 'learning_rate': 1.5178543338052543e-05, 'epoch': 0.72}


 24%|██▍       | 6140/25428 [1:06:46<3:27:40,  1.55it/s]

{'loss': 0.1989, 'grad_norm': 1.6517539024353027, 'learning_rate': 1.5170677992763882e-05, 'epoch': 0.72}


 24%|██▍       | 6150/25428 [1:06:53<3:30:51,  1.52it/s]

{'loss': 0.2111, 'grad_norm': 1.992491364479065, 'learning_rate': 1.5162812647475225e-05, 'epoch': 0.73}


 24%|██▍       | 6160/25428 [1:06:59<3:29:00,  1.54it/s]

{'loss': 0.1858, 'grad_norm': 1.896162509918213, 'learning_rate': 1.5154947302186566e-05, 'epoch': 0.73}


 24%|██▍       | 6170/25428 [1:07:06<3:29:06,  1.53it/s]

{'loss': 0.1964, 'grad_norm': 1.0974658727645874, 'learning_rate': 1.5147081956897909e-05, 'epoch': 0.73}


 24%|██▍       | 6180/25428 [1:07:13<3:40:27,  1.46it/s]

{'loss': 0.1903, 'grad_norm': 1.2830054759979248, 'learning_rate': 1.5139216611609251e-05, 'epoch': 0.73}


 24%|██▍       | 6190/25428 [1:07:19<3:31:05,  1.52it/s]

{'loss': 0.1804, 'grad_norm': 1.2947574853897095, 'learning_rate': 1.5131351266320592e-05, 'epoch': 0.73}


 24%|██▍       | 6200/25428 [1:07:26<3:26:57,  1.55it/s]

{'loss': 0.146, 'grad_norm': 1.4534142017364502, 'learning_rate': 1.5123485921031935e-05, 'epoch': 0.73}


 24%|██▍       | 6210/25428 [1:07:32<3:25:34,  1.56it/s]

{'loss': 0.1689, 'grad_norm': 1.2753100395202637, 'learning_rate': 1.5115620575743278e-05, 'epoch': 0.73}


 24%|██▍       | 6220/25428 [1:07:39<3:29:02,  1.53it/s]

{'loss': 0.1762, 'grad_norm': 1.5155901908874512, 'learning_rate': 1.5107755230454617e-05, 'epoch': 0.73}


 25%|██▍       | 6230/25428 [1:07:45<3:24:34,  1.56it/s]

{'loss': 0.1504, 'grad_norm': 1.5405616760253906, 'learning_rate': 1.509988988516596e-05, 'epoch': 0.74}


 25%|██▍       | 6240/25428 [1:07:51<3:28:39,  1.53it/s]

{'loss': 0.211, 'grad_norm': 1.4920873641967773, 'learning_rate': 1.50920245398773e-05, 'epoch': 0.74}


 25%|██▍       | 6250/25428 [1:07:58<3:27:05,  1.54it/s]

{'loss': 0.1677, 'grad_norm': 1.408515453338623, 'learning_rate': 1.5084159194588643e-05, 'epoch': 0.74}


 25%|██▍       | 6260/25428 [1:08:04<3:29:01,  1.53it/s]

{'loss': 0.166, 'grad_norm': 1.634600043296814, 'learning_rate': 1.5076293849299986e-05, 'epoch': 0.74}


 25%|██▍       | 6270/25428 [1:08:11<3:26:54,  1.54it/s]

{'loss': 0.1927, 'grad_norm': 0.9445279836654663, 'learning_rate': 1.5068428504011327e-05, 'epoch': 0.74}


 25%|██▍       | 6280/25428 [1:08:17<3:24:50,  1.56it/s]

{'loss': 0.2362, 'grad_norm': 2.5923850536346436, 'learning_rate': 1.506056315872267e-05, 'epoch': 0.74}


 25%|██▍       | 6290/25428 [1:08:24<3:27:41,  1.54it/s]

{'loss': 0.1706, 'grad_norm': 1.843762755393982, 'learning_rate': 1.5052697813434012e-05, 'epoch': 0.74}


 25%|██▍       | 6300/25428 [1:08:30<3:29:01,  1.53it/s]

{'loss': 0.1468, 'grad_norm': 1.224313735961914, 'learning_rate': 1.5044832468145353e-05, 'epoch': 0.74}


 25%|██▍       | 6310/25428 [1:08:37<3:20:53,  1.59it/s]

{'loss': 0.1737, 'grad_norm': 1.1676063537597656, 'learning_rate': 1.5036967122856694e-05, 'epoch': 0.74}


 25%|██▍       | 6320/25428 [1:08:43<3:24:18,  1.56it/s]

{'loss': 0.1595, 'grad_norm': 0.8827442526817322, 'learning_rate': 1.5029101777568035e-05, 'epoch': 0.75}


 25%|██▍       | 6330/25428 [1:08:50<3:27:50,  1.53it/s]

{'loss': 0.1447, 'grad_norm': 1.0650417804718018, 'learning_rate': 1.5021236432279378e-05, 'epoch': 0.75}


 25%|██▍       | 6340/25428 [1:08:56<3:27:19,  1.53it/s]

{'loss': 0.1768, 'grad_norm': 1.6820026636123657, 'learning_rate': 1.501337108699072e-05, 'epoch': 0.75}


 25%|██▍       | 6350/25428 [1:09:03<3:25:16,  1.55it/s]

{'loss': 0.2069, 'grad_norm': 2.1653566360473633, 'learning_rate': 1.5005505741702062e-05, 'epoch': 0.75}


 25%|██▌       | 6360/25428 [1:09:09<3:24:52,  1.55it/s]

{'loss': 0.1606, 'grad_norm': 1.7615803480148315, 'learning_rate': 1.4997640396413404e-05, 'epoch': 0.75}


 25%|██▌       | 6370/25428 [1:09:16<3:28:28,  1.52it/s]

{'loss': 0.1774, 'grad_norm': 1.099303960800171, 'learning_rate': 1.4989775051124747e-05, 'epoch': 0.75}


 25%|██▌       | 6380/25428 [1:09:22<3:25:39,  1.54it/s]

{'loss': 0.1479, 'grad_norm': 0.9266412258148193, 'learning_rate': 1.4981909705836088e-05, 'epoch': 0.75}


 25%|██▌       | 6390/25428 [1:09:29<3:23:56,  1.56it/s]

{'loss': 0.1777, 'grad_norm': 1.1443449258804321, 'learning_rate': 1.4974044360547429e-05, 'epoch': 0.75}


 25%|██▌       | 6400/25428 [1:09:35<3:26:19,  1.54it/s]

{'loss': 0.2478, 'grad_norm': 1.1544790267944336, 'learning_rate': 1.496617901525877e-05, 'epoch': 0.76}


 25%|██▌       | 6410/25428 [1:09:42<3:20:43,  1.58it/s]

{'loss': 0.1464, 'grad_norm': 1.9845664501190186, 'learning_rate': 1.4958313669970113e-05, 'epoch': 0.76}


 25%|██▌       | 6420/25428 [1:09:48<3:25:49,  1.54it/s]

{'loss': 0.1804, 'grad_norm': 1.1523339748382568, 'learning_rate': 1.4950448324681455e-05, 'epoch': 0.76}


 25%|██▌       | 6430/25428 [1:09:55<3:25:25,  1.54it/s]

{'loss': 0.1583, 'grad_norm': 1.9349355697631836, 'learning_rate': 1.4942582979392796e-05, 'epoch': 0.76}


 25%|██▌       | 6440/25428 [1:10:01<3:26:27,  1.53it/s]

{'loss': 0.1538, 'grad_norm': 1.2161431312561035, 'learning_rate': 1.4934717634104139e-05, 'epoch': 0.76}


 25%|██▌       | 6450/25428 [1:10:08<3:26:04,  1.53it/s]

{'loss': 0.1346, 'grad_norm': 1.4180597066879272, 'learning_rate': 1.4926852288815482e-05, 'epoch': 0.76}


 25%|██▌       | 6460/25428 [1:10:14<3:22:54,  1.56it/s]

{'loss': 0.1616, 'grad_norm': 1.492547631263733, 'learning_rate': 1.4918986943526823e-05, 'epoch': 0.76}


 25%|██▌       | 6470/25428 [1:10:20<3:21:35,  1.57it/s]

{'loss': 0.1739, 'grad_norm': 0.9291211366653442, 'learning_rate': 1.4911121598238164e-05, 'epoch': 0.76}


 25%|██▌       | 6480/25428 [1:10:27<3:27:04,  1.53it/s]

{'loss': 0.1368, 'grad_norm': 1.1532042026519775, 'learning_rate': 1.4903256252949505e-05, 'epoch': 0.76}


 26%|██▌       | 6490/25428 [1:10:33<3:24:02,  1.55it/s]

{'loss': 0.1719, 'grad_norm': 1.5846402645111084, 'learning_rate': 1.4895390907660847e-05, 'epoch': 0.77}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.1699, 'grad_norm': 1.0840821266174316, 'learning_rate': 1.488752556237219e-05, 'epoch': 0.77}


 26%|██▌       | 6510/25428 [1:10:49<3:33:09,  1.48it/s]

{'loss': 0.1611, 'grad_norm': 1.670771598815918, 'learning_rate': 1.4879660217083531e-05, 'epoch': 0.77}


 26%|██▌       | 6520/25428 [1:10:55<3:23:40,  1.55it/s]

{'loss': 0.1789, 'grad_norm': 1.1543868780136108, 'learning_rate': 1.4871794871794874e-05, 'epoch': 0.77}


 26%|██▌       | 6530/25428 [1:11:02<3:25:43,  1.53it/s]

{'loss': 0.1358, 'grad_norm': 1.0900418758392334, 'learning_rate': 1.4863929526506216e-05, 'epoch': 0.77}


 26%|██▌       | 6540/25428 [1:11:08<3:25:05,  1.53it/s]

{'loss': 0.1628, 'grad_norm': 2.4581618309020996, 'learning_rate': 1.4856064181217557e-05, 'epoch': 0.77}


 26%|██▌       | 6550/25428 [1:11:15<3:17:56,  1.59it/s]

{'loss': 0.1542, 'grad_norm': 1.0568214654922485, 'learning_rate': 1.4848198835928898e-05, 'epoch': 0.77}


 26%|██▌       | 6560/25428 [1:11:21<3:18:15,  1.59it/s]

{'loss': 0.1451, 'grad_norm': 2.4371273517608643, 'learning_rate': 1.484033349064024e-05, 'epoch': 0.77}


 26%|██▌       | 6570/25428 [1:11:27<3:20:40,  1.57it/s]

{'loss': 0.2227, 'grad_norm': 1.0677694082260132, 'learning_rate': 1.4832468145351582e-05, 'epoch': 0.78}


 26%|██▌       | 6580/25428 [1:11:34<3:25:19,  1.53it/s]

{'loss': 0.1636, 'grad_norm': 0.9590996503829956, 'learning_rate': 1.4824602800062923e-05, 'epoch': 0.78}


 26%|██▌       | 6590/25428 [1:11:40<3:26:14,  1.52it/s]

{'loss': 0.1461, 'grad_norm': 1.2167713642120361, 'learning_rate': 1.4816737454774266e-05, 'epoch': 0.78}


 26%|██▌       | 6600/25428 [1:11:47<3:22:01,  1.55it/s]

{'loss': 0.1395, 'grad_norm': 1.8468157052993774, 'learning_rate': 1.4808872109485608e-05, 'epoch': 0.78}


 26%|██▌       | 6610/25428 [1:11:53<3:25:26,  1.53it/s]

{'loss': 0.1691, 'grad_norm': 1.3873966932296753, 'learning_rate': 1.480100676419695e-05, 'epoch': 0.78}


 26%|██▌       | 6620/25428 [1:12:00<3:23:40,  1.54it/s]

{'loss': 0.1846, 'grad_norm': 1.9027243852615356, 'learning_rate': 1.4793141418908292e-05, 'epoch': 0.78}


 26%|██▌       | 6630/25428 [1:12:06<3:23:13,  1.54it/s]

{'loss': 0.1457, 'grad_norm': 2.2226879596710205, 'learning_rate': 1.4785276073619631e-05, 'epoch': 0.78}


 26%|██▌       | 6640/25428 [1:12:13<3:25:23,  1.52it/s]

{'loss': 0.1697, 'grad_norm': 1.364894986152649, 'learning_rate': 1.4777410728330974e-05, 'epoch': 0.78}


 26%|██▌       | 6650/25428 [1:12:19<3:19:55,  1.57it/s]

{'loss': 0.1236, 'grad_norm': 0.7422735095024109, 'learning_rate': 1.4769545383042317e-05, 'epoch': 0.78}


 26%|██▌       | 6660/25428 [1:12:26<3:23:05,  1.54it/s]

{'loss': 0.1461, 'grad_norm': 1.170586109161377, 'learning_rate': 1.4761680037753658e-05, 'epoch': 0.79}


 26%|██▌       | 6670/25428 [1:12:32<3:21:13,  1.55it/s]

{'loss': 0.1646, 'grad_norm': 1.3016315698623657, 'learning_rate': 1.4753814692465e-05, 'epoch': 0.79}


 26%|██▋       | 6680/25428 [1:12:38<3:17:36,  1.58it/s]

{'loss': 0.168, 'grad_norm': 1.4872932434082031, 'learning_rate': 1.4745949347176343e-05, 'epoch': 0.79}


 26%|██▋       | 6690/25428 [1:12:45<3:19:20,  1.57it/s]

{'loss': 0.1443, 'grad_norm': 1.4218955039978027, 'learning_rate': 1.4738084001887684e-05, 'epoch': 0.79}


 26%|██▋       | 6700/25428 [1:12:51<3:22:18,  1.54it/s]

{'loss': 0.1708, 'grad_norm': 2.7361083030700684, 'learning_rate': 1.4730218656599027e-05, 'epoch': 0.79}


 26%|██▋       | 6710/25428 [1:12:58<3:19:11,  1.57it/s]

{'loss': 0.1802, 'grad_norm': 2.4607958793640137, 'learning_rate': 1.4722353311310366e-05, 'epoch': 0.79}


 26%|██▋       | 6720/25428 [1:13:04<3:20:18,  1.56it/s]

{'loss': 0.1475, 'grad_norm': 1.2426363229751587, 'learning_rate': 1.4714487966021709e-05, 'epoch': 0.79}


 26%|██▋       | 6730/25428 [1:13:11<3:21:18,  1.55it/s]

{'loss': 0.1568, 'grad_norm': 1.1306054592132568, 'learning_rate': 1.4706622620733051e-05, 'epoch': 0.79}


 27%|██▋       | 6740/25428 [1:13:17<3:19:44,  1.56it/s]

{'loss': 0.1845, 'grad_norm': 1.5013554096221924, 'learning_rate': 1.4698757275444392e-05, 'epoch': 0.8}


 27%|██▋       | 6750/25428 [1:13:24<3:23:25,  1.53it/s]

{'loss': 0.1639, 'grad_norm': 0.9327700734138489, 'learning_rate': 1.4690891930155735e-05, 'epoch': 0.8}


 27%|██▋       | 6760/25428 [1:13:30<3:20:45,  1.55it/s]

{'loss': 0.1678, 'grad_norm': 1.393936038017273, 'learning_rate': 1.4683026584867078e-05, 'epoch': 0.8}


 27%|██▋       | 6770/25428 [1:13:37<3:18:12,  1.57it/s]

{'loss': 0.1693, 'grad_norm': 1.764999508857727, 'learning_rate': 1.4675161239578419e-05, 'epoch': 0.8}


 27%|██▋       | 6780/25428 [1:13:43<3:21:56,  1.54it/s]

{'loss': 0.1717, 'grad_norm': 1.610047459602356, 'learning_rate': 1.4667295894289761e-05, 'epoch': 0.8}


 27%|██▋       | 6790/25428 [1:13:49<3:19:43,  1.56it/s]

{'loss': 0.1592, 'grad_norm': 2.2277488708496094, 'learning_rate': 1.46594305490011e-05, 'epoch': 0.8}


 27%|██▋       | 6800/25428 [1:13:56<3:20:47,  1.55it/s]

{'loss': 0.1673, 'grad_norm': 1.0995489358901978, 'learning_rate': 1.4651565203712443e-05, 'epoch': 0.8}


 27%|██▋       | 6810/25428 [1:14:02<3:18:51,  1.56it/s]

{'loss': 0.1939, 'grad_norm': 2.1490182876586914, 'learning_rate': 1.4643699858423786e-05, 'epoch': 0.8}


 27%|██▋       | 6820/25428 [1:14:09<3:20:07,  1.55it/s]

{'loss': 0.1876, 'grad_norm': 1.414530634880066, 'learning_rate': 1.4635834513135127e-05, 'epoch': 0.8}


 27%|██▋       | 6830/25428 [1:14:15<3:18:35,  1.56it/s]

{'loss': 0.191, 'grad_norm': 1.195465326309204, 'learning_rate': 1.462796916784647e-05, 'epoch': 0.81}


 27%|██▋       | 6840/25428 [1:14:22<3:17:31,  1.57it/s]

{'loss': 0.2073, 'grad_norm': 1.1932809352874756, 'learning_rate': 1.4620103822557812e-05, 'epoch': 0.81}


 27%|██▋       | 6850/25428 [1:14:28<3:18:28,  1.56it/s]

{'loss': 0.1901, 'grad_norm': 1.0134449005126953, 'learning_rate': 1.4612238477269153e-05, 'epoch': 0.81}


 27%|██▋       | 6860/25428 [1:14:34<3:18:54,  1.56it/s]

{'loss': 0.1721, 'grad_norm': 1.1490757465362549, 'learning_rate': 1.4604373131980496e-05, 'epoch': 0.81}


 27%|██▋       | 6870/25428 [1:14:41<3:17:59,  1.56it/s]

{'loss': 0.1666, 'grad_norm': 1.790547251701355, 'learning_rate': 1.4596507786691835e-05, 'epoch': 0.81}


 27%|██▋       | 6880/25428 [1:14:47<3:20:30,  1.54it/s]

{'loss': 0.1804, 'grad_norm': 1.8397767543792725, 'learning_rate': 1.4588642441403178e-05, 'epoch': 0.81}


 27%|██▋       | 6890/25428 [1:14:54<3:21:32,  1.53it/s]

{'loss': 0.1722, 'grad_norm': 0.976845383644104, 'learning_rate': 1.458077709611452e-05, 'epoch': 0.81}


 27%|██▋       | 6900/25428 [1:15:00<3:19:23,  1.55it/s]

{'loss': 0.1874, 'grad_norm': 1.174927830696106, 'learning_rate': 1.4572911750825862e-05, 'epoch': 0.81}


 27%|██▋       | 6910/25428 [1:15:07<3:17:19,  1.56it/s]

{'loss': 0.2564, 'grad_norm': 1.2948628664016724, 'learning_rate': 1.4565046405537204e-05, 'epoch': 0.82}


 27%|██▋       | 6920/25428 [1:15:13<3:28:01,  1.48it/s]

{'loss': 0.1895, 'grad_norm': 1.5013788938522339, 'learning_rate': 1.4557181060248547e-05, 'epoch': 0.82}


 27%|██▋       | 6930/25428 [1:15:20<3:21:19,  1.53it/s]

{'loss': 0.1453, 'grad_norm': 1.3401318788528442, 'learning_rate': 1.4549315714959888e-05, 'epoch': 0.82}


 27%|██▋       | 6940/25428 [1:15:26<3:16:22,  1.57it/s]

{'loss': 0.1743, 'grad_norm': 1.2999919652938843, 'learning_rate': 1.454145036967123e-05, 'epoch': 0.82}


 27%|██▋       | 6950/25428 [1:15:33<3:21:08,  1.53it/s]

{'loss': 0.1612, 'grad_norm': 1.6094671487808228, 'learning_rate': 1.4533585024382573e-05, 'epoch': 0.82}


 27%|██▋       | 6960/25428 [1:15:39<3:17:02,  1.56it/s]

{'loss': 0.1558, 'grad_norm': 1.3145356178283691, 'learning_rate': 1.4525719679093913e-05, 'epoch': 0.82}


 27%|██▋       | 6970/25428 [1:15:45<3:19:51,  1.54it/s]

{'loss': 0.1657, 'grad_norm': 1.8242889642715454, 'learning_rate': 1.4517854333805255e-05, 'epoch': 0.82}


 27%|██▋       | 6980/25428 [1:15:52<3:17:42,  1.56it/s]

{'loss': 0.164, 'grad_norm': 1.2528704404830933, 'learning_rate': 1.4509988988516596e-05, 'epoch': 0.82}


 27%|██▋       | 6990/25428 [1:15:58<3:16:32,  1.56it/s]

{'loss': 0.1822, 'grad_norm': 1.2073243856430054, 'learning_rate': 1.4502123643227939e-05, 'epoch': 0.82}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.1454, 'grad_norm': 0.9110001921653748, 'learning_rate': 1.4494258297939282e-05, 'epoch': 0.83}


 28%|██▊       | 7010/25428 [1:16:14<3:30:52,  1.46it/s]

{'loss': 0.1893, 'grad_norm': 1.2017678022384644, 'learning_rate': 1.4486392952650623e-05, 'epoch': 0.83}


 28%|██▊       | 7020/25428 [1:16:20<3:21:12,  1.52it/s]

{'loss': 0.1748, 'grad_norm': 1.1696102619171143, 'learning_rate': 1.4478527607361965e-05, 'epoch': 0.83}


 28%|██▊       | 7030/25428 [1:16:27<3:13:11,  1.59it/s]

{'loss': 0.1619, 'grad_norm': 1.1734790802001953, 'learning_rate': 1.4470662262073308e-05, 'epoch': 0.83}


 28%|██▊       | 7040/25428 [1:16:33<3:18:45,  1.54it/s]

{'loss': 0.1662, 'grad_norm': 1.2146143913269043, 'learning_rate': 1.4462796916784647e-05, 'epoch': 0.83}


 28%|██▊       | 7050/25428 [1:16:40<3:15:33,  1.57it/s]

{'loss': 0.1677, 'grad_norm': 1.3983172178268433, 'learning_rate': 1.445493157149599e-05, 'epoch': 0.83}


 28%|██▊       | 7060/25428 [1:16:46<3:20:19,  1.53it/s]

{'loss': 0.1659, 'grad_norm': 1.7568778991699219, 'learning_rate': 1.4447066226207331e-05, 'epoch': 0.83}


 28%|██▊       | 7070/25428 [1:16:53<3:17:49,  1.55it/s]

{'loss': 0.1782, 'grad_norm': 2.2258381843566895, 'learning_rate': 1.4439200880918674e-05, 'epoch': 0.83}


 28%|██▊       | 7080/25428 [1:16:59<3:16:48,  1.55it/s]

{'loss': 0.1779, 'grad_norm': 2.1229324340820312, 'learning_rate': 1.4431335535630016e-05, 'epoch': 0.84}


 28%|██▊       | 7090/25428 [1:17:06<3:18:31,  1.54it/s]

{'loss': 0.1843, 'grad_norm': 1.1177380084991455, 'learning_rate': 1.4423470190341357e-05, 'epoch': 0.84}


 28%|██▊       | 7100/25428 [1:17:12<3:17:57,  1.54it/s]

{'loss': 0.166, 'grad_norm': 2.198394775390625, 'learning_rate': 1.44156048450527e-05, 'epoch': 0.84}


 28%|██▊       | 7110/25428 [1:17:18<3:17:55,  1.54it/s]

{'loss': 0.1657, 'grad_norm': 1.381566047668457, 'learning_rate': 1.4407739499764041e-05, 'epoch': 0.84}


 28%|██▊       | 7120/25428 [1:17:25<3:15:01,  1.56it/s]

{'loss': 0.1701, 'grad_norm': 1.5961295366287231, 'learning_rate': 1.4399874154475382e-05, 'epoch': 0.84}


 28%|██▊       | 7130/25428 [1:17:31<3:19:20,  1.53it/s]

{'loss': 0.168, 'grad_norm': 2.472867012023926, 'learning_rate': 1.4392008809186723e-05, 'epoch': 0.84}


 28%|██▊       | 7140/25428 [1:17:38<3:19:07,  1.53it/s]

{'loss': 0.1452, 'grad_norm': 1.2041503190994263, 'learning_rate': 1.4384143463898066e-05, 'epoch': 0.84}


 28%|██▊       | 7150/25428 [1:17:44<3:15:51,  1.56it/s]

{'loss': 0.1691, 'grad_norm': 2.8172805309295654, 'learning_rate': 1.4376278118609408e-05, 'epoch': 0.84}


 28%|██▊       | 7160/25428 [1:17:51<3:14:51,  1.56it/s]

{'loss': 0.1596, 'grad_norm': 2.383967399597168, 'learning_rate': 1.436841277332075e-05, 'epoch': 0.84}


 28%|██▊       | 7170/25428 [1:17:57<3:18:56,  1.53it/s]

{'loss': 0.1578, 'grad_norm': 1.6032582521438599, 'learning_rate': 1.4360547428032092e-05, 'epoch': 0.85}


 28%|██▊       | 7180/25428 [1:18:04<3:14:07,  1.57it/s]

{'loss': 0.1938, 'grad_norm': 1.4490201473236084, 'learning_rate': 1.4352682082743435e-05, 'epoch': 0.85}


 28%|██▊       | 7190/25428 [1:18:10<3:17:53,  1.54it/s]

{'loss': 0.1602, 'grad_norm': 1.3010563850402832, 'learning_rate': 1.4344816737454776e-05, 'epoch': 0.85}


 28%|██▊       | 7200/25428 [1:18:17<3:13:52,  1.57it/s]

{'loss': 0.1791, 'grad_norm': 0.8273558020591736, 'learning_rate': 1.4336951392166117e-05, 'epoch': 0.85}


 28%|██▊       | 7210/25428 [1:18:23<3:18:41,  1.53it/s]

{'loss': 0.1489, 'grad_norm': 1.5911731719970703, 'learning_rate': 1.4329086046877458e-05, 'epoch': 0.85}


 28%|██▊       | 7220/25428 [1:18:30<3:17:46,  1.53it/s]

{'loss': 0.1397, 'grad_norm': 1.3460371494293213, 'learning_rate': 1.43212207015888e-05, 'epoch': 0.85}


 28%|██▊       | 7230/25428 [1:18:36<3:17:07,  1.54it/s]

{'loss': 0.1814, 'grad_norm': 1.5786895751953125, 'learning_rate': 1.4313355356300143e-05, 'epoch': 0.85}


 28%|██▊       | 7240/25428 [1:18:43<3:18:20,  1.53it/s]

{'loss': 0.1998, 'grad_norm': 2.2925686836242676, 'learning_rate': 1.4305490011011484e-05, 'epoch': 0.85}


 29%|██▊       | 7250/25428 [1:18:49<3:18:46,  1.52it/s]

{'loss': 0.1642, 'grad_norm': 1.099815845489502, 'learning_rate': 1.4297624665722827e-05, 'epoch': 0.86}


 29%|██▊       | 7260/25428 [1:18:56<3:16:07,  1.54it/s]

{'loss': 0.1465, 'grad_norm': 0.8825260400772095, 'learning_rate': 1.428975932043417e-05, 'epoch': 0.86}


 29%|██▊       | 7270/25428 [1:19:02<3:16:26,  1.54it/s]

{'loss': 0.1613, 'grad_norm': 1.3871163129806519, 'learning_rate': 1.428189397514551e-05, 'epoch': 0.86}


 29%|██▊       | 7280/25428 [1:19:09<3:15:07,  1.55it/s]

{'loss': 0.1635, 'grad_norm': 1.364776611328125, 'learning_rate': 1.4274028629856851e-05, 'epoch': 0.86}


 29%|██▊       | 7290/25428 [1:19:15<3:20:00,  1.51it/s]

{'loss': 0.1723, 'grad_norm': 1.011671781539917, 'learning_rate': 1.4266163284568192e-05, 'epoch': 0.86}


 29%|██▊       | 7300/25428 [1:19:22<3:12:39,  1.57it/s]

{'loss': 0.2101, 'grad_norm': 2.5616042613983154, 'learning_rate': 1.4258297939279535e-05, 'epoch': 0.86}


 29%|██▊       | 7310/25428 [1:19:28<3:15:32,  1.54it/s]

{'loss': 0.1721, 'grad_norm': 1.767744541168213, 'learning_rate': 1.4250432593990878e-05, 'epoch': 0.86}


 29%|██▉       | 7320/25428 [1:19:35<3:12:30,  1.57it/s]

{'loss': 0.171, 'grad_norm': 2.074249267578125, 'learning_rate': 1.4242567248702219e-05, 'epoch': 0.86}


 29%|██▉       | 7330/25428 [1:19:41<3:16:56,  1.53it/s]

{'loss': 0.1559, 'grad_norm': 1.091856837272644, 'learning_rate': 1.4234701903413561e-05, 'epoch': 0.86}


 29%|██▉       | 7340/25428 [1:19:48<3:14:50,  1.55it/s]

{'loss': 0.146, 'grad_norm': 0.962980329990387, 'learning_rate': 1.4226836558124904e-05, 'epoch': 0.87}


 29%|██▉       | 7350/25428 [1:19:54<3:15:08,  1.54it/s]

{'loss': 0.1735, 'grad_norm': 1.291790246963501, 'learning_rate': 1.4218971212836245e-05, 'epoch': 0.87}


 29%|██▉       | 7360/25428 [1:20:01<3:17:45,  1.52it/s]

{'loss': 0.2015, 'grad_norm': 1.4575923681259155, 'learning_rate': 1.4211105867547586e-05, 'epoch': 0.87}


 29%|██▉       | 7370/25428 [1:20:07<3:16:02,  1.54it/s]

{'loss': 0.1592, 'grad_norm': 1.164264440536499, 'learning_rate': 1.4203240522258927e-05, 'epoch': 0.87}


 29%|██▉       | 7380/25428 [1:20:13<3:09:54,  1.58it/s]

{'loss': 0.1491, 'grad_norm': 2.44416880607605, 'learning_rate': 1.419537517697027e-05, 'epoch': 0.87}


 29%|██▉       | 7390/25428 [1:20:20<3:16:27,  1.53it/s]

{'loss': 0.162, 'grad_norm': 1.4548087120056152, 'learning_rate': 1.4187509831681612e-05, 'epoch': 0.87}


 29%|██▉       | 7400/25428 [1:20:26<3:17:08,  1.52it/s]

{'loss': 0.1506, 'grad_norm': 1.1864476203918457, 'learning_rate': 1.4179644486392953e-05, 'epoch': 0.87}


 29%|██▉       | 7410/25428 [1:20:33<3:16:53,  1.53it/s]

{'loss': 0.1519, 'grad_norm': 2.38643217086792, 'learning_rate': 1.4171779141104296e-05, 'epoch': 0.87}


 29%|██▉       | 7420/25428 [1:20:40<3:13:21,  1.55it/s]

{'loss': 0.1821, 'grad_norm': 1.3486969470977783, 'learning_rate': 1.4163913795815639e-05, 'epoch': 0.88}


 29%|██▉       | 7430/25428 [1:20:46<3:15:54,  1.53it/s]

{'loss': 0.2108, 'grad_norm': 2.04638409614563, 'learning_rate': 1.415604845052698e-05, 'epoch': 0.88}


 29%|██▉       | 7440/25428 [1:20:52<3:11:09,  1.57it/s]

{'loss': 0.1396, 'grad_norm': 1.0556963682174683, 'learning_rate': 1.414818310523832e-05, 'epoch': 0.88}


 29%|██▉       | 7450/25428 [1:20:59<3:12:33,  1.56it/s]

{'loss': 0.1936, 'grad_norm': 1.9603101015090942, 'learning_rate': 1.4140317759949662e-05, 'epoch': 0.88}


 29%|██▉       | 7460/25428 [1:21:05<3:11:36,  1.56it/s]

{'loss': 0.1662, 'grad_norm': 2.0317695140838623, 'learning_rate': 1.4132452414661004e-05, 'epoch': 0.88}


 29%|██▉       | 7470/25428 [1:21:12<3:12:14,  1.56it/s]

{'loss': 0.171, 'grad_norm': 1.1003862619400024, 'learning_rate': 1.4124587069372347e-05, 'epoch': 0.88}


 29%|██▉       | 7480/25428 [1:21:19<3:16:43,  1.52it/s]

{'loss': 0.1689, 'grad_norm': 1.0428359508514404, 'learning_rate': 1.4116721724083688e-05, 'epoch': 0.88}


 29%|██▉       | 7490/25428 [1:21:25<3:15:11,  1.53it/s]

{'loss': 0.1639, 'grad_norm': 1.7852675914764404, 'learning_rate': 1.410885637879503e-05, 'epoch': 0.88}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.1491, 'grad_norm': 1.7380157709121704, 'learning_rate': 1.4100991033506373e-05, 'epoch': 0.88}


 30%|██▉       | 7510/25428 [1:21:41<3:24:56,  1.46it/s]

{'loss': 0.1686, 'grad_norm': 0.8129540085792542, 'learning_rate': 1.4093125688217714e-05, 'epoch': 0.89}


 30%|██▉       | 7520/25428 [1:21:47<3:13:42,  1.54it/s]

{'loss': 0.152, 'grad_norm': 0.8442876935005188, 'learning_rate': 1.4085260342929055e-05, 'epoch': 0.89}


 30%|██▉       | 7530/25428 [1:21:53<3:08:46,  1.58it/s]

{'loss': 0.1698, 'grad_norm': 1.1181213855743408, 'learning_rate': 1.4077394997640396e-05, 'epoch': 0.89}


 30%|██▉       | 7540/25428 [1:22:00<3:13:46,  1.54it/s]

{'loss': 0.1648, 'grad_norm': 1.1733012199401855, 'learning_rate': 1.4069529652351739e-05, 'epoch': 0.89}


 30%|██▉       | 7550/25428 [1:22:06<3:12:22,  1.55it/s]

{'loss': 0.1746, 'grad_norm': 1.7208361625671387, 'learning_rate': 1.4061664307063082e-05, 'epoch': 0.89}


 30%|██▉       | 7560/25428 [1:22:13<3:12:25,  1.55it/s]

{'loss': 0.188, 'grad_norm': 1.1751865148544312, 'learning_rate': 1.4053798961774423e-05, 'epoch': 0.89}


 30%|██▉       | 7570/25428 [1:22:19<3:13:51,  1.54it/s]

{'loss': 0.1574, 'grad_norm': 1.5777349472045898, 'learning_rate': 1.4045933616485765e-05, 'epoch': 0.89}


 30%|██▉       | 7580/25428 [1:22:26<3:08:57,  1.57it/s]

{'loss': 0.1516, 'grad_norm': 0.911089301109314, 'learning_rate': 1.4038068271197108e-05, 'epoch': 0.89}


 30%|██▉       | 7590/25428 [1:22:32<3:07:49,  1.58it/s]

{'loss': 0.1409, 'grad_norm': 0.9479108452796936, 'learning_rate': 1.4030202925908449e-05, 'epoch': 0.9}


 30%|██▉       | 7600/25428 [1:22:38<3:12:49,  1.54it/s]

{'loss': 0.1636, 'grad_norm': 0.973383367061615, 'learning_rate': 1.4022337580619792e-05, 'epoch': 0.9}


 30%|██▉       | 7610/25428 [1:22:45<3:10:16,  1.56it/s]

{'loss': 0.1571, 'grad_norm': 1.5326870679855347, 'learning_rate': 1.4014472235331131e-05, 'epoch': 0.9}


 30%|██▉       | 7620/25428 [1:22:51<3:11:08,  1.55it/s]

{'loss': 0.1799, 'grad_norm': 1.7636432647705078, 'learning_rate': 1.4006606890042474e-05, 'epoch': 0.9}


 30%|███       | 7630/25428 [1:22:58<3:12:09,  1.54it/s]

{'loss': 0.1372, 'grad_norm': 1.9757875204086304, 'learning_rate': 1.3998741544753815e-05, 'epoch': 0.9}


 30%|███       | 7640/25428 [1:23:04<3:06:08,  1.59it/s]

{'loss': 0.1628, 'grad_norm': 2.0830843448638916, 'learning_rate': 1.3990876199465157e-05, 'epoch': 0.9}


 30%|███       | 7650/25428 [1:23:10<3:08:50,  1.57it/s]

{'loss': 0.1413, 'grad_norm': 1.0040273666381836, 'learning_rate': 1.39830108541765e-05, 'epoch': 0.9}


 30%|███       | 7660/25428 [1:23:17<3:12:19,  1.54it/s]

{'loss': 0.1725, 'grad_norm': 1.5172011852264404, 'learning_rate': 1.3975145508887841e-05, 'epoch': 0.9}


 30%|███       | 7670/25428 [1:23:23<3:10:52,  1.55it/s]

{'loss': 0.1465, 'grad_norm': 2.1190781593322754, 'learning_rate': 1.3967280163599184e-05, 'epoch': 0.9}


 30%|███       | 7680/25428 [1:23:30<3:10:48,  1.55it/s]

{'loss': 0.2127, 'grad_norm': 2.5478641986846924, 'learning_rate': 1.3959414818310526e-05, 'epoch': 0.91}


 30%|███       | 7690/25428 [1:23:36<3:08:12,  1.57it/s]

{'loss': 0.1735, 'grad_norm': 1.3802424669265747, 'learning_rate': 1.3951549473021866e-05, 'epoch': 0.91}


 30%|███       | 7700/25428 [1:23:43<3:04:58,  1.60it/s]

{'loss': 0.1523, 'grad_norm': 0.9590073823928833, 'learning_rate': 1.3943684127733208e-05, 'epoch': 0.91}


 30%|███       | 7710/25428 [1:23:49<3:11:53,  1.54it/s]

{'loss': 0.1279, 'grad_norm': 1.1567496061325073, 'learning_rate': 1.393581878244455e-05, 'epoch': 0.91}


 30%|███       | 7720/25428 [1:23:56<3:12:27,  1.53it/s]

{'loss': 0.1813, 'grad_norm': 1.1242222785949707, 'learning_rate': 1.3927953437155892e-05, 'epoch': 0.91}


 30%|███       | 7730/25428 [1:24:02<3:06:18,  1.58it/s]

{'loss': 0.1299, 'grad_norm': 1.5706300735473633, 'learning_rate': 1.3920088091867235e-05, 'epoch': 0.91}


 30%|███       | 7740/25428 [1:24:08<3:06:50,  1.58it/s]

{'loss': 0.1471, 'grad_norm': 1.3374444246292114, 'learning_rate': 1.3912222746578576e-05, 'epoch': 0.91}


 30%|███       | 7750/25428 [1:24:15<3:07:32,  1.57it/s]

{'loss': 0.1656, 'grad_norm': 1.0730420351028442, 'learning_rate': 1.3904357401289918e-05, 'epoch': 0.91}


 31%|███       | 7760/25428 [1:24:21<3:12:13,  1.53it/s]

{'loss': 0.189, 'grad_norm': 1.0751240253448486, 'learning_rate': 1.3896492056001261e-05, 'epoch': 0.92}


 31%|███       | 7770/25428 [1:24:28<3:11:55,  1.53it/s]

{'loss': 0.172, 'grad_norm': 1.5649478435516357, 'learning_rate': 1.38886267107126e-05, 'epoch': 0.92}


 31%|███       | 7780/25428 [1:24:34<3:08:35,  1.56it/s]

{'loss': 0.1558, 'grad_norm': 1.2564557790756226, 'learning_rate': 1.3880761365423943e-05, 'epoch': 0.92}


 31%|███       | 7790/25428 [1:24:41<3:07:53,  1.56it/s]

{'loss': 0.2091, 'grad_norm': 1.1843671798706055, 'learning_rate': 1.3872896020135284e-05, 'epoch': 0.92}


 31%|███       | 7800/25428 [1:24:47<3:10:52,  1.54it/s]

{'loss': 0.169, 'grad_norm': 2.203284502029419, 'learning_rate': 1.3865030674846627e-05, 'epoch': 0.92}


 31%|███       | 7810/25428 [1:24:53<3:08:17,  1.56it/s]

{'loss': 0.1556, 'grad_norm': 1.649010181427002, 'learning_rate': 1.385716532955797e-05, 'epoch': 0.92}


 31%|███       | 7820/25428 [1:25:00<3:11:22,  1.53it/s]

{'loss': 0.1599, 'grad_norm': 1.1429568529129028, 'learning_rate': 1.384929998426931e-05, 'epoch': 0.92}


 31%|███       | 7830/25428 [1:25:06<3:05:20,  1.58it/s]

{'loss': 0.1526, 'grad_norm': 0.9063065648078918, 'learning_rate': 1.3841434638980653e-05, 'epoch': 0.92}


 31%|███       | 7840/25428 [1:25:13<3:09:21,  1.55it/s]

{'loss': 0.1686, 'grad_norm': 2.108600616455078, 'learning_rate': 1.3833569293691996e-05, 'epoch': 0.92}


 31%|███       | 7850/25428 [1:25:19<3:08:25,  1.55it/s]

{'loss': 0.184, 'grad_norm': 0.9091483354568481, 'learning_rate': 1.3825703948403335e-05, 'epoch': 0.93}


 31%|███       | 7860/25428 [1:25:25<3:06:02,  1.57it/s]

{'loss': 0.1743, 'grad_norm': 2.6023075580596924, 'learning_rate': 1.3817838603114678e-05, 'epoch': 0.93}


 31%|███       | 7870/25428 [1:25:32<3:10:41,  1.53it/s]

{'loss': 0.142, 'grad_norm': 0.950297474861145, 'learning_rate': 1.3809973257826019e-05, 'epoch': 0.93}


 31%|███       | 7880/25428 [1:25:38<3:06:12,  1.57it/s]

{'loss': 0.1578, 'grad_norm': 1.4667627811431885, 'learning_rate': 1.3802107912537361e-05, 'epoch': 0.93}


 31%|███       | 7890/25428 [1:25:45<3:08:53,  1.55it/s]

{'loss': 0.1747, 'grad_norm': 1.5397969484329224, 'learning_rate': 1.3794242567248704e-05, 'epoch': 0.93}


 31%|███       | 7900/25428 [1:25:51<3:07:00,  1.56it/s]

{'loss': 0.1872, 'grad_norm': 1.2845702171325684, 'learning_rate': 1.3786377221960045e-05, 'epoch': 0.93}


 31%|███       | 7910/25428 [1:25:58<3:08:45,  1.55it/s]

{'loss': 0.147, 'grad_norm': 1.0781517028808594, 'learning_rate': 1.3778511876671388e-05, 'epoch': 0.93}


 31%|███       | 7920/25428 [1:26:04<3:08:20,  1.55it/s]

{'loss': 0.2338, 'grad_norm': 3.6645545959472656, 'learning_rate': 1.377064653138273e-05, 'epoch': 0.93}


 31%|███       | 7930/25428 [1:26:10<3:05:31,  1.57it/s]

{'loss': 0.1875, 'grad_norm': 2.4654324054718018, 'learning_rate': 1.376278118609407e-05, 'epoch': 0.94}


 31%|███       | 7940/25428 [1:26:17<3:07:20,  1.56it/s]

{'loss': 0.187, 'grad_norm': 1.4974156618118286, 'learning_rate': 1.3754915840805412e-05, 'epoch': 0.94}


 31%|███▏      | 7950/25428 [1:26:23<3:09:01,  1.54it/s]

{'loss': 0.1948, 'grad_norm': 1.389230728149414, 'learning_rate': 1.3747050495516753e-05, 'epoch': 0.94}


 31%|███▏      | 7960/25428 [1:26:30<3:09:34,  1.54it/s]

{'loss': 0.1285, 'grad_norm': 1.1582376956939697, 'learning_rate': 1.3739185150228096e-05, 'epoch': 0.94}


 31%|███▏      | 7970/25428 [1:26:36<3:07:01,  1.56it/s]

{'loss': 0.1704, 'grad_norm': 1.568416953086853, 'learning_rate': 1.3731319804939439e-05, 'epoch': 0.94}


 31%|███▏      | 7980/25428 [1:26:43<3:05:14,  1.57it/s]

{'loss': 0.1659, 'grad_norm': 1.1757400035858154, 'learning_rate': 1.372345445965078e-05, 'epoch': 0.94}


 31%|███▏      | 7990/25428 [1:26:49<3:06:50,  1.56it/s]

{'loss': 0.1449, 'grad_norm': 0.8381808400154114, 'learning_rate': 1.3715589114362122e-05, 'epoch': 0.94}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.1794, 'grad_norm': 3.018547534942627, 'learning_rate': 1.3707723769073465e-05, 'epoch': 0.94}


 32%|███▏      | 8010/25428 [1:27:05<3:17:46,  1.47it/s]

{'loss': 0.1234, 'grad_norm': 1.1840699911117554, 'learning_rate': 1.3699858423784804e-05, 'epoch': 0.95}


 32%|███▏      | 8020/25428 [1:27:11<3:08:24,  1.54it/s]

{'loss': 0.1531, 'grad_norm': 1.3227410316467285, 'learning_rate': 1.3691993078496147e-05, 'epoch': 0.95}


 32%|███▏      | 8030/25428 [1:27:18<3:09:36,  1.53it/s]

{'loss': 0.1711, 'grad_norm': 1.2386881113052368, 'learning_rate': 1.3684127733207488e-05, 'epoch': 0.95}


 32%|███▏      | 8040/25428 [1:27:24<3:05:57,  1.56it/s]

{'loss': 0.1922, 'grad_norm': 1.0130350589752197, 'learning_rate': 1.367626238791883e-05, 'epoch': 0.95}


 32%|███▏      | 8050/25428 [1:27:31<3:06:19,  1.55it/s]

{'loss': 0.1983, 'grad_norm': 1.5845298767089844, 'learning_rate': 1.3668397042630173e-05, 'epoch': 0.95}


 32%|███▏      | 8060/25428 [1:27:37<3:06:53,  1.55it/s]

{'loss': 0.1701, 'grad_norm': 1.2189708948135376, 'learning_rate': 1.3660531697341514e-05, 'epoch': 0.95}


 32%|███▏      | 8070/25428 [1:27:44<3:09:23,  1.53it/s]

{'loss': 0.1638, 'grad_norm': 1.4090725183486938, 'learning_rate': 1.3652666352052857e-05, 'epoch': 0.95}


 32%|███▏      | 8080/25428 [1:27:50<3:09:13,  1.53it/s]

{'loss': 0.1375, 'grad_norm': 1.1285661458969116, 'learning_rate': 1.36448010067642e-05, 'epoch': 0.95}


 32%|███▏      | 8090/25428 [1:27:57<3:09:09,  1.53it/s]

{'loss': 0.1519, 'grad_norm': 0.9875717759132385, 'learning_rate': 1.3636935661475539e-05, 'epoch': 0.95}


 32%|███▏      | 8100/25428 [1:28:03<3:08:25,  1.53it/s]

{'loss': 0.1953, 'grad_norm': 2.123549699783325, 'learning_rate': 1.3629070316186882e-05, 'epoch': 0.96}


 32%|███▏      | 8110/25428 [1:28:10<3:03:55,  1.57it/s]

{'loss': 0.1806, 'grad_norm': 1.1828124523162842, 'learning_rate': 1.3621204970898223e-05, 'epoch': 0.96}


 32%|███▏      | 8120/25428 [1:28:16<3:08:45,  1.53it/s]

{'loss': 0.1693, 'grad_norm': 1.4599440097808838, 'learning_rate': 1.3613339625609565e-05, 'epoch': 0.96}


 32%|███▏      | 8130/25428 [1:28:23<3:05:46,  1.55it/s]

{'loss': 0.12, 'grad_norm': 0.9304269552230835, 'learning_rate': 1.3605474280320908e-05, 'epoch': 0.96}


 32%|███▏      | 8140/25428 [1:28:29<3:08:57,  1.52it/s]

{'loss': 0.157, 'grad_norm': 1.0937494039535522, 'learning_rate': 1.3597608935032249e-05, 'epoch': 0.96}


 32%|███▏      | 8150/25428 [1:28:36<3:05:04,  1.56it/s]

{'loss': 0.1408, 'grad_norm': 1.8845409154891968, 'learning_rate': 1.3589743589743592e-05, 'epoch': 0.96}


 32%|███▏      | 8160/25428 [1:28:42<3:07:23,  1.54it/s]

{'loss': 0.1938, 'grad_norm': 0.8213154673576355, 'learning_rate': 1.3581878244454933e-05, 'epoch': 0.96}


 32%|███▏      | 8170/25428 [1:28:49<3:05:14,  1.55it/s]

{'loss': 0.1508, 'grad_norm': 0.9390763640403748, 'learning_rate': 1.3574012899166274e-05, 'epoch': 0.96}


 32%|███▏      | 8180/25428 [1:28:55<3:07:46,  1.53it/s]

{'loss': 0.1348, 'grad_norm': 1.2112807035446167, 'learning_rate': 1.3566147553877615e-05, 'epoch': 0.97}


 32%|███▏      | 8190/25428 [1:29:02<3:03:01,  1.57it/s]

{'loss': 0.1789, 'grad_norm': 1.0439531803131104, 'learning_rate': 1.3558282208588957e-05, 'epoch': 0.97}


 32%|███▏      | 8200/25428 [1:29:08<3:05:28,  1.55it/s]

{'loss': 0.1818, 'grad_norm': 1.1030163764953613, 'learning_rate': 1.35504168633003e-05, 'epoch': 0.97}


 32%|███▏      | 8210/25428 [1:29:15<3:03:23,  1.56it/s]

{'loss': 0.1686, 'grad_norm': 1.7358042001724243, 'learning_rate': 1.3542551518011641e-05, 'epoch': 0.97}


 32%|███▏      | 8220/25428 [1:29:21<3:06:12,  1.54it/s]

{'loss': 0.1641, 'grad_norm': 1.1698559522628784, 'learning_rate': 1.3534686172722984e-05, 'epoch': 0.97}


 32%|███▏      | 8230/25428 [1:29:28<3:08:49,  1.52it/s]

{'loss': 0.1581, 'grad_norm': 1.9864734411239624, 'learning_rate': 1.3526820827434326e-05, 'epoch': 0.97}


 32%|███▏      | 8240/25428 [1:29:35<3:05:38,  1.54it/s]

{'loss': 0.143, 'grad_norm': 0.9451664686203003, 'learning_rate': 1.3518955482145667e-05, 'epoch': 0.97}


 32%|███▏      | 8250/25428 [1:29:41<3:03:48,  1.56it/s]

{'loss': 0.1482, 'grad_norm': 1.4027951955795288, 'learning_rate': 1.351109013685701e-05, 'epoch': 0.97}


 32%|███▏      | 8260/25428 [1:29:48<3:06:42,  1.53it/s]

{'loss': 0.1831, 'grad_norm': 1.6593759059906006, 'learning_rate': 1.350322479156835e-05, 'epoch': 0.97}


 33%|███▎      | 8270/25428 [1:29:54<3:04:58,  1.55it/s]

{'loss': 0.1766, 'grad_norm': 1.3031257390975952, 'learning_rate': 1.3495359446279692e-05, 'epoch': 0.98}


 33%|███▎      | 8280/25428 [1:30:00<3:02:15,  1.57it/s]

{'loss': 0.1622, 'grad_norm': 1.276862382888794, 'learning_rate': 1.3487494100991035e-05, 'epoch': 0.98}


 33%|███▎      | 8290/25428 [1:30:07<2:58:56,  1.60it/s]

{'loss': 0.1395, 'grad_norm': 1.3322144746780396, 'learning_rate': 1.3479628755702376e-05, 'epoch': 0.98}


 33%|███▎      | 8300/25428 [1:30:13<3:00:10,  1.58it/s]

{'loss': 0.1789, 'grad_norm': 1.3461929559707642, 'learning_rate': 1.3471763410413718e-05, 'epoch': 0.98}


 33%|███▎      | 8310/25428 [1:30:19<3:03:36,  1.55it/s]

{'loss': 0.1663, 'grad_norm': 1.0920966863632202, 'learning_rate': 1.3463898065125061e-05, 'epoch': 0.98}


 33%|███▎      | 8320/25428 [1:30:26<2:58:31,  1.60it/s]

{'loss': 0.1667, 'grad_norm': 1.6632976531982422, 'learning_rate': 1.3456032719836402e-05, 'epoch': 0.98}


 33%|███▎      | 8330/25428 [1:30:32<3:04:59,  1.54it/s]

{'loss': 0.1297, 'grad_norm': 2.2575008869171143, 'learning_rate': 1.3448167374547745e-05, 'epoch': 0.98}


 33%|███▎      | 8340/25428 [1:30:39<3:05:46,  1.53it/s]

{'loss': 0.1477, 'grad_norm': 1.198792576789856, 'learning_rate': 1.3440302029259084e-05, 'epoch': 0.98}


 33%|███▎      | 8350/25428 [1:30:45<2:57:34,  1.60it/s]

{'loss': 0.1546, 'grad_norm': 1.9518784284591675, 'learning_rate': 1.3432436683970427e-05, 'epoch': 0.99}


 33%|███▎      | 8360/25428 [1:30:51<3:01:46,  1.56it/s]

{'loss': 0.1397, 'grad_norm': 1.2134162187576294, 'learning_rate': 1.342457133868177e-05, 'epoch': 0.99}


 33%|███▎      | 8370/25428 [1:30:58<2:59:21,  1.59it/s]

{'loss': 0.1523, 'grad_norm': 3.508713960647583, 'learning_rate': 1.341670599339311e-05, 'epoch': 0.99}


 33%|███▎      | 8380/25428 [1:31:04<3:00:17,  1.58it/s]

{'loss': 0.2234, 'grad_norm': 1.4678490161895752, 'learning_rate': 1.3408840648104453e-05, 'epoch': 0.99}


 33%|███▎      | 8390/25428 [1:31:10<3:04:42,  1.54it/s]

{'loss': 0.1677, 'grad_norm': 1.0384572744369507, 'learning_rate': 1.3400975302815796e-05, 'epoch': 0.99}


 33%|███▎      | 8400/25428 [1:31:17<3:02:14,  1.56it/s]

{'loss': 0.1756, 'grad_norm': 1.5186864137649536, 'learning_rate': 1.3393109957527137e-05, 'epoch': 0.99}


 33%|███▎      | 8410/25428 [1:31:23<3:05:18,  1.53it/s]

{'loss': 0.1381, 'grad_norm': 1.7270158529281616, 'learning_rate': 1.338524461223848e-05, 'epoch': 0.99}


 33%|███▎      | 8420/25428 [1:31:30<3:01:24,  1.56it/s]

{'loss': 0.1569, 'grad_norm': 1.75946044921875, 'learning_rate': 1.3377379266949819e-05, 'epoch': 0.99}


 33%|███▎      | 8430/25428 [1:31:36<2:58:05,  1.59it/s]

{'loss': 0.1668, 'grad_norm': 1.5560096502304077, 'learning_rate': 1.3369513921661161e-05, 'epoch': 0.99}


 33%|███▎      | 8440/25428 [1:31:42<2:57:37,  1.59it/s]

{'loss': 0.1343, 'grad_norm': 1.5827836990356445, 'learning_rate': 1.3361648576372504e-05, 'epoch': 1.0}


 33%|███▎      | 8450/25428 [1:31:49<2:58:52,  1.58it/s]

{'loss': 0.136, 'grad_norm': 1.1749473810195923, 'learning_rate': 1.3353783231083845e-05, 'epoch': 1.0}


 33%|███▎      | 8460/25428 [1:31:55<3:00:36,  1.57it/s]

{'loss': 0.1351, 'grad_norm': 1.5063672065734863, 'learning_rate': 1.3345917885795188e-05, 'epoch': 1.0}


 33%|███▎      | 8470/25428 [1:32:01<3:02:46,  1.55it/s]

{'loss': 0.1635, 'grad_norm': 1.040945053100586, 'learning_rate': 1.333805254050653e-05, 'epoch': 1.0}


                                                        
 33%|███▎      | 8476/25428 [1:51:47<3:00:29,  1.57it/s]

{'eval_loss': 0.09268521517515182, 'eval_runtime': 1182.2907, 'eval_samples_per_second': 57.35, 'eval_steps_per_second': 7.169, 'epoch': 1.0}


 33%|███▎      | 8480/25428 [1:51:50<575:44:18, 122.30s/it] 

{'loss': 0.1328, 'grad_norm': 1.2225185632705688, 'learning_rate': 1.3330187195217871e-05, 'epoch': 1.0}


 33%|███▎      | 8490/25428 [1:51:56<19:09:23,  4.07s/it]  

{'loss': 0.162, 'grad_norm': 1.4982666969299316, 'learning_rate': 1.3322321849929214e-05, 'epoch': 1.0}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.1558, 'grad_norm': 1.3331525325775146, 'learning_rate': 1.3314456504640553e-05, 'epoch': 1.0}


 33%|███▎      | 8510/25428 [1:52:12<3:04:52,  1.53it/s]

{'loss': 0.1184, 'grad_norm': 0.9355366826057434, 'learning_rate': 1.3306591159351896e-05, 'epoch': 1.0}


 34%|███▎      | 8520/25428 [1:52:18<2:53:45,  1.62it/s]

{'loss': 0.1783, 'grad_norm': 2.2527871131896973, 'learning_rate': 1.3298725814063239e-05, 'epoch': 1.01}


 34%|███▎      | 8530/25428 [1:52:24<2:53:00,  1.63it/s]

{'loss': 0.1672, 'grad_norm': 1.1427289247512817, 'learning_rate': 1.329086046877458e-05, 'epoch': 1.01}


 34%|███▎      | 8540/25428 [1:52:31<2:54:38,  1.61it/s]

{'loss': 0.1334, 'grad_norm': 1.699608564376831, 'learning_rate': 1.3282995123485922e-05, 'epoch': 1.01}


 34%|███▎      | 8550/25428 [1:52:37<2:53:39,  1.62it/s]

{'loss': 0.0996, 'grad_norm': 0.9586929678916931, 'learning_rate': 1.3275129778197265e-05, 'epoch': 1.01}


 34%|███▎      | 8560/25428 [1:52:43<2:56:28,  1.59it/s]

{'loss': 0.1001, 'grad_norm': 1.0958844423294067, 'learning_rate': 1.3267264432908606e-05, 'epoch': 1.01}


 34%|███▎      | 8570/25428 [1:52:50<2:57:13,  1.59it/s]

{'loss': 0.0937, 'grad_norm': 0.745423436164856, 'learning_rate': 1.3259399087619949e-05, 'epoch': 1.01}


 34%|███▎      | 8580/25428 [1:52:56<2:54:47,  1.61it/s]

{'loss': 0.153, 'grad_norm': 1.9291422367095947, 'learning_rate': 1.3251533742331288e-05, 'epoch': 1.01}


 34%|███▍      | 8590/25428 [1:53:02<2:54:24,  1.61it/s]

{'loss': 0.1289, 'grad_norm': 0.8992966413497925, 'learning_rate': 1.324366839704263e-05, 'epoch': 1.01}


 34%|███▍      | 8600/25428 [1:53:08<2:53:15,  1.62it/s]

{'loss': 0.1162, 'grad_norm': 0.9701732397079468, 'learning_rate': 1.3235803051753973e-05, 'epoch': 1.01}


 34%|███▍      | 8610/25428 [1:53:15<2:58:49,  1.57it/s]

{'loss': 0.1218, 'grad_norm': 0.7692265510559082, 'learning_rate': 1.3227937706465314e-05, 'epoch': 1.02}


 34%|███▍      | 8620/25428 [1:53:21<2:53:38,  1.61it/s]

{'loss': 0.1421, 'grad_norm': 1.050323247909546, 'learning_rate': 1.3220072361176657e-05, 'epoch': 1.02}


 34%|███▍      | 8630/25428 [1:53:27<2:52:19,  1.62it/s]

{'loss': 0.1253, 'grad_norm': 0.9461706876754761, 'learning_rate': 1.3212207015888e-05, 'epoch': 1.02}


 34%|███▍      | 8640/25428 [1:53:33<2:55:00,  1.60it/s]

{'loss': 0.1201, 'grad_norm': 1.3164509534835815, 'learning_rate': 1.320434167059934e-05, 'epoch': 1.02}


 34%|███▍      | 8650/25428 [1:53:40<2:51:53,  1.63it/s]

{'loss': 0.1615, 'grad_norm': 1.625434398651123, 'learning_rate': 1.3196476325310683e-05, 'epoch': 1.02}


 34%|███▍      | 8660/25428 [1:53:46<2:54:16,  1.60it/s]

{'loss': 0.182, 'grad_norm': 2.3644495010375977, 'learning_rate': 1.3188610980022023e-05, 'epoch': 1.02}


 34%|███▍      | 8670/25428 [1:53:52<2:54:28,  1.60it/s]

{'loss': 0.1175, 'grad_norm': 1.1079548597335815, 'learning_rate': 1.3180745634733365e-05, 'epoch': 1.02}


 34%|███▍      | 8680/25428 [1:53:58<2:54:58,  1.60it/s]

{'loss': 0.1322, 'grad_norm': 2.301868438720703, 'learning_rate': 1.3172880289444706e-05, 'epoch': 1.02}


 34%|███▍      | 8690/25428 [1:54:04<2:51:04,  1.63it/s]

{'loss': 0.1337, 'grad_norm': 2.3732290267944336, 'learning_rate': 1.3165014944156049e-05, 'epoch': 1.03}


 34%|███▍      | 8700/25428 [1:54:11<2:53:51,  1.60it/s]

{'loss': 0.102, 'grad_norm': 1.5358282327651978, 'learning_rate': 1.3157149598867392e-05, 'epoch': 1.03}


 34%|███▍      | 8710/25428 [1:54:17<2:54:26,  1.60it/s]

{'loss': 0.1346, 'grad_norm': 1.3500101566314697, 'learning_rate': 1.3149284253578733e-05, 'epoch': 1.03}


 34%|███▍      | 8720/25428 [1:54:23<2:54:22,  1.60it/s]

{'loss': 0.1143, 'grad_norm': 1.0868834257125854, 'learning_rate': 1.3141418908290075e-05, 'epoch': 1.03}


 34%|███▍      | 8730/25428 [1:54:30<2:56:30,  1.58it/s]

{'loss': 0.1483, 'grad_norm': 1.1509233713150024, 'learning_rate': 1.3133553563001418e-05, 'epoch': 1.03}


 34%|███▍      | 8740/25428 [1:54:36<2:54:46,  1.59it/s]

{'loss': 0.116, 'grad_norm': 1.4297245740890503, 'learning_rate': 1.3125688217712757e-05, 'epoch': 1.03}


 34%|███▍      | 8750/25428 [1:54:42<2:54:43,  1.59it/s]

{'loss': 0.1143, 'grad_norm': 1.1238548755645752, 'learning_rate': 1.31178228724241e-05, 'epoch': 1.03}


 34%|███▍      | 8760/25428 [1:54:48<2:52:23,  1.61it/s]

{'loss': 0.1353, 'grad_norm': 2.6910834312438965, 'learning_rate': 1.3109957527135441e-05, 'epoch': 1.03}


 34%|███▍      | 8770/25428 [1:54:55<2:51:12,  1.62it/s]

{'loss': 0.1458, 'grad_norm': 1.5524425506591797, 'learning_rate': 1.3102092181846784e-05, 'epoch': 1.03}


 35%|███▍      | 8780/25428 [1:55:01<2:52:59,  1.60it/s]

{'loss': 0.1441, 'grad_norm': 1.120293140411377, 'learning_rate': 1.3094226836558126e-05, 'epoch': 1.04}


 35%|███▍      | 8790/25428 [1:55:07<2:53:33,  1.60it/s]

{'loss': 0.1336, 'grad_norm': 1.3953579664230347, 'learning_rate': 1.3086361491269467e-05, 'epoch': 1.04}


 35%|███▍      | 8800/25428 [1:55:13<2:53:15,  1.60it/s]

{'loss': 0.1034, 'grad_norm': 0.9718558192253113, 'learning_rate': 1.307849614598081e-05, 'epoch': 1.04}


 35%|███▍      | 8810/25428 [1:55:20<2:50:55,  1.62it/s]

{'loss': 0.1524, 'grad_norm': 2.464390516281128, 'learning_rate': 1.3070630800692153e-05, 'epoch': 1.04}


 35%|███▍      | 8820/25428 [1:55:26<2:54:08,  1.59it/s]

{'loss': 0.123, 'grad_norm': 0.7842265367507935, 'learning_rate': 1.3062765455403494e-05, 'epoch': 1.04}


 35%|███▍      | 8830/25428 [1:55:32<2:51:17,  1.62it/s]

{'loss': 0.122, 'grad_norm': 1.3571752309799194, 'learning_rate': 1.3054900110114835e-05, 'epoch': 1.04}


 35%|███▍      | 8840/25428 [1:55:39<2:57:58,  1.55it/s]

{'loss': 0.117, 'grad_norm': 1.0816689729690552, 'learning_rate': 1.3047034764826176e-05, 'epoch': 1.04}


 35%|███▍      | 8850/25428 [1:55:45<2:52:08,  1.60it/s]

{'loss': 0.1062, 'grad_norm': 0.9587883949279785, 'learning_rate': 1.3039169419537518e-05, 'epoch': 1.04}


 35%|███▍      | 8860/25428 [1:55:51<2:51:41,  1.61it/s]

{'loss': 0.1313, 'grad_norm': 1.4261208772659302, 'learning_rate': 1.3031304074248861e-05, 'epoch': 1.05}


 35%|███▍      | 8870/25428 [1:55:57<2:56:06,  1.57it/s]

{'loss': 0.1503, 'grad_norm': 2.938154697418213, 'learning_rate': 1.3023438728960202e-05, 'epoch': 1.05}


 35%|███▍      | 8880/25428 [1:56:04<2:51:08,  1.61it/s]

{'loss': 0.1328, 'grad_norm': 1.4122339487075806, 'learning_rate': 1.3015573383671545e-05, 'epoch': 1.05}


 35%|███▍      | 8890/25428 [1:56:10<2:53:21,  1.59it/s]

{'loss': 0.1215, 'grad_norm': 1.4772661924362183, 'learning_rate': 1.3007708038382887e-05, 'epoch': 1.05}


 35%|███▌      | 8900/25428 [1:56:16<2:49:40,  1.62it/s]

{'loss': 0.1368, 'grad_norm': 1.5323216915130615, 'learning_rate': 1.2999842693094228e-05, 'epoch': 1.05}


 35%|███▌      | 8910/25428 [1:56:22<2:51:02,  1.61it/s]

{'loss': 0.1232, 'grad_norm': 1.226149320602417, 'learning_rate': 1.299197734780557e-05, 'epoch': 1.05}


 35%|███▌      | 8920/25428 [1:56:29<2:50:19,  1.62it/s]

{'loss': 0.101, 'grad_norm': 1.3338862657546997, 'learning_rate': 1.298411200251691e-05, 'epoch': 1.05}


 35%|███▌      | 8930/25428 [1:56:35<2:52:38,  1.59it/s]

{'loss': 0.1165, 'grad_norm': 1.0502368211746216, 'learning_rate': 1.2976246657228253e-05, 'epoch': 1.05}


 35%|███▌      | 8940/25428 [1:56:41<2:51:16,  1.60it/s]

{'loss': 0.1546, 'grad_norm': 0.8968656063079834, 'learning_rate': 1.2968381311939596e-05, 'epoch': 1.05}


 35%|███▌      | 8950/25428 [1:56:47<2:51:04,  1.61it/s]

{'loss': 0.1332, 'grad_norm': 0.9706090092658997, 'learning_rate': 1.2960515966650937e-05, 'epoch': 1.06}


 35%|███▌      | 8960/25428 [1:56:54<2:49:09,  1.62it/s]

{'loss': 0.1079, 'grad_norm': 4.331425666809082, 'learning_rate': 1.295265062136228e-05, 'epoch': 1.06}


 35%|███▌      | 8970/25428 [1:57:00<2:49:03,  1.62it/s]

{'loss': 0.1872, 'grad_norm': 3.975017786026001, 'learning_rate': 1.2944785276073622e-05, 'epoch': 1.06}


 35%|███▌      | 8980/25428 [1:57:06<2:55:13,  1.56it/s]

{'loss': 0.1389, 'grad_norm': 2.175997495651245, 'learning_rate': 1.2936919930784963e-05, 'epoch': 1.06}


 35%|███▌      | 8990/25428 [1:57:13<2:59:57,  1.52it/s]

{'loss': 0.1467, 'grad_norm': 1.0667232275009155, 'learning_rate': 1.2929054585496304e-05, 'epoch': 1.06}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.1798, 'grad_norm': 1.5611605644226074, 'learning_rate': 1.2921189240207645e-05, 'epoch': 1.06}


 35%|███▌      | 9010/25428 [1:57:28<3:01:25,  1.51it/s]

{'loss': 0.1109, 'grad_norm': 1.4233605861663818, 'learning_rate': 1.2913323894918988e-05, 'epoch': 1.06}


 35%|███▌      | 9020/25428 [1:57:35<2:47:43,  1.63it/s]

{'loss': 0.1366, 'grad_norm': 1.1057922840118408, 'learning_rate': 1.290545854963033e-05, 'epoch': 1.06}


 36%|███▌      | 9030/25428 [1:57:41<2:47:42,  1.63it/s]

{'loss': 0.1473, 'grad_norm': 1.2046959400177002, 'learning_rate': 1.2897593204341671e-05, 'epoch': 1.07}


 36%|███▌      | 9040/25428 [1:57:47<2:49:06,  1.62it/s]

{'loss': 0.1326, 'grad_norm': 0.9688941240310669, 'learning_rate': 1.2889727859053014e-05, 'epoch': 1.07}


 36%|███▌      | 9050/25428 [1:57:53<2:49:58,  1.61it/s]

{'loss': 0.1321, 'grad_norm': 1.8463624715805054, 'learning_rate': 1.2881862513764357e-05, 'epoch': 1.07}


 36%|███▌      | 9060/25428 [1:57:59<2:51:39,  1.59it/s]

{'loss': 0.1318, 'grad_norm': 6.035231590270996, 'learning_rate': 1.2873997168475698e-05, 'epoch': 1.07}


 36%|███▌      | 9070/25428 [1:58:06<2:53:20,  1.57it/s]

{'loss': 0.1355, 'grad_norm': 1.2569398880004883, 'learning_rate': 1.2866131823187039e-05, 'epoch': 1.07}


 36%|███▌      | 9080/25428 [1:58:12<2:49:22,  1.61it/s]

{'loss': 0.1269, 'grad_norm': 2.546635389328003, 'learning_rate': 1.285826647789838e-05, 'epoch': 1.07}


 36%|███▌      | 9090/25428 [1:58:18<2:53:21,  1.57it/s]

{'loss': 0.1515, 'grad_norm': 1.3174623250961304, 'learning_rate': 1.2850401132609722e-05, 'epoch': 1.07}


 36%|███▌      | 9100/25428 [1:58:25<2:46:57,  1.63it/s]

{'loss': 0.1473, 'grad_norm': 1.3587446212768555, 'learning_rate': 1.2842535787321065e-05, 'epoch': 1.07}


 36%|███▌      | 9110/25428 [1:58:31<2:49:09,  1.61it/s]

{'loss': 0.1001, 'grad_norm': 1.206903338432312, 'learning_rate': 1.2834670442032406e-05, 'epoch': 1.07}


 36%|███▌      | 9120/25428 [1:58:37<2:48:27,  1.61it/s]

{'loss': 0.1327, 'grad_norm': 0.95391845703125, 'learning_rate': 1.2826805096743749e-05, 'epoch': 1.08}


 36%|███▌      | 9130/25428 [1:58:43<2:49:14,  1.61it/s]

{'loss': 0.1051, 'grad_norm': 2.5934836864471436, 'learning_rate': 1.2818939751455091e-05, 'epoch': 1.08}


 36%|███▌      | 9140/25428 [1:58:50<2:48:43,  1.61it/s]

{'loss': 0.1151, 'grad_norm': 1.1030852794647217, 'learning_rate': 1.2811074406166432e-05, 'epoch': 1.08}


 36%|███▌      | 9150/25428 [1:58:56<2:52:08,  1.58it/s]

{'loss': 0.1217, 'grad_norm': 0.9593591094017029, 'learning_rate': 1.2803209060877773e-05, 'epoch': 1.08}


 36%|███▌      | 9160/25428 [1:59:02<2:47:36,  1.62it/s]

{'loss': 0.1041, 'grad_norm': 0.7828589677810669, 'learning_rate': 1.2795343715589114e-05, 'epoch': 1.08}


 36%|███▌      | 9170/25428 [1:59:08<2:53:09,  1.56it/s]

{'loss': 0.1451, 'grad_norm': 0.6908275485038757, 'learning_rate': 1.2787478370300457e-05, 'epoch': 1.08}


 36%|███▌      | 9180/25428 [1:59:15<2:48:42,  1.61it/s]

{'loss': 0.1651, 'grad_norm': 1.3530833721160889, 'learning_rate': 1.27796130250118e-05, 'epoch': 1.08}


 36%|███▌      | 9190/25428 [1:59:21<2:47:04,  1.62it/s]

{'loss': 0.1319, 'grad_norm': 1.2853972911834717, 'learning_rate': 1.277174767972314e-05, 'epoch': 1.08}


 36%|███▌      | 9200/25428 [1:59:27<2:47:55,  1.61it/s]

{'loss': 0.1418, 'grad_norm': 1.1911216974258423, 'learning_rate': 1.2763882334434483e-05, 'epoch': 1.09}


 36%|███▌      | 9210/25428 [1:59:33<2:47:59,  1.61it/s]

{'loss': 0.1016, 'grad_norm': 1.2058589458465576, 'learning_rate': 1.2756016989145824e-05, 'epoch': 1.09}


 36%|███▋      | 9220/25428 [1:59:40<2:50:21,  1.59it/s]

{'loss': 0.1468, 'grad_norm': 1.1044713258743286, 'learning_rate': 1.2748151643857167e-05, 'epoch': 1.09}


 36%|███▋      | 9230/25428 [1:59:46<2:47:25,  1.61it/s]

{'loss': 0.1061, 'grad_norm': 1.7645143270492554, 'learning_rate': 1.2740286298568506e-05, 'epoch': 1.09}


 36%|███▋      | 9240/25428 [1:59:52<2:47:13,  1.61it/s]

{'loss': 0.1541, 'grad_norm': 1.351192593574524, 'learning_rate': 1.2732420953279849e-05, 'epoch': 1.09}


 36%|███▋      | 9250/25428 [1:59:58<2:48:23,  1.60it/s]

{'loss': 0.1606, 'grad_norm': 2.172424077987671, 'learning_rate': 1.2724555607991192e-05, 'epoch': 1.09}


 36%|███▋      | 9260/25428 [2:00:05<2:49:24,  1.59it/s]

{'loss': 0.1096, 'grad_norm': 1.5443826913833618, 'learning_rate': 1.2716690262702533e-05, 'epoch': 1.09}


 36%|███▋      | 9270/25428 [2:00:11<2:58:12,  1.51it/s]

{'loss': 0.1256, 'grad_norm': 1.0169427394866943, 'learning_rate': 1.2708824917413875e-05, 'epoch': 1.09}


 36%|███▋      | 9280/25428 [2:00:18<2:55:53,  1.53it/s]

{'loss': 0.1517, 'grad_norm': 0.8842915892601013, 'learning_rate': 1.2700959572125218e-05, 'epoch': 1.09}


 37%|███▋      | 9290/25428 [2:00:24<2:53:59,  1.55it/s]

{'loss': 0.1186, 'grad_norm': 1.0410490036010742, 'learning_rate': 1.2693094226836559e-05, 'epoch': 1.1}


 37%|███▋      | 9300/25428 [2:00:31<2:52:08,  1.56it/s]

{'loss': 0.1303, 'grad_norm': 1.1381388902664185, 'learning_rate': 1.2685228881547902e-05, 'epoch': 1.1}


 37%|███▋      | 9310/25428 [2:00:38<3:01:46,  1.48it/s]

{'loss': 0.1154, 'grad_norm': 1.1477019786834717, 'learning_rate': 1.2677363536259241e-05, 'epoch': 1.1}


 37%|███▋      | 9320/25428 [2:00:44<2:58:58,  1.50it/s]

{'loss': 0.121, 'grad_norm': 1.0013819932937622, 'learning_rate': 1.2669498190970584e-05, 'epoch': 1.1}


 37%|███▋      | 9330/25428 [2:00:51<2:52:52,  1.55it/s]

{'loss': 0.1223, 'grad_norm': 1.0325673818588257, 'learning_rate': 1.2661632845681926e-05, 'epoch': 1.1}


 37%|███▋      | 9340/25428 [2:00:57<2:51:29,  1.56it/s]

{'loss': 0.1067, 'grad_norm': 1.39677095413208, 'learning_rate': 1.2653767500393267e-05, 'epoch': 1.1}


 37%|███▋      | 9350/25428 [2:01:04<2:50:32,  1.57it/s]

{'loss': 0.1636, 'grad_norm': 1.4553247690200806, 'learning_rate': 1.264590215510461e-05, 'epoch': 1.1}


 37%|███▋      | 9360/25428 [2:01:10<2:46:45,  1.61it/s]

{'loss': 0.1292, 'grad_norm': 0.6714960932731628, 'learning_rate': 1.2638036809815953e-05, 'epoch': 1.1}


 37%|███▋      | 9370/25428 [2:01:16<2:53:22,  1.54it/s]

{'loss': 0.1067, 'grad_norm': 0.9663749933242798, 'learning_rate': 1.2630171464527294e-05, 'epoch': 1.11}


 37%|███▋      | 9380/25428 [2:01:23<2:46:39,  1.60it/s]

{'loss': 0.1171, 'grad_norm': 1.7712373733520508, 'learning_rate': 1.2622306119238636e-05, 'epoch': 1.11}


 37%|███▋      | 9390/25428 [2:01:29<2:44:21,  1.63it/s]

{'loss': 0.1435, 'grad_norm': 1.3760122060775757, 'learning_rate': 1.2614440773949976e-05, 'epoch': 1.11}


 37%|███▋      | 9400/25428 [2:01:35<2:54:35,  1.53it/s]

{'loss': 0.0893, 'grad_norm': 0.797394335269928, 'learning_rate': 1.2606575428661318e-05, 'epoch': 1.11}


 37%|███▋      | 9410/25428 [2:01:41<2:44:02,  1.63it/s]

{'loss': 0.159, 'grad_norm': 0.9063144326210022, 'learning_rate': 1.2598710083372661e-05, 'epoch': 1.11}


 37%|███▋      | 9420/25428 [2:01:48<2:43:19,  1.63it/s]

{'loss': 0.1336, 'grad_norm': 1.2107094526290894, 'learning_rate': 1.2590844738084002e-05, 'epoch': 1.11}


 37%|███▋      | 9430/25428 [2:01:54<2:46:39,  1.60it/s]

{'loss': 0.1483, 'grad_norm': 0.9689863920211792, 'learning_rate': 1.2582979392795345e-05, 'epoch': 1.11}


 37%|███▋      | 9440/25428 [2:02:00<2:58:26,  1.49it/s]

{'loss': 0.1121, 'grad_norm': 1.9202851057052612, 'learning_rate': 1.2575114047506687e-05, 'epoch': 1.11}


 37%|███▋      | 9450/25428 [2:02:07<3:01:51,  1.46it/s]

{'loss': 0.1418, 'grad_norm': 0.8218294978141785, 'learning_rate': 1.2567248702218028e-05, 'epoch': 1.11}


 37%|███▋      | 9460/25428 [2:02:13<2:51:38,  1.55it/s]

{'loss': 0.1264, 'grad_norm': 1.4206268787384033, 'learning_rate': 1.2559383356929371e-05, 'epoch': 1.12}


 37%|███▋      | 9470/25428 [2:02:20<2:45:47,  1.60it/s]

{'loss': 0.1225, 'grad_norm': 0.8689427971839905, 'learning_rate': 1.2551518011640714e-05, 'epoch': 1.12}


 37%|███▋      | 9480/25428 [2:02:26<2:43:43,  1.62it/s]

{'loss': 0.1326, 'grad_norm': 1.3525183200836182, 'learning_rate': 1.2543652666352053e-05, 'epoch': 1.12}


 37%|███▋      | 9490/25428 [2:02:32<2:45:41,  1.60it/s]

{'loss': 0.1175, 'grad_norm': 0.7791709899902344, 'learning_rate': 1.2535787321063396e-05, 'epoch': 1.12}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.1738, 'grad_norm': 1.2231481075286865, 'learning_rate': 1.2527921975774737e-05, 'epoch': 1.12}


 37%|███▋      | 9510/25428 [2:02:48<2:54:03,  1.52it/s]

{'loss': 0.124, 'grad_norm': 1.0469677448272705, 'learning_rate': 1.252005663048608e-05, 'epoch': 1.12}


 37%|███▋      | 9520/25428 [2:02:54<2:48:31,  1.57it/s]

{'loss': 0.1534, 'grad_norm': 0.9284231662750244, 'learning_rate': 1.2512191285197422e-05, 'epoch': 1.12}


 37%|███▋      | 9530/25428 [2:03:00<2:45:38,  1.60it/s]

{'loss': 0.115, 'grad_norm': 1.393628716468811, 'learning_rate': 1.2504325939908763e-05, 'epoch': 1.12}


 38%|███▊      | 9540/25428 [2:03:07<2:45:38,  1.60it/s]

{'loss': 0.1116, 'grad_norm': 1.0947238206863403, 'learning_rate': 1.2496460594620106e-05, 'epoch': 1.13}


 38%|███▊      | 9550/25428 [2:03:13<2:42:28,  1.63it/s]

{'loss': 0.1212, 'grad_norm': 0.8644044995307922, 'learning_rate': 1.2488595249331448e-05, 'epoch': 1.13}


 38%|███▊      | 9560/25428 [2:03:19<2:44:58,  1.60it/s]

{'loss': 0.1039, 'grad_norm': 1.5922337770462036, 'learning_rate': 1.2480729904042788e-05, 'epoch': 1.13}


 38%|███▊      | 9570/25428 [2:03:25<2:47:47,  1.58it/s]

{'loss': 0.1546, 'grad_norm': 1.1620967388153076, 'learning_rate': 1.247286455875413e-05, 'epoch': 1.13}


 38%|███▊      | 9580/25428 [2:03:32<2:44:25,  1.61it/s]

{'loss': 0.1033, 'grad_norm': 1.199805736541748, 'learning_rate': 1.2464999213465471e-05, 'epoch': 1.13}


 38%|███▊      | 9590/25428 [2:03:38<2:44:58,  1.60it/s]

{'loss': 0.1037, 'grad_norm': 1.4952775239944458, 'learning_rate': 1.2457133868176814e-05, 'epoch': 1.13}


 38%|███▊      | 9600/25428 [2:03:44<2:48:14,  1.57it/s]

{'loss': 0.1501, 'grad_norm': 1.7842837572097778, 'learning_rate': 1.2449268522888157e-05, 'epoch': 1.13}


 38%|███▊      | 9610/25428 [2:03:50<2:41:39,  1.63it/s]

{'loss': 0.1481, 'grad_norm': 1.5431036949157715, 'learning_rate': 1.2441403177599498e-05, 'epoch': 1.13}


 38%|███▊      | 9620/25428 [2:03:57<2:44:58,  1.60it/s]

{'loss': 0.194, 'grad_norm': 1.628265380859375, 'learning_rate': 1.243353783231084e-05, 'epoch': 1.13}


 38%|███▊      | 9630/25428 [2:04:03<2:43:07,  1.61it/s]

{'loss': 0.1269, 'grad_norm': 0.8343320488929749, 'learning_rate': 1.2425672487022183e-05, 'epoch': 1.14}


 38%|███▊      | 9640/25428 [2:04:09<2:47:02,  1.58it/s]

{'loss': 0.1345, 'grad_norm': 1.6526429653167725, 'learning_rate': 1.2417807141733522e-05, 'epoch': 1.14}


 38%|███▊      | 9650/25428 [2:04:15<2:45:20,  1.59it/s]

{'loss': 0.1107, 'grad_norm': 2.150728940963745, 'learning_rate': 1.2409941796444865e-05, 'epoch': 1.14}


 38%|███▊      | 9660/25428 [2:04:22<2:53:21,  1.52it/s]

{'loss': 0.0985, 'grad_norm': 1.0576043128967285, 'learning_rate': 1.2402076451156206e-05, 'epoch': 1.14}


 38%|███▊      | 9670/25428 [2:04:28<2:46:31,  1.58it/s]

{'loss': 0.1318, 'grad_norm': 0.9079294204711914, 'learning_rate': 1.2394211105867549e-05, 'epoch': 1.14}


 38%|███▊      | 9680/25428 [2:04:34<2:43:31,  1.61it/s]

{'loss': 0.1209, 'grad_norm': 1.2098267078399658, 'learning_rate': 1.2386345760578891e-05, 'epoch': 1.14}


 38%|███▊      | 9690/25428 [2:04:41<2:43:26,  1.60it/s]

{'loss': 0.1033, 'grad_norm': 1.0382088422775269, 'learning_rate': 1.2378480415290232e-05, 'epoch': 1.14}


 38%|███▊      | 9700/25428 [2:04:47<2:41:20,  1.62it/s]

{'loss': 0.1296, 'grad_norm': 0.7804055213928223, 'learning_rate': 1.2370615070001575e-05, 'epoch': 1.14}


 38%|███▊      | 9710/25428 [2:04:53<2:40:48,  1.63it/s]

{'loss': 0.0999, 'grad_norm': 1.4312783479690552, 'learning_rate': 1.2362749724712916e-05, 'epoch': 1.15}


 38%|███▊      | 9720/25428 [2:04:59<2:42:11,  1.61it/s]

{'loss': 0.1106, 'grad_norm': 1.1289504766464233, 'learning_rate': 1.2354884379424257e-05, 'epoch': 1.15}


 38%|███▊      | 9730/25428 [2:05:06<2:44:03,  1.59it/s]

{'loss': 0.126, 'grad_norm': 1.5609484910964966, 'learning_rate': 1.23470190341356e-05, 'epoch': 1.15}


 38%|███▊      | 9740/25428 [2:05:12<2:42:50,  1.61it/s]

{'loss': 0.139, 'grad_norm': 0.9255644083023071, 'learning_rate': 1.233915368884694e-05, 'epoch': 1.15}


 38%|███▊      | 9750/25428 [2:05:18<2:46:26,  1.57it/s]

{'loss': 0.1116, 'grad_norm': 1.2503457069396973, 'learning_rate': 1.2331288343558283e-05, 'epoch': 1.15}


 38%|███▊      | 9760/25428 [2:05:25<2:42:31,  1.61it/s]

{'loss': 0.1108, 'grad_norm': 2.11478853225708, 'learning_rate': 1.2323422998269624e-05, 'epoch': 1.15}


 38%|███▊      | 9770/25428 [2:05:31<2:41:55,  1.61it/s]

{'loss': 0.1177, 'grad_norm': 1.031153917312622, 'learning_rate': 1.2315557652980967e-05, 'epoch': 1.15}


 38%|███▊      | 9780/25428 [2:05:37<2:39:19,  1.64it/s]

{'loss': 0.1478, 'grad_norm': 1.0806958675384521, 'learning_rate': 1.230769230769231e-05, 'epoch': 1.15}


 39%|███▊      | 9790/25428 [2:05:43<2:41:19,  1.62it/s]

{'loss': 0.1124, 'grad_norm': 1.0413641929626465, 'learning_rate': 1.229982696240365e-05, 'epoch': 1.16}


 39%|███▊      | 9800/25428 [2:05:49<2:39:39,  1.63it/s]

{'loss': 0.1155, 'grad_norm': 1.0138633251190186, 'learning_rate': 1.2291961617114992e-05, 'epoch': 1.16}


 39%|███▊      | 9810/25428 [2:05:55<2:40:33,  1.62it/s]

{'loss': 0.1236, 'grad_norm': 1.1490806341171265, 'learning_rate': 1.2284096271826333e-05, 'epoch': 1.16}


 39%|███▊      | 9820/25428 [2:06:02<2:40:02,  1.63it/s]

{'loss': 0.1336, 'grad_norm': 1.0299301147460938, 'learning_rate': 1.2276230926537675e-05, 'epoch': 1.16}


 39%|███▊      | 9830/25428 [2:06:08<2:39:48,  1.63it/s]

{'loss': 0.1054, 'grad_norm': 1.2344521284103394, 'learning_rate': 1.2268365581249018e-05, 'epoch': 1.16}


 39%|███▊      | 9840/25428 [2:06:14<2:39:56,  1.62it/s]

{'loss': 0.1285, 'grad_norm': 0.9250611066818237, 'learning_rate': 1.2260500235960359e-05, 'epoch': 1.16}


 39%|███▊      | 9850/25428 [2:06:20<2:44:30,  1.58it/s]

{'loss': 0.1066, 'grad_norm': 1.0457566976547241, 'learning_rate': 1.2252634890671702e-05, 'epoch': 1.16}


 39%|███▉      | 9860/25428 [2:06:26<2:39:21,  1.63it/s]

{'loss': 0.1281, 'grad_norm': 1.4597398042678833, 'learning_rate': 1.2244769545383044e-05, 'epoch': 1.16}


 39%|███▉      | 9870/25428 [2:06:33<2:41:32,  1.61it/s]

{'loss': 0.1344, 'grad_norm': 1.8727574348449707, 'learning_rate': 1.2236904200094385e-05, 'epoch': 1.16}


 39%|███▉      | 9880/25428 [2:06:39<2:40:11,  1.62it/s]

{'loss': 0.1443, 'grad_norm': 1.378241777420044, 'learning_rate': 1.2229038854805726e-05, 'epoch': 1.17}


 39%|███▉      | 9890/25428 [2:06:45<2:39:22,  1.62it/s]

{'loss': 0.1958, 'grad_norm': 1.2451027631759644, 'learning_rate': 1.2221173509517067e-05, 'epoch': 1.17}


 39%|███▉      | 9900/25428 [2:06:51<2:43:05,  1.59it/s]

{'loss': 0.1492, 'grad_norm': 1.009947657585144, 'learning_rate': 1.221330816422841e-05, 'epoch': 1.17}


 39%|███▉      | 9910/25428 [2:06:58<2:40:49,  1.61it/s]

{'loss': 0.1679, 'grad_norm': 1.3592671155929565, 'learning_rate': 1.2205442818939753e-05, 'epoch': 1.17}


 39%|███▉      | 9920/25428 [2:07:04<2:38:31,  1.63it/s]

{'loss': 0.0974, 'grad_norm': 1.0202200412750244, 'learning_rate': 1.2197577473651094e-05, 'epoch': 1.17}


 39%|███▉      | 9930/25428 [2:07:10<2:41:51,  1.60it/s]

{'loss': 0.1376, 'grad_norm': 1.295232892036438, 'learning_rate': 1.2189712128362436e-05, 'epoch': 1.17}


 39%|███▉      | 9940/25428 [2:07:16<2:42:31,  1.59it/s]

{'loss': 0.1316, 'grad_norm': 1.289092779159546, 'learning_rate': 1.2181846783073779e-05, 'epoch': 1.17}


 39%|███▉      | 9950/25428 [2:07:23<2:44:07,  1.57it/s]

{'loss': 0.1058, 'grad_norm': 2.384922742843628, 'learning_rate': 1.217398143778512e-05, 'epoch': 1.17}


 39%|███▉      | 9960/25428 [2:07:29<2:38:45,  1.62it/s]

{'loss': 0.1148, 'grad_norm': 1.0061581134796143, 'learning_rate': 1.2166116092496461e-05, 'epoch': 1.18}


 39%|███▉      | 9970/25428 [2:07:35<2:39:00,  1.62it/s]

{'loss': 0.1144, 'grad_norm': 1.0557059049606323, 'learning_rate': 1.2158250747207802e-05, 'epoch': 1.18}


 39%|███▉      | 9980/25428 [2:07:41<2:42:39,  1.58it/s]

{'loss': 0.1259, 'grad_norm': 1.425891399383545, 'learning_rate': 1.2150385401919145e-05, 'epoch': 1.18}


 39%|███▉      | 9990/25428 [2:07:47<2:37:44,  1.63it/s]

{'loss': 0.1031, 'grad_norm': 0.7968956232070923, 'learning_rate': 1.2142520056630487e-05, 'epoch': 1.18}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.1035, 'grad_norm': 0.9009087085723877, 'learning_rate': 1.2134654711341828e-05, 'epoch': 1.18}


 39%|███▉      | 10010/25428 [2:08:03<2:54:09,  1.48it/s]

{'loss': 0.1251, 'grad_norm': 1.0021439790725708, 'learning_rate': 1.2126789366053171e-05, 'epoch': 1.18}


 39%|███▉      | 10020/25428 [2:08:10<2:44:47,  1.56it/s]

{'loss': 0.1147, 'grad_norm': 1.0246708393096924, 'learning_rate': 1.2118924020764514e-05, 'epoch': 1.18}


 39%|███▉      | 10030/25428 [2:08:16<2:45:20,  1.55it/s]

{'loss': 0.1245, 'grad_norm': 0.8359276652336121, 'learning_rate': 1.2111058675475855e-05, 'epoch': 1.18}


 39%|███▉      | 10040/25428 [2:08:22<2:42:46,  1.58it/s]

{'loss': 0.126, 'grad_norm': 1.0785537958145142, 'learning_rate': 1.2103193330187196e-05, 'epoch': 1.18}


 40%|███▉      | 10050/25428 [2:08:29<2:41:00,  1.59it/s]

{'loss': 0.1608, 'grad_norm': 1.4927512407302856, 'learning_rate': 1.2095327984898537e-05, 'epoch': 1.19}


 40%|███▉      | 10060/25428 [2:08:35<2:40:17,  1.60it/s]

{'loss': 0.1448, 'grad_norm': 1.9908013343811035, 'learning_rate': 1.208746263960988e-05, 'epoch': 1.19}


 40%|███▉      | 10070/25428 [2:08:41<2:39:28,  1.61it/s]

{'loss': 0.1267, 'grad_norm': 1.537377119064331, 'learning_rate': 1.2079597294321222e-05, 'epoch': 1.19}


 40%|███▉      | 10080/25428 [2:08:48<2:39:00,  1.61it/s]

{'loss': 0.1077, 'grad_norm': 1.3517515659332275, 'learning_rate': 1.2071731949032563e-05, 'epoch': 1.19}


 40%|███▉      | 10090/25428 [2:08:54<2:40:03,  1.60it/s]

{'loss': 0.1218, 'grad_norm': 0.8690186142921448, 'learning_rate': 1.2063866603743906e-05, 'epoch': 1.19}


 40%|███▉      | 10100/25428 [2:09:00<2:40:45,  1.59it/s]

{'loss': 0.14, 'grad_norm': 0.7311669588088989, 'learning_rate': 1.2056001258455248e-05, 'epoch': 1.19}


 40%|███▉      | 10110/25428 [2:09:07<2:42:55,  1.57it/s]

{'loss': 0.1034, 'grad_norm': 2.374354839324951, 'learning_rate': 1.204813591316659e-05, 'epoch': 1.19}


 40%|███▉      | 10120/25428 [2:09:13<2:46:36,  1.53it/s]

{'loss': 0.1475, 'grad_norm': 0.8269674777984619, 'learning_rate': 1.2040270567877932e-05, 'epoch': 1.19}


 40%|███▉      | 10130/25428 [2:09:19<2:39:10,  1.60it/s]

{'loss': 0.1069, 'grad_norm': 0.8135301470756531, 'learning_rate': 1.2032405222589271e-05, 'epoch': 1.2}


 40%|███▉      | 10140/25428 [2:09:26<2:38:11,  1.61it/s]

{'loss': 0.1028, 'grad_norm': 1.4417589902877808, 'learning_rate': 1.2024539877300614e-05, 'epoch': 1.2}


 40%|███▉      | 10150/25428 [2:09:32<2:45:32,  1.54it/s]

{'loss': 0.1176, 'grad_norm': 1.072687029838562, 'learning_rate': 1.2016674532011957e-05, 'epoch': 1.2}


 40%|███▉      | 10160/25428 [2:09:39<2:41:21,  1.58it/s]

{'loss': 0.1173, 'grad_norm': 0.9491287469863892, 'learning_rate': 1.2008809186723298e-05, 'epoch': 1.2}


 40%|███▉      | 10170/25428 [2:09:45<2:41:30,  1.57it/s]

{'loss': 0.1083, 'grad_norm': 0.9258643388748169, 'learning_rate': 1.200094384143464e-05, 'epoch': 1.2}


 40%|████      | 10180/25428 [2:09:51<2:43:34,  1.55it/s]

{'loss': 0.1334, 'grad_norm': 0.8997431397438049, 'learning_rate': 1.1993078496145983e-05, 'epoch': 1.2}


 40%|████      | 10190/25428 [2:09:58<2:39:51,  1.59it/s]

{'loss': 0.1795, 'grad_norm': 1.2058026790618896, 'learning_rate': 1.1985213150857324e-05, 'epoch': 1.2}


 40%|████      | 10200/25428 [2:10:04<2:41:58,  1.57it/s]

{'loss': 0.0945, 'grad_norm': 1.4818273782730103, 'learning_rate': 1.1977347805568667e-05, 'epoch': 1.2}


 40%|████      | 10210/25428 [2:10:11<2:41:21,  1.57it/s]

{'loss': 0.1376, 'grad_norm': 1.1717808246612549, 'learning_rate': 1.1969482460280006e-05, 'epoch': 1.2}


 40%|████      | 10220/25428 [2:10:17<2:41:24,  1.57it/s]

{'loss': 0.1013, 'grad_norm': 0.7651910781860352, 'learning_rate': 1.1961617114991349e-05, 'epoch': 1.21}


 40%|████      | 10230/25428 [2:10:23<2:42:50,  1.56it/s]

{'loss': 0.131, 'grad_norm': 2.0617594718933105, 'learning_rate': 1.1953751769702691e-05, 'epoch': 1.21}


 40%|████      | 10240/25428 [2:10:30<2:36:21,  1.62it/s]

{'loss': 0.1134, 'grad_norm': 2.435762405395508, 'learning_rate': 1.1945886424414032e-05, 'epoch': 1.21}


 40%|████      | 10250/25428 [2:10:36<2:40:37,  1.57it/s]

{'loss': 0.1226, 'grad_norm': 0.7734096050262451, 'learning_rate': 1.1938021079125375e-05, 'epoch': 1.21}


 40%|████      | 10260/25428 [2:10:43<2:59:19,  1.41it/s]

{'loss': 0.1251, 'grad_norm': 1.621755838394165, 'learning_rate': 1.1930155733836716e-05, 'epoch': 1.21}


 40%|████      | 10270/25428 [2:10:50<2:47:49,  1.51it/s]

{'loss': 0.1379, 'grad_norm': 1.1845072507858276, 'learning_rate': 1.1922290388548059e-05, 'epoch': 1.21}


 40%|████      | 10280/25428 [2:10:56<2:43:06,  1.55it/s]

{'loss': 0.1159, 'grad_norm': 1.5793697834014893, 'learning_rate': 1.1914425043259401e-05, 'epoch': 1.21}


 40%|████      | 10290/25428 [2:11:03<2:42:34,  1.55it/s]

{'loss': 0.1073, 'grad_norm': 1.2209445238113403, 'learning_rate': 1.190655969797074e-05, 'epoch': 1.21}


 41%|████      | 10300/25428 [2:11:09<2:44:00,  1.54it/s]

{'loss': 0.1383, 'grad_norm': 0.9771899580955505, 'learning_rate': 1.1898694352682083e-05, 'epoch': 1.22}


 41%|████      | 10310/25428 [2:11:16<2:47:09,  1.51it/s]

{'loss': 0.1411, 'grad_norm': 0.8877353072166443, 'learning_rate': 1.1890829007393424e-05, 'epoch': 1.22}


 41%|████      | 10320/25428 [2:11:22<2:56:14,  1.43it/s]

{'loss': 0.0947, 'grad_norm': 1.3269416093826294, 'learning_rate': 1.1882963662104767e-05, 'epoch': 1.22}


 41%|████      | 10330/25428 [2:11:29<2:43:28,  1.54it/s]

{'loss': 0.1021, 'grad_norm': 1.3811883926391602, 'learning_rate': 1.187509831681611e-05, 'epoch': 1.22}


 41%|████      | 10340/25428 [2:11:35<2:42:00,  1.55it/s]

{'loss': 0.1175, 'grad_norm': 1.0829912424087524, 'learning_rate': 1.186723297152745e-05, 'epoch': 1.22}


 41%|████      | 10350/25428 [2:11:42<2:40:11,  1.57it/s]

{'loss': 0.1218, 'grad_norm': 1.2054965496063232, 'learning_rate': 1.1859367626238793e-05, 'epoch': 1.22}


 41%|████      | 10360/25428 [2:11:48<2:38:54,  1.58it/s]

{'loss': 0.1028, 'grad_norm': 1.7079757452011108, 'learning_rate': 1.1851502280950136e-05, 'epoch': 1.22}


 41%|████      | 10370/25428 [2:11:55<2:40:41,  1.56it/s]

{'loss': 0.1241, 'grad_norm': 1.1209214925765991, 'learning_rate': 1.1843636935661475e-05, 'epoch': 1.22}


 41%|████      | 10380/25428 [2:12:01<2:40:47,  1.56it/s]

{'loss': 0.1279, 'grad_norm': 0.8992490768432617, 'learning_rate': 1.1835771590372818e-05, 'epoch': 1.22}


 41%|████      | 10390/25428 [2:12:09<3:08:03,  1.33it/s]

{'loss': 0.1121, 'grad_norm': 2.003995895385742, 'learning_rate': 1.1827906245084159e-05, 'epoch': 1.23}


 41%|████      | 10400/25428 [2:12:15<2:46:51,  1.50it/s]

{'loss': 0.1119, 'grad_norm': 0.944846510887146, 'learning_rate': 1.1820040899795502e-05, 'epoch': 1.23}


 41%|████      | 10410/25428 [2:12:22<2:39:39,  1.57it/s]

{'loss': 0.1621, 'grad_norm': 1.371875286102295, 'learning_rate': 1.1812175554506844e-05, 'epoch': 1.23}


 41%|████      | 10420/25428 [2:12:28<2:43:55,  1.53it/s]

{'loss': 0.1306, 'grad_norm': 1.046699047088623, 'learning_rate': 1.1804310209218185e-05, 'epoch': 1.23}


 41%|████      | 10430/25428 [2:12:35<2:43:28,  1.53it/s]

{'loss': 0.1592, 'grad_norm': 1.6287658214569092, 'learning_rate': 1.1796444863929528e-05, 'epoch': 1.23}


 41%|████      | 10440/25428 [2:12:42<2:43:37,  1.53it/s]

{'loss': 0.1326, 'grad_norm': 0.6673973202705383, 'learning_rate': 1.178857951864087e-05, 'epoch': 1.23}


 41%|████      | 10450/25428 [2:12:48<2:43:24,  1.53it/s]

{'loss': 0.1145, 'grad_norm': 1.221904993057251, 'learning_rate': 1.178071417335221e-05, 'epoch': 1.23}


 41%|████      | 10460/25428 [2:12:55<2:41:21,  1.55it/s]

{'loss': 0.1139, 'grad_norm': 1.1309294700622559, 'learning_rate': 1.1772848828063553e-05, 'epoch': 1.23}


 41%|████      | 10470/25428 [2:13:01<2:42:12,  1.54it/s]

{'loss': 0.091, 'grad_norm': 0.700473964214325, 'learning_rate': 1.1764983482774894e-05, 'epoch': 1.24}


 41%|████      | 10480/25428 [2:13:08<2:40:39,  1.55it/s]

{'loss': 0.1409, 'grad_norm': 2.1755287647247314, 'learning_rate': 1.1757118137486236e-05, 'epoch': 1.24}


 41%|████▏     | 10490/25428 [2:13:14<2:43:47,  1.52it/s]

{'loss': 0.1056, 'grad_norm': 1.2496974468231201, 'learning_rate': 1.1749252792197579e-05, 'epoch': 1.24}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.1115, 'grad_norm': 0.6551205515861511, 'learning_rate': 1.174138744690892e-05, 'epoch': 1.24}


 41%|████▏     | 10510/25428 [2:13:31<2:51:04,  1.45it/s]

{'loss': 0.1401, 'grad_norm': 1.0031994581222534, 'learning_rate': 1.1733522101620263e-05, 'epoch': 1.24}


 41%|████▏     | 10520/25428 [2:13:37<2:42:25,  1.53it/s]

{'loss': 0.1181, 'grad_norm': 1.4680025577545166, 'learning_rate': 1.1725656756331605e-05, 'epoch': 1.24}


 41%|████▏     | 10530/25428 [2:13:44<2:41:41,  1.54it/s]

{'loss': 0.1238, 'grad_norm': 1.2028950452804565, 'learning_rate': 1.1717791411042945e-05, 'epoch': 1.24}


 41%|████▏     | 10540/25428 [2:13:50<2:43:48,  1.51it/s]

{'loss': 0.1574, 'grad_norm': 1.0619913339614868, 'learning_rate': 1.1709926065754287e-05, 'epoch': 1.24}


 41%|████▏     | 10550/25428 [2:13:57<2:39:09,  1.56it/s]

{'loss': 0.1014, 'grad_norm': 2.1862757205963135, 'learning_rate': 1.1702060720465628e-05, 'epoch': 1.24}


 42%|████▏     | 10560/25428 [2:14:03<2:37:08,  1.58it/s]

{'loss': 0.114, 'grad_norm': 1.1282732486724854, 'learning_rate': 1.1694195375176971e-05, 'epoch': 1.25}


 42%|████▏     | 10570/25428 [2:14:10<2:41:57,  1.53it/s]

{'loss': 0.1232, 'grad_norm': 1.1809478998184204, 'learning_rate': 1.1686330029888314e-05, 'epoch': 1.25}


 42%|████▏     | 10580/25428 [2:14:16<2:41:15,  1.53it/s]

{'loss': 0.1356, 'grad_norm': 1.117538332939148, 'learning_rate': 1.1678464684599655e-05, 'epoch': 1.25}


 42%|████▏     | 10590/25428 [2:14:23<2:49:43,  1.46it/s]

{'loss': 0.1208, 'grad_norm': 1.803267240524292, 'learning_rate': 1.1670599339310997e-05, 'epoch': 1.25}


 42%|████▏     | 10600/25428 [2:14:29<2:38:46,  1.56it/s]

{'loss': 0.11, 'grad_norm': 2.1451587677001953, 'learning_rate': 1.166273399402234e-05, 'epoch': 1.25}


 42%|████▏     | 10610/25428 [2:14:36<2:39:24,  1.55it/s]

{'loss': 0.1423, 'grad_norm': 1.710706114768982, 'learning_rate': 1.165486864873368e-05, 'epoch': 1.25}


 42%|████▏     | 10620/25428 [2:14:42<2:37:01,  1.57it/s]

{'loss': 0.0877, 'grad_norm': 0.883695125579834, 'learning_rate': 1.1647003303445022e-05, 'epoch': 1.25}


 42%|████▏     | 10630/25428 [2:14:49<2:47:39,  1.47it/s]

{'loss': 0.1234, 'grad_norm': 0.8700695633888245, 'learning_rate': 1.1639137958156363e-05, 'epoch': 1.25}


 42%|████▏     | 10640/25428 [2:14:55<2:39:09,  1.55it/s]

{'loss': 0.1147, 'grad_norm': 0.9107195734977722, 'learning_rate': 1.1631272612867706e-05, 'epoch': 1.26}


 42%|████▏     | 10650/25428 [2:15:02<2:49:04,  1.46it/s]

{'loss': 0.1754, 'grad_norm': 1.0629241466522217, 'learning_rate': 1.1623407267579048e-05, 'epoch': 1.26}


 42%|████▏     | 10660/25428 [2:15:09<2:49:10,  1.45it/s]

{'loss': 0.152, 'grad_norm': 1.8501532077789307, 'learning_rate': 1.161554192229039e-05, 'epoch': 1.26}


 42%|████▏     | 10670/25428 [2:15:16<2:41:17,  1.53it/s]

{'loss': 0.117, 'grad_norm': 1.8906066417694092, 'learning_rate': 1.1607676577001732e-05, 'epoch': 1.26}


 42%|████▏     | 10680/25428 [2:15:22<2:36:12,  1.57it/s]

{'loss': 0.1252, 'grad_norm': 0.8978711366653442, 'learning_rate': 1.1599811231713075e-05, 'epoch': 1.26}


 42%|████▏     | 10690/25428 [2:15:29<2:37:31,  1.56it/s]

{'loss': 0.1059, 'grad_norm': 1.0984692573547363, 'learning_rate': 1.1591945886424414e-05, 'epoch': 1.26}


 42%|████▏     | 10700/25428 [2:15:35<2:33:31,  1.60it/s]

{'loss': 0.128, 'grad_norm': 1.0874810218811035, 'learning_rate': 1.1584080541135757e-05, 'epoch': 1.26}


 42%|████▏     | 10710/25428 [2:15:41<2:35:14,  1.58it/s]

{'loss': 0.1466, 'grad_norm': 1.8738540410995483, 'learning_rate': 1.1576215195847098e-05, 'epoch': 1.26}


 42%|████▏     | 10720/25428 [2:15:48<2:32:38,  1.61it/s]

{'loss': 0.12, 'grad_norm': 0.9362284541130066, 'learning_rate': 1.156834985055844e-05, 'epoch': 1.26}


 42%|████▏     | 10730/25428 [2:15:54<2:35:08,  1.58it/s]

{'loss': 0.1154, 'grad_norm': 1.8144186735153198, 'learning_rate': 1.1560484505269783e-05, 'epoch': 1.27}


 42%|████▏     | 10740/25428 [2:16:00<2:34:40,  1.58it/s]

{'loss': 0.1382, 'grad_norm': 1.3533942699432373, 'learning_rate': 1.1552619159981124e-05, 'epoch': 1.27}


 42%|████▏     | 10750/25428 [2:16:07<2:38:09,  1.55it/s]

{'loss': 0.1074, 'grad_norm': 0.6579908728599548, 'learning_rate': 1.1544753814692467e-05, 'epoch': 1.27}


 42%|████▏     | 10760/25428 [2:16:13<2:42:09,  1.51it/s]

{'loss': 0.1277, 'grad_norm': 1.0250521898269653, 'learning_rate': 1.1536888469403808e-05, 'epoch': 1.27}


 42%|████▏     | 10770/25428 [2:16:21<2:41:57,  1.51it/s]

{'loss': 0.1233, 'grad_norm': 3.52720308303833, 'learning_rate': 1.152902312411515e-05, 'epoch': 1.27}


 42%|████▏     | 10780/25428 [2:16:27<2:40:27,  1.52it/s]

{'loss': 0.1355, 'grad_norm': 1.0278679132461548, 'learning_rate': 1.1521157778826491e-05, 'epoch': 1.27}


 42%|████▏     | 10790/25428 [2:16:34<2:38:45,  1.54it/s]

{'loss': 0.1488, 'grad_norm': 1.5863227844238281, 'learning_rate': 1.1513292433537832e-05, 'epoch': 1.27}


 42%|████▏     | 10800/25428 [2:16:40<2:42:50,  1.50it/s]

{'loss': 0.1529, 'grad_norm': 1.8681637048721313, 'learning_rate': 1.1505427088249175e-05, 'epoch': 1.27}


 43%|████▎     | 10810/25428 [2:16:47<2:37:14,  1.55it/s]

{'loss': 0.0962, 'grad_norm': 1.1393998861312866, 'learning_rate': 1.1497561742960516e-05, 'epoch': 1.28}


 43%|████▎     | 10820/25428 [2:16:54<2:40:11,  1.52it/s]

{'loss': 0.0982, 'grad_norm': 2.351724863052368, 'learning_rate': 1.1489696397671859e-05, 'epoch': 1.28}


 43%|████▎     | 10830/25428 [2:17:00<2:40:48,  1.51it/s]

{'loss': 0.1166, 'grad_norm': 0.9261336922645569, 'learning_rate': 1.1481831052383201e-05, 'epoch': 1.28}


 43%|████▎     | 10840/25428 [2:17:07<2:39:45,  1.52it/s]

{'loss': 0.1072, 'grad_norm': 1.8892513513565063, 'learning_rate': 1.1473965707094542e-05, 'epoch': 1.28}


 43%|████▎     | 10850/25428 [2:17:13<2:46:45,  1.46it/s]

{'loss': 0.1392, 'grad_norm': 0.9284810423851013, 'learning_rate': 1.1466100361805885e-05, 'epoch': 1.28}


 43%|████▎     | 10860/25428 [2:17:20<2:37:59,  1.54it/s]

{'loss': 0.1409, 'grad_norm': 2.2740983963012695, 'learning_rate': 1.1458235016517224e-05, 'epoch': 1.28}


 43%|████▎     | 10870/25428 [2:17:26<2:35:49,  1.56it/s]

{'loss': 0.1431, 'grad_norm': 1.3765385150909424, 'learning_rate': 1.1450369671228567e-05, 'epoch': 1.28}


 43%|████▎     | 10880/25428 [2:17:33<2:35:14,  1.56it/s]

{'loss': 0.1287, 'grad_norm': 1.383879542350769, 'learning_rate': 1.144250432593991e-05, 'epoch': 1.28}


 43%|████▎     | 10890/25428 [2:17:39<2:37:11,  1.54it/s]

{'loss': 0.1203, 'grad_norm': 1.340828776359558, 'learning_rate': 1.143463898065125e-05, 'epoch': 1.28}


 43%|████▎     | 10900/25428 [2:17:46<2:35:14,  1.56it/s]

{'loss': 0.1128, 'grad_norm': 1.048425316810608, 'learning_rate': 1.1426773635362593e-05, 'epoch': 1.29}


 43%|████▎     | 10910/25428 [2:17:52<2:30:30,  1.61it/s]

{'loss': 0.1082, 'grad_norm': 0.9177427887916565, 'learning_rate': 1.1418908290073936e-05, 'epoch': 1.29}


 43%|████▎     | 10920/25428 [2:17:58<2:37:27,  1.54it/s]

{'loss': 0.1195, 'grad_norm': 0.7769534587860107, 'learning_rate': 1.1411042944785277e-05, 'epoch': 1.29}


 43%|████▎     | 10930/25428 [2:18:05<2:39:49,  1.51it/s]

{'loss': 0.1174, 'grad_norm': 0.9151881337165833, 'learning_rate': 1.140317759949662e-05, 'epoch': 1.29}


 43%|████▎     | 10940/25428 [2:18:11<2:36:04,  1.55it/s]

{'loss': 0.165, 'grad_norm': 1.1682584285736084, 'learning_rate': 1.1395312254207959e-05, 'epoch': 1.29}


 43%|████▎     | 10950/25428 [2:18:18<2:35:42,  1.55it/s]

{'loss': 0.11, 'grad_norm': 1.3750207424163818, 'learning_rate': 1.1387446908919302e-05, 'epoch': 1.29}


 43%|████▎     | 10960/25428 [2:18:24<2:33:32,  1.57it/s]

{'loss': 0.1164, 'grad_norm': 1.142543911933899, 'learning_rate': 1.1379581563630644e-05, 'epoch': 1.29}


 43%|████▎     | 10970/25428 [2:18:31<2:31:31,  1.59it/s]

{'loss': 0.1359, 'grad_norm': 1.208085060119629, 'learning_rate': 1.1371716218341985e-05, 'epoch': 1.29}


 43%|████▎     | 10980/25428 [2:18:37<2:35:06,  1.55it/s]

{'loss': 0.116, 'grad_norm': 0.9613150358200073, 'learning_rate': 1.1363850873053328e-05, 'epoch': 1.3}


 43%|████▎     | 10990/25428 [2:18:43<2:31:13,  1.59it/s]

{'loss': 0.1249, 'grad_norm': 4.068911075592041, 'learning_rate': 1.135598552776467e-05, 'epoch': 1.3}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.099, 'grad_norm': 0.9014462828636169, 'learning_rate': 1.1348120182476012e-05, 'epoch': 1.3}


 43%|████▎     | 11010/25428 [2:19:00<2:45:58,  1.45it/s]

{'loss': 0.1028, 'grad_norm': 1.4344149827957153, 'learning_rate': 1.1340254837187354e-05, 'epoch': 1.3}


 43%|████▎     | 11020/25428 [2:19:06<2:30:06,  1.60it/s]

{'loss': 0.1095, 'grad_norm': 1.2310336828231812, 'learning_rate': 1.1332389491898694e-05, 'epoch': 1.3}


 43%|████▎     | 11030/25428 [2:19:13<2:38:54,  1.51it/s]

{'loss': 0.1313, 'grad_norm': 1.511969804763794, 'learning_rate': 1.1324524146610036e-05, 'epoch': 1.3}


 43%|████▎     | 11040/25428 [2:19:19<2:32:43,  1.57it/s]

{'loss': 0.1153, 'grad_norm': 0.9668511748313904, 'learning_rate': 1.1316658801321379e-05, 'epoch': 1.3}


 43%|████▎     | 11050/25428 [2:19:25<2:33:03,  1.57it/s]

{'loss': 0.1666, 'grad_norm': 1.8836418390274048, 'learning_rate': 1.130879345603272e-05, 'epoch': 1.3}


 43%|████▎     | 11060/25428 [2:19:32<2:33:49,  1.56it/s]

{'loss': 0.1142, 'grad_norm': 1.8740006685256958, 'learning_rate': 1.1300928110744063e-05, 'epoch': 1.3}


 44%|████▎     | 11070/25428 [2:19:38<2:29:54,  1.60it/s]

{'loss': 0.1042, 'grad_norm': 1.7492941617965698, 'learning_rate': 1.1293062765455405e-05, 'epoch': 1.31}


 44%|████▎     | 11080/25428 [2:19:44<2:31:58,  1.57it/s]

{'loss': 0.1451, 'grad_norm': 1.5995757579803467, 'learning_rate': 1.1285197420166746e-05, 'epoch': 1.31}


 44%|████▎     | 11090/25428 [2:19:51<2:31:16,  1.58it/s]

{'loss': 0.1512, 'grad_norm': 1.297032117843628, 'learning_rate': 1.1277332074878089e-05, 'epoch': 1.31}


 44%|████▎     | 11100/25428 [2:19:57<2:30:40,  1.58it/s]

{'loss': 0.1216, 'grad_norm': 1.8589566946029663, 'learning_rate': 1.1269466729589428e-05, 'epoch': 1.31}


 44%|████▎     | 11110/25428 [2:20:04<2:33:37,  1.55it/s]

{'loss': 0.1392, 'grad_norm': 2.408728837966919, 'learning_rate': 1.1261601384300771e-05, 'epoch': 1.31}


 44%|████▎     | 11120/25428 [2:20:10<2:33:38,  1.55it/s]

{'loss': 0.1111, 'grad_norm': 1.0141689777374268, 'learning_rate': 1.1253736039012114e-05, 'epoch': 1.31}


 44%|████▍     | 11130/25428 [2:20:17<2:34:51,  1.54it/s]

{'loss': 0.1206, 'grad_norm': 1.2169525623321533, 'learning_rate': 1.1245870693723455e-05, 'epoch': 1.31}


 44%|████▍     | 11140/25428 [2:20:23<2:29:43,  1.59it/s]

{'loss': 0.1326, 'grad_norm': 1.3004153966903687, 'learning_rate': 1.1238005348434797e-05, 'epoch': 1.31}


 44%|████▍     | 11150/25428 [2:20:30<2:31:19,  1.57it/s]

{'loss': 0.1165, 'grad_norm': 1.1800516843795776, 'learning_rate': 1.123014000314614e-05, 'epoch': 1.32}


 44%|████▍     | 11160/25428 [2:20:36<2:28:32,  1.60it/s]

{'loss': 0.0949, 'grad_norm': 1.3631770610809326, 'learning_rate': 1.1222274657857481e-05, 'epoch': 1.32}


 44%|████▍     | 11170/25428 [2:20:42<2:29:52,  1.59it/s]

{'loss': 0.1165, 'grad_norm': 1.9015347957611084, 'learning_rate': 1.1214409312568824e-05, 'epoch': 1.32}


 44%|████▍     | 11180/25428 [2:20:49<2:29:52,  1.58it/s]

{'loss': 0.1217, 'grad_norm': 1.4672424793243408, 'learning_rate': 1.1206543967280163e-05, 'epoch': 1.32}


 44%|████▍     | 11190/25428 [2:20:55<2:27:19,  1.61it/s]

{'loss': 0.1469, 'grad_norm': 1.548465609550476, 'learning_rate': 1.1198678621991506e-05, 'epoch': 1.32}


 44%|████▍     | 11200/25428 [2:21:01<2:31:03,  1.57it/s]

{'loss': 0.0969, 'grad_norm': 1.0702159404754639, 'learning_rate': 1.1190813276702848e-05, 'epoch': 1.32}


 44%|████▍     | 11210/25428 [2:21:08<2:29:25,  1.59it/s]

{'loss': 0.1312, 'grad_norm': 1.3678628206253052, 'learning_rate': 1.118294793141419e-05, 'epoch': 1.32}


 44%|████▍     | 11220/25428 [2:21:14<2:30:18,  1.58it/s]

{'loss': 0.0952, 'grad_norm': 1.2417285442352295, 'learning_rate': 1.1175082586125532e-05, 'epoch': 1.32}


 44%|████▍     | 11230/25428 [2:21:20<2:29:28,  1.58it/s]

{'loss': 0.1237, 'grad_norm': 1.2073506116867065, 'learning_rate': 1.1167217240836875e-05, 'epoch': 1.32}


 44%|████▍     | 11240/25428 [2:21:27<2:30:47,  1.57it/s]

{'loss': 0.104, 'grad_norm': 1.2746106386184692, 'learning_rate': 1.1159351895548216e-05, 'epoch': 1.33}


 44%|████▍     | 11250/25428 [2:21:33<2:28:36,  1.59it/s]

{'loss': 0.1515, 'grad_norm': 1.2734489440917969, 'learning_rate': 1.1151486550259558e-05, 'epoch': 1.33}


 44%|████▍     | 11260/25428 [2:21:39<2:28:38,  1.59it/s]

{'loss': 0.1218, 'grad_norm': 1.3093982934951782, 'learning_rate': 1.1143621204970898e-05, 'epoch': 1.33}


 44%|████▍     | 11270/25428 [2:21:45<2:29:06,  1.58it/s]

{'loss': 0.0932, 'grad_norm': 1.317478060722351, 'learning_rate': 1.113575585968224e-05, 'epoch': 1.33}


 44%|████▍     | 11280/25428 [2:21:52<2:33:57,  1.53it/s]

{'loss': 0.1018, 'grad_norm': 0.8257752060890198, 'learning_rate': 1.1127890514393583e-05, 'epoch': 1.33}


 44%|████▍     | 11290/25428 [2:21:58<2:26:06,  1.61it/s]

{'loss': 0.1093, 'grad_norm': 1.9786738157272339, 'learning_rate': 1.1120025169104924e-05, 'epoch': 1.33}


 44%|████▍     | 11300/25428 [2:22:04<2:25:23,  1.62it/s]

{'loss': 0.1226, 'grad_norm': 0.9723653197288513, 'learning_rate': 1.1112159823816267e-05, 'epoch': 1.33}


 44%|████▍     | 11310/25428 [2:22:10<2:25:06,  1.62it/s]

{'loss': 0.1072, 'grad_norm': 1.2617262601852417, 'learning_rate': 1.1104294478527608e-05, 'epoch': 1.33}


 45%|████▍     | 11320/25428 [2:22:17<2:26:13,  1.61it/s]

{'loss': 0.0948, 'grad_norm': 0.7117288112640381, 'learning_rate': 1.109642913323895e-05, 'epoch': 1.34}


 45%|████▍     | 11330/25428 [2:22:23<2:27:11,  1.60it/s]

{'loss': 0.1571, 'grad_norm': 1.1041135787963867, 'learning_rate': 1.1088563787950293e-05, 'epoch': 1.34}


 45%|████▍     | 11340/25428 [2:22:29<2:28:34,  1.58it/s]

{'loss': 0.1355, 'grad_norm': 3.564953327178955, 'learning_rate': 1.1080698442661634e-05, 'epoch': 1.34}


 45%|████▍     | 11350/25428 [2:22:36<2:33:14,  1.53it/s]

{'loss': 0.1167, 'grad_norm': 1.000076174736023, 'learning_rate': 1.1072833097372975e-05, 'epoch': 1.34}


 45%|████▍     | 11360/25428 [2:22:42<2:27:34,  1.59it/s]

{'loss': 0.1336, 'grad_norm': 0.7534371018409729, 'learning_rate': 1.1064967752084316e-05, 'epoch': 1.34}


 45%|████▍     | 11370/25428 [2:22:49<2:32:30,  1.54it/s]

{'loss': 0.1438, 'grad_norm': 1.0298731327056885, 'learning_rate': 1.1057102406795659e-05, 'epoch': 1.34}


 45%|████▍     | 11380/25428 [2:22:55<2:26:55,  1.59it/s]

{'loss': 0.1317, 'grad_norm': 0.9580816626548767, 'learning_rate': 1.1049237061507001e-05, 'epoch': 1.34}


 45%|████▍     | 11390/25428 [2:23:01<2:29:29,  1.57it/s]

{'loss': 0.1641, 'grad_norm': 1.277231216430664, 'learning_rate': 1.1041371716218342e-05, 'epoch': 1.34}


 45%|████▍     | 11400/25428 [2:23:08<2:30:46,  1.55it/s]

{'loss': 0.1349, 'grad_norm': 1.583788514137268, 'learning_rate': 1.1033506370929685e-05, 'epoch': 1.34}


 45%|████▍     | 11410/25428 [2:23:14<2:29:36,  1.56it/s]

{'loss': 0.1092, 'grad_norm': 0.9373043775558472, 'learning_rate': 1.1025641025641028e-05, 'epoch': 1.35}


 45%|████▍     | 11420/25428 [2:23:20<2:29:49,  1.56it/s]

{'loss': 0.0877, 'grad_norm': 0.8751555681228638, 'learning_rate': 1.1017775680352369e-05, 'epoch': 1.35}


 45%|████▍     | 11430/25428 [2:23:27<2:27:49,  1.58it/s]

{'loss': 0.1079, 'grad_norm': 1.2901666164398193, 'learning_rate': 1.100991033506371e-05, 'epoch': 1.35}


 45%|████▍     | 11440/25428 [2:23:33<2:27:16,  1.58it/s]

{'loss': 0.1156, 'grad_norm': 0.777782678604126, 'learning_rate': 1.100204498977505e-05, 'epoch': 1.35}


 45%|████▌     | 11450/25428 [2:23:39<2:26:38,  1.59it/s]

{'loss': 0.1471, 'grad_norm': 1.237229585647583, 'learning_rate': 1.0994179644486393e-05, 'epoch': 1.35}


 45%|████▌     | 11460/25428 [2:23:46<2:27:47,  1.58it/s]

{'loss': 0.123, 'grad_norm': 0.8678650259971619, 'learning_rate': 1.0986314299197736e-05, 'epoch': 1.35}


 45%|████▌     | 11470/25428 [2:23:53<2:34:01,  1.51it/s]

{'loss': 0.1098, 'grad_norm': 1.1706887483596802, 'learning_rate': 1.0978448953909077e-05, 'epoch': 1.35}


 45%|████▌     | 11480/25428 [2:23:59<2:33:00,  1.52it/s]

{'loss': 0.1031, 'grad_norm': 1.1982675790786743, 'learning_rate': 1.097058360862042e-05, 'epoch': 1.35}


 45%|████▌     | 11490/25428 [2:24:06<2:27:18,  1.58it/s]

{'loss': 0.1098, 'grad_norm': 1.4479475021362305, 'learning_rate': 1.0962718263331762e-05, 'epoch': 1.36}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.1368, 'grad_norm': 0.634019672870636, 'learning_rate': 1.0954852918043103e-05, 'epoch': 1.36}


 45%|████▌     | 11510/25428 [2:24:22<2:38:10,  1.47it/s]

{'loss': 0.0989, 'grad_norm': 1.216180682182312, 'learning_rate': 1.0946987572754444e-05, 'epoch': 1.36}


 45%|████▌     | 11520/25428 [2:24:28<2:29:37,  1.55it/s]

{'loss': 0.1141, 'grad_norm': 1.5639235973358154, 'learning_rate': 1.0939122227465785e-05, 'epoch': 1.36}


 45%|████▌     | 11530/25428 [2:24:35<2:27:04,  1.57it/s]

{'loss': 0.133, 'grad_norm': 1.746437668800354, 'learning_rate': 1.0931256882177128e-05, 'epoch': 1.36}


 45%|████▌     | 11540/25428 [2:24:41<2:30:07,  1.54it/s]

{'loss': 0.1156, 'grad_norm': 0.9932736754417419, 'learning_rate': 1.092339153688847e-05, 'epoch': 1.36}


 45%|████▌     | 11550/25428 [2:24:47<2:29:08,  1.55it/s]

{'loss': 0.1401, 'grad_norm': 2.0891873836517334, 'learning_rate': 1.0915526191599812e-05, 'epoch': 1.36}


 45%|████▌     | 11560/25428 [2:24:54<2:24:31,  1.60it/s]

{'loss': 0.1459, 'grad_norm': 1.1289052963256836, 'learning_rate': 1.0907660846311154e-05, 'epoch': 1.36}


 46%|████▌     | 11570/25428 [2:25:01<2:33:42,  1.50it/s]

{'loss': 0.0994, 'grad_norm': 1.627694010734558, 'learning_rate': 1.0899795501022497e-05, 'epoch': 1.37}


 46%|████▌     | 11580/25428 [2:25:07<2:29:21,  1.55it/s]

{'loss': 0.1791, 'grad_norm': 1.0022284984588623, 'learning_rate': 1.0891930155733838e-05, 'epoch': 1.37}


 46%|████▌     | 11590/25428 [2:25:14<2:32:54,  1.51it/s]

{'loss': 0.1352, 'grad_norm': 1.4126213788986206, 'learning_rate': 1.0884064810445179e-05, 'epoch': 1.37}


 46%|████▌     | 11600/25428 [2:25:20<2:27:30,  1.56it/s]

{'loss': 0.0982, 'grad_norm': 0.878822386264801, 'learning_rate': 1.087619946515652e-05, 'epoch': 1.37}


 46%|████▌     | 11610/25428 [2:25:26<2:28:54,  1.55it/s]

{'loss': 0.0783, 'grad_norm': 0.5470747947692871, 'learning_rate': 1.0868334119867863e-05, 'epoch': 1.37}


 46%|████▌     | 11620/25428 [2:25:33<2:23:02,  1.61it/s]

{'loss': 0.1123, 'grad_norm': 1.0022400617599487, 'learning_rate': 1.0860468774579205e-05, 'epoch': 1.37}


 46%|████▌     | 11630/25428 [2:25:39<2:24:40,  1.59it/s]

{'loss': 0.1001, 'grad_norm': 0.9584015011787415, 'learning_rate': 1.0852603429290546e-05, 'epoch': 1.37}


 46%|████▌     | 11640/25428 [2:25:45<2:25:48,  1.58it/s]

{'loss': 0.1168, 'grad_norm': 1.01285982131958, 'learning_rate': 1.0844738084001889e-05, 'epoch': 1.37}


 46%|████▌     | 11650/25428 [2:25:52<2:24:11,  1.59it/s]

{'loss': 0.0809, 'grad_norm': 1.060097575187683, 'learning_rate': 1.0836872738713232e-05, 'epoch': 1.37}


 46%|████▌     | 11660/25428 [2:25:58<2:24:38,  1.59it/s]

{'loss': 0.1033, 'grad_norm': 2.869572877883911, 'learning_rate': 1.0829007393424573e-05, 'epoch': 1.38}


 46%|████▌     | 11670/25428 [2:26:04<2:25:15,  1.58it/s]

{'loss': 0.1095, 'grad_norm': 1.0974332094192505, 'learning_rate': 1.0821142048135914e-05, 'epoch': 1.38}


 46%|████▌     | 11680/25428 [2:26:11<2:22:32,  1.61it/s]

{'loss': 0.1101, 'grad_norm': 0.6420236825942993, 'learning_rate': 1.0813276702847255e-05, 'epoch': 1.38}


 46%|████▌     | 11690/25428 [2:26:17<2:25:41,  1.57it/s]

{'loss': 0.1149, 'grad_norm': 0.7677866220474243, 'learning_rate': 1.0805411357558597e-05, 'epoch': 1.38}


 46%|████▌     | 11700/25428 [2:26:24<2:30:27,  1.52it/s]

{'loss': 0.1193, 'grad_norm': 1.4773272275924683, 'learning_rate': 1.079754601226994e-05, 'epoch': 1.38}


 46%|████▌     | 11710/25428 [2:26:30<2:29:13,  1.53it/s]

{'loss': 0.1262, 'grad_norm': 1.444582223892212, 'learning_rate': 1.0789680666981281e-05, 'epoch': 1.38}


 46%|████▌     | 11720/25428 [2:26:37<2:23:27,  1.59it/s]

{'loss': 0.1345, 'grad_norm': 0.8490585684776306, 'learning_rate': 1.0781815321692624e-05, 'epoch': 1.38}


 46%|████▌     | 11730/25428 [2:26:43<2:26:34,  1.56it/s]

{'loss': 0.1232, 'grad_norm': 1.4232306480407715, 'learning_rate': 1.0773949976403966e-05, 'epoch': 1.38}


 46%|████▌     | 11740/25428 [2:26:49<2:26:25,  1.56it/s]

{'loss': 0.1416, 'grad_norm': 2.066699504852295, 'learning_rate': 1.0766084631115307e-05, 'epoch': 1.39}


 46%|████▌     | 11750/25428 [2:26:56<2:23:56,  1.58it/s]

{'loss': 0.1058, 'grad_norm': 0.8487260937690735, 'learning_rate': 1.0758219285826648e-05, 'epoch': 1.39}


 46%|████▌     | 11760/25428 [2:27:02<2:24:43,  1.57it/s]

{'loss': 0.1342, 'grad_norm': 0.8711559176445007, 'learning_rate': 1.075035394053799e-05, 'epoch': 1.39}


 46%|████▋     | 11770/25428 [2:27:08<2:25:46,  1.56it/s]

{'loss': 0.103, 'grad_norm': 1.4619206190109253, 'learning_rate': 1.0742488595249332e-05, 'epoch': 1.39}


 46%|████▋     | 11780/25428 [2:27:15<2:21:58,  1.60it/s]

{'loss': 0.115, 'grad_norm': 0.7922748923301697, 'learning_rate': 1.0734623249960675e-05, 'epoch': 1.39}


 46%|████▋     | 11790/25428 [2:27:21<2:23:31,  1.58it/s]

{'loss': 0.1077, 'grad_norm': 0.8953571915626526, 'learning_rate': 1.0726757904672016e-05, 'epoch': 1.39}


 46%|████▋     | 11800/25428 [2:27:27<2:25:06,  1.57it/s]

{'loss': 0.0828, 'grad_norm': 1.0174447298049927, 'learning_rate': 1.0718892559383358e-05, 'epoch': 1.39}


 46%|████▋     | 11810/25428 [2:27:34<2:23:07,  1.59it/s]

{'loss': 0.1429, 'grad_norm': 0.9726575613021851, 'learning_rate': 1.0711027214094701e-05, 'epoch': 1.39}


 46%|████▋     | 11820/25428 [2:27:40<2:22:11,  1.60it/s]

{'loss': 0.1006, 'grad_norm': 7.677497386932373, 'learning_rate': 1.0703161868806042e-05, 'epoch': 1.39}


 47%|████▋     | 11830/25428 [2:27:46<2:20:54,  1.61it/s]

{'loss': 0.1478, 'grad_norm': 0.6495762467384338, 'learning_rate': 1.0695296523517383e-05, 'epoch': 1.4}


 47%|████▋     | 11840/25428 [2:27:53<2:22:59,  1.58it/s]

{'loss': 0.0864, 'grad_norm': 0.7793817520141602, 'learning_rate': 1.0687431178228724e-05, 'epoch': 1.4}


 47%|████▋     | 11850/25428 [2:27:59<2:19:50,  1.62it/s]

{'loss': 0.1446, 'grad_norm': 1.1938731670379639, 'learning_rate': 1.0679565832940067e-05, 'epoch': 1.4}


 47%|████▋     | 11860/25428 [2:28:06<2:24:11,  1.57it/s]

{'loss': 0.138, 'grad_norm': 0.8784014582633972, 'learning_rate': 1.0671700487651408e-05, 'epoch': 1.4}


 47%|████▋     | 11870/25428 [2:28:12<2:32:35,  1.48it/s]

{'loss': 0.1052, 'grad_norm': 1.0778528451919556, 'learning_rate': 1.066383514236275e-05, 'epoch': 1.4}


 47%|████▋     | 11880/25428 [2:28:19<2:27:31,  1.53it/s]

{'loss': 0.1066, 'grad_norm': 1.3941938877105713, 'learning_rate': 1.0655969797074093e-05, 'epoch': 1.4}


 47%|████▋     | 11890/25428 [2:28:26<2:26:34,  1.54it/s]

{'loss': 0.1439, 'grad_norm': 1.0145364999771118, 'learning_rate': 1.0648104451785434e-05, 'epoch': 1.4}


 47%|████▋     | 11900/25428 [2:28:32<2:28:53,  1.51it/s]

{'loss': 0.1302, 'grad_norm': 3.5850331783294678, 'learning_rate': 1.0640239106496777e-05, 'epoch': 1.4}


 47%|████▋     | 11910/25428 [2:28:39<2:23:06,  1.57it/s]

{'loss': 0.136, 'grad_norm': 1.214949369430542, 'learning_rate': 1.0632373761208116e-05, 'epoch': 1.41}


 47%|████▋     | 11920/25428 [2:28:45<2:21:32,  1.59it/s]

{'loss': 0.1085, 'grad_norm': 1.204379677772522, 'learning_rate': 1.0624508415919459e-05, 'epoch': 1.41}


 47%|████▋     | 11930/25428 [2:28:51<2:22:38,  1.58it/s]

{'loss': 0.1545, 'grad_norm': 0.9578708410263062, 'learning_rate': 1.0616643070630801e-05, 'epoch': 1.41}


 47%|████▋     | 11940/25428 [2:28:58<2:22:42,  1.58it/s]

{'loss': 0.0988, 'grad_norm': 1.0606732368469238, 'learning_rate': 1.0608777725342142e-05, 'epoch': 1.41}


 47%|████▋     | 11950/25428 [2:29:04<2:19:27,  1.61it/s]

{'loss': 0.1109, 'grad_norm': 1.0526108741760254, 'learning_rate': 1.0600912380053485e-05, 'epoch': 1.41}


 47%|████▋     | 11960/25428 [2:29:10<2:20:27,  1.60it/s]

{'loss': 0.0976, 'grad_norm': 0.8499302268028259, 'learning_rate': 1.0593047034764828e-05, 'epoch': 1.41}


 47%|████▋     | 11970/25428 [2:29:16<2:21:02,  1.59it/s]

{'loss': 0.0956, 'grad_norm': 2.2855563163757324, 'learning_rate': 1.0585181689476169e-05, 'epoch': 1.41}


 47%|████▋     | 11980/25428 [2:29:23<2:22:45,  1.57it/s]

{'loss': 0.1443, 'grad_norm': 0.8235875964164734, 'learning_rate': 1.0577316344187511e-05, 'epoch': 1.41}


 47%|████▋     | 11990/25428 [2:29:29<2:24:51,  1.55it/s]

{'loss': 0.1101, 'grad_norm': 0.8850177526473999, 'learning_rate': 1.0569450998898854e-05, 'epoch': 1.41}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.118, 'grad_norm': 1.4478747844696045, 'learning_rate': 1.0561585653610193e-05, 'epoch': 1.42}


 47%|████▋     | 12010/25428 [2:29:46<2:37:35,  1.42it/s]

{'loss': 0.096, 'grad_norm': 1.194190502166748, 'learning_rate': 1.0553720308321536e-05, 'epoch': 1.42}


 47%|████▋     | 12020/25428 [2:29:53<2:31:30,  1.47it/s]

{'loss': 0.0919, 'grad_norm': 1.591574788093567, 'learning_rate': 1.0545854963032877e-05, 'epoch': 1.42}


 47%|████▋     | 12030/25428 [2:29:59<2:22:21,  1.57it/s]

{'loss': 0.1231, 'grad_norm': 1.511983871459961, 'learning_rate': 1.053798961774422e-05, 'epoch': 1.42}


 47%|████▋     | 12040/25428 [2:30:06<2:22:34,  1.57it/s]

{'loss': 0.0967, 'grad_norm': 1.378435730934143, 'learning_rate': 1.0530124272455562e-05, 'epoch': 1.42}


 47%|████▋     | 12050/25428 [2:30:12<2:23:08,  1.56it/s]

{'loss': 0.1157, 'grad_norm': 2.1378047466278076, 'learning_rate': 1.0522258927166903e-05, 'epoch': 1.42}


 47%|████▋     | 12060/25428 [2:30:19<2:30:35,  1.48it/s]

{'loss': 0.1183, 'grad_norm': 1.3588711023330688, 'learning_rate': 1.0514393581878246e-05, 'epoch': 1.42}


 47%|████▋     | 12070/25428 [2:30:25<2:29:48,  1.49it/s]

{'loss': 0.1055, 'grad_norm': 0.9201786518096924, 'learning_rate': 1.0506528236589589e-05, 'epoch': 1.42}


 48%|████▊     | 12080/25428 [2:30:32<2:25:04,  1.53it/s]

{'loss': 0.0939, 'grad_norm': 0.7221306562423706, 'learning_rate': 1.0498662891300928e-05, 'epoch': 1.43}


 48%|████▊     | 12090/25428 [2:30:38<2:24:32,  1.54it/s]

{'loss': 0.1608, 'grad_norm': 0.9114471673965454, 'learning_rate': 1.049079754601227e-05, 'epoch': 1.43}


 48%|████▊     | 12100/25428 [2:30:45<2:32:31,  1.46it/s]

{'loss': 0.1276, 'grad_norm': 0.9089028239250183, 'learning_rate': 1.0482932200723612e-05, 'epoch': 1.43}


 48%|████▊     | 12110/25428 [2:30:51<2:22:35,  1.56it/s]

{'loss': 0.0814, 'grad_norm': 0.9230232834815979, 'learning_rate': 1.0475066855434954e-05, 'epoch': 1.43}


 48%|████▊     | 12120/25428 [2:30:58<2:32:11,  1.46it/s]

{'loss': 0.0896, 'grad_norm': 0.8770695328712463, 'learning_rate': 1.0467201510146297e-05, 'epoch': 1.43}


 48%|████▊     | 12130/25428 [2:31:05<2:31:37,  1.46it/s]

{'loss': 0.1175, 'grad_norm': 1.5551252365112305, 'learning_rate': 1.0459336164857638e-05, 'epoch': 1.43}


 48%|████▊     | 12140/25428 [2:31:12<2:35:54,  1.42it/s]

{'loss': 0.1077, 'grad_norm': 1.09197998046875, 'learning_rate': 1.045147081956898e-05, 'epoch': 1.43}


 48%|████▊     | 12150/25428 [2:31:19<2:21:54,  1.56it/s]

{'loss': 0.0937, 'grad_norm': 0.8099583983421326, 'learning_rate': 1.0443605474280323e-05, 'epoch': 1.43}


 48%|████▊     | 12160/25428 [2:31:25<2:22:08,  1.56it/s]

{'loss': 0.1411, 'grad_norm': 0.950385570526123, 'learning_rate': 1.0435740128991663e-05, 'epoch': 1.43}


 48%|████▊     | 12170/25428 [2:31:32<2:20:06,  1.58it/s]

{'loss': 0.1056, 'grad_norm': 1.091294288635254, 'learning_rate': 1.0427874783703005e-05, 'epoch': 1.44}


 48%|████▊     | 12180/25428 [2:31:38<2:23:32,  1.54it/s]

{'loss': 0.1237, 'grad_norm': 1.0756292343139648, 'learning_rate': 1.0420009438414346e-05, 'epoch': 1.44}


 48%|████▊     | 12190/25428 [2:31:45<2:23:10,  1.54it/s]

{'loss': 0.0994, 'grad_norm': 1.1654441356658936, 'learning_rate': 1.0412144093125689e-05, 'epoch': 1.44}


 48%|████▊     | 12200/25428 [2:31:51<2:25:16,  1.52it/s]

{'loss': 0.112, 'grad_norm': 1.067203164100647, 'learning_rate': 1.0404278747837032e-05, 'epoch': 1.44}


 48%|████▊     | 12210/25428 [2:31:58<2:23:35,  1.53it/s]

{'loss': 0.1326, 'grad_norm': 0.7437575459480286, 'learning_rate': 1.0396413402548373e-05, 'epoch': 1.44}


 48%|████▊     | 12220/25428 [2:32:04<2:20:13,  1.57it/s]

{'loss': 0.0946, 'grad_norm': 0.8659260272979736, 'learning_rate': 1.0388548057259715e-05, 'epoch': 1.44}


 48%|████▊     | 12230/25428 [2:32:11<2:19:14,  1.58it/s]

{'loss': 0.0967, 'grad_norm': 0.8972605466842651, 'learning_rate': 1.0380682711971058e-05, 'epoch': 1.44}


 48%|████▊     | 12240/25428 [2:32:17<2:21:15,  1.56it/s]

{'loss': 0.0988, 'grad_norm': 1.4258021116256714, 'learning_rate': 1.0372817366682397e-05, 'epoch': 1.44}


 48%|████▊     | 12250/25428 [2:32:24<2:23:33,  1.53it/s]

{'loss': 0.1019, 'grad_norm': 0.9646903276443481, 'learning_rate': 1.036495202139374e-05, 'epoch': 1.45}


 48%|████▊     | 12260/25428 [2:32:30<2:19:36,  1.57it/s]

{'loss': 0.1373, 'grad_norm': 1.1550254821777344, 'learning_rate': 1.0357086676105081e-05, 'epoch': 1.45}


 48%|████▊     | 12270/25428 [2:32:36<2:23:48,  1.52it/s]

{'loss': 0.1037, 'grad_norm': 1.0952941179275513, 'learning_rate': 1.0349221330816424e-05, 'epoch': 1.45}


 48%|████▊     | 12280/25428 [2:32:43<2:23:54,  1.52it/s]

{'loss': 0.0963, 'grad_norm': 1.4999415874481201, 'learning_rate': 1.0341355985527766e-05, 'epoch': 1.45}


 48%|████▊     | 12290/25428 [2:32:50<2:25:01,  1.51it/s]

{'loss': 0.1439, 'grad_norm': 4.794425010681152, 'learning_rate': 1.0333490640239107e-05, 'epoch': 1.45}


 48%|████▊     | 12300/25428 [2:32:56<2:23:02,  1.53it/s]

{'loss': 0.0884, 'grad_norm': 1.035752296447754, 'learning_rate': 1.032562529495045e-05, 'epoch': 1.45}


 48%|████▊     | 12310/25428 [2:33:03<2:19:05,  1.57it/s]

{'loss': 0.1292, 'grad_norm': 1.2838363647460938, 'learning_rate': 1.0317759949661793e-05, 'epoch': 1.45}


 48%|████▊     | 12320/25428 [2:33:09<2:20:00,  1.56it/s]

{'loss': 0.0997, 'grad_norm': 1.3653347492218018, 'learning_rate': 1.0309894604373132e-05, 'epoch': 1.45}


 48%|████▊     | 12330/25428 [2:33:16<2:34:33,  1.41it/s]

{'loss': 0.1167, 'grad_norm': 0.9128756523132324, 'learning_rate': 1.0302029259084475e-05, 'epoch': 1.45}


 49%|████▊     | 12340/25428 [2:33:23<2:22:37,  1.53it/s]

{'loss': 0.1061, 'grad_norm': 0.8894217610359192, 'learning_rate': 1.0294163913795816e-05, 'epoch': 1.46}


 49%|████▊     | 12350/25428 [2:33:29<2:25:25,  1.50it/s]

{'loss': 0.0856, 'grad_norm': 0.895456075668335, 'learning_rate': 1.0286298568507158e-05, 'epoch': 1.46}


 49%|████▊     | 12360/25428 [2:33:36<2:20:04,  1.55it/s]

{'loss': 0.0918, 'grad_norm': 3.1726293563842773, 'learning_rate': 1.02784332232185e-05, 'epoch': 1.46}


 49%|████▊     | 12370/25428 [2:33:42<2:21:37,  1.54it/s]

{'loss': 0.1349, 'grad_norm': 0.7178751230239868, 'learning_rate': 1.0270567877929842e-05, 'epoch': 1.46}


 49%|████▊     | 12380/25428 [2:33:49<2:21:48,  1.53it/s]

{'loss': 0.1051, 'grad_norm': 1.361742615699768, 'learning_rate': 1.0262702532641185e-05, 'epoch': 1.46}


 49%|████▊     | 12390/25428 [2:33:55<2:19:26,  1.56it/s]

{'loss': 0.1108, 'grad_norm': 0.6969426274299622, 'learning_rate': 1.0254837187352526e-05, 'epoch': 1.46}


 49%|████▉     | 12400/25428 [2:34:02<2:20:09,  1.55it/s]

{'loss': 0.1172, 'grad_norm': 1.2900055646896362, 'learning_rate': 1.0246971842063867e-05, 'epoch': 1.46}


 49%|████▉     | 12410/25428 [2:34:08<2:18:57,  1.56it/s]

{'loss': 0.1386, 'grad_norm': 1.4752204418182373, 'learning_rate': 1.0239106496775208e-05, 'epoch': 1.46}


 49%|████▉     | 12420/25428 [2:34:15<2:17:44,  1.57it/s]

{'loss': 0.119, 'grad_norm': 1.8300983905792236, 'learning_rate': 1.023124115148655e-05, 'epoch': 1.47}


 49%|████▉     | 12430/25428 [2:34:21<2:19:03,  1.56it/s]

{'loss': 0.0984, 'grad_norm': 1.1760104894638062, 'learning_rate': 1.0223375806197893e-05, 'epoch': 1.47}


 49%|████▉     | 12440/25428 [2:34:28<2:23:05,  1.51it/s]

{'loss': 0.1064, 'grad_norm': 1.3573733568191528, 'learning_rate': 1.0215510460909234e-05, 'epoch': 1.47}


 49%|████▉     | 12450/25428 [2:34:34<2:20:43,  1.54it/s]

{'loss': 0.084, 'grad_norm': 1.5520071983337402, 'learning_rate': 1.0207645115620577e-05, 'epoch': 1.47}


 49%|████▉     | 12460/25428 [2:34:41<2:21:17,  1.53it/s]

{'loss': 0.0881, 'grad_norm': 0.7291840314865112, 'learning_rate': 1.019977977033192e-05, 'epoch': 1.47}


 49%|████▉     | 12470/25428 [2:34:47<2:20:16,  1.54it/s]

{'loss': 0.1378, 'grad_norm': 1.1282811164855957, 'learning_rate': 1.019191442504326e-05, 'epoch': 1.47}


 49%|████▉     | 12480/25428 [2:34:54<2:15:09,  1.60it/s]

{'loss': 0.1237, 'grad_norm': 1.5417271852493286, 'learning_rate': 1.0184049079754601e-05, 'epoch': 1.47}


 49%|████▉     | 12490/25428 [2:35:00<2:18:20,  1.56it/s]

{'loss': 0.1133, 'grad_norm': 2.4259727001190186, 'learning_rate': 1.0176183734465942e-05, 'epoch': 1.47}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.1072, 'grad_norm': 1.1014282703399658, 'learning_rate': 1.0168318389177285e-05, 'epoch': 1.47}


 49%|████▉     | 12510/25428 [2:35:17<2:26:23,  1.47it/s]

{'loss': 0.1085, 'grad_norm': 1.029604196548462, 'learning_rate': 1.0160453043888628e-05, 'epoch': 1.48}


 49%|████▉     | 12520/25428 [2:35:23<2:19:54,  1.54it/s]

{'loss': 0.1222, 'grad_norm': 1.0453379154205322, 'learning_rate': 1.0152587698599969e-05, 'epoch': 1.48}


 49%|████▉     | 12530/25428 [2:35:30<2:17:59,  1.56it/s]

{'loss': 0.1676, 'grad_norm': 0.8328022956848145, 'learning_rate': 1.0144722353311311e-05, 'epoch': 1.48}


 49%|████▉     | 12540/25428 [2:35:36<2:14:27,  1.60it/s]

{'loss': 0.1021, 'grad_norm': 0.6911569833755493, 'learning_rate': 1.0136857008022654e-05, 'epoch': 1.48}


 49%|████▉     | 12550/25428 [2:35:43<2:20:03,  1.53it/s]

{'loss': 0.0935, 'grad_norm': 0.9495851397514343, 'learning_rate': 1.0128991662733995e-05, 'epoch': 1.48}


 49%|████▉     | 12560/25428 [2:35:49<2:14:15,  1.60it/s]

{'loss': 0.1315, 'grad_norm': 2.1722145080566406, 'learning_rate': 1.0121126317445336e-05, 'epoch': 1.48}


 49%|████▉     | 12570/25428 [2:35:55<2:22:00,  1.51it/s]

{'loss': 0.1151, 'grad_norm': 0.8148528933525085, 'learning_rate': 1.0113260972156677e-05, 'epoch': 1.48}


 49%|████▉     | 12580/25428 [2:36:02<2:17:07,  1.56it/s]

{'loss': 0.0946, 'grad_norm': 1.341484785079956, 'learning_rate': 1.010539562686802e-05, 'epoch': 1.48}


 50%|████▉     | 12590/25428 [2:36:08<2:17:20,  1.56it/s]

{'loss': 0.1396, 'grad_norm': 0.705099880695343, 'learning_rate': 1.0097530281579362e-05, 'epoch': 1.49}


 50%|████▉     | 12600/25428 [2:36:15<2:23:53,  1.49it/s]

{'loss': 0.1375, 'grad_norm': 2.9235727787017822, 'learning_rate': 1.0089664936290703e-05, 'epoch': 1.49}


 50%|████▉     | 12610/25428 [2:36:22<2:18:25,  1.54it/s]

{'loss': 0.1058, 'grad_norm': 0.944648802280426, 'learning_rate': 1.0081799591002046e-05, 'epoch': 1.49}


 50%|████▉     | 12620/25428 [2:36:28<2:19:04,  1.53it/s]

{'loss': 0.1184, 'grad_norm': 1.1487047672271729, 'learning_rate': 1.0073934245713389e-05, 'epoch': 1.49}


 50%|████▉     | 12630/25428 [2:36:35<2:18:24,  1.54it/s]

{'loss': 0.1024, 'grad_norm': 0.95069420337677, 'learning_rate': 1.006606890042473e-05, 'epoch': 1.49}


 50%|████▉     | 12640/25428 [2:36:41<2:18:52,  1.53it/s]

{'loss': 0.1097, 'grad_norm': 0.7783294320106506, 'learning_rate': 1.0058203555136072e-05, 'epoch': 1.49}


 50%|████▉     | 12650/25428 [2:36:48<2:19:20,  1.53it/s]

{'loss': 0.1235, 'grad_norm': 1.2554235458374023, 'learning_rate': 1.0050338209847412e-05, 'epoch': 1.49}


 50%|████▉     | 12660/25428 [2:36:54<2:15:13,  1.57it/s]

{'loss': 0.0999, 'grad_norm': 1.4130651950836182, 'learning_rate': 1.0042472864558754e-05, 'epoch': 1.49}


 50%|████▉     | 12670/25428 [2:37:00<2:15:45,  1.57it/s]

{'loss': 0.1366, 'grad_norm': 2.0265979766845703, 'learning_rate': 1.0034607519270097e-05, 'epoch': 1.49}


 50%|████▉     | 12680/25428 [2:37:07<2:13:34,  1.59it/s]

{'loss': 0.0928, 'grad_norm': 1.5285391807556152, 'learning_rate': 1.0026742173981438e-05, 'epoch': 1.5}


 50%|████▉     | 12690/25428 [2:37:13<2:17:06,  1.55it/s]

{'loss': 0.0882, 'grad_norm': 1.2055319547653198, 'learning_rate': 1.001887682869278e-05, 'epoch': 1.5}


 50%|████▉     | 12700/25428 [2:37:20<2:13:54,  1.58it/s]

{'loss': 0.1101, 'grad_norm': 0.7481322288513184, 'learning_rate': 1.0011011483404123e-05, 'epoch': 1.5}


 50%|████▉     | 12710/25428 [2:37:26<2:13:16,  1.59it/s]

{'loss': 0.1069, 'grad_norm': 1.0682014226913452, 'learning_rate': 1.0003146138115464e-05, 'epoch': 1.5}


 50%|█████     | 12720/25428 [2:37:32<2:11:51,  1.61it/s]

{'loss': 0.1322, 'grad_norm': 1.3574333190917969, 'learning_rate': 9.995280792826805e-06, 'epoch': 1.5}


 50%|█████     | 12730/25428 [2:37:39<2:11:07,  1.61it/s]

{'loss': 0.1133, 'grad_norm': 0.7993746995925903, 'learning_rate': 9.987415447538148e-06, 'epoch': 1.5}


 50%|█████     | 12740/25428 [2:37:45<2:13:11,  1.59it/s]

{'loss': 0.0993, 'grad_norm': 1.0403950214385986, 'learning_rate': 9.97955010224949e-06, 'epoch': 1.5}


 50%|█████     | 12750/25428 [2:37:51<2:11:14,  1.61it/s]

{'loss': 0.1085, 'grad_norm': 0.9327972531318665, 'learning_rate': 9.971684756960832e-06, 'epoch': 1.5}


 50%|█████     | 12760/25428 [2:37:57<2:12:40,  1.59it/s]

{'loss': 0.0887, 'grad_norm': 0.9790077209472656, 'learning_rate': 9.963819411672173e-06, 'epoch': 1.51}


 50%|█████     | 12770/25428 [2:38:04<2:13:55,  1.58it/s]

{'loss': 0.1199, 'grad_norm': 0.8390982151031494, 'learning_rate': 9.955954066383515e-06, 'epoch': 1.51}


 50%|█████     | 12780/25428 [2:38:10<2:15:04,  1.56it/s]

{'loss': 0.1013, 'grad_norm': 0.6927253007888794, 'learning_rate': 9.948088721094858e-06, 'epoch': 1.51}


 50%|█████     | 12790/25428 [2:38:17<2:12:39,  1.59it/s]

{'loss': 0.1387, 'grad_norm': 1.0623217821121216, 'learning_rate': 9.940223375806199e-06, 'epoch': 1.51}


 50%|█████     | 12800/25428 [2:38:23<2:12:56,  1.58it/s]

{'loss': 0.1071, 'grad_norm': 0.7110451459884644, 'learning_rate': 9.93235803051754e-06, 'epoch': 1.51}


 50%|█████     | 12810/25428 [2:38:29<2:11:22,  1.60it/s]

{'loss': 0.1303, 'grad_norm': 2.025373935699463, 'learning_rate': 9.924492685228883e-06, 'epoch': 1.51}


 50%|█████     | 12820/25428 [2:38:35<2:14:05,  1.57it/s]

{'loss': 0.0949, 'grad_norm': 1.1183315515518188, 'learning_rate': 9.916627339940225e-06, 'epoch': 1.51}


 50%|█████     | 12830/25428 [2:38:42<2:14:02,  1.57it/s]

{'loss': 0.1043, 'grad_norm': 1.0367168188095093, 'learning_rate': 9.908761994651566e-06, 'epoch': 1.51}


 50%|█████     | 12840/25428 [2:38:48<2:13:43,  1.57it/s]

{'loss': 0.0814, 'grad_norm': 0.7717629075050354, 'learning_rate': 9.900896649362907e-06, 'epoch': 1.51}


 51%|█████     | 12850/25428 [2:38:54<2:16:38,  1.53it/s]

{'loss': 0.1075, 'grad_norm': 1.8252021074295044, 'learning_rate': 9.89303130407425e-06, 'epoch': 1.52}


 51%|█████     | 12860/25428 [2:39:01<2:13:43,  1.57it/s]

{'loss': 0.1315, 'grad_norm': 0.9438841342926025, 'learning_rate': 9.885165958785593e-06, 'epoch': 1.52}


 51%|█████     | 12870/25428 [2:39:07<2:12:52,  1.58it/s]

{'loss': 0.1064, 'grad_norm': 1.0497859716415405, 'learning_rate': 9.877300613496934e-06, 'epoch': 1.52}


 51%|█████     | 12880/25428 [2:39:14<2:19:38,  1.50it/s]

{'loss': 0.1336, 'grad_norm': 1.0712528228759766, 'learning_rate': 9.869435268208275e-06, 'epoch': 1.52}


 51%|█████     | 12890/25428 [2:39:20<2:11:34,  1.59it/s]

{'loss': 0.1274, 'grad_norm': 4.848968505859375, 'learning_rate': 9.861569922919617e-06, 'epoch': 1.52}


 51%|█████     | 12900/25428 [2:39:27<2:14:23,  1.55it/s]

{'loss': 0.0841, 'grad_norm': 1.1128915548324585, 'learning_rate': 9.853704577630958e-06, 'epoch': 1.52}


 51%|█████     | 12910/25428 [2:39:33<2:12:16,  1.58it/s]

{'loss': 0.1143, 'grad_norm': 0.9111453294754028, 'learning_rate': 9.8458392323423e-06, 'epoch': 1.52}


 51%|█████     | 12920/25428 [2:39:40<2:15:32,  1.54it/s]

{'loss': 0.0972, 'grad_norm': 1.1230382919311523, 'learning_rate': 9.837973887053642e-06, 'epoch': 1.52}


 51%|█████     | 12930/25428 [2:39:46<2:11:01,  1.59it/s]

{'loss': 0.1294, 'grad_norm': 0.9455891847610474, 'learning_rate': 9.830108541764985e-06, 'epoch': 1.53}


 51%|█████     | 12940/25428 [2:39:52<2:14:19,  1.55it/s]

{'loss': 0.1149, 'grad_norm': 1.0058366060256958, 'learning_rate': 9.822243196476326e-06, 'epoch': 1.53}


 51%|█████     | 12950/25428 [2:39:59<2:13:06,  1.56it/s]

{'loss': 0.1022, 'grad_norm': 1.3666462898254395, 'learning_rate': 9.814377851187667e-06, 'epoch': 1.53}


 51%|█████     | 12960/25428 [2:40:05<2:14:31,  1.54it/s]

{'loss': 0.0996, 'grad_norm': 1.0819038152694702, 'learning_rate': 9.80651250589901e-06, 'epoch': 1.53}


 51%|█████     | 12970/25428 [2:40:12<2:12:38,  1.57it/s]

{'loss': 0.119, 'grad_norm': 1.7143774032592773, 'learning_rate': 9.798647160610352e-06, 'epoch': 1.53}


 51%|█████     | 12980/25428 [2:40:18<2:12:04,  1.57it/s]

{'loss': 0.1066, 'grad_norm': 1.0163894891738892, 'learning_rate': 9.790781815321693e-06, 'epoch': 1.53}


 51%|█████     | 12990/25428 [2:40:24<2:14:46,  1.54it/s]

{'loss': 0.0879, 'grad_norm': 0.8293868899345398, 'learning_rate': 9.782916470033034e-06, 'epoch': 1.53}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.1112, 'grad_norm': 1.4429453611373901, 'learning_rate': 9.775051124744377e-06, 'epoch': 1.53}


 51%|█████     | 13010/25428 [2:40:41<2:17:50,  1.50it/s]

{'loss': 0.1015, 'grad_norm': 1.1982694864273071, 'learning_rate': 9.76718577945572e-06, 'epoch': 1.53}


 51%|█████     | 13020/25428 [2:40:47<2:16:45,  1.51it/s]

{'loss': 0.1221, 'grad_norm': 1.0129305124282837, 'learning_rate': 9.75932043416706e-06, 'epoch': 1.54}


 51%|█████     | 13030/25428 [2:40:54<2:13:37,  1.55it/s]

{'loss': 0.0937, 'grad_norm': 0.7490450739860535, 'learning_rate': 9.751455088878401e-06, 'epoch': 1.54}


 51%|█████▏    | 13040/25428 [2:41:00<2:12:09,  1.56it/s]

{'loss': 0.1767, 'grad_norm': 0.942162275314331, 'learning_rate': 9.743589743589744e-06, 'epoch': 1.54}


 51%|█████▏    | 13050/25428 [2:41:06<2:11:39,  1.57it/s]

{'loss': 0.1091, 'grad_norm': 9.464899063110352, 'learning_rate': 9.735724398301087e-06, 'epoch': 1.54}


 51%|█████▏    | 13060/25428 [2:41:13<2:14:06,  1.54it/s]

{'loss': 0.0958, 'grad_norm': 0.9680486917495728, 'learning_rate': 9.727859053012428e-06, 'epoch': 1.54}


 51%|█████▏    | 13070/25428 [2:41:19<2:15:49,  1.52it/s]

{'loss': 0.0727, 'grad_norm': 0.816383421421051, 'learning_rate': 9.719993707723769e-06, 'epoch': 1.54}


 51%|█████▏    | 13080/25428 [2:41:26<2:13:43,  1.54it/s]

{'loss': 0.1233, 'grad_norm': 1.7164890766143799, 'learning_rate': 9.712128362435111e-06, 'epoch': 1.54}


 51%|█████▏    | 13090/25428 [2:41:32<2:10:44,  1.57it/s]

{'loss': 0.1323, 'grad_norm': 1.6705169677734375, 'learning_rate': 9.704263017146454e-06, 'epoch': 1.54}


 52%|█████▏    | 13100/25428 [2:41:39<2:11:13,  1.57it/s]

{'loss': 0.1332, 'grad_norm': 1.0941163301467896, 'learning_rate': 9.696397671857795e-06, 'epoch': 1.55}


 52%|█████▏    | 13110/25428 [2:41:45<2:13:57,  1.53it/s]

{'loss': 0.1351, 'grad_norm': 1.0971770286560059, 'learning_rate': 9.688532326569136e-06, 'epoch': 1.55}


 52%|█████▏    | 13120/25428 [2:41:52<2:08:41,  1.59it/s]

{'loss': 0.0903, 'grad_norm': 0.8618335723876953, 'learning_rate': 9.680666981280479e-06, 'epoch': 1.55}


 52%|█████▏    | 13130/25428 [2:41:58<2:09:19,  1.58it/s]

{'loss': 0.1251, 'grad_norm': 0.9201576709747314, 'learning_rate': 9.672801635991821e-06, 'epoch': 1.55}


 52%|█████▏    | 13140/25428 [2:42:04<2:13:42,  1.53it/s]

{'loss': 0.1036, 'grad_norm': 0.9648464918136597, 'learning_rate': 9.664936290703162e-06, 'epoch': 1.55}


 52%|█████▏    | 13150/25428 [2:42:11<2:13:42,  1.53it/s]

{'loss': 0.1432, 'grad_norm': 1.1725561618804932, 'learning_rate': 9.657070945414503e-06, 'epoch': 1.55}


 52%|█████▏    | 13160/25428 [2:42:17<2:11:25,  1.56it/s]

{'loss': 0.0938, 'grad_norm': 0.8350266218185425, 'learning_rate': 9.649205600125846e-06, 'epoch': 1.55}


 52%|█████▏    | 13170/25428 [2:42:24<2:12:35,  1.54it/s]

{'loss': 0.0988, 'grad_norm': 1.4054334163665771, 'learning_rate': 9.641340254837189e-06, 'epoch': 1.55}


 52%|█████▏    | 13180/25428 [2:42:30<2:11:42,  1.55it/s]

{'loss': 0.0994, 'grad_norm': 1.0582528114318848, 'learning_rate': 9.63347490954853e-06, 'epoch': 1.55}


 52%|█████▏    | 13190/25428 [2:42:37<2:09:43,  1.57it/s]

{'loss': 0.1273, 'grad_norm': 1.115204930305481, 'learning_rate': 9.62560956425987e-06, 'epoch': 1.56}


 52%|█████▏    | 13200/25428 [2:42:43<2:06:52,  1.61it/s]

{'loss': 0.104, 'grad_norm': 1.1850180625915527, 'learning_rate': 9.617744218971213e-06, 'epoch': 1.56}


 52%|█████▏    | 13210/25428 [2:42:49<2:07:16,  1.60it/s]

{'loss': 0.0833, 'grad_norm': 0.9174594283103943, 'learning_rate': 9.609878873682556e-06, 'epoch': 1.56}


 52%|█████▏    | 13220/25428 [2:42:55<2:07:53,  1.59it/s]

{'loss': 0.1036, 'grad_norm': 0.8331064581871033, 'learning_rate': 9.602013528393897e-06, 'epoch': 1.56}


 52%|█████▏    | 13230/25428 [2:43:02<2:09:03,  1.58it/s]

{'loss': 0.1176, 'grad_norm': 0.8934900164604187, 'learning_rate': 9.59414818310524e-06, 'epoch': 1.56}


 52%|█████▏    | 13240/25428 [2:43:08<2:09:54,  1.56it/s]

{'loss': 0.1064, 'grad_norm': 1.2663047313690186, 'learning_rate': 9.58628283781658e-06, 'epoch': 1.56}


 52%|█████▏    | 13250/25428 [2:43:15<2:15:39,  1.50it/s]

{'loss': 0.1615, 'grad_norm': 0.8809673190116882, 'learning_rate': 9.578417492527923e-06, 'epoch': 1.56}


 52%|█████▏    | 13260/25428 [2:43:21<2:09:04,  1.57it/s]

{'loss': 0.1086, 'grad_norm': 1.152685284614563, 'learning_rate': 9.570552147239264e-06, 'epoch': 1.56}


 52%|█████▏    | 13270/25428 [2:43:28<2:07:13,  1.59it/s]

{'loss': 0.0956, 'grad_norm': 0.8653648495674133, 'learning_rate': 9.562686801950607e-06, 'epoch': 1.57}


 52%|█████▏    | 13280/25428 [2:43:34<2:06:50,  1.60it/s]

{'loss': 0.0955, 'grad_norm': 1.1863058805465698, 'learning_rate': 9.554821456661948e-06, 'epoch': 1.57}


 52%|█████▏    | 13290/25428 [2:43:40<2:09:15,  1.57it/s]

{'loss': 0.111, 'grad_norm': 0.7325499653816223, 'learning_rate': 9.54695611137329e-06, 'epoch': 1.57}


 52%|█████▏    | 13300/25428 [2:43:47<2:09:39,  1.56it/s]

{'loss': 0.1275, 'grad_norm': 1.274601697921753, 'learning_rate': 9.539090766084632e-06, 'epoch': 1.57}


 52%|█████▏    | 13310/25428 [2:43:53<2:08:02,  1.58it/s]

{'loss': 0.0867, 'grad_norm': 0.8936898708343506, 'learning_rate': 9.531225420795974e-06, 'epoch': 1.57}


 52%|█████▏    | 13320/25428 [2:43:59<2:06:47,  1.59it/s]

{'loss': 0.0863, 'grad_norm': 0.6980330944061279, 'learning_rate': 9.523360075507315e-06, 'epoch': 1.57}


 52%|█████▏    | 13330/25428 [2:44:06<2:04:46,  1.62it/s]

{'loss': 0.1136, 'grad_norm': 1.412767767906189, 'learning_rate': 9.515494730218658e-06, 'epoch': 1.57}


 52%|█████▏    | 13340/25428 [2:44:12<2:06:32,  1.59it/s]

{'loss': 0.0831, 'grad_norm': 0.6414442658424377, 'learning_rate': 9.507629384929999e-06, 'epoch': 1.57}


 53%|█████▎    | 13350/25428 [2:44:18<2:09:23,  1.56it/s]

{'loss': 0.096, 'grad_norm': 0.6819939613342285, 'learning_rate': 9.499764039641342e-06, 'epoch': 1.58}


 53%|█████▎    | 13360/25428 [2:44:25<2:07:18,  1.58it/s]

{'loss': 0.0934, 'grad_norm': 1.4307678937911987, 'learning_rate': 9.491898694352683e-06, 'epoch': 1.58}


 53%|█████▎    | 13370/25428 [2:44:31<2:05:40,  1.60it/s]

{'loss': 0.1174, 'grad_norm': 0.8399147987365723, 'learning_rate': 9.484033349064025e-06, 'epoch': 1.58}


 53%|█████▎    | 13380/25428 [2:44:37<2:07:02,  1.58it/s]

{'loss': 0.0978, 'grad_norm': 1.1207751035690308, 'learning_rate': 9.476168003775366e-06, 'epoch': 1.58}


 53%|█████▎    | 13390/25428 [2:44:44<2:05:19,  1.60it/s]

{'loss': 0.1075, 'grad_norm': 1.500251293182373, 'learning_rate': 9.468302658486709e-06, 'epoch': 1.58}


 53%|█████▎    | 13400/25428 [2:44:50<2:06:11,  1.59it/s]

{'loss': 0.1258, 'grad_norm': 1.1123658418655396, 'learning_rate': 9.46043731319805e-06, 'epoch': 1.58}


 53%|█████▎    | 13410/25428 [2:44:56<2:06:37,  1.58it/s]

{'loss': 0.0943, 'grad_norm': 0.9063106179237366, 'learning_rate': 9.452571967909391e-06, 'epoch': 1.58}


 53%|█████▎    | 13420/25428 [2:45:03<2:05:50,  1.59it/s]

{'loss': 0.1024, 'grad_norm': 1.0274112224578857, 'learning_rate': 9.444706622620734e-06, 'epoch': 1.58}


 53%|█████▎    | 13430/25428 [2:45:09<2:05:04,  1.60it/s]

{'loss': 0.1081, 'grad_norm': 0.6237653493881226, 'learning_rate': 9.436841277332076e-06, 'epoch': 1.58}


 53%|█████▎    | 13440/25428 [2:45:15<2:08:04,  1.56it/s]

{'loss': 0.101, 'grad_norm': 0.9390432238578796, 'learning_rate': 9.428975932043417e-06, 'epoch': 1.59}


 53%|█████▎    | 13450/25428 [2:45:22<2:07:14,  1.57it/s]

{'loss': 0.1048, 'grad_norm': 1.6412270069122314, 'learning_rate': 9.421110586754758e-06, 'epoch': 1.59}


 53%|█████▎    | 13460/25428 [2:45:28<2:06:34,  1.58it/s]

{'loss': 0.0986, 'grad_norm': 1.0820599794387817, 'learning_rate': 9.413245241466101e-06, 'epoch': 1.59}


 53%|█████▎    | 13470/25428 [2:45:34<2:06:52,  1.57it/s]

{'loss': 0.1006, 'grad_norm': 0.8581246137619019, 'learning_rate': 9.405379896177444e-06, 'epoch': 1.59}


 53%|█████▎    | 13480/25428 [2:45:41<2:09:12,  1.54it/s]

{'loss': 0.0909, 'grad_norm': 1.1228910684585571, 'learning_rate': 9.397514550888785e-06, 'epoch': 1.59}


 53%|█████▎    | 13490/25428 [2:45:47<2:07:51,  1.56it/s]

{'loss': 0.1032, 'grad_norm': 0.9933993220329285, 'learning_rate': 9.389649205600126e-06, 'epoch': 1.59}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.1192, 'grad_norm': 0.9133040904998779, 'learning_rate': 9.381783860311468e-06, 'epoch': 1.59}


 53%|█████▎    | 13510/25428 [2:46:03<2:11:26,  1.51it/s]

{'loss': 0.1233, 'grad_norm': 1.115723967552185, 'learning_rate': 9.373918515022811e-06, 'epoch': 1.59}


 53%|█████▎    | 13520/25428 [2:46:10<2:04:10,  1.60it/s]

{'loss': 0.1119, 'grad_norm': 1.2417675256729126, 'learning_rate': 9.366053169734152e-06, 'epoch': 1.6}


 53%|█████▎    | 13530/25428 [2:46:16<2:05:55,  1.57it/s]

{'loss': 0.1387, 'grad_norm': 1.738227128982544, 'learning_rate': 9.358187824445493e-06, 'epoch': 1.6}


 53%|█████▎    | 13540/25428 [2:46:22<2:03:15,  1.61it/s]

{'loss': 0.1057, 'grad_norm': 0.6291214823722839, 'learning_rate': 9.350322479156836e-06, 'epoch': 1.6}


 53%|█████▎    | 13550/25428 [2:46:29<2:05:40,  1.58it/s]

{'loss': 0.0965, 'grad_norm': 0.9991096258163452, 'learning_rate': 9.342457133868178e-06, 'epoch': 1.6}


 53%|█████▎    | 13560/25428 [2:46:35<2:03:17,  1.60it/s]

{'loss': 0.1193, 'grad_norm': 1.5010583400726318, 'learning_rate': 9.33459178857952e-06, 'epoch': 1.6}


 53%|█████▎    | 13570/25428 [2:46:41<2:04:01,  1.59it/s]

{'loss': 0.1051, 'grad_norm': 1.1017200946807861, 'learning_rate': 9.32672644329086e-06, 'epoch': 1.6}


 53%|█████▎    | 13580/25428 [2:46:48<2:03:19,  1.60it/s]

{'loss': 0.0985, 'grad_norm': 1.6683108806610107, 'learning_rate': 9.318861098002203e-06, 'epoch': 1.6}


 53%|█████▎    | 13590/25428 [2:46:54<2:03:12,  1.60it/s]

{'loss': 0.1048, 'grad_norm': 0.7989406585693359, 'learning_rate': 9.310995752713546e-06, 'epoch': 1.6}


 53%|█████▎    | 13600/25428 [2:47:00<2:07:04,  1.55it/s]

{'loss': 0.1488, 'grad_norm': 1.2265958786010742, 'learning_rate': 9.303130407424887e-06, 'epoch': 1.6}


 54%|█████▎    | 13610/25428 [2:47:06<2:03:05,  1.60it/s]

{'loss': 0.092, 'grad_norm': 1.008589506149292, 'learning_rate': 9.295265062136228e-06, 'epoch': 1.61}


 54%|█████▎    | 13620/25428 [2:47:13<2:05:59,  1.56it/s]

{'loss': 0.1486, 'grad_norm': 0.7994009852409363, 'learning_rate': 9.28739971684757e-06, 'epoch': 1.61}


 54%|█████▎    | 13630/25428 [2:47:19<2:04:55,  1.57it/s]

{'loss': 0.1045, 'grad_norm': 0.9545822143554688, 'learning_rate': 9.279534371558913e-06, 'epoch': 1.61}


 54%|█████▎    | 13640/25428 [2:47:25<2:06:18,  1.56it/s]

{'loss': 0.1181, 'grad_norm': 1.6926190853118896, 'learning_rate': 9.271669026270254e-06, 'epoch': 1.61}


 54%|█████▎    | 13650/25428 [2:47:32<2:05:36,  1.56it/s]

{'loss': 0.1523, 'grad_norm': 0.7317684292793274, 'learning_rate': 9.263803680981595e-06, 'epoch': 1.61}


 54%|█████▎    | 13660/25428 [2:47:38<2:02:24,  1.60it/s]

{'loss': 0.0913, 'grad_norm': 1.3249772787094116, 'learning_rate': 9.255938335692938e-06, 'epoch': 1.61}


 54%|█████▍    | 13670/25428 [2:47:45<2:02:22,  1.60it/s]

{'loss': 0.1369, 'grad_norm': 2.254653215408325, 'learning_rate': 9.24807299040428e-06, 'epoch': 1.61}


 54%|█████▍    | 13680/25428 [2:47:51<2:00:30,  1.62it/s]

{'loss': 0.1346, 'grad_norm': 1.193033218383789, 'learning_rate': 9.240207645115621e-06, 'epoch': 1.61}


 54%|█████▍    | 13690/25428 [2:47:57<2:02:59,  1.59it/s]

{'loss': 0.1178, 'grad_norm': 1.9241365194320679, 'learning_rate': 9.232342299826962e-06, 'epoch': 1.62}


 54%|█████▍    | 13700/25428 [2:48:03<2:05:03,  1.56it/s]

{'loss': 0.1053, 'grad_norm': 0.831519603729248, 'learning_rate': 9.224476954538305e-06, 'epoch': 1.62}


 54%|█████▍    | 13710/25428 [2:48:10<2:01:59,  1.60it/s]

{'loss': 0.0831, 'grad_norm': 0.8422461152076721, 'learning_rate': 9.216611609249648e-06, 'epoch': 1.62}


 54%|█████▍    | 13720/25428 [2:48:16<2:05:02,  1.56it/s]

{'loss': 0.1107, 'grad_norm': 4.285187244415283, 'learning_rate': 9.208746263960989e-06, 'epoch': 1.62}


 54%|█████▍    | 13730/25428 [2:48:23<2:05:20,  1.56it/s]

{'loss': 0.1264, 'grad_norm': 1.666472315788269, 'learning_rate': 9.20088091867233e-06, 'epoch': 1.62}


 54%|█████▍    | 13740/25428 [2:48:29<2:04:54,  1.56it/s]

{'loss': 0.108, 'grad_norm': 1.1554361581802368, 'learning_rate': 9.193015573383672e-06, 'epoch': 1.62}


 54%|█████▍    | 13750/25428 [2:48:35<2:04:23,  1.56it/s]

{'loss': 0.1059, 'grad_norm': 0.8291816711425781, 'learning_rate': 9.185150228095015e-06, 'epoch': 1.62}


 54%|█████▍    | 13760/25428 [2:48:42<2:03:58,  1.57it/s]

{'loss': 0.1018, 'grad_norm': 1.3755072355270386, 'learning_rate': 9.177284882806356e-06, 'epoch': 1.62}


 54%|█████▍    | 13770/25428 [2:48:48<2:03:17,  1.58it/s]

{'loss': 0.1061, 'grad_norm': 0.652140736579895, 'learning_rate': 9.169419537517697e-06, 'epoch': 1.62}


 54%|█████▍    | 13780/25428 [2:48:54<2:02:16,  1.59it/s]

{'loss': 0.136, 'grad_norm': 0.8705208897590637, 'learning_rate': 9.16155419222904e-06, 'epoch': 1.63}


 54%|█████▍    | 13790/25428 [2:49:01<2:02:18,  1.59it/s]

{'loss': 0.1157, 'grad_norm': 1.518107295036316, 'learning_rate': 9.153688846940382e-06, 'epoch': 1.63}


 54%|█████▍    | 13800/25428 [2:49:07<2:02:09,  1.59it/s]

{'loss': 0.0966, 'grad_norm': 2.622706890106201, 'learning_rate': 9.145823501651723e-06, 'epoch': 1.63}


 54%|█████▍    | 13810/25428 [2:49:13<2:05:38,  1.54it/s]

{'loss': 0.1, 'grad_norm': 0.6707150340080261, 'learning_rate': 9.137958156363064e-06, 'epoch': 1.63}


 54%|█████▍    | 13820/25428 [2:49:20<1:59:16,  1.62it/s]

{'loss': 0.0851, 'grad_norm': 0.6943297982215881, 'learning_rate': 9.130092811074407e-06, 'epoch': 1.63}


 54%|█████▍    | 13830/25428 [2:49:26<1:59:10,  1.62it/s]

{'loss': 0.1408, 'grad_norm': 1.7027764320373535, 'learning_rate': 9.12222746578575e-06, 'epoch': 1.63}


 54%|█████▍    | 13840/25428 [2:49:32<2:00:56,  1.60it/s]

{'loss': 0.1027, 'grad_norm': 1.1693376302719116, 'learning_rate': 9.11436212049709e-06, 'epoch': 1.63}


 54%|█████▍    | 13850/25428 [2:49:39<2:01:35,  1.59it/s]

{'loss': 0.1014, 'grad_norm': 1.6302858591079712, 'learning_rate': 9.106496775208432e-06, 'epoch': 1.63}


 55%|█████▍    | 13860/25428 [2:49:45<2:01:24,  1.59it/s]

{'loss': 0.1146, 'grad_norm': 0.7666190266609192, 'learning_rate': 9.098631429919774e-06, 'epoch': 1.64}


 55%|█████▍    | 13870/25428 [2:49:51<1:58:50,  1.62it/s]

{'loss': 0.1114, 'grad_norm': 1.1893113851547241, 'learning_rate': 9.090766084631117e-06, 'epoch': 1.64}


 55%|█████▍    | 13880/25428 [2:49:57<2:01:39,  1.58it/s]

{'loss': 0.1118, 'grad_norm': 1.7358735799789429, 'learning_rate': 9.082900739342458e-06, 'epoch': 1.64}


 55%|█████▍    | 13890/25428 [2:50:04<2:01:17,  1.59it/s]

{'loss': 0.0795, 'grad_norm': 1.2217254638671875, 'learning_rate': 9.075035394053799e-06, 'epoch': 1.64}


 55%|█████▍    | 13900/25428 [2:50:10<2:00:26,  1.60it/s]

{'loss': 0.0867, 'grad_norm': 0.8519184589385986, 'learning_rate': 9.067170048765142e-06, 'epoch': 1.64}


 55%|█████▍    | 13910/25428 [2:50:16<2:01:38,  1.58it/s]

{'loss': 0.1176, 'grad_norm': 0.8537103533744812, 'learning_rate': 9.059304703476484e-06, 'epoch': 1.64}


 55%|█████▍    | 13920/25428 [2:50:23<1:59:56,  1.60it/s]

{'loss': 0.0969, 'grad_norm': 1.3808413743972778, 'learning_rate': 9.051439358187825e-06, 'epoch': 1.64}


 55%|█████▍    | 13930/25428 [2:50:29<2:01:51,  1.57it/s]

{'loss': 0.0769, 'grad_norm': 0.9369145631790161, 'learning_rate': 9.043574012899166e-06, 'epoch': 1.64}


 55%|█████▍    | 13940/25428 [2:50:35<1:59:42,  1.60it/s]

{'loss': 0.1042, 'grad_norm': 1.2691060304641724, 'learning_rate': 9.035708667610509e-06, 'epoch': 1.64}


 55%|█████▍    | 13950/25428 [2:50:41<2:03:07,  1.55it/s]

{'loss': 0.1076, 'grad_norm': 0.6970950961112976, 'learning_rate': 9.02784332232185e-06, 'epoch': 1.65}


 55%|█████▍    | 13960/25428 [2:50:48<2:05:23,  1.52it/s]

{'loss': 0.1109, 'grad_norm': 1.5454658269882202, 'learning_rate': 9.019977977033193e-06, 'epoch': 1.65}


 55%|█████▍    | 13970/25428 [2:50:54<2:00:25,  1.59it/s]

{'loss': 0.1074, 'grad_norm': 1.3115172386169434, 'learning_rate': 9.012112631744534e-06, 'epoch': 1.65}


 55%|█████▍    | 13980/25428 [2:51:01<1:59:32,  1.60it/s]

{'loss': 0.1255, 'grad_norm': 0.9120321869850159, 'learning_rate': 9.004247286455876e-06, 'epoch': 1.65}


 55%|█████▌    | 13990/25428 [2:51:07<2:00:25,  1.58it/s]

{'loss': 0.0974, 'grad_norm': 1.0398705005645752, 'learning_rate': 8.996381941167217e-06, 'epoch': 1.65}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.1824, 'grad_norm': 0.823233962059021, 'learning_rate': 8.98851659587856e-06, 'epoch': 1.65}


 55%|█████▌    | 14010/25428 [2:51:23<2:07:11,  1.50it/s]

{'loss': 0.1004, 'grad_norm': 1.292949914932251, 'learning_rate': 8.980651250589901e-06, 'epoch': 1.65}


 55%|█████▌    | 14020/25428 [2:51:30<2:01:29,  1.56it/s]

{'loss': 0.1082, 'grad_norm': 0.8141164183616638, 'learning_rate': 8.972785905301244e-06, 'epoch': 1.65}


 55%|█████▌    | 14030/25428 [2:51:36<2:00:33,  1.58it/s]

{'loss': 0.0972, 'grad_norm': 0.7193059921264648, 'learning_rate': 8.964920560012585e-06, 'epoch': 1.66}


 55%|█████▌    | 14040/25428 [2:51:42<2:00:03,  1.58it/s]

{'loss': 0.1069, 'grad_norm': 1.6532541513442993, 'learning_rate': 8.957055214723927e-06, 'epoch': 1.66}


 55%|█████▌    | 14050/25428 [2:51:49<2:01:08,  1.57it/s]

{'loss': 0.1118, 'grad_norm': 0.9669573903083801, 'learning_rate': 8.949189869435268e-06, 'epoch': 1.66}


 55%|█████▌    | 14060/25428 [2:51:55<1:59:07,  1.59it/s]

{'loss': 0.086, 'grad_norm': 4.641191005706787, 'learning_rate': 8.941324524146611e-06, 'epoch': 1.66}


 55%|█████▌    | 14070/25428 [2:52:01<1:56:58,  1.62it/s]

{'loss': 0.1637, 'grad_norm': 1.2959522008895874, 'learning_rate': 8.933459178857952e-06, 'epoch': 1.66}


 55%|█████▌    | 14080/25428 [2:52:07<1:57:54,  1.60it/s]

{'loss': 0.0997, 'grad_norm': 2.0172786712646484, 'learning_rate': 8.925593833569295e-06, 'epoch': 1.66}


 55%|█████▌    | 14090/25428 [2:52:14<2:01:30,  1.56it/s]

{'loss': 0.1153, 'grad_norm': 0.9782839417457581, 'learning_rate': 8.917728488280636e-06, 'epoch': 1.66}


 55%|█████▌    | 14100/25428 [2:52:20<1:57:17,  1.61it/s]

{'loss': 0.0934, 'grad_norm': 1.1314040422439575, 'learning_rate': 8.909863142991978e-06, 'epoch': 1.66}


 55%|█████▌    | 14110/25428 [2:52:26<1:57:47,  1.60it/s]

{'loss': 0.1072, 'grad_norm': 1.8037943840026855, 'learning_rate': 8.90199779770332e-06, 'epoch': 1.66}


 56%|█████▌    | 14120/25428 [2:52:33<1:59:41,  1.57it/s]

{'loss': 0.1189, 'grad_norm': 1.0272729396820068, 'learning_rate': 8.894132452414662e-06, 'epoch': 1.67}


 56%|█████▌    | 14130/25428 [2:52:39<2:01:49,  1.55it/s]

{'loss': 0.0872, 'grad_norm': 0.8960787057876587, 'learning_rate': 8.886267107126003e-06, 'epoch': 1.67}


 56%|█████▌    | 14140/25428 [2:52:45<1:59:36,  1.57it/s]

{'loss': 0.1197, 'grad_norm': 1.1607550382614136, 'learning_rate': 8.878401761837346e-06, 'epoch': 1.67}


 56%|█████▌    | 14150/25428 [2:52:52<1:59:58,  1.57it/s]

{'loss': 0.0835, 'grad_norm': 1.2550370693206787, 'learning_rate': 8.870536416548687e-06, 'epoch': 1.67}


 56%|█████▌    | 14160/25428 [2:52:58<1:57:17,  1.60it/s]

{'loss': 0.1256, 'grad_norm': 0.887252926826477, 'learning_rate': 8.86267107126003e-06, 'epoch': 1.67}


 56%|█████▌    | 14170/25428 [2:53:04<2:00:45,  1.55it/s]

{'loss': 0.0925, 'grad_norm': 0.8405895233154297, 'learning_rate': 8.85480572597137e-06, 'epoch': 1.67}


 56%|█████▌    | 14180/25428 [2:53:11<1:59:33,  1.57it/s]

{'loss': 0.1121, 'grad_norm': 5.251931667327881, 'learning_rate': 8.846940380682713e-06, 'epoch': 1.67}


 56%|█████▌    | 14190/25428 [2:53:17<1:59:22,  1.57it/s]

{'loss': 0.1271, 'grad_norm': 0.9193692803382874, 'learning_rate': 8.839075035394054e-06, 'epoch': 1.67}


 56%|█████▌    | 14200/25428 [2:53:24<1:58:14,  1.58it/s]

{'loss': 0.0962, 'grad_norm': 0.8034351468086243, 'learning_rate': 8.831209690105397e-06, 'epoch': 1.68}


 56%|█████▌    | 14210/25428 [2:53:30<1:58:51,  1.57it/s]

{'loss': 0.1057, 'grad_norm': 1.0434918403625488, 'learning_rate': 8.823344344816738e-06, 'epoch': 1.68}


 56%|█████▌    | 14220/25428 [2:53:36<2:00:12,  1.55it/s]

{'loss': 0.1032, 'grad_norm': 0.7659276127815247, 'learning_rate': 8.81547899952808e-06, 'epoch': 1.68}


 56%|█████▌    | 14230/25428 [2:53:43<1:58:58,  1.57it/s]

{'loss': 0.0761, 'grad_norm': 1.0207093954086304, 'learning_rate': 8.807613654239421e-06, 'epoch': 1.68}


 56%|█████▌    | 14240/25428 [2:53:49<1:57:35,  1.59it/s]

{'loss': 0.0807, 'grad_norm': 2.345872640609741, 'learning_rate': 8.799748308950764e-06, 'epoch': 1.68}


 56%|█████▌    | 14250/25428 [2:53:56<1:59:07,  1.56it/s]

{'loss': 0.0945, 'grad_norm': 0.6797863841056824, 'learning_rate': 8.791882963662105e-06, 'epoch': 1.68}


 56%|█████▌    | 14260/25428 [2:54:02<2:01:15,  1.54it/s]

{'loss': 0.1011, 'grad_norm': 2.8605124950408936, 'learning_rate': 8.784017618373448e-06, 'epoch': 1.68}


 56%|█████▌    | 14270/25428 [2:54:09<2:10:25,  1.43it/s]

{'loss': 0.105, 'grad_norm': 0.9279377460479736, 'learning_rate': 8.776152273084789e-06, 'epoch': 1.68}


 56%|█████▌    | 14280/25428 [2:54:16<2:07:34,  1.46it/s]

{'loss': 0.12, 'grad_norm': 1.7060673236846924, 'learning_rate': 8.768286927796131e-06, 'epoch': 1.68}


 56%|█████▌    | 14290/25428 [2:54:23<2:02:52,  1.51it/s]

{'loss': 0.108, 'grad_norm': 0.8327508568763733, 'learning_rate': 8.760421582507472e-06, 'epoch': 1.69}


 56%|█████▌    | 14300/25428 [2:54:30<2:04:23,  1.49it/s]

{'loss': 0.1018, 'grad_norm': 1.1543749570846558, 'learning_rate': 8.752556237218815e-06, 'epoch': 1.69}


 56%|█████▋    | 14310/25428 [2:54:36<1:58:49,  1.56it/s]

{'loss': 0.1059, 'grad_norm': 0.9015442728996277, 'learning_rate': 8.744690891930156e-06, 'epoch': 1.69}


 56%|█████▋    | 14320/25428 [2:54:43<2:01:11,  1.53it/s]

{'loss': 0.0899, 'grad_norm': 1.262965440750122, 'learning_rate': 8.736825546641499e-06, 'epoch': 1.69}


 56%|█████▋    | 14330/25428 [2:54:49<1:59:22,  1.55it/s]

{'loss': 0.0944, 'grad_norm': 1.0409021377563477, 'learning_rate': 8.72896020135284e-06, 'epoch': 1.69}


 56%|█████▋    | 14340/25428 [2:54:56<2:01:05,  1.53it/s]

{'loss': 0.093, 'grad_norm': 1.2538870573043823, 'learning_rate': 8.721094856064182e-06, 'epoch': 1.69}


 56%|█████▋    | 14350/25428 [2:55:02<1:56:49,  1.58it/s]

{'loss': 0.0946, 'grad_norm': 1.0683947801589966, 'learning_rate': 8.713229510775523e-06, 'epoch': 1.69}


 56%|█████▋    | 14360/25428 [2:55:08<1:54:42,  1.61it/s]

{'loss': 0.1093, 'grad_norm': 1.2313556671142578, 'learning_rate': 8.705364165486866e-06, 'epoch': 1.69}


 57%|█████▋    | 14370/25428 [2:55:15<2:00:00,  1.54it/s]

{'loss': 0.0997, 'grad_norm': 0.9639958739280701, 'learning_rate': 8.697498820198207e-06, 'epoch': 1.7}


 57%|█████▋    | 14380/25428 [2:55:22<2:00:06,  1.53it/s]

{'loss': 0.1079, 'grad_norm': 0.9968622922897339, 'learning_rate': 8.68963347490955e-06, 'epoch': 1.7}


 57%|█████▋    | 14390/25428 [2:55:28<1:55:38,  1.59it/s]

{'loss': 0.2085, 'grad_norm': 1.0920137166976929, 'learning_rate': 8.68176812962089e-06, 'epoch': 1.7}


 57%|█████▋    | 14400/25428 [2:55:35<3:06:55,  1.02s/it]

{'loss': 0.1419, 'grad_norm': 1.0356522798538208, 'learning_rate': 8.673902784332233e-06, 'epoch': 1.7}


 57%|█████▋    | 14410/25428 [2:55:45<3:25:57,  1.12s/it]

{'loss': 0.115, 'grad_norm': 1.0783686637878418, 'learning_rate': 8.666037439043574e-06, 'epoch': 1.7}


 57%|█████▋    | 14420/25428 [2:55:52<2:01:40,  1.51it/s]

{'loss': 0.1145, 'grad_norm': 1.3439823389053345, 'learning_rate': 8.658172093754917e-06, 'epoch': 1.7}


 57%|█████▋    | 14430/25428 [2:55:58<1:56:28,  1.57it/s]

{'loss': 0.0863, 'grad_norm': 0.704846978187561, 'learning_rate': 8.650306748466258e-06, 'epoch': 1.7}


 57%|█████▋    | 14440/25428 [2:56:05<1:56:48,  1.57it/s]

{'loss': 0.1164, 'grad_norm': 1.2051535844802856, 'learning_rate': 8.6424414031776e-06, 'epoch': 1.7}


 57%|█████▋    | 14450/25428 [2:56:11<1:57:00,  1.56it/s]

{'loss': 0.1079, 'grad_norm': 0.7569435238838196, 'learning_rate': 8.634576057888942e-06, 'epoch': 1.7}


 57%|█████▋    | 14460/25428 [2:56:17<1:53:26,  1.61it/s]

{'loss': 0.1223, 'grad_norm': 1.3840891122817993, 'learning_rate': 8.626710712600283e-06, 'epoch': 1.71}


 57%|█████▋    | 14470/25428 [2:56:24<1:54:37,  1.59it/s]

{'loss': 0.0922, 'grad_norm': 1.4284701347351074, 'learning_rate': 8.618845367311625e-06, 'epoch': 1.71}


 57%|█████▋    | 14480/25428 [2:56:30<1:57:15,  1.56it/s]

{'loss': 0.1056, 'grad_norm': 1.040181279182434, 'learning_rate': 8.610980022022968e-06, 'epoch': 1.71}


 57%|█████▋    | 14490/25428 [2:56:36<1:57:13,  1.56it/s]

{'loss': 0.0972, 'grad_norm': 1.355578899383545, 'learning_rate': 8.603114676734309e-06, 'epoch': 1.71}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.1045, 'grad_norm': 1.261520266532898, 'learning_rate': 8.59524933144565e-06, 'epoch': 1.71}


 57%|█████▋    | 14510/25428 [2:56:53<2:07:15,  1.43it/s]

{'loss': 0.1033, 'grad_norm': 1.417993426322937, 'learning_rate': 8.587383986156993e-06, 'epoch': 1.71}


 57%|█████▋    | 14520/25428 [2:57:00<2:01:53,  1.49it/s]

{'loss': 0.118, 'grad_norm': 0.9166001677513123, 'learning_rate': 8.579518640868335e-06, 'epoch': 1.71}


 57%|█████▋    | 14530/25428 [2:57:06<1:56:04,  1.56it/s]

{'loss': 0.111, 'grad_norm': 0.9538828134536743, 'learning_rate': 8.571653295579676e-06, 'epoch': 1.71}


 57%|█████▋    | 14540/25428 [2:57:12<1:55:34,  1.57it/s]

{'loss': 0.1322, 'grad_norm': 0.8076317310333252, 'learning_rate': 8.563787950291017e-06, 'epoch': 1.72}


 57%|█████▋    | 14550/25428 [2:57:19<1:57:41,  1.54it/s]

{'loss': 0.1136, 'grad_norm': 1.4104336500167847, 'learning_rate': 8.55592260500236e-06, 'epoch': 1.72}


 57%|█████▋    | 14560/25428 [2:57:25<1:56:01,  1.56it/s]

{'loss': 0.105, 'grad_norm': 1.1172759532928467, 'learning_rate': 8.548057259713703e-06, 'epoch': 1.72}


 57%|█████▋    | 14570/25428 [2:57:32<1:57:58,  1.53it/s]

{'loss': 0.0877, 'grad_norm': 0.8769748210906982, 'learning_rate': 8.540191914425044e-06, 'epoch': 1.72}


 57%|█████▋    | 14580/25428 [2:57:38<1:52:59,  1.60it/s]

{'loss': 0.1077, 'grad_norm': 0.8525928258895874, 'learning_rate': 8.532326569136385e-06, 'epoch': 1.72}


 57%|█████▋    | 14590/25428 [2:57:45<1:59:22,  1.51it/s]

{'loss': 0.1174, 'grad_norm': 1.2132633924484253, 'learning_rate': 8.524461223847727e-06, 'epoch': 1.72}


 57%|█████▋    | 14600/25428 [2:57:51<1:58:28,  1.52it/s]

{'loss': 0.0988, 'grad_norm': 1.7269911766052246, 'learning_rate': 8.51659587855907e-06, 'epoch': 1.72}


 57%|█████▋    | 14610/25428 [2:57:58<1:56:47,  1.54it/s]

{'loss': 0.0951, 'grad_norm': 1.3041329383850098, 'learning_rate': 8.508730533270411e-06, 'epoch': 1.72}


 57%|█████▋    | 14620/25428 [2:58:04<1:54:33,  1.57it/s]

{'loss': 0.1042, 'grad_norm': 1.3433678150177002, 'learning_rate': 8.500865187981752e-06, 'epoch': 1.72}


 58%|█████▊    | 14630/25428 [2:58:11<1:52:54,  1.59it/s]

{'loss': 0.0809, 'grad_norm': 1.4550389051437378, 'learning_rate': 8.492999842693095e-06, 'epoch': 1.73}


 58%|█████▊    | 14640/25428 [2:58:17<1:53:09,  1.59it/s]

{'loss': 0.109, 'grad_norm': 1.2329914569854736, 'learning_rate': 8.485134497404437e-06, 'epoch': 1.73}


 58%|█████▊    | 14650/25428 [2:58:23<1:57:12,  1.53it/s]

{'loss': 0.0895, 'grad_norm': 0.6272889375686646, 'learning_rate': 8.477269152115778e-06, 'epoch': 1.73}


 58%|█████▊    | 14660/25428 [2:58:30<1:58:24,  1.52it/s]

{'loss': 0.1363, 'grad_norm': 1.015661358833313, 'learning_rate': 8.46940380682712e-06, 'epoch': 1.73}


 58%|█████▊    | 14670/25428 [2:58:37<1:59:40,  1.50it/s]

{'loss': 0.0876, 'grad_norm': 0.8323445320129395, 'learning_rate': 8.461538461538462e-06, 'epoch': 1.73}


 58%|█████▊    | 14680/25428 [2:58:43<1:56:14,  1.54it/s]

{'loss': 0.087, 'grad_norm': 0.934285044670105, 'learning_rate': 8.453673116249805e-06, 'epoch': 1.73}


 58%|█████▊    | 14690/25428 [2:58:50<1:52:55,  1.58it/s]

{'loss': 0.0744, 'grad_norm': 0.7796635031700134, 'learning_rate': 8.445807770961146e-06, 'epoch': 1.73}


 58%|█████▊    | 14700/25428 [2:58:56<1:50:47,  1.61it/s]

{'loss': 0.1259, 'grad_norm': 1.0656476020812988, 'learning_rate': 8.437942425672487e-06, 'epoch': 1.73}


 58%|█████▊    | 14710/25428 [2:59:02<1:52:37,  1.59it/s]

{'loss': 0.0805, 'grad_norm': 1.8477516174316406, 'learning_rate': 8.43007708038383e-06, 'epoch': 1.74}


 58%|█████▊    | 14720/25428 [2:59:09<1:52:57,  1.58it/s]

{'loss': 0.1244, 'grad_norm': 0.9802550673484802, 'learning_rate': 8.422211735095172e-06, 'epoch': 1.74}


 58%|█████▊    | 14730/25428 [2:59:15<1:51:14,  1.60it/s]

{'loss': 0.0879, 'grad_norm': 1.3471734523773193, 'learning_rate': 8.414346389806513e-06, 'epoch': 1.74}


 58%|█████▊    | 14740/25428 [2:59:21<1:52:44,  1.58it/s]

{'loss': 0.0754, 'grad_norm': 0.9931336045265198, 'learning_rate': 8.406481044517854e-06, 'epoch': 1.74}


 58%|█████▊    | 14750/25428 [2:59:27<1:51:15,  1.60it/s]

{'loss': 0.0969, 'grad_norm': 1.029778003692627, 'learning_rate': 8.398615699229197e-06, 'epoch': 1.74}


 58%|█████▊    | 14760/25428 [2:59:34<1:51:17,  1.60it/s]

{'loss': 0.1099, 'grad_norm': 7.208274841308594, 'learning_rate': 8.39075035394054e-06, 'epoch': 1.74}


 58%|█████▊    | 14770/25428 [2:59:40<1:48:13,  1.64it/s]

{'loss': 0.0796, 'grad_norm': 0.7892853617668152, 'learning_rate': 8.38288500865188e-06, 'epoch': 1.74}


 58%|█████▊    | 14780/25428 [2:59:46<1:55:44,  1.53it/s]

{'loss': 0.1117, 'grad_norm': 1.6270546913146973, 'learning_rate': 8.375019663363221e-06, 'epoch': 1.74}


 58%|█████▊    | 14790/25428 [2:59:52<1:49:33,  1.62it/s]

{'loss': 0.1017, 'grad_norm': 1.0347833633422852, 'learning_rate': 8.367154318074564e-06, 'epoch': 1.74}


 58%|█████▊    | 14800/25428 [2:59:59<1:50:21,  1.61it/s]

{'loss': 0.1541, 'grad_norm': 1.1303291320800781, 'learning_rate': 8.359288972785907e-06, 'epoch': 1.75}


 58%|█████▊    | 14810/25428 [3:00:05<1:48:52,  1.63it/s]

{'loss': 0.1346, 'grad_norm': 1.1691864728927612, 'learning_rate': 8.351423627497248e-06, 'epoch': 1.75}


 58%|█████▊    | 14820/25428 [3:00:11<1:49:18,  1.62it/s]

{'loss': 0.1095, 'grad_norm': 1.1261991262435913, 'learning_rate': 8.343558282208589e-06, 'epoch': 1.75}


 58%|█████▊    | 14830/25428 [3:00:17<1:48:55,  1.62it/s]

{'loss': 0.1067, 'grad_norm': 1.1888272762298584, 'learning_rate': 8.335692936919931e-06, 'epoch': 1.75}


 58%|█████▊    | 14840/25428 [3:00:24<1:50:27,  1.60it/s]

{'loss': 0.124, 'grad_norm': 0.9763433337211609, 'learning_rate': 8.327827591631274e-06, 'epoch': 1.75}


 58%|█████▊    | 14850/25428 [3:00:30<1:47:52,  1.63it/s]

{'loss': 0.0932, 'grad_norm': 1.266034722328186, 'learning_rate': 8.319962246342615e-06, 'epoch': 1.75}


 58%|█████▊    | 14860/25428 [3:00:36<1:52:15,  1.57it/s]

{'loss': 0.1012, 'grad_norm': 0.7891438603401184, 'learning_rate': 8.312096901053956e-06, 'epoch': 1.75}


 58%|█████▊    | 14870/25428 [3:00:42<1:51:51,  1.57it/s]

{'loss': 0.1308, 'grad_norm': 1.0493247509002686, 'learning_rate': 8.304231555765299e-06, 'epoch': 1.75}


 59%|█████▊    | 14880/25428 [3:00:49<1:50:35,  1.59it/s]

{'loss': 0.1376, 'grad_norm': 1.2376697063446045, 'learning_rate': 8.296366210476641e-06, 'epoch': 1.76}


 59%|█████▊    | 14890/25428 [3:00:55<1:48:22,  1.62it/s]

{'loss': 0.1187, 'grad_norm': 1.0265613794326782, 'learning_rate': 8.288500865187982e-06, 'epoch': 1.76}


 59%|█████▊    | 14900/25428 [3:01:01<1:50:56,  1.58it/s]

{'loss': 0.0932, 'grad_norm': 0.9924871325492859, 'learning_rate': 8.280635519899323e-06, 'epoch': 1.76}


 59%|█████▊    | 14910/25428 [3:01:07<1:51:35,  1.57it/s]

{'loss': 0.0946, 'grad_norm': 0.9817088842391968, 'learning_rate': 8.272770174610666e-06, 'epoch': 1.76}


 59%|█████▊    | 14920/25428 [3:01:14<1:51:48,  1.57it/s]

{'loss': 0.0949, 'grad_norm': 0.861640214920044, 'learning_rate': 8.264904829322009e-06, 'epoch': 1.76}


 59%|█████▊    | 14930/25428 [3:01:20<1:50:59,  1.58it/s]

{'loss': 0.117, 'grad_norm': 1.2775852680206299, 'learning_rate': 8.25703948403335e-06, 'epoch': 1.76}


 59%|█████▉    | 14940/25428 [3:01:26<1:48:53,  1.61it/s]

{'loss': 0.0862, 'grad_norm': 1.6242635250091553, 'learning_rate': 8.24917413874469e-06, 'epoch': 1.76}


 59%|█████▉    | 14950/25428 [3:01:32<1:47:51,  1.62it/s]

{'loss': 0.1021, 'grad_norm': 0.9139047861099243, 'learning_rate': 8.241308793456033e-06, 'epoch': 1.76}


 59%|█████▉    | 14960/25428 [3:01:39<1:50:37,  1.58it/s]

{'loss': 0.1002, 'grad_norm': 0.6176179647445679, 'learning_rate': 8.233443448167376e-06, 'epoch': 1.76}


 59%|█████▉    | 14970/25428 [3:01:45<1:46:57,  1.63it/s]

{'loss': 0.0857, 'grad_norm': 1.9223620891571045, 'learning_rate': 8.225578102878717e-06, 'epoch': 1.77}


 59%|█████▉    | 14980/25428 [3:01:51<1:46:57,  1.63it/s]

{'loss': 0.0992, 'grad_norm': 1.2961877584457397, 'learning_rate': 8.217712757590058e-06, 'epoch': 1.77}


 59%|█████▉    | 14990/25428 [3:01:57<1:49:23,  1.59it/s]

{'loss': 0.0943, 'grad_norm': 1.5772011280059814, 'learning_rate': 8.2098474123014e-06, 'epoch': 1.77}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.0974, 'grad_norm': 0.906865119934082, 'learning_rate': 8.201982067012742e-06, 'epoch': 1.77}


 59%|█████▉    | 15010/25428 [3:02:13<1:54:00,  1.52it/s]

{'loss': 0.0934, 'grad_norm': 1.5009187459945679, 'learning_rate': 8.194116721724084e-06, 'epoch': 1.77}


 59%|█████▉    | 15020/25428 [3:02:19<1:50:25,  1.57it/s]

{'loss': 0.0812, 'grad_norm': 0.7365873456001282, 'learning_rate': 8.186251376435425e-06, 'epoch': 1.77}


 59%|█████▉    | 15030/25428 [3:02:25<1:49:06,  1.59it/s]

{'loss': 0.0777, 'grad_norm': 1.2487237453460693, 'learning_rate': 8.178386031146768e-06, 'epoch': 1.77}


 59%|█████▉    | 15040/25428 [3:02:32<1:48:26,  1.60it/s]

{'loss': 0.121, 'grad_norm': 0.932641863822937, 'learning_rate': 8.170520685858109e-06, 'epoch': 1.77}


 59%|█████▉    | 15050/25428 [3:02:38<1:47:54,  1.60it/s]

{'loss': 0.0903, 'grad_norm': 0.9554066061973572, 'learning_rate': 8.162655340569452e-06, 'epoch': 1.78}


 59%|█████▉    | 15060/25428 [3:02:44<1:49:21,  1.58it/s]

{'loss': 0.0962, 'grad_norm': 2.4212043285369873, 'learning_rate': 8.154789995280793e-06, 'epoch': 1.78}


 59%|█████▉    | 15070/25428 [3:02:51<1:50:01,  1.57it/s]

{'loss': 0.1023, 'grad_norm': 0.7719524502754211, 'learning_rate': 8.146924649992135e-06, 'epoch': 1.78}


 59%|█████▉    | 15080/25428 [3:02:57<1:50:17,  1.56it/s]

{'loss': 0.0843, 'grad_norm': 1.1012457609176636, 'learning_rate': 8.139059304703476e-06, 'epoch': 1.78}


 59%|█████▉    | 15090/25428 [3:03:03<1:48:44,  1.58it/s]

{'loss': 0.0819, 'grad_norm': 0.965633749961853, 'learning_rate': 8.131193959414819e-06, 'epoch': 1.78}


 59%|█████▉    | 15100/25428 [3:03:10<1:50:22,  1.56it/s]

{'loss': 0.0998, 'grad_norm': 1.4129351377487183, 'learning_rate': 8.12332861412616e-06, 'epoch': 1.78}


 59%|█████▉    | 15110/25428 [3:03:16<1:48:39,  1.58it/s]

{'loss': 0.1179, 'grad_norm': 0.995206892490387, 'learning_rate': 8.115463268837503e-06, 'epoch': 1.78}


 59%|█████▉    | 15120/25428 [3:03:22<1:50:12,  1.56it/s]

{'loss': 0.1277, 'grad_norm': 0.9756342172622681, 'learning_rate': 8.107597923548844e-06, 'epoch': 1.78}


 60%|█████▉    | 15130/25428 [3:03:29<1:50:05,  1.56it/s]

{'loss': 0.1114, 'grad_norm': 0.8375414609909058, 'learning_rate': 8.099732578260186e-06, 'epoch': 1.79}


 60%|█████▉    | 15140/25428 [3:03:35<1:48:40,  1.58it/s]

{'loss': 0.0851, 'grad_norm': 0.8968362212181091, 'learning_rate': 8.091867232971529e-06, 'epoch': 1.79}


 60%|█████▉    | 15150/25428 [3:03:42<1:49:16,  1.57it/s]

{'loss': 0.1028, 'grad_norm': 1.4500399827957153, 'learning_rate': 8.08400188768287e-06, 'epoch': 1.79}


 60%|█████▉    | 15160/25428 [3:03:48<1:47:10,  1.60it/s]

{'loss': 0.0933, 'grad_norm': 1.0738067626953125, 'learning_rate': 8.076136542394211e-06, 'epoch': 1.79}


 60%|█████▉    | 15170/25428 [3:03:54<1:49:07,  1.57it/s]

{'loss': 0.0845, 'grad_norm': 0.680819571018219, 'learning_rate': 8.068271197105554e-06, 'epoch': 1.79}


 60%|█████▉    | 15180/25428 [3:04:00<1:47:45,  1.59it/s]

{'loss': 0.0968, 'grad_norm': 0.770647406578064, 'learning_rate': 8.060405851816896e-06, 'epoch': 1.79}


 60%|█████▉    | 15190/25428 [3:04:07<1:47:48,  1.58it/s]

{'loss': 0.0927, 'grad_norm': 0.5842463970184326, 'learning_rate': 8.052540506528237e-06, 'epoch': 1.79}


 60%|█████▉    | 15200/25428 [3:04:13<1:46:07,  1.61it/s]

{'loss': 0.1189, 'grad_norm': 1.336228370666504, 'learning_rate': 8.044675161239578e-06, 'epoch': 1.79}


 60%|█████▉    | 15210/25428 [3:04:19<1:48:44,  1.57it/s]

{'loss': 0.0966, 'grad_norm': 0.8207131028175354, 'learning_rate': 8.036809815950921e-06, 'epoch': 1.79}


 60%|█████▉    | 15220/25428 [3:04:26<1:48:32,  1.57it/s]

{'loss': 0.1002, 'grad_norm': 1.0754499435424805, 'learning_rate': 8.028944470662264e-06, 'epoch': 1.8}


 60%|█████▉    | 15230/25428 [3:04:32<1:48:41,  1.56it/s]

{'loss': 0.0917, 'grad_norm': 0.6792322397232056, 'learning_rate': 8.021079125373605e-06, 'epoch': 1.8}


 60%|█████▉    | 15240/25428 [3:04:38<1:47:28,  1.58it/s]

{'loss': 0.0865, 'grad_norm': 1.3740898370742798, 'learning_rate': 8.013213780084946e-06, 'epoch': 1.8}


 60%|█████▉    | 15250/25428 [3:04:45<1:48:05,  1.57it/s]

{'loss': 0.1103, 'grad_norm': 1.1481108665466309, 'learning_rate': 8.005348434796288e-06, 'epoch': 1.8}


 60%|██████    | 15260/25428 [3:04:51<1:48:25,  1.56it/s]

{'loss': 0.155, 'grad_norm': 1.4190194606781006, 'learning_rate': 7.997483089507631e-06, 'epoch': 1.8}


 60%|██████    | 15270/25428 [3:04:57<1:48:08,  1.57it/s]

{'loss': 0.0688, 'grad_norm': 0.9780794978141785, 'learning_rate': 7.989617744218972e-06, 'epoch': 1.8}


 60%|██████    | 15280/25428 [3:05:04<1:45:22,  1.60it/s]

{'loss': 0.1675, 'grad_norm': 2.29001522064209, 'learning_rate': 7.981752398930313e-06, 'epoch': 1.8}


 60%|██████    | 15290/25428 [3:05:10<1:45:27,  1.60it/s]

{'loss': 0.1041, 'grad_norm': 0.800326943397522, 'learning_rate': 7.973887053641656e-06, 'epoch': 1.8}


 60%|██████    | 15300/25428 [3:05:16<1:48:42,  1.55it/s]

{'loss': 0.0975, 'grad_norm': 0.710105836391449, 'learning_rate': 7.966021708352998e-06, 'epoch': 1.81}


 60%|██████    | 15310/25428 [3:05:23<1:48:21,  1.56it/s]

{'loss': 0.0949, 'grad_norm': 1.0636240243911743, 'learning_rate': 7.95815636306434e-06, 'epoch': 1.81}


 60%|██████    | 15320/25428 [3:05:29<1:46:45,  1.58it/s]

{'loss': 0.1064, 'grad_norm': 0.9999203085899353, 'learning_rate': 7.95029101777568e-06, 'epoch': 1.81}


 60%|██████    | 15330/25428 [3:05:35<1:47:58,  1.56it/s]

{'loss': 0.1007, 'grad_norm': 0.8429461717605591, 'learning_rate': 7.942425672487023e-06, 'epoch': 1.81}


 60%|██████    | 15340/25428 [3:05:42<1:43:27,  1.63it/s]

{'loss': 0.1084, 'grad_norm': 1.120901346206665, 'learning_rate': 7.934560327198366e-06, 'epoch': 1.81}


 60%|██████    | 15350/25428 [3:05:48<1:46:24,  1.58it/s]

{'loss': 0.1182, 'grad_norm': 0.8405408263206482, 'learning_rate': 7.926694981909707e-06, 'epoch': 1.81}


 60%|██████    | 15360/25428 [3:05:54<1:47:08,  1.57it/s]

{'loss': 0.1358, 'grad_norm': 0.8182889223098755, 'learning_rate': 7.918829636621048e-06, 'epoch': 1.81}


 60%|██████    | 15370/25428 [3:06:01<1:44:57,  1.60it/s]

{'loss': 0.1077, 'grad_norm': 1.4800384044647217, 'learning_rate': 7.91096429133239e-06, 'epoch': 1.81}


 60%|██████    | 15380/25428 [3:06:07<1:44:44,  1.60it/s]

{'loss': 0.1122, 'grad_norm': 1.7151232957839966, 'learning_rate': 7.903098946043733e-06, 'epoch': 1.81}


 61%|██████    | 15390/25428 [3:06:13<1:47:20,  1.56it/s]

{'loss': 0.1122, 'grad_norm': 1.047858476638794, 'learning_rate': 7.895233600755074e-06, 'epoch': 1.82}


 61%|██████    | 15400/25428 [3:06:20<1:45:04,  1.59it/s]

{'loss': 0.0887, 'grad_norm': 0.8583891987800598, 'learning_rate': 7.887368255466415e-06, 'epoch': 1.82}


 61%|██████    | 15410/25428 [3:06:26<1:46:32,  1.57it/s]

{'loss': 0.1007, 'grad_norm': 1.1134544610977173, 'learning_rate': 7.879502910177758e-06, 'epoch': 1.82}


 61%|██████    | 15420/25428 [3:06:32<1:46:14,  1.57it/s]

{'loss': 0.0893, 'grad_norm': 1.0360862016677856, 'learning_rate': 7.8716375648891e-06, 'epoch': 1.82}


 61%|██████    | 15430/25428 [3:06:38<1:45:44,  1.58it/s]

{'loss': 0.0909, 'grad_norm': 0.7332923412322998, 'learning_rate': 7.863772219600441e-06, 'epoch': 1.82}


 61%|██████    | 15440/25428 [3:06:45<1:42:59,  1.62it/s]

{'loss': 0.1002, 'grad_norm': 1.224826455116272, 'learning_rate': 7.855906874311782e-06, 'epoch': 1.82}


 61%|██████    | 15450/25428 [3:06:51<1:43:04,  1.61it/s]

{'loss': 0.165, 'grad_norm': 1.445651650428772, 'learning_rate': 7.848041529023125e-06, 'epoch': 1.82}


 61%|██████    | 15460/25428 [3:06:57<1:43:34,  1.60it/s]

{'loss': 0.1066, 'grad_norm': 0.9143394231796265, 'learning_rate': 7.840176183734468e-06, 'epoch': 1.82}


 61%|██████    | 15470/25428 [3:07:03<1:44:23,  1.59it/s]

{'loss': 0.0908, 'grad_norm': 0.9284034371376038, 'learning_rate': 7.832310838445809e-06, 'epoch': 1.83}


 61%|██████    | 15480/25428 [3:07:10<1:44:20,  1.59it/s]

{'loss': 0.1334, 'grad_norm': 0.8656042814254761, 'learning_rate': 7.82444549315715e-06, 'epoch': 1.83}


 61%|██████    | 15490/25428 [3:07:16<1:44:37,  1.58it/s]

{'loss': 0.0958, 'grad_norm': 0.931605875492096, 'learning_rate': 7.816580147868492e-06, 'epoch': 1.83}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.1148, 'grad_norm': 1.2216167449951172, 'learning_rate': 7.808714802579833e-06, 'epoch': 1.83}


 61%|██████    | 15510/25428 [3:07:32<1:49:09,  1.51it/s]

{'loss': 0.1097, 'grad_norm': 0.8206040859222412, 'learning_rate': 7.800849457291174e-06, 'epoch': 1.83}


 61%|██████    | 15520/25428 [3:07:38<1:41:18,  1.63it/s]

{'loss': 0.108, 'grad_norm': 3.3459386825561523, 'learning_rate': 7.792984112002517e-06, 'epoch': 1.83}


 61%|██████    | 15530/25428 [3:07:44<1:43:40,  1.59it/s]

{'loss': 0.1088, 'grad_norm': 0.8239358067512512, 'learning_rate': 7.78511876671386e-06, 'epoch': 1.83}


 61%|██████    | 15540/25428 [3:07:50<1:43:25,  1.59it/s]

{'loss': 0.0859, 'grad_norm': 0.9226999282836914, 'learning_rate': 7.7772534214252e-06, 'epoch': 1.83}


 61%|██████    | 15550/25428 [3:07:57<1:44:25,  1.58it/s]

{'loss': 0.098, 'grad_norm': 1.0102094411849976, 'learning_rate': 7.769388076136542e-06, 'epoch': 1.83}


 61%|██████    | 15560/25428 [3:08:03<1:42:06,  1.61it/s]

{'loss': 0.1497, 'grad_norm': 1.6556240320205688, 'learning_rate': 7.761522730847884e-06, 'epoch': 1.84}


 61%|██████    | 15570/25428 [3:08:09<1:44:10,  1.58it/s]

{'loss': 0.0792, 'grad_norm': 0.7483682036399841, 'learning_rate': 7.753657385559227e-06, 'epoch': 1.84}


 61%|██████▏   | 15580/25428 [3:08:15<1:42:36,  1.60it/s]

{'loss': 0.115, 'grad_norm': 1.97762131690979, 'learning_rate': 7.745792040270568e-06, 'epoch': 1.84}


 61%|██████▏   | 15590/25428 [3:08:22<1:42:32,  1.60it/s]

{'loss': 0.0975, 'grad_norm': 0.7771434187889099, 'learning_rate': 7.737926694981909e-06, 'epoch': 1.84}


 61%|██████▏   | 15600/25428 [3:08:28<1:41:31,  1.61it/s]

{'loss': 0.1017, 'grad_norm': 1.4336607456207275, 'learning_rate': 7.730061349693252e-06, 'epoch': 1.84}


 61%|██████▏   | 15610/25428 [3:08:34<1:41:16,  1.62it/s]

{'loss': 0.0933, 'grad_norm': 0.7469202876091003, 'learning_rate': 7.722196004404594e-06, 'epoch': 1.84}


 61%|██████▏   | 15620/25428 [3:08:40<1:41:10,  1.62it/s]

{'loss': 0.0914, 'grad_norm': 1.678425908088684, 'learning_rate': 7.714330659115935e-06, 'epoch': 1.84}


 61%|██████▏   | 15630/25428 [3:08:47<1:44:15,  1.57it/s]

{'loss': 0.079, 'grad_norm': 0.8474905490875244, 'learning_rate': 7.706465313827276e-06, 'epoch': 1.84}


 62%|██████▏   | 15640/25428 [3:08:53<1:43:14,  1.58it/s]

{'loss': 0.0966, 'grad_norm': 1.194840431213379, 'learning_rate': 7.698599968538619e-06, 'epoch': 1.85}


 62%|██████▏   | 15650/25428 [3:08:59<1:41:04,  1.61it/s]

{'loss': 0.1013, 'grad_norm': 0.9326841831207275, 'learning_rate': 7.690734623249962e-06, 'epoch': 1.85}


 62%|██████▏   | 15660/25428 [3:09:05<1:43:38,  1.57it/s]

{'loss': 0.0737, 'grad_norm': 0.9483981132507324, 'learning_rate': 7.682869277961303e-06, 'epoch': 1.85}


 62%|██████▏   | 15670/25428 [3:09:12<1:40:03,  1.63it/s]

{'loss': 0.1121, 'grad_norm': 0.7886488437652588, 'learning_rate': 7.675003932672644e-06, 'epoch': 1.85}


 62%|██████▏   | 15680/25428 [3:09:18<1:41:23,  1.60it/s]

{'loss': 0.081, 'grad_norm': 0.6369476318359375, 'learning_rate': 7.667138587383986e-06, 'epoch': 1.85}


 62%|██████▏   | 15690/25428 [3:09:24<1:40:53,  1.61it/s]

{'loss': 0.099, 'grad_norm': 0.759888231754303, 'learning_rate': 7.659273242095329e-06, 'epoch': 1.85}


 62%|██████▏   | 15700/25428 [3:09:31<1:42:54,  1.58it/s]

{'loss': 0.1155, 'grad_norm': 0.7988731861114502, 'learning_rate': 7.65140789680667e-06, 'epoch': 1.85}


 62%|██████▏   | 15710/25428 [3:09:37<1:41:40,  1.59it/s]

{'loss': 0.1005, 'grad_norm': 5.766519069671631, 'learning_rate': 7.643542551518011e-06, 'epoch': 1.85}


 62%|██████▏   | 15720/25428 [3:09:43<1:41:14,  1.60it/s]

{'loss': 0.0869, 'grad_norm': 0.9662582874298096, 'learning_rate': 7.635677206229354e-06, 'epoch': 1.85}


 62%|██████▏   | 15730/25428 [3:09:49<1:41:19,  1.60it/s]

{'loss': 0.0791, 'grad_norm': 2.543327808380127, 'learning_rate': 7.627811860940696e-06, 'epoch': 1.86}


 62%|██████▏   | 15740/25428 [3:09:55<1:40:31,  1.61it/s]

{'loss': 0.0912, 'grad_norm': 0.9097185134887695, 'learning_rate': 7.619946515652038e-06, 'epoch': 1.86}


 62%|██████▏   | 15750/25428 [3:10:02<1:41:34,  1.59it/s]

{'loss': 0.0793, 'grad_norm': 0.948161244392395, 'learning_rate': 7.61208117036338e-06, 'epoch': 1.86}


 62%|██████▏   | 15760/25428 [3:10:08<1:42:22,  1.57it/s]

{'loss': 0.0882, 'grad_norm': 0.8546619415283203, 'learning_rate': 7.604215825074721e-06, 'epoch': 1.86}


 62%|██████▏   | 15770/25428 [3:10:14<1:43:59,  1.55it/s]

{'loss': 0.0916, 'grad_norm': 0.8384608030319214, 'learning_rate': 7.596350479786063e-06, 'epoch': 1.86}


 62%|██████▏   | 15780/25428 [3:10:21<1:43:02,  1.56it/s]

{'loss': 0.1155, 'grad_norm': 1.3031692504882812, 'learning_rate': 7.588485134497406e-06, 'epoch': 1.86}


 62%|██████▏   | 15790/25428 [3:10:27<1:39:39,  1.61it/s]

{'loss': 0.1086, 'grad_norm': 0.9304255843162537, 'learning_rate': 7.5806197892087475e-06, 'epoch': 1.86}


 62%|██████▏   | 15800/25428 [3:10:33<1:41:11,  1.59it/s]

{'loss': 0.0734, 'grad_norm': 0.8919360041618347, 'learning_rate': 7.5727544439200884e-06, 'epoch': 1.86}


 62%|██████▏   | 15810/25428 [3:10:40<1:39:00,  1.62it/s]

{'loss': 0.1075, 'grad_norm': 1.4529730081558228, 'learning_rate': 7.56488909863143e-06, 'epoch': 1.87}


 62%|██████▏   | 15820/25428 [3:10:46<1:49:50,  1.46it/s]

{'loss': 0.1282, 'grad_norm': 1.1931029558181763, 'learning_rate': 7.557023753342773e-06, 'epoch': 1.87}


 62%|██████▏   | 15830/25428 [3:10:53<1:42:58,  1.55it/s]

{'loss': 0.0835, 'grad_norm': 0.9895228147506714, 'learning_rate': 7.549158408054115e-06, 'epoch': 1.87}


 62%|██████▏   | 15840/25428 [3:10:59<1:41:22,  1.58it/s]

{'loss': 0.09, 'grad_norm': 0.8365440368652344, 'learning_rate': 7.541293062765456e-06, 'epoch': 1.87}


 62%|██████▏   | 15850/25428 [3:11:05<1:39:10,  1.61it/s]

{'loss': 0.0888, 'grad_norm': 1.121835470199585, 'learning_rate': 7.533427717476798e-06, 'epoch': 1.87}


 62%|██████▏   | 15860/25428 [3:11:12<1:39:25,  1.60it/s]

{'loss': 0.1112, 'grad_norm': 1.6710723638534546, 'learning_rate': 7.52556237218814e-06, 'epoch': 1.87}


 62%|██████▏   | 15870/25428 [3:11:18<1:41:31,  1.57it/s]

{'loss': 0.1025, 'grad_norm': 0.9076747298240662, 'learning_rate': 7.517697026899482e-06, 'epoch': 1.87}


 62%|██████▏   | 15880/25428 [3:11:24<1:39:12,  1.60it/s]

{'loss': 0.1047, 'grad_norm': 1.5080199241638184, 'learning_rate': 7.509831681610823e-06, 'epoch': 1.87}


 62%|██████▏   | 15890/25428 [3:11:30<1:38:23,  1.62it/s]

{'loss': 0.0764, 'grad_norm': 1.0281164646148682, 'learning_rate': 7.501966336322165e-06, 'epoch': 1.87}


 63%|██████▎   | 15900/25428 [3:11:37<1:42:42,  1.55it/s]

{'loss': 0.1299, 'grad_norm': 1.0092743635177612, 'learning_rate': 7.494100991033507e-06, 'epoch': 1.88}


 63%|██████▎   | 15910/25428 [3:11:43<1:40:36,  1.58it/s]

{'loss': 0.0851, 'grad_norm': 0.678501546382904, 'learning_rate': 7.4862356457448495e-06, 'epoch': 1.88}


 63%|██████▎   | 15920/25428 [3:11:50<1:39:01,  1.60it/s]

{'loss': 0.1239, 'grad_norm': 1.0775600671768188, 'learning_rate': 7.4783703004561904e-06, 'epoch': 1.88}


 63%|██████▎   | 15930/25428 [3:11:56<1:39:18,  1.59it/s]

{'loss': 0.1165, 'grad_norm': 1.1542588472366333, 'learning_rate': 7.470504955167532e-06, 'epoch': 1.88}


 63%|██████▎   | 15940/25428 [3:12:02<1:45:12,  1.50it/s]

{'loss': 0.088, 'grad_norm': 1.1056910753250122, 'learning_rate': 7.462639609878874e-06, 'epoch': 1.88}


 63%|██████▎   | 15950/25428 [3:12:09<1:40:03,  1.58it/s]

{'loss': 0.0947, 'grad_norm': 1.6036707162857056, 'learning_rate': 7.454774264590217e-06, 'epoch': 1.88}


 63%|██████▎   | 15960/25428 [3:12:15<1:42:25,  1.54it/s]

{'loss': 0.0815, 'grad_norm': 0.9799880385398865, 'learning_rate': 7.446908919301558e-06, 'epoch': 1.88}


 63%|██████▎   | 15970/25428 [3:12:22<1:39:40,  1.58it/s]

{'loss': 0.0869, 'grad_norm': 1.5899581909179688, 'learning_rate': 7.4390435740129e-06, 'epoch': 1.88}


 63%|██████▎   | 15980/25428 [3:12:28<1:39:00,  1.59it/s]

{'loss': 0.1078, 'grad_norm': 1.116770625114441, 'learning_rate': 7.4311782287242414e-06, 'epoch': 1.89}


 63%|██████▎   | 15990/25428 [3:12:34<1:38:04,  1.60it/s]

{'loss': 0.1121, 'grad_norm': 0.9792742133140564, 'learning_rate': 7.423312883435584e-06, 'epoch': 1.89}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.0874, 'grad_norm': 0.8697064518928528, 'learning_rate': 7.415447538146925e-06, 'epoch': 1.89}


 63%|██████▎   | 16010/25428 [3:12:50<1:43:14,  1.52it/s]

{'loss': 0.0876, 'grad_norm': 1.034824252128601, 'learning_rate': 7.407582192858267e-06, 'epoch': 1.89}


 63%|██████▎   | 16020/25428 [3:12:56<1:40:14,  1.56it/s]

{'loss': 0.107, 'grad_norm': 0.857398509979248, 'learning_rate': 7.399716847569609e-06, 'epoch': 1.89}


 63%|██████▎   | 16030/25428 [3:13:02<1:40:22,  1.56it/s]

{'loss': 0.081, 'grad_norm': 0.6568965315818787, 'learning_rate': 7.3918515022809515e-06, 'epoch': 1.89}


 63%|██████▎   | 16040/25428 [3:13:09<1:43:09,  1.52it/s]

{'loss': 0.0804, 'grad_norm': 2.391810178756714, 'learning_rate': 7.3839861569922924e-06, 'epoch': 1.89}


 63%|██████▎   | 16050/25428 [3:13:15<1:41:06,  1.55it/s]

{'loss': 0.0823, 'grad_norm': 0.8234846591949463, 'learning_rate': 7.376120811703634e-06, 'epoch': 1.89}


 63%|██████▎   | 16060/25428 [3:13:22<1:41:51,  1.53it/s]

{'loss': 0.0886, 'grad_norm': 0.9817753434181213, 'learning_rate': 7.368255466414976e-06, 'epoch': 1.89}


 63%|██████▎   | 16070/25428 [3:13:28<1:39:07,  1.57it/s]

{'loss': 0.0956, 'grad_norm': 1.1855298280715942, 'learning_rate': 7.360390121126319e-06, 'epoch': 1.9}


 63%|██████▎   | 16080/25428 [3:13:34<1:38:03,  1.59it/s]

{'loss': 0.1103, 'grad_norm': 0.7449824213981628, 'learning_rate': 7.35252477583766e-06, 'epoch': 1.9}


 63%|██████▎   | 16090/25428 [3:13:41<1:38:26,  1.58it/s]

{'loss': 0.0909, 'grad_norm': 0.8073422312736511, 'learning_rate': 7.344659430549002e-06, 'epoch': 1.9}


 63%|██████▎   | 16100/25428 [3:13:47<1:40:14,  1.55it/s]

{'loss': 0.0969, 'grad_norm': 1.8510299921035767, 'learning_rate': 7.3367940852603434e-06, 'epoch': 1.9}


 63%|██████▎   | 16110/25428 [3:13:54<1:40:00,  1.55it/s]

{'loss': 0.1349, 'grad_norm': 0.45055896043777466, 'learning_rate': 7.328928739971686e-06, 'epoch': 1.9}


 63%|██████▎   | 16120/25428 [3:14:00<1:39:45,  1.56it/s]

{'loss': 0.0975, 'grad_norm': 1.4954224824905396, 'learning_rate': 7.321063394683027e-06, 'epoch': 1.9}


 63%|██████▎   | 16130/25428 [3:14:06<1:37:21,  1.59it/s]

{'loss': 0.0845, 'grad_norm': 0.8822730183601379, 'learning_rate': 7.313198049394369e-06, 'epoch': 1.9}


 63%|██████▎   | 16140/25428 [3:14:13<1:40:09,  1.55it/s]

{'loss': 0.0789, 'grad_norm': 1.0573182106018066, 'learning_rate': 7.305332704105711e-06, 'epoch': 1.9}


 64%|██████▎   | 16150/25428 [3:14:19<1:36:56,  1.60it/s]

{'loss': 0.1039, 'grad_norm': 0.8470867872238159, 'learning_rate': 7.297467358817053e-06, 'epoch': 1.91}


 64%|██████▎   | 16160/25428 [3:14:25<1:36:01,  1.61it/s]

{'loss': 0.1542, 'grad_norm': 0.8879605531692505, 'learning_rate': 7.2896020135283944e-06, 'epoch': 1.91}


 64%|██████▎   | 16170/25428 [3:14:32<1:36:44,  1.60it/s]

{'loss': 0.0778, 'grad_norm': 1.0255377292633057, 'learning_rate': 7.281736668239736e-06, 'epoch': 1.91}


 64%|██████▎   | 16180/25428 [3:14:38<1:41:29,  1.52it/s]

{'loss': 0.1119, 'grad_norm': 0.9407978057861328, 'learning_rate': 7.273871322951078e-06, 'epoch': 1.91}


 64%|██████▎   | 16190/25428 [3:14:44<1:38:05,  1.57it/s]

{'loss': 0.0883, 'grad_norm': 1.4206304550170898, 'learning_rate': 7.26600597766242e-06, 'epoch': 1.91}


 64%|██████▎   | 16200/25428 [3:14:51<1:37:24,  1.58it/s]

{'loss': 0.1187, 'grad_norm': 1.0745160579681396, 'learning_rate': 7.258140632373761e-06, 'epoch': 1.91}


 64%|██████▎   | 16210/25428 [3:14:57<1:43:03,  1.49it/s]

{'loss': 0.0859, 'grad_norm': 0.7825736999511719, 'learning_rate': 7.250275287085104e-06, 'epoch': 1.91}


 64%|██████▍   | 16220/25428 [3:15:04<1:36:24,  1.59it/s]

{'loss': 0.1244, 'grad_norm': 0.9643734693527222, 'learning_rate': 7.2424099417964454e-06, 'epoch': 1.91}


 64%|██████▍   | 16230/25428 [3:15:10<1:35:54,  1.60it/s]

{'loss': 0.0995, 'grad_norm': 1.0799964666366577, 'learning_rate': 7.234544596507787e-06, 'epoch': 1.91}


 64%|██████▍   | 16240/25428 [3:15:17<1:38:06,  1.56it/s]

{'loss': 0.076, 'grad_norm': 0.6235259771347046, 'learning_rate': 7.226679251219128e-06, 'epoch': 1.92}


 64%|██████▍   | 16250/25428 [3:15:23<1:38:41,  1.55it/s]

{'loss': 0.1369, 'grad_norm': 0.7934598326683044, 'learning_rate': 7.218813905930471e-06, 'epoch': 1.92}


 64%|██████▍   | 16260/25428 [3:15:29<1:38:06,  1.56it/s]

{'loss': 0.0887, 'grad_norm': 1.88198721408844, 'learning_rate': 7.210948560641813e-06, 'epoch': 1.92}


 64%|██████▍   | 16270/25428 [3:15:36<1:36:56,  1.57it/s]

{'loss': 0.0893, 'grad_norm': 1.2302675247192383, 'learning_rate': 7.203083215353155e-06, 'epoch': 1.92}


 64%|██████▍   | 16280/25428 [3:15:42<1:38:16,  1.55it/s]

{'loss': 0.0953, 'grad_norm': 1.1482428312301636, 'learning_rate': 7.195217870064496e-06, 'epoch': 1.92}


 64%|██████▍   | 16290/25428 [3:15:48<1:35:32,  1.59it/s]

{'loss': 0.1, 'grad_norm': 1.1730051040649414, 'learning_rate': 7.187352524775838e-06, 'epoch': 1.92}


 64%|██████▍   | 16300/25428 [3:15:55<1:35:49,  1.59it/s]

{'loss': 0.09, 'grad_norm': 0.73516446352005, 'learning_rate': 7.17948717948718e-06, 'epoch': 1.92}


 64%|██████▍   | 16310/25428 [3:16:01<1:36:30,  1.57it/s]

{'loss': 0.0829, 'grad_norm': 2.672412395477295, 'learning_rate': 7.171621834198522e-06, 'epoch': 1.92}


 64%|██████▍   | 16320/25428 [3:16:07<1:34:37,  1.60it/s]

{'loss': 0.1294, 'grad_norm': 0.8522169589996338, 'learning_rate': 7.163756488909863e-06, 'epoch': 1.93}


 64%|██████▍   | 16330/25428 [3:16:14<1:40:38,  1.51it/s]

{'loss': 0.0782, 'grad_norm': 2.00757098197937, 'learning_rate': 7.155891143621206e-06, 'epoch': 1.93}


 64%|██████▍   | 16340/25428 [3:16:20<1:37:20,  1.56it/s]

{'loss': 0.1159, 'grad_norm': 1.8845187425613403, 'learning_rate': 7.1480257983325475e-06, 'epoch': 1.93}


 64%|██████▍   | 16350/25428 [3:16:27<1:38:38,  1.53it/s]

{'loss': 0.0959, 'grad_norm': 0.9105145335197449, 'learning_rate': 7.140160453043889e-06, 'epoch': 1.93}


 64%|██████▍   | 16360/25428 [3:16:33<1:37:15,  1.55it/s]

{'loss': 0.0827, 'grad_norm': 1.240785002708435, 'learning_rate': 7.13229510775523e-06, 'epoch': 1.93}


 64%|██████▍   | 16370/25428 [3:16:40<1:36:39,  1.56it/s]

{'loss': 0.0922, 'grad_norm': 1.2446104288101196, 'learning_rate': 7.124429762466573e-06, 'epoch': 1.93}


 64%|██████▍   | 16380/25428 [3:16:46<1:35:05,  1.59it/s]

{'loss': 0.0958, 'grad_norm': 0.861128032207489, 'learning_rate': 7.116564417177915e-06, 'epoch': 1.93}


 64%|██████▍   | 16390/25428 [3:16:53<1:37:16,  1.55it/s]

{'loss': 0.1063, 'grad_norm': 0.9431726932525635, 'learning_rate': 7.108699071889257e-06, 'epoch': 1.93}


 64%|██████▍   | 16400/25428 [3:16:59<1:33:22,  1.61it/s]

{'loss': 0.0882, 'grad_norm': 0.980889618396759, 'learning_rate': 7.100833726600599e-06, 'epoch': 1.93}


 65%|██████▍   | 16410/25428 [3:17:05<1:34:54,  1.58it/s]

{'loss': 0.1109, 'grad_norm': 1.0877786874771118, 'learning_rate': 7.09296838131194e-06, 'epoch': 1.94}


 65%|██████▍   | 16420/25428 [3:17:12<1:34:37,  1.59it/s]

{'loss': 0.0963, 'grad_norm': 0.6102740168571472, 'learning_rate': 7.085103036023282e-06, 'epoch': 1.94}


 65%|██████▍   | 16430/25428 [3:17:18<1:34:58,  1.58it/s]

{'loss': 0.0759, 'grad_norm': 0.8459378480911255, 'learning_rate': 7.077237690734624e-06, 'epoch': 1.94}


 65%|██████▍   | 16440/25428 [3:17:25<1:39:39,  1.50it/s]

{'loss': 0.1587, 'grad_norm': 0.552832305431366, 'learning_rate': 7.069372345445966e-06, 'epoch': 1.94}


 65%|██████▍   | 16450/25428 [3:17:31<1:34:35,  1.58it/s]

{'loss': 0.1265, 'grad_norm': 1.3422893285751343, 'learning_rate': 7.061507000157307e-06, 'epoch': 1.94}


 65%|██████▍   | 16460/25428 [3:17:37<1:35:42,  1.56it/s]

{'loss': 0.1475, 'grad_norm': 1.6678072214126587, 'learning_rate': 7.0536416548686495e-06, 'epoch': 1.94}


 65%|██████▍   | 16470/25428 [3:17:44<1:34:15,  1.58it/s]

{'loss': 0.0974, 'grad_norm': 0.992855966091156, 'learning_rate': 7.045776309579991e-06, 'epoch': 1.94}


 65%|██████▍   | 16480/25428 [3:17:50<1:37:26,  1.53it/s]

{'loss': 0.0873, 'grad_norm': 0.9433713555335999, 'learning_rate': 7.037910964291333e-06, 'epoch': 1.94}


 65%|██████▍   | 16490/25428 [3:17:56<1:34:30,  1.58it/s]

{'loss': 0.1174, 'grad_norm': 0.5884308218955994, 'learning_rate': 7.030045619002674e-06, 'epoch': 1.95}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.0866, 'grad_norm': 1.095394253730774, 'learning_rate': 7.022180273714017e-06, 'epoch': 1.95}


 65%|██████▍   | 16510/25428 [3:18:12<1:40:22,  1.48it/s]

{'loss': 0.1435, 'grad_norm': 0.9410407543182373, 'learning_rate': 7.014314928425359e-06, 'epoch': 1.95}


 65%|██████▍   | 16520/25428 [3:18:18<1:35:16,  1.56it/s]

{'loss': 0.0968, 'grad_norm': 1.0634961128234863, 'learning_rate': 7.0064495831367005e-06, 'epoch': 1.95}


 65%|██████▌   | 16530/25428 [3:18:25<1:33:46,  1.58it/s]

{'loss': 0.1005, 'grad_norm': 1.0471667051315308, 'learning_rate': 6.9985842378480414e-06, 'epoch': 1.95}


 65%|██████▌   | 16540/25428 [3:18:31<1:36:42,  1.53it/s]

{'loss': 0.1059, 'grad_norm': 1.1797795295715332, 'learning_rate': 6.990718892559384e-06, 'epoch': 1.95}


 65%|██████▌   | 16550/25428 [3:18:38<1:33:11,  1.59it/s]

{'loss': 0.0941, 'grad_norm': 1.1431570053100586, 'learning_rate': 6.982853547270726e-06, 'epoch': 1.95}


 65%|██████▌   | 16560/25428 [3:18:44<1:32:42,  1.59it/s]

{'loss': 0.1103, 'grad_norm': 0.982749879360199, 'learning_rate': 6.974988201982068e-06, 'epoch': 1.95}


 65%|██████▌   | 16570/25428 [3:18:50<1:34:54,  1.56it/s]

{'loss': 0.0805, 'grad_norm': 0.7885605692863464, 'learning_rate': 6.967122856693409e-06, 'epoch': 1.95}


 65%|██████▌   | 16580/25428 [3:18:57<1:33:55,  1.57it/s]

{'loss': 0.0959, 'grad_norm': 2.3722784519195557, 'learning_rate': 6.9592575114047515e-06, 'epoch': 1.96}


 65%|██████▌   | 16590/25428 [3:19:03<1:34:28,  1.56it/s]

{'loss': 0.0851, 'grad_norm': 1.5379537343978882, 'learning_rate': 6.951392166116093e-06, 'epoch': 1.96}


 65%|██████▌   | 16600/25428 [3:19:09<1:32:09,  1.60it/s]

{'loss': 0.0838, 'grad_norm': 1.3140532970428467, 'learning_rate': 6.943526820827435e-06, 'epoch': 1.96}


 65%|██████▌   | 16610/25428 [3:19:16<1:34:27,  1.56it/s]

{'loss': 0.1009, 'grad_norm': 1.160933017730713, 'learning_rate': 6.935661475538776e-06, 'epoch': 1.96}


 65%|██████▌   | 16620/25428 [3:19:22<1:33:38,  1.57it/s]

{'loss': 0.103, 'grad_norm': 1.2668371200561523, 'learning_rate': 6.927796130250119e-06, 'epoch': 1.96}


 65%|██████▌   | 16630/25428 [3:19:29<1:32:44,  1.58it/s]

{'loss': 0.0931, 'grad_norm': 1.0734339952468872, 'learning_rate': 6.919930784961461e-06, 'epoch': 1.96}


 65%|██████▌   | 16640/25428 [3:19:35<1:33:41,  1.56it/s]

{'loss': 0.1205, 'grad_norm': 0.9346873164176941, 'learning_rate': 6.9120654396728025e-06, 'epoch': 1.96}


 65%|██████▌   | 16650/25428 [3:19:41<1:32:50,  1.58it/s]

{'loss': 0.0937, 'grad_norm': 0.826987087726593, 'learning_rate': 6.9042000943841434e-06, 'epoch': 1.96}


 66%|██████▌   | 16660/25428 [3:19:48<1:32:30,  1.58it/s]

{'loss': 0.085, 'grad_norm': 1.0650345087051392, 'learning_rate': 6.896334749095486e-06, 'epoch': 1.97}


 66%|██████▌   | 16670/25428 [3:19:54<1:32:27,  1.58it/s]

{'loss': 0.0996, 'grad_norm': 1.224053978919983, 'learning_rate': 6.888469403806828e-06, 'epoch': 1.97}


 66%|██████▌   | 16680/25428 [3:20:00<1:33:46,  1.55it/s]

{'loss': 0.0877, 'grad_norm': 1.1020103693008423, 'learning_rate': 6.88060405851817e-06, 'epoch': 1.97}


 66%|██████▌   | 16690/25428 [3:20:07<1:32:45,  1.57it/s]

{'loss': 0.1103, 'grad_norm': 0.7409606575965881, 'learning_rate': 6.872738713229511e-06, 'epoch': 1.97}


 66%|██████▌   | 16700/25428 [3:20:13<1:31:30,  1.59it/s]

{'loss': 0.101, 'grad_norm': 1.028926968574524, 'learning_rate': 6.864873367940853e-06, 'epoch': 1.97}


 66%|██████▌   | 16710/25428 [3:20:19<1:32:40,  1.57it/s]

{'loss': 0.0961, 'grad_norm': 0.7514804601669312, 'learning_rate': 6.857008022652195e-06, 'epoch': 1.97}


 66%|██████▌   | 16720/25428 [3:20:26<1:33:09,  1.56it/s]

{'loss': 0.1139, 'grad_norm': 0.8063938021659851, 'learning_rate': 6.849142677363537e-06, 'epoch': 1.97}


 66%|██████▌   | 16730/25428 [3:20:32<1:30:48,  1.60it/s]

{'loss': 0.0866, 'grad_norm': 1.0886576175689697, 'learning_rate': 6.841277332074878e-06, 'epoch': 1.97}


 66%|██████▌   | 16740/25428 [3:20:38<1:31:20,  1.59it/s]

{'loss': 0.1242, 'grad_norm': 0.8029321432113647, 'learning_rate': 6.83341198678622e-06, 'epoch': 1.97}


 66%|██████▌   | 16750/25428 [3:20:45<1:32:16,  1.57it/s]

{'loss': 0.1114, 'grad_norm': 1.2256431579589844, 'learning_rate': 6.825546641497563e-06, 'epoch': 1.98}


 66%|██████▌   | 16760/25428 [3:20:51<1:30:34,  1.59it/s]

{'loss': 0.1077, 'grad_norm': 1.4368394613265991, 'learning_rate': 6.8176812962089045e-06, 'epoch': 1.98}


 66%|██████▌   | 16770/25428 [3:20:57<1:31:48,  1.57it/s]

{'loss': 0.0819, 'grad_norm': 0.6408206820487976, 'learning_rate': 6.8098159509202454e-06, 'epoch': 1.98}


 66%|██████▌   | 16780/25428 [3:21:04<1:29:22,  1.61it/s]

{'loss': 0.1058, 'grad_norm': 1.179945707321167, 'learning_rate': 6.801950605631587e-06, 'epoch': 1.98}


 66%|██████▌   | 16790/25428 [3:21:10<1:31:10,  1.58it/s]

{'loss': 0.1586, 'grad_norm': 1.2387562990188599, 'learning_rate': 6.79408526034293e-06, 'epoch': 1.98}


 66%|██████▌   | 16800/25428 [3:21:17<1:31:37,  1.57it/s]

{'loss': 0.0932, 'grad_norm': 0.9535180330276489, 'learning_rate': 6.786219915054272e-06, 'epoch': 1.98}


 66%|██████▌   | 16810/25428 [3:21:23<1:31:32,  1.57it/s]

{'loss': 0.0951, 'grad_norm': 0.7608325481414795, 'learning_rate': 6.778354569765613e-06, 'epoch': 1.98}


 66%|██████▌   | 16820/25428 [3:21:29<1:30:40,  1.58it/s]

{'loss': 0.0918, 'grad_norm': 0.7869686484336853, 'learning_rate': 6.770489224476955e-06, 'epoch': 1.98}


 66%|██████▌   | 16830/25428 [3:21:36<1:31:15,  1.57it/s]

{'loss': 0.1045, 'grad_norm': 1.1539651155471802, 'learning_rate': 6.762623879188297e-06, 'epoch': 1.99}


 66%|██████▌   | 16840/25428 [3:21:42<1:31:24,  1.57it/s]

{'loss': 0.0834, 'grad_norm': 1.2332899570465088, 'learning_rate': 6.754758533899639e-06, 'epoch': 1.99}


 66%|██████▋   | 16850/25428 [3:21:48<1:31:17,  1.57it/s]

{'loss': 0.1044, 'grad_norm': 1.310779333114624, 'learning_rate': 6.74689318861098e-06, 'epoch': 1.99}


 66%|██████▋   | 16860/25428 [3:21:55<1:30:52,  1.57it/s]

{'loss': 0.1041, 'grad_norm': 0.9249715209007263, 'learning_rate': 6.739027843322322e-06, 'epoch': 1.99}


 66%|██████▋   | 16870/25428 [3:22:01<1:32:34,  1.54it/s]

{'loss': 0.0835, 'grad_norm': 0.7983193397521973, 'learning_rate': 6.731162498033665e-06, 'epoch': 1.99}


 66%|██████▋   | 16880/25428 [3:22:07<1:30:31,  1.57it/s]

{'loss': 0.0832, 'grad_norm': 2.570086717605591, 'learning_rate': 6.7232971527450065e-06, 'epoch': 1.99}


 66%|██████▋   | 16890/25428 [3:22:14<1:30:24,  1.57it/s]

{'loss': 0.0972, 'grad_norm': 1.2256661653518677, 'learning_rate': 6.7154318074563474e-06, 'epoch': 1.99}


 66%|██████▋   | 16900/25428 [3:22:20<1:30:11,  1.58it/s]

{'loss': 0.1127, 'grad_norm': 1.3326982259750366, 'learning_rate': 6.707566462167689e-06, 'epoch': 1.99}


 67%|██████▋   | 16910/25428 [3:22:26<1:29:02,  1.59it/s]

{'loss': 0.0856, 'grad_norm': 0.9151497483253479, 'learning_rate': 6.699701116879032e-06, 'epoch': 2.0}


 67%|██████▋   | 16920/25428 [3:22:33<1:30:15,  1.57it/s]

{'loss': 0.093, 'grad_norm': 0.7585273385047913, 'learning_rate': 6.691835771590374e-06, 'epoch': 2.0}


 67%|██████▋   | 16930/25428 [3:22:39<1:29:40,  1.58it/s]

{'loss': 0.1275, 'grad_norm': 0.8776934146881104, 'learning_rate': 6.683970426301715e-06, 'epoch': 2.0}


 67%|██████▋   | 16940/25428 [3:22:45<1:30:46,  1.56it/s]

{'loss': 0.0938, 'grad_norm': 1.1680591106414795, 'learning_rate': 6.676105081013057e-06, 'epoch': 2.0}


 67%|██████▋   | 16950/25428 [3:22:52<1:30:52,  1.55it/s]

{'loss': 0.0907, 'grad_norm': 1.214106559753418, 'learning_rate': 6.6682397357243985e-06, 'epoch': 2.0}


                                                         
 67%|██████▋   | 16952/25428 [3:42:08<1:23:06,  1.70it/s]

{'eval_loss': 0.05777658894658089, 'eval_runtime': 1154.9476, 'eval_samples_per_second': 58.707, 'eval_steps_per_second': 7.339, 'epoch': 2.0}


 67%|██████▋   | 16960/25428 [3:42:13<68:36:43, 29.17s/it]  

{'loss': 0.0766, 'grad_norm': 0.5876413583755493, 'learning_rate': 6.660374390435741e-06, 'epoch': 2.0}


 67%|██████▋   | 16970/25428 [3:42:19<3:23:24,  1.44s/it] 

{'loss': 0.0544, 'grad_norm': 0.6741622686386108, 'learning_rate': 6.652509045147082e-06, 'epoch': 2.0}


 67%|██████▋   | 16980/25428 [3:42:26<1:32:04,  1.53it/s]

{'loss': 0.1286, 'grad_norm': 0.7923599481582642, 'learning_rate': 6.644643699858424e-06, 'epoch': 2.0}


 67%|██████▋   | 16990/25428 [3:42:32<1:30:48,  1.55it/s]

{'loss': 0.0783, 'grad_norm': 0.8687600493431091, 'learning_rate': 6.636778354569766e-06, 'epoch': 2.0}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.076, 'grad_norm': 0.918347954750061, 'learning_rate': 6.6289130092811085e-06, 'epoch': 2.01}


 67%|██████▋   | 17010/25428 [3:42:47<1:33:09,  1.51it/s]

{'loss': 0.1068, 'grad_norm': 1.0661511421203613, 'learning_rate': 6.62104766399245e-06, 'epoch': 2.01}


 67%|██████▋   | 17020/25428 [3:42:54<1:28:51,  1.58it/s]

{'loss': 0.0935, 'grad_norm': 1.528673768043518, 'learning_rate': 6.613182318703791e-06, 'epoch': 2.01}


 67%|██████▋   | 17030/25428 [3:43:00<1:28:55,  1.57it/s]

{'loss': 0.1207, 'grad_norm': 0.9852325320243835, 'learning_rate': 6.605316973415133e-06, 'epoch': 2.01}


 67%|██████▋   | 17040/25428 [3:43:06<1:27:54,  1.59it/s]

{'loss': 0.107, 'grad_norm': 0.8697620630264282, 'learning_rate': 6.597451628126476e-06, 'epoch': 2.01}


 67%|██████▋   | 17050/25428 [3:43:12<1:27:20,  1.60it/s]

{'loss': 0.1118, 'grad_norm': 0.7251565456390381, 'learning_rate': 6.589586282837818e-06, 'epoch': 2.01}


 67%|██████▋   | 17060/25428 [3:43:19<1:28:58,  1.57it/s]

{'loss': 0.1, 'grad_norm': 0.8346564173698425, 'learning_rate': 6.581720937549159e-06, 'epoch': 2.01}


 67%|██████▋   | 17070/25428 [3:43:25<1:28:55,  1.57it/s]

{'loss': 0.0714, 'grad_norm': 1.5758349895477295, 'learning_rate': 6.5738555922605005e-06, 'epoch': 2.01}


 67%|██████▋   | 17080/25428 [3:43:31<1:26:01,  1.62it/s]

{'loss': 0.0495, 'grad_norm': 0.6138141751289368, 'learning_rate': 6.565990246971843e-06, 'epoch': 2.02}


 67%|██████▋   | 17090/25428 [3:43:38<1:26:59,  1.60it/s]

{'loss': 0.085, 'grad_norm': 0.6910768151283264, 'learning_rate': 6.558124901683185e-06, 'epoch': 2.02}


 67%|██████▋   | 17100/25428 [3:43:44<1:28:04,  1.58it/s]

{'loss': 0.092, 'grad_norm': 1.060827612876892, 'learning_rate': 6.550259556394526e-06, 'epoch': 2.02}


 67%|██████▋   | 17110/25428 [3:43:50<1:26:56,  1.59it/s]

{'loss': 0.0954, 'grad_norm': 0.8212350010871887, 'learning_rate': 6.542394211105868e-06, 'epoch': 2.02}


 67%|██████▋   | 17120/25428 [3:43:57<1:25:51,  1.61it/s]

{'loss': 0.0693, 'grad_norm': 0.759449303150177, 'learning_rate': 6.5345288658172105e-06, 'epoch': 2.02}


 67%|██████▋   | 17130/25428 [3:44:03<1:27:24,  1.58it/s]

{'loss': 0.1036, 'grad_norm': 0.7231572866439819, 'learning_rate': 6.526663520528552e-06, 'epoch': 2.02}


 67%|██████▋   | 17140/25428 [3:44:09<1:26:53,  1.59it/s]

{'loss': 0.0929, 'grad_norm': 2.903818130493164, 'learning_rate': 6.518798175239893e-06, 'epoch': 2.02}


 67%|██████▋   | 17150/25428 [3:44:16<1:30:17,  1.53it/s]

{'loss': 0.1094, 'grad_norm': 1.028997540473938, 'learning_rate': 6.510932829951235e-06, 'epoch': 2.02}


 67%|██████▋   | 17160/25428 [3:44:22<1:25:35,  1.61it/s]

{'loss': 0.1018, 'grad_norm': 1.0515495538711548, 'learning_rate': 6.503067484662578e-06, 'epoch': 2.02}


 68%|██████▊   | 17170/25428 [3:44:29<1:26:28,  1.59it/s]

{'loss': 0.0864, 'grad_norm': 1.3946201801300049, 'learning_rate': 6.49520213937392e-06, 'epoch': 2.03}


 68%|██████▊   | 17180/25428 [3:44:35<1:26:25,  1.59it/s]

{'loss': 0.0718, 'grad_norm': 0.6872408986091614, 'learning_rate': 6.487336794085261e-06, 'epoch': 2.03}


 68%|██████▊   | 17190/25428 [3:44:41<1:26:08,  1.59it/s]

{'loss': 0.0951, 'grad_norm': 1.7634881734848022, 'learning_rate': 6.4794714487966025e-06, 'epoch': 2.03}


 68%|██████▊   | 17200/25428 [3:44:47<1:24:42,  1.62it/s]

{'loss': 0.1083, 'grad_norm': 1.8790918588638306, 'learning_rate': 6.471606103507945e-06, 'epoch': 2.03}


 68%|██████▊   | 17210/25428 [3:44:54<1:24:59,  1.61it/s]

{'loss': 0.1024, 'grad_norm': 1.169704556465149, 'learning_rate': 6.463740758219287e-06, 'epoch': 2.03}


 68%|██████▊   | 17220/25428 [3:45:00<1:25:30,  1.60it/s]

{'loss': 0.0572, 'grad_norm': 0.6436488032341003, 'learning_rate': 6.455875412930628e-06, 'epoch': 2.03}


 68%|██████▊   | 17230/25428 [3:45:06<1:26:38,  1.58it/s]

{'loss': 0.0672, 'grad_norm': 0.8415026068687439, 'learning_rate': 6.44801006764197e-06, 'epoch': 2.03}


 68%|██████▊   | 17240/25428 [3:45:13<1:30:24,  1.51it/s]

{'loss': 0.1197, 'grad_norm': 0.8376455903053284, 'learning_rate': 6.440144722353312e-06, 'epoch': 2.03}


 68%|██████▊   | 17250/25428 [3:45:19<1:25:04,  1.60it/s]

{'loss': 0.0915, 'grad_norm': 1.02699613571167, 'learning_rate': 6.432279377064654e-06, 'epoch': 2.04}


 68%|██████▊   | 17260/25428 [3:45:25<1:25:12,  1.60it/s]

{'loss': 0.1019, 'grad_norm': 1.2587640285491943, 'learning_rate': 6.424414031775995e-06, 'epoch': 2.04}


 68%|██████▊   | 17270/25428 [3:45:32<1:25:15,  1.59it/s]

{'loss': 0.0805, 'grad_norm': 0.7498045563697815, 'learning_rate': 6.416548686487337e-06, 'epoch': 2.04}


 68%|██████▊   | 17280/25428 [3:45:38<1:23:35,  1.62it/s]

{'loss': 0.0917, 'grad_norm': 1.2830731868743896, 'learning_rate': 6.408683341198679e-06, 'epoch': 2.04}


 68%|██████▊   | 17290/25428 [3:45:44<1:24:39,  1.60it/s]

{'loss': 0.0769, 'grad_norm': 0.38629546761512756, 'learning_rate': 6.400817995910022e-06, 'epoch': 2.04}


 68%|██████▊   | 17300/25428 [3:45:50<1:25:31,  1.58it/s]

{'loss': 0.1101, 'grad_norm': 0.8258333802223206, 'learning_rate': 6.392952650621363e-06, 'epoch': 2.04}


 68%|██████▊   | 17310/25428 [3:45:57<1:23:51,  1.61it/s]

{'loss': 0.0715, 'grad_norm': 0.9543084502220154, 'learning_rate': 6.3850873053327045e-06, 'epoch': 2.04}


 68%|██████▊   | 17320/25428 [3:46:03<1:24:42,  1.60it/s]

{'loss': 0.0802, 'grad_norm': 1.636512041091919, 'learning_rate': 6.377221960044046e-06, 'epoch': 2.04}


 68%|██████▊   | 17330/25428 [3:46:09<1:24:13,  1.60it/s]

{'loss': 0.0764, 'grad_norm': 1.1696009635925293, 'learning_rate': 6.369356614755389e-06, 'epoch': 2.04}


 68%|██████▊   | 17340/25428 [3:46:16<1:28:35,  1.52it/s]

{'loss': 0.0574, 'grad_norm': 0.46067461371421814, 'learning_rate': 6.36149126946673e-06, 'epoch': 2.05}


 68%|██████▊   | 17350/25428 [3:46:22<1:25:06,  1.58it/s]

{'loss': 0.0795, 'grad_norm': 0.7027092576026917, 'learning_rate': 6.353625924178072e-06, 'epoch': 2.05}


 68%|██████▊   | 17360/25428 [3:46:28<1:24:01,  1.60it/s]

{'loss': 0.0727, 'grad_norm': 0.6165974140167236, 'learning_rate': 6.345760578889414e-06, 'epoch': 2.05}


 68%|██████▊   | 17370/25428 [3:46:35<1:25:33,  1.57it/s]

{'loss': 0.0751, 'grad_norm': 1.2311774492263794, 'learning_rate': 6.337895233600756e-06, 'epoch': 2.05}


 68%|██████▊   | 17380/25428 [3:46:41<1:25:35,  1.57it/s]

{'loss': 0.0892, 'grad_norm': 0.7216469645500183, 'learning_rate': 6.330029888312097e-06, 'epoch': 2.05}


 68%|██████▊   | 17390/25428 [3:46:47<1:25:03,  1.57it/s]

{'loss': 0.0988, 'grad_norm': 2.2076900005340576, 'learning_rate': 6.322164543023439e-06, 'epoch': 2.05}


 68%|██████▊   | 17400/25428 [3:46:54<1:24:25,  1.58it/s]

{'loss': 0.0733, 'grad_norm': 1.0129728317260742, 'learning_rate': 6.314299197734781e-06, 'epoch': 2.05}


 68%|██████▊   | 17410/25428 [3:47:00<1:23:31,  1.60it/s]

{'loss': 0.0791, 'grad_norm': 1.5318942070007324, 'learning_rate': 6.306433852446124e-06, 'epoch': 2.05}


 69%|██████▊   | 17420/25428 [3:47:06<1:24:14,  1.58it/s]

{'loss': 0.0859, 'grad_norm': 0.8589389324188232, 'learning_rate': 6.298568507157465e-06, 'epoch': 2.06}


 69%|██████▊   | 17430/25428 [3:47:13<1:28:30,  1.51it/s]

{'loss': 0.0972, 'grad_norm': 0.6289300322532654, 'learning_rate': 6.2907031618688065e-06, 'epoch': 2.06}


 69%|██████▊   | 17440/25428 [3:47:19<1:24:17,  1.58it/s]

{'loss': 0.0649, 'grad_norm': 1.3283380270004272, 'learning_rate': 6.282837816580148e-06, 'epoch': 2.06}


 69%|██████▊   | 17450/25428 [3:47:25<1:24:06,  1.58it/s]

{'loss': 0.0797, 'grad_norm': 1.1969940662384033, 'learning_rate': 6.274972471291491e-06, 'epoch': 2.06}


 69%|██████▊   | 17460/25428 [3:47:32<1:24:22,  1.57it/s]

{'loss': 0.0683, 'grad_norm': 1.2370015382766724, 'learning_rate': 6.267107126002832e-06, 'epoch': 2.06}


 69%|██████▊   | 17470/25428 [3:47:38<1:23:17,  1.59it/s]

{'loss': 0.0785, 'grad_norm': 0.8220586180686951, 'learning_rate': 6.259241780714174e-06, 'epoch': 2.06}


 69%|██████▊   | 17480/25428 [3:47:44<1:22:03,  1.61it/s]

{'loss': 0.0664, 'grad_norm': 1.359670639038086, 'learning_rate': 6.251376435425516e-06, 'epoch': 2.06}


 69%|██████▉   | 17490/25428 [3:47:50<1:22:14,  1.61it/s]

{'loss': 0.099, 'grad_norm': 1.0145379304885864, 'learning_rate': 6.2435110901368575e-06, 'epoch': 2.06}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.0758, 'grad_norm': 1.4335633516311646, 'learning_rate': 6.2356457448481984e-06, 'epoch': 2.06}


 69%|██████▉   | 17510/25428 [3:48:06<1:26:52,  1.52it/s]

{'loss': 0.0701, 'grad_norm': 0.9416729807853699, 'learning_rate': 6.227780399559541e-06, 'epoch': 2.07}


 69%|██████▉   | 17520/25428 [3:48:12<1:22:09,  1.60it/s]

{'loss': 0.098, 'grad_norm': 1.5826020240783691, 'learning_rate': 6.219915054270883e-06, 'epoch': 2.07}


 69%|██████▉   | 17530/25428 [3:48:18<1:23:41,  1.57it/s]

{'loss': 0.0713, 'grad_norm': 0.862741231918335, 'learning_rate': 6.212049708982225e-06, 'epoch': 2.07}


 69%|██████▉   | 17540/25428 [3:48:25<1:22:07,  1.60it/s]

{'loss': 0.1128, 'grad_norm': 1.223219633102417, 'learning_rate': 6.204184363693566e-06, 'epoch': 2.07}


 69%|██████▉   | 17550/25428 [3:48:31<1:22:27,  1.59it/s]

{'loss': 0.0756, 'grad_norm': 0.5556929111480713, 'learning_rate': 6.1963190184049085e-06, 'epoch': 2.07}


 69%|██████▉   | 17560/25428 [3:48:37<1:20:52,  1.62it/s]

{'loss': 0.1125, 'grad_norm': 1.448160171508789, 'learning_rate': 6.18845367311625e-06, 'epoch': 2.07}


 69%|██████▉   | 17570/25428 [3:48:44<1:21:51,  1.60it/s]

{'loss': 0.0968, 'grad_norm': 0.6871422529220581, 'learning_rate': 6.180588327827592e-06, 'epoch': 2.07}


 69%|██████▉   | 17580/25428 [3:48:50<1:21:46,  1.60it/s]

{'loss': 0.1079, 'grad_norm': 0.8888037204742432, 'learning_rate': 6.172722982538933e-06, 'epoch': 2.07}


 69%|██████▉   | 17590/25428 [3:48:56<1:22:42,  1.58it/s]

{'loss': 0.1386, 'grad_norm': 0.6982713341712952, 'learning_rate': 6.164857637250276e-06, 'epoch': 2.08}


 69%|██████▉   | 17600/25428 [3:49:02<1:20:29,  1.62it/s]

{'loss': 0.0907, 'grad_norm': 0.7588913440704346, 'learning_rate': 6.156992291961618e-06, 'epoch': 2.08}


 69%|██████▉   | 17610/25428 [3:49:09<1:22:00,  1.59it/s]

{'loss': 0.0866, 'grad_norm': 1.0172284841537476, 'learning_rate': 6.1491269466729595e-06, 'epoch': 2.08}


 69%|██████▉   | 17620/25428 [3:49:15<1:22:16,  1.58it/s]

{'loss': 0.103, 'grad_norm': 1.0222117900848389, 'learning_rate': 6.1412616013843004e-06, 'epoch': 2.08}


 69%|██████▉   | 17630/25428 [3:49:21<1:20:14,  1.62it/s]

{'loss': 0.1046, 'grad_norm': 0.7963467836380005, 'learning_rate': 6.133396256095643e-06, 'epoch': 2.08}


 69%|██████▉   | 17640/25428 [3:49:28<1:23:45,  1.55it/s]

{'loss': 0.0957, 'grad_norm': 0.7309243083000183, 'learning_rate': 6.125530910806985e-06, 'epoch': 2.08}


 69%|██████▉   | 17650/25428 [3:49:34<1:21:36,  1.59it/s]

{'loss': 0.0805, 'grad_norm': 0.7817779779434204, 'learning_rate': 6.117665565518327e-06, 'epoch': 2.08}


 69%|██████▉   | 17660/25428 [3:49:40<1:20:01,  1.62it/s]

{'loss': 0.0714, 'grad_norm': 0.8436850905418396, 'learning_rate': 6.1098002202296695e-06, 'epoch': 2.08}


 69%|██████▉   | 17670/25428 [3:49:47<1:20:29,  1.61it/s]

{'loss': 0.087, 'grad_norm': 0.6911827325820923, 'learning_rate': 6.1019348749410105e-06, 'epoch': 2.08}


 70%|██████▉   | 17680/25428 [3:49:53<1:21:03,  1.59it/s]

{'loss': 0.0804, 'grad_norm': 1.2274389266967773, 'learning_rate': 6.094069529652352e-06, 'epoch': 2.09}


 70%|██████▉   | 17690/25428 [3:49:59<1:20:11,  1.61it/s]

{'loss': 0.0896, 'grad_norm': 1.1390397548675537, 'learning_rate': 6.086204184363694e-06, 'epoch': 2.09}


 70%|██████▉   | 17700/25428 [3:50:06<1:22:43,  1.56it/s]

{'loss': 0.0944, 'grad_norm': 1.1055474281311035, 'learning_rate': 6.078338839075037e-06, 'epoch': 2.09}


 70%|██████▉   | 17710/25428 [3:50:12<1:19:36,  1.62it/s]

{'loss': 0.0679, 'grad_norm': 0.8521119952201843, 'learning_rate': 6.070473493786378e-06, 'epoch': 2.09}


 70%|██████▉   | 17720/25428 [3:50:18<1:20:08,  1.60it/s]

{'loss': 0.0905, 'grad_norm': 0.7515867948532104, 'learning_rate': 6.06260814849772e-06, 'epoch': 2.09}


 70%|██████▉   | 17730/25428 [3:50:25<1:20:43,  1.59it/s]

{'loss': 0.0899, 'grad_norm': 0.8909955620765686, 'learning_rate': 6.0547428032090615e-06, 'epoch': 2.09}


 70%|██████▉   | 17740/25428 [3:50:31<1:19:54,  1.60it/s]

{'loss': 0.1162, 'grad_norm': 0.7172498106956482, 'learning_rate': 6.046877457920403e-06, 'epoch': 2.09}


 70%|██████▉   | 17750/25428 [3:50:37<1:20:23,  1.59it/s]

{'loss': 0.0874, 'grad_norm': 0.9314507246017456, 'learning_rate': 6.039012112631744e-06, 'epoch': 2.09}


 70%|██████▉   | 17760/25428 [3:50:43<1:18:32,  1.63it/s]

{'loss': 0.0692, 'grad_norm': 0.8815787434577942, 'learning_rate': 6.031146767343087e-06, 'epoch': 2.1}


 70%|██████▉   | 17770/25428 [3:50:50<1:18:51,  1.62it/s]

{'loss': 0.0792, 'grad_norm': 0.6545189023017883, 'learning_rate': 6.023281422054429e-06, 'epoch': 2.1}


 70%|██████▉   | 17780/25428 [3:50:56<1:19:27,  1.60it/s]

{'loss': 0.0621, 'grad_norm': 57.57542419433594, 'learning_rate': 6.015416076765771e-06, 'epoch': 2.1}


 70%|██████▉   | 17790/25428 [3:51:02<1:19:07,  1.61it/s]

{'loss': 0.0799, 'grad_norm': 0.8548059463500977, 'learning_rate': 6.007550731477112e-06, 'epoch': 2.1}


 70%|███████   | 17800/25428 [3:51:08<1:19:38,  1.60it/s]

{'loss': 0.079, 'grad_norm': 2.005784273147583, 'learning_rate': 5.999685386188454e-06, 'epoch': 2.1}


 70%|███████   | 17810/25428 [3:51:15<1:20:14,  1.58it/s]

{'loss': 0.0933, 'grad_norm': 0.771808922290802, 'learning_rate': 5.991820040899796e-06, 'epoch': 2.1}


 70%|███████   | 17820/25428 [3:51:21<1:21:05,  1.56it/s]

{'loss': 0.0804, 'grad_norm': 0.8983741998672485, 'learning_rate': 5.983954695611138e-06, 'epoch': 2.1}


 70%|███████   | 17830/25428 [3:51:27<1:20:12,  1.58it/s]

{'loss': 0.0887, 'grad_norm': 1.0162872076034546, 'learning_rate': 5.976089350322479e-06, 'epoch': 2.1}


 70%|███████   | 17840/25428 [3:51:34<1:19:16,  1.60it/s]

{'loss': 0.0857, 'grad_norm': 0.8505938649177551, 'learning_rate': 5.968224005033822e-06, 'epoch': 2.1}


 70%|███████   | 17850/25428 [3:51:40<1:18:42,  1.60it/s]

{'loss': 0.1042, 'grad_norm': 1.554335594177246, 'learning_rate': 5.9603586597451635e-06, 'epoch': 2.11}


 70%|███████   | 17860/25428 [3:51:46<1:19:27,  1.59it/s]

{'loss': 0.0904, 'grad_norm': 1.408634901046753, 'learning_rate': 5.952493314456505e-06, 'epoch': 2.11}


 70%|███████   | 17870/25428 [3:51:53<1:18:47,  1.60it/s]

{'loss': 0.0816, 'grad_norm': 0.7105199694633484, 'learning_rate': 5.944627969167846e-06, 'epoch': 2.11}


 70%|███████   | 17880/25428 [3:51:59<1:19:09,  1.59it/s]

{'loss': 0.0781, 'grad_norm': 0.8398268818855286, 'learning_rate': 5.936762623879189e-06, 'epoch': 2.11}


 70%|███████   | 17890/25428 [3:52:05<1:19:17,  1.58it/s]

{'loss': 0.0873, 'grad_norm': 0.9834842085838318, 'learning_rate': 5.928897278590531e-06, 'epoch': 2.11}


 70%|███████   | 17900/25428 [3:52:12<1:20:13,  1.56it/s]

{'loss': 0.0783, 'grad_norm': 0.7635593414306641, 'learning_rate': 5.921031933301873e-06, 'epoch': 2.11}


 70%|███████   | 17910/25428 [3:52:18<1:19:15,  1.58it/s]

{'loss': 0.0788, 'grad_norm': 1.057623267173767, 'learning_rate': 5.913166588013214e-06, 'epoch': 2.11}


 70%|███████   | 17920/25428 [3:52:24<1:17:59,  1.60it/s]

{'loss': 0.0847, 'grad_norm': 1.1505082845687866, 'learning_rate': 5.905301242724556e-06, 'epoch': 2.11}


 71%|███████   | 17930/25428 [3:52:30<1:19:17,  1.58it/s]

{'loss': 0.0759, 'grad_norm': 0.6672056317329407, 'learning_rate': 5.897435897435898e-06, 'epoch': 2.12}


 71%|███████   | 17940/25428 [3:52:37<1:18:04,  1.60it/s]

{'loss': 0.0858, 'grad_norm': 1.3868563175201416, 'learning_rate': 5.88957055214724e-06, 'epoch': 2.12}


 71%|███████   | 17950/25428 [3:52:43<1:18:19,  1.59it/s]

{'loss': 0.0853, 'grad_norm': 0.9734143018722534, 'learning_rate': 5.881705206858581e-06, 'epoch': 2.12}


 71%|███████   | 17960/25428 [3:52:49<1:18:48,  1.58it/s]

{'loss': 0.0814, 'grad_norm': 1.5120126008987427, 'learning_rate': 5.873839861569924e-06, 'epoch': 2.12}


 71%|███████   | 17970/25428 [3:52:56<1:19:42,  1.56it/s]

{'loss': 0.0707, 'grad_norm': 0.5976622104644775, 'learning_rate': 5.8659745162812655e-06, 'epoch': 2.12}


 71%|███████   | 17980/25428 [3:53:02<1:18:25,  1.58it/s]

{'loss': 0.0655, 'grad_norm': 0.8322227597236633, 'learning_rate': 5.858109170992607e-06, 'epoch': 2.12}


 71%|███████   | 17990/25428 [3:53:08<1:19:03,  1.57it/s]

{'loss': 0.0917, 'grad_norm': 0.7856056094169617, 'learning_rate': 5.850243825703948e-06, 'epoch': 2.12}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.0711, 'grad_norm': 0.8904126882553101, 'learning_rate': 5.84237848041529e-06, 'epoch': 2.12}


 71%|███████   | 18010/25428 [3:53:24<1:23:15,  1.48it/s]

{'loss': 0.0724, 'grad_norm': 0.7498867511749268, 'learning_rate': 5.834513135126633e-06, 'epoch': 2.12}


 71%|███████   | 18020/25428 [3:53:30<1:17:47,  1.59it/s]

{'loss': 0.0686, 'grad_norm': 0.8552466034889221, 'learning_rate': 5.826647789837975e-06, 'epoch': 2.13}


 71%|███████   | 18030/25428 [3:53:36<1:17:00,  1.60it/s]

{'loss': 0.1005, 'grad_norm': 0.9694015383720398, 'learning_rate': 5.818782444549316e-06, 'epoch': 2.13}


 71%|███████   | 18040/25428 [3:53:43<1:18:59,  1.56it/s]

{'loss': 0.0818, 'grad_norm': 1.3982504606246948, 'learning_rate': 5.8109170992606575e-06, 'epoch': 2.13}


 71%|███████   | 18050/25428 [3:53:49<1:17:02,  1.60it/s]

{'loss': 0.0786, 'grad_norm': 1.21915864944458, 'learning_rate': 5.803051753972e-06, 'epoch': 2.13}


 71%|███████   | 18060/25428 [3:53:55<1:17:30,  1.58it/s]

{'loss': 0.0631, 'grad_norm': 0.8217261433601379, 'learning_rate': 5.795186408683342e-06, 'epoch': 2.13}


 71%|███████   | 18070/25428 [3:54:02<1:15:31,  1.62it/s]

{'loss': 0.078, 'grad_norm': 0.8367446064949036, 'learning_rate': 5.787321063394683e-06, 'epoch': 2.13}


 71%|███████   | 18080/25428 [3:54:08<1:16:14,  1.61it/s]

{'loss': 0.0605, 'grad_norm': 0.8018895983695984, 'learning_rate': 5.779455718106025e-06, 'epoch': 2.13}


 71%|███████   | 18090/25428 [3:54:14<1:16:35,  1.60it/s]

{'loss': 0.0939, 'grad_norm': 1.5168883800506592, 'learning_rate': 5.7715903728173675e-06, 'epoch': 2.13}


 71%|███████   | 18100/25428 [3:54:20<1:16:55,  1.59it/s]

{'loss': 0.0701, 'grad_norm': 0.9164271354675293, 'learning_rate': 5.763725027528709e-06, 'epoch': 2.14}


 71%|███████   | 18110/25428 [3:54:27<1:17:54,  1.57it/s]

{'loss': 0.0661, 'grad_norm': 0.963853120803833, 'learning_rate': 5.75585968224005e-06, 'epoch': 2.14}


 71%|███████▏  | 18120/25428 [3:54:33<1:17:33,  1.57it/s]

{'loss': 0.0814, 'grad_norm': 0.9499496817588806, 'learning_rate': 5.747994336951392e-06, 'epoch': 2.14}


 71%|███████▏  | 18130/25428 [3:54:39<1:15:40,  1.61it/s]

{'loss': 0.087, 'grad_norm': 1.0445947647094727, 'learning_rate': 5.740128991662735e-06, 'epoch': 2.14}


 71%|███████▏  | 18140/25428 [3:54:45<1:15:38,  1.61it/s]

{'loss': 0.0926, 'grad_norm': 0.8667728900909424, 'learning_rate': 5.732263646374077e-06, 'epoch': 2.14}


 71%|███████▏  | 18150/25428 [3:54:52<1:17:28,  1.57it/s]

{'loss': 0.1139, 'grad_norm': 1.282746434211731, 'learning_rate': 5.724398301085418e-06, 'epoch': 2.14}


 71%|███████▏  | 18160/25428 [3:54:58<1:14:32,  1.62it/s]

{'loss': 0.0883, 'grad_norm': 0.8186916708946228, 'learning_rate': 5.7165329557967595e-06, 'epoch': 2.14}


 71%|███████▏  | 18170/25428 [3:55:04<1:16:03,  1.59it/s]

{'loss': 0.0751, 'grad_norm': 1.0258281230926514, 'learning_rate': 5.708667610508102e-06, 'epoch': 2.14}


 71%|███████▏  | 18180/25428 [3:55:10<1:15:37,  1.60it/s]

{'loss': 0.0969, 'grad_norm': 0.8727108240127563, 'learning_rate': 5.700802265219444e-06, 'epoch': 2.14}


 72%|███████▏  | 18190/25428 [3:55:17<1:14:27,  1.62it/s]

{'loss': 0.0663, 'grad_norm': 0.6497644186019897, 'learning_rate': 5.692936919930785e-06, 'epoch': 2.15}


 72%|███████▏  | 18200/25428 [3:55:23<1:14:07,  1.63it/s]

{'loss': 0.0824, 'grad_norm': 0.8202279806137085, 'learning_rate': 5.685071574642127e-06, 'epoch': 2.15}


 72%|███████▏  | 18210/25428 [3:55:29<1:15:13,  1.60it/s]

{'loss': 0.0874, 'grad_norm': 0.6681221723556519, 'learning_rate': 5.6772062293534695e-06, 'epoch': 2.15}


 72%|███████▏  | 18220/25428 [3:55:35<1:16:03,  1.58it/s]

{'loss': 0.0883, 'grad_norm': 0.7809181213378906, 'learning_rate': 5.669340884064811e-06, 'epoch': 2.15}


 72%|███████▏  | 18230/25428 [3:55:42<1:13:50,  1.62it/s]

{'loss': 0.0834, 'grad_norm': 1.0650707483291626, 'learning_rate': 5.661475538776152e-06, 'epoch': 2.15}


 72%|███████▏  | 18240/25428 [3:55:48<1:14:50,  1.60it/s]

{'loss': 0.0662, 'grad_norm': 0.7916483283042908, 'learning_rate': 5.653610193487494e-06, 'epoch': 2.15}


 72%|███████▏  | 18250/25428 [3:55:54<1:13:38,  1.62it/s]

{'loss': 0.1206, 'grad_norm': 1.2029211521148682, 'learning_rate': 5.645744848198837e-06, 'epoch': 2.15}


 72%|███████▏  | 18260/25428 [3:56:00<1:13:22,  1.63it/s]

{'loss': 0.0776, 'grad_norm': 0.8152806162834167, 'learning_rate': 5.637879502910179e-06, 'epoch': 2.15}


 72%|███████▏  | 18270/25428 [3:56:07<1:15:02,  1.59it/s]

{'loss': 0.0649, 'grad_norm': 0.7872734069824219, 'learning_rate': 5.6300141576215205e-06, 'epoch': 2.16}


 72%|███████▏  | 18280/25428 [3:56:13<1:14:44,  1.59it/s]

{'loss': 0.1241, 'grad_norm': 1.356635570526123, 'learning_rate': 5.6221488123328615e-06, 'epoch': 2.16}


 72%|███████▏  | 18290/25428 [3:56:19<1:15:49,  1.57it/s]

{'loss': 0.078, 'grad_norm': 1.257993221282959, 'learning_rate': 5.614283467044203e-06, 'epoch': 2.16}


 72%|███████▏  | 18300/25428 [3:56:25<1:14:51,  1.59it/s]

{'loss': 0.0627, 'grad_norm': 0.641483724117279, 'learning_rate': 5.606418121755546e-06, 'epoch': 2.16}


 72%|███████▏  | 18310/25428 [3:56:32<1:14:04,  1.60it/s]

{'loss': 0.0719, 'grad_norm': 0.6022255420684814, 'learning_rate': 5.598552776466888e-06, 'epoch': 2.16}


 72%|███████▏  | 18320/25428 [3:56:38<1:14:35,  1.59it/s]

{'loss': 0.0903, 'grad_norm': 1.0225908756256104, 'learning_rate': 5.590687431178229e-06, 'epoch': 2.16}


 72%|███████▏  | 18330/25428 [3:56:44<1:14:02,  1.60it/s]

{'loss': 0.0988, 'grad_norm': 1.0384776592254639, 'learning_rate': 5.582822085889571e-06, 'epoch': 2.16}


 72%|███████▏  | 18340/25428 [3:56:51<1:14:48,  1.58it/s]

{'loss': 0.1014, 'grad_norm': 1.396164894104004, 'learning_rate': 5.574956740600913e-06, 'epoch': 2.16}


 72%|███████▏  | 18350/25428 [3:56:57<1:14:34,  1.58it/s]

{'loss': 0.0961, 'grad_norm': 0.8167441487312317, 'learning_rate': 5.567091395312255e-06, 'epoch': 2.16}


 72%|███████▏  | 18360/25428 [3:57:03<1:14:21,  1.58it/s]

{'loss': 0.1261, 'grad_norm': 2.4784011840820312, 'learning_rate': 5.559226050023596e-06, 'epoch': 2.17}


 72%|███████▏  | 18370/25428 [3:57:09<1:13:55,  1.59it/s]

{'loss': 0.0594, 'grad_norm': 0.7048024535179138, 'learning_rate': 5.551360704734938e-06, 'epoch': 2.17}


 72%|███████▏  | 18380/25428 [3:57:16<1:13:32,  1.60it/s]

{'loss': 0.0868, 'grad_norm': 0.7755796313285828, 'learning_rate': 5.543495359446281e-06, 'epoch': 2.17}


 72%|███████▏  | 18390/25428 [3:57:22<1:13:56,  1.59it/s]

{'loss': 0.1038, 'grad_norm': 0.967765748500824, 'learning_rate': 5.5356300141576225e-06, 'epoch': 2.17}


 72%|███████▏  | 18400/25428 [3:57:28<1:13:11,  1.60it/s]

{'loss': 0.0936, 'grad_norm': 1.0450265407562256, 'learning_rate': 5.5277646688689635e-06, 'epoch': 2.17}


 72%|███████▏  | 18410/25428 [3:57:35<1:12:11,  1.62it/s]

{'loss': 0.1063, 'grad_norm': 0.7656646370887756, 'learning_rate': 5.519899323580305e-06, 'epoch': 2.17}


 72%|███████▏  | 18420/25428 [3:57:41<1:14:01,  1.58it/s]

{'loss': 0.1127, 'grad_norm': 1.4672801494598389, 'learning_rate': 5.512033978291648e-06, 'epoch': 2.17}


 72%|███████▏  | 18430/25428 [3:57:47<1:12:56,  1.60it/s]

{'loss': 0.0741, 'grad_norm': 0.6686292290687561, 'learning_rate': 5.50416863300299e-06, 'epoch': 2.17}


 73%|███████▎  | 18440/25428 [3:57:53<1:13:48,  1.58it/s]

{'loss': 0.0975, 'grad_norm': 1.2491782903671265, 'learning_rate': 5.496303287714331e-06, 'epoch': 2.18}


 73%|███████▎  | 18450/25428 [3:58:00<1:12:52,  1.60it/s]

{'loss': 0.0941, 'grad_norm': 1.2881397008895874, 'learning_rate': 5.488437942425673e-06, 'epoch': 2.18}


 73%|███████▎  | 18460/25428 [3:58:06<1:13:51,  1.57it/s]

{'loss': 0.0798, 'grad_norm': 0.6627309322357178, 'learning_rate': 5.480572597137015e-06, 'epoch': 2.18}


 73%|███████▎  | 18470/25428 [3:58:12<1:13:04,  1.59it/s]

{'loss': 0.0761, 'grad_norm': 0.8214340806007385, 'learning_rate': 5.472707251848357e-06, 'epoch': 2.18}


 73%|███████▎  | 18480/25428 [3:58:19<1:12:33,  1.60it/s]

{'loss': 0.0922, 'grad_norm': 1.026770830154419, 'learning_rate': 5.464841906559698e-06, 'epoch': 2.18}


 73%|███████▎  | 18490/25428 [3:58:25<1:11:48,  1.61it/s]

{'loss': 0.0881, 'grad_norm': 0.9504058957099915, 'learning_rate': 5.45697656127104e-06, 'epoch': 2.18}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.1069, 'grad_norm': 0.4841409921646118, 'learning_rate': 5.449111215982383e-06, 'epoch': 2.18}


 73%|███████▎  | 18510/25428 [3:58:40<1:17:26,  1.49it/s]

{'loss': 0.106, 'grad_norm': 1.3513036966323853, 'learning_rate': 5.4412458706937245e-06, 'epoch': 2.18}


 73%|███████▎  | 18520/25428 [3:58:46<1:12:46,  1.58it/s]

{'loss': 0.1086, 'grad_norm': 0.8297964334487915, 'learning_rate': 5.4333805254050655e-06, 'epoch': 2.18}


 73%|███████▎  | 18530/25428 [3:58:53<1:14:12,  1.55it/s]

{'loss': 0.0869, 'grad_norm': 0.683726966381073, 'learning_rate': 5.425515180116407e-06, 'epoch': 2.19}


 73%|███████▎  | 18540/25428 [3:58:59<1:11:31,  1.61it/s]

{'loss': 0.0889, 'grad_norm': 1.4762601852416992, 'learning_rate': 5.417649834827749e-06, 'epoch': 2.19}


 73%|███████▎  | 18550/25428 [3:59:05<1:12:41,  1.58it/s]

{'loss': 0.0606, 'grad_norm': 1.438442587852478, 'learning_rate': 5.409784489539092e-06, 'epoch': 2.19}


 73%|███████▎  | 18560/25428 [3:59:12<1:13:38,  1.55it/s]

{'loss': 0.0893, 'grad_norm': 1.3738011121749878, 'learning_rate': 5.401919144250433e-06, 'epoch': 2.19}


 73%|███████▎  | 18570/25428 [3:59:18<1:12:29,  1.58it/s]

{'loss': 0.0998, 'grad_norm': 1.7906739711761475, 'learning_rate': 5.394053798961775e-06, 'epoch': 2.19}


 73%|███████▎  | 18580/25428 [3:59:24<1:12:08,  1.58it/s]

{'loss': 0.0785, 'grad_norm': 1.664939522743225, 'learning_rate': 5.3861884536731165e-06, 'epoch': 2.19}


 73%|███████▎  | 18590/25428 [3:59:31<1:12:14,  1.58it/s]

{'loss': 0.1159, 'grad_norm': 1.474714994430542, 'learning_rate': 5.378323108384459e-06, 'epoch': 2.19}


 73%|███████▎  | 18600/25428 [3:59:37<1:12:08,  1.58it/s]

{'loss': 0.0669, 'grad_norm': 0.8739088177680969, 'learning_rate': 5.3704577630958e-06, 'epoch': 2.19}


 73%|███████▎  | 18610/25428 [3:59:43<1:12:45,  1.56it/s]

{'loss': 0.0776, 'grad_norm': 0.6928265690803528, 'learning_rate': 5.362592417807142e-06, 'epoch': 2.2}


 73%|███████▎  | 18620/25428 [3:59:50<1:10:02,  1.62it/s]

{'loss': 0.0808, 'grad_norm': 1.0242911577224731, 'learning_rate': 5.354727072518484e-06, 'epoch': 2.2}


 73%|███████▎  | 18630/25428 [3:59:56<1:10:33,  1.61it/s]

{'loss': 0.0788, 'grad_norm': 1.119653344154358, 'learning_rate': 5.3468617272298265e-06, 'epoch': 2.2}


 73%|███████▎  | 18640/25428 [4:00:02<1:10:27,  1.61it/s]

{'loss': 0.0872, 'grad_norm': 0.7194523215293884, 'learning_rate': 5.3389963819411675e-06, 'epoch': 2.2}


 73%|███████▎  | 18650/25428 [4:00:09<1:11:26,  1.58it/s]

{'loss': 0.0982, 'grad_norm': 0.6560426950454712, 'learning_rate': 5.331131036652509e-06, 'epoch': 2.2}


 73%|███████▎  | 18660/25428 [4:00:15<1:12:59,  1.55it/s]

{'loss': 0.089, 'grad_norm': 1.806130290031433, 'learning_rate': 5.323265691363851e-06, 'epoch': 2.2}


 73%|███████▎  | 18670/25428 [4:00:21<1:09:59,  1.61it/s]

{'loss': 0.0618, 'grad_norm': 0.8920786380767822, 'learning_rate': 5.315400346075194e-06, 'epoch': 2.2}


 73%|███████▎  | 18680/25428 [4:00:27<1:10:17,  1.60it/s]

{'loss': 0.0788, 'grad_norm': 0.7434963583946228, 'learning_rate': 5.307535000786535e-06, 'epoch': 2.2}


 74%|███████▎  | 18690/25428 [4:00:34<1:10:41,  1.59it/s]

{'loss': 0.0716, 'grad_norm': 0.6622933149337769, 'learning_rate': 5.299669655497877e-06, 'epoch': 2.21}


 74%|███████▎  | 18700/25428 [4:00:40<1:10:20,  1.59it/s]

{'loss': 0.0798, 'grad_norm': 0.7673725485801697, 'learning_rate': 5.2918043102092185e-06, 'epoch': 2.21}


 74%|███████▎  | 18710/25428 [4:00:46<1:10:25,  1.59it/s]

{'loss': 0.095, 'grad_norm': 1.447232961654663, 'learning_rate': 5.283938964920561e-06, 'epoch': 2.21}


 74%|███████▎  | 18720/25428 [4:00:53<1:09:25,  1.61it/s]

{'loss': 0.0793, 'grad_norm': 0.7871283888816833, 'learning_rate': 5.276073619631902e-06, 'epoch': 2.21}


 74%|███████▎  | 18730/25428 [4:00:59<1:08:42,  1.62it/s]

{'loss': 0.0821, 'grad_norm': 1.2963955402374268, 'learning_rate': 5.268208274343244e-06, 'epoch': 2.21}


 74%|███████▎  | 18740/25428 [4:01:05<1:09:47,  1.60it/s]

{'loss': 0.0798, 'grad_norm': 1.0217310190200806, 'learning_rate': 5.260342929054586e-06, 'epoch': 2.21}


 74%|███████▎  | 18750/25428 [4:01:11<1:08:33,  1.62it/s]

{'loss': 0.0915, 'grad_norm': 1.5073587894439697, 'learning_rate': 5.2524775837659285e-06, 'epoch': 2.21}


 74%|███████▍  | 18760/25428 [4:01:18<1:10:13,  1.58it/s]

{'loss': 0.0829, 'grad_norm': 0.9482422471046448, 'learning_rate': 5.2446122384772695e-06, 'epoch': 2.21}


 74%|███████▍  | 18770/25428 [4:01:24<1:10:42,  1.57it/s]

{'loss': 0.1138, 'grad_norm': 0.7038213610649109, 'learning_rate': 5.236746893188611e-06, 'epoch': 2.21}


 74%|███████▍  | 18780/25428 [4:01:30<1:08:16,  1.62it/s]

{'loss': 0.1337, 'grad_norm': 0.864588737487793, 'learning_rate': 5.228881547899953e-06, 'epoch': 2.22}


 74%|███████▍  | 18790/25428 [4:01:36<1:08:24,  1.62it/s]

{'loss': 0.1069, 'grad_norm': 1.0977567434310913, 'learning_rate': 5.221016202611295e-06, 'epoch': 2.22}


 74%|███████▍  | 18800/25428 [4:01:43<1:08:48,  1.61it/s]

{'loss': 0.0869, 'grad_norm': 0.6707112193107605, 'learning_rate': 5.213150857322636e-06, 'epoch': 2.22}


 74%|███████▍  | 18810/25428 [4:01:49<1:10:07,  1.57it/s]

{'loss': 0.0877, 'grad_norm': 1.393753170967102, 'learning_rate': 5.205285512033979e-06, 'epoch': 2.22}


 74%|███████▍  | 18820/25428 [4:01:55<1:08:07,  1.62it/s]

{'loss': 0.0678, 'grad_norm': 0.5894870758056641, 'learning_rate': 5.1974201667453205e-06, 'epoch': 2.22}


 74%|███████▍  | 18830/25428 [4:02:01<1:09:47,  1.58it/s]

{'loss': 0.0911, 'grad_norm': 0.6585909128189087, 'learning_rate': 5.189554821456662e-06, 'epoch': 2.22}


 74%|███████▍  | 18840/25428 [4:02:08<1:07:40,  1.62it/s]

{'loss': 0.1195, 'grad_norm': 0.8249225616455078, 'learning_rate': 5.181689476168003e-06, 'epoch': 2.22}


 74%|███████▍  | 18850/25428 [4:02:14<1:11:21,  1.54it/s]

{'loss': 0.0833, 'grad_norm': 0.8258016705513, 'learning_rate': 5.173824130879346e-06, 'epoch': 2.22}


 74%|███████▍  | 18860/25428 [4:02:20<1:07:34,  1.62it/s]

{'loss': 0.0632, 'grad_norm': 0.7070220112800598, 'learning_rate': 5.165958785590688e-06, 'epoch': 2.23}


 74%|███████▍  | 18870/25428 [4:02:26<1:09:10,  1.58it/s]

{'loss': 0.1318, 'grad_norm': 1.173840880393982, 'learning_rate': 5.15809344030203e-06, 'epoch': 2.23}


 74%|███████▍  | 18880/25428 [4:02:33<1:09:08,  1.58it/s]

{'loss': 0.1, 'grad_norm': 0.9084092974662781, 'learning_rate': 5.150228095013371e-06, 'epoch': 2.23}


 74%|███████▍  | 18890/25428 [4:02:39<1:08:36,  1.59it/s]

{'loss': 0.0754, 'grad_norm': 0.8886658549308777, 'learning_rate': 5.142362749724713e-06, 'epoch': 2.23}


 74%|███████▍  | 18900/25428 [4:02:45<1:06:53,  1.63it/s]

{'loss': 0.0881, 'grad_norm': 0.8315153121948242, 'learning_rate': 5.134497404436055e-06, 'epoch': 2.23}


 74%|███████▍  | 18910/25428 [4:02:51<1:07:22,  1.61it/s]

{'loss': 0.0801, 'grad_norm': 0.7416512966156006, 'learning_rate': 5.126632059147397e-06, 'epoch': 2.23}


 74%|███████▍  | 18920/25428 [4:02:57<1:06:59,  1.62it/s]

{'loss': 0.0824, 'grad_norm': 0.5711231231689453, 'learning_rate': 5.11876671385874e-06, 'epoch': 2.23}


 74%|███████▍  | 18930/25428 [4:03:04<1:06:45,  1.62it/s]

{'loss': 0.0657, 'grad_norm': 1.2602581977844238, 'learning_rate': 5.110901368570081e-06, 'epoch': 2.23}


 74%|███████▍  | 18940/25428 [4:03:10<1:07:18,  1.61it/s]

{'loss': 0.0693, 'grad_norm': 0.8761876225471497, 'learning_rate': 5.1030360232814225e-06, 'epoch': 2.23}


 75%|███████▍  | 18950/25428 [4:03:16<1:08:35,  1.57it/s]

{'loss': 0.0825, 'grad_norm': 0.9843398332595825, 'learning_rate': 5.095170677992764e-06, 'epoch': 2.24}


 75%|███████▍  | 18960/25428 [4:03:22<1:06:51,  1.61it/s]

{'loss': 0.1001, 'grad_norm': 0.7911108136177063, 'learning_rate': 5.087305332704107e-06, 'epoch': 2.24}


 75%|███████▍  | 18970/25428 [4:03:29<1:07:42,  1.59it/s]

{'loss': 0.0863, 'grad_norm': 0.7305978536605835, 'learning_rate': 5.079439987415448e-06, 'epoch': 2.24}


 75%|███████▍  | 18980/25428 [4:03:35<1:07:54,  1.58it/s]

{'loss': 0.0776, 'grad_norm': 1.7046959400177002, 'learning_rate': 5.07157464212679e-06, 'epoch': 2.24}


 75%|███████▍  | 18990/25428 [4:03:41<1:08:30,  1.57it/s]

{'loss': 0.0792, 'grad_norm': 0.9212898015975952, 'learning_rate': 5.063709296838132e-06, 'epoch': 2.24}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.0758, 'grad_norm': 1.0058749914169312, 'learning_rate': 5.055843951549474e-06, 'epoch': 2.24}


 75%|███████▍  | 19010/25428 [4:03:56<1:11:12,  1.50it/s]

{'loss': 0.1136, 'grad_norm': 3.00411319732666, 'learning_rate': 5.047978606260815e-06, 'epoch': 2.24}


 75%|███████▍  | 19020/25428 [4:04:03<1:06:51,  1.60it/s]

{'loss': 0.085, 'grad_norm': 0.7131851315498352, 'learning_rate': 5.040113260972157e-06, 'epoch': 2.24}


 75%|███████▍  | 19030/25428 [4:04:09<1:06:47,  1.60it/s]

{'loss': 0.0808, 'grad_norm': 0.9892131090164185, 'learning_rate': 5.032247915683499e-06, 'epoch': 2.25}


 75%|███████▍  | 19040/25428 [4:04:15<1:07:12,  1.58it/s]

{'loss': 0.0946, 'grad_norm': 2.039078712463379, 'learning_rate': 5.024382570394841e-06, 'epoch': 2.25}


 75%|███████▍  | 19050/25428 [4:04:21<1:05:39,  1.62it/s]

{'loss': 0.0845, 'grad_norm': 1.0941789150238037, 'learning_rate': 5.016517225106183e-06, 'epoch': 2.25}


 75%|███████▍  | 19060/25428 [4:04:28<1:06:53,  1.59it/s]

{'loss': 0.0868, 'grad_norm': 0.878424882888794, 'learning_rate': 5.0086518798175245e-06, 'epoch': 2.25}


 75%|███████▍  | 19070/25428 [4:04:34<1:05:39,  1.61it/s]

{'loss': 0.064, 'grad_norm': 0.7953494787216187, 'learning_rate': 5.000786534528866e-06, 'epoch': 2.25}


 75%|███████▌  | 19080/25428 [4:04:40<1:04:56,  1.63it/s]

{'loss': 0.0908, 'grad_norm': 1.0581912994384766, 'learning_rate': 4.992921189240208e-06, 'epoch': 2.25}


 75%|███████▌  | 19090/25428 [4:04:46<1:06:22,  1.59it/s]

{'loss': 0.0965, 'grad_norm': 1.296802282333374, 'learning_rate': 4.98505584395155e-06, 'epoch': 2.25}


 75%|███████▌  | 19100/25428 [4:04:53<1:06:50,  1.58it/s]

{'loss': 0.0777, 'grad_norm': 0.7489702701568604, 'learning_rate': 4.977190498662892e-06, 'epoch': 2.25}


 75%|███████▌  | 19110/25428 [4:04:59<1:07:21,  1.56it/s]

{'loss': 0.0965, 'grad_norm': 0.917934775352478, 'learning_rate': 4.969325153374234e-06, 'epoch': 2.25}


 75%|███████▌  | 19120/25428 [4:05:05<1:04:52,  1.62it/s]

{'loss': 0.0729, 'grad_norm': 0.5605801343917847, 'learning_rate': 4.9614598080855755e-06, 'epoch': 2.26}


 75%|███████▌  | 19130/25428 [4:05:11<1:04:35,  1.62it/s]

{'loss': 0.0916, 'grad_norm': 1.1314923763275146, 'learning_rate': 4.953594462796917e-06, 'epoch': 2.26}


 75%|███████▌  | 19140/25428 [4:05:18<1:06:07,  1.58it/s]

{'loss': 0.0672, 'grad_norm': 0.7478209137916565, 'learning_rate': 4.945729117508259e-06, 'epoch': 2.26}


 75%|███████▌  | 19150/25428 [4:05:24<1:06:51,  1.56it/s]

{'loss': 0.082, 'grad_norm': 1.4054107666015625, 'learning_rate': 4.937863772219601e-06, 'epoch': 2.26}


 75%|███████▌  | 19160/25428 [4:05:30<1:04:23,  1.62it/s]

{'loss': 0.1114, 'grad_norm': 0.862193763256073, 'learning_rate': 4.929998426930943e-06, 'epoch': 2.26}


 75%|███████▌  | 19170/25428 [4:05:36<1:05:11,  1.60it/s]

{'loss': 0.068, 'grad_norm': 1.0625970363616943, 'learning_rate': 4.922133081642285e-06, 'epoch': 2.26}


 75%|███████▌  | 19180/25428 [4:05:43<1:05:06,  1.60it/s]

{'loss': 0.1087, 'grad_norm': 0.643297016620636, 'learning_rate': 4.9142677363536265e-06, 'epoch': 2.26}


 75%|███████▌  | 19190/25428 [4:05:49<1:06:09,  1.57it/s]

{'loss': 0.0625, 'grad_norm': 0.8240870833396912, 'learning_rate': 4.906402391064968e-06, 'epoch': 2.26}


 76%|███████▌  | 19200/25428 [4:05:55<1:05:14,  1.59it/s]

{'loss': 0.0734, 'grad_norm': 0.5103397965431213, 'learning_rate': 4.89853704577631e-06, 'epoch': 2.27}


 76%|███████▌  | 19210/25428 [4:06:01<1:04:17,  1.61it/s]

{'loss': 0.1186, 'grad_norm': 0.6110082268714905, 'learning_rate': 4.890671700487652e-06, 'epoch': 2.27}


 76%|███████▌  | 19220/25428 [4:06:08<1:04:25,  1.61it/s]

{'loss': 0.1056, 'grad_norm': 0.667209267616272, 'learning_rate': 4.882806355198994e-06, 'epoch': 2.27}


 76%|███████▌  | 19230/25428 [4:06:14<1:05:59,  1.57it/s]

{'loss': 0.0758, 'grad_norm': 0.9345197081565857, 'learning_rate': 4.874941009910336e-06, 'epoch': 2.27}


 76%|███████▌  | 19240/25428 [4:06:20<1:03:49,  1.62it/s]

{'loss': 0.0706, 'grad_norm': 0.83181232213974, 'learning_rate': 4.8670756646216775e-06, 'epoch': 2.27}


 76%|███████▌  | 19250/25428 [4:06:26<1:03:38,  1.62it/s]

{'loss': 0.077, 'grad_norm': 0.9786180853843689, 'learning_rate': 4.859210319333019e-06, 'epoch': 2.27}


 76%|███████▌  | 19260/25428 [4:06:33<1:03:21,  1.62it/s]

{'loss': 0.1262, 'grad_norm': 0.8696165084838867, 'learning_rate': 4.851344974044361e-06, 'epoch': 2.27}


 76%|███████▌  | 19270/25428 [4:06:39<1:04:21,  1.59it/s]

{'loss': 0.0767, 'grad_norm': 0.6890693306922913, 'learning_rate': 4.843479628755703e-06, 'epoch': 2.27}


 76%|███████▌  | 19280/25428 [4:06:45<1:03:48,  1.61it/s]

{'loss': 0.0937, 'grad_norm': 1.2308915853500366, 'learning_rate': 4.835614283467045e-06, 'epoch': 2.27}


 76%|███████▌  | 19290/25428 [4:06:51<1:03:38,  1.61it/s]

{'loss': 0.0976, 'grad_norm': 1.2411004304885864, 'learning_rate': 4.827748938178387e-06, 'epoch': 2.28}


 76%|███████▌  | 19300/25428 [4:06:57<1:02:55,  1.62it/s]

{'loss': 0.0803, 'grad_norm': 0.46735960245132446, 'learning_rate': 4.8198835928897285e-06, 'epoch': 2.28}


 76%|███████▌  | 19310/25428 [4:07:04<1:03:00,  1.62it/s]

{'loss': 0.0836, 'grad_norm': 0.9107925891876221, 'learning_rate': 4.81201824760107e-06, 'epoch': 2.28}


 76%|███████▌  | 19320/25428 [4:07:10<1:02:41,  1.62it/s]

{'loss': 0.0874, 'grad_norm': 0.8233042359352112, 'learning_rate': 4.804152902312411e-06, 'epoch': 2.28}


 76%|███████▌  | 19330/25428 [4:07:16<1:04:19,  1.58it/s]

{'loss': 0.078, 'grad_norm': 0.9857375621795654, 'learning_rate': 4.796287557023754e-06, 'epoch': 2.28}


 76%|███████▌  | 19340/25428 [4:07:22<1:02:38,  1.62it/s]

{'loss': 0.0778, 'grad_norm': 1.203518033027649, 'learning_rate': 4.788422211735095e-06, 'epoch': 2.28}


 76%|███████▌  | 19350/25428 [4:07:29<1:02:51,  1.61it/s]

{'loss': 0.097, 'grad_norm': 1.0948717594146729, 'learning_rate': 4.780556866446438e-06, 'epoch': 2.28}


 76%|███████▌  | 19360/25428 [4:07:35<1:03:37,  1.59it/s]

{'loss': 0.0881, 'grad_norm': 1.096924066543579, 'learning_rate': 4.772691521157779e-06, 'epoch': 2.28}


 76%|███████▌  | 19370/25428 [4:07:41<1:03:58,  1.58it/s]

{'loss': 0.0696, 'grad_norm': 0.8110026717185974, 'learning_rate': 4.764826175869121e-06, 'epoch': 2.29}


 76%|███████▌  | 19380/25428 [4:07:47<1:02:10,  1.62it/s]

{'loss': 0.1256, 'grad_norm': 2.026493787765503, 'learning_rate': 4.756960830580462e-06, 'epoch': 2.29}


 76%|███████▋  | 19390/25428 [4:07:54<1:01:41,  1.63it/s]

{'loss': 0.1115, 'grad_norm': 1.1169946193695068, 'learning_rate': 4.749095485291805e-06, 'epoch': 2.29}


 76%|███████▋  | 19400/25428 [4:08:00<1:03:05,  1.59it/s]

{'loss': 0.0805, 'grad_norm': 0.5366857051849365, 'learning_rate': 4.741230140003146e-06, 'epoch': 2.29}


 76%|███████▋  | 19410/25428 [4:08:06<1:03:06,  1.59it/s]

{'loss': 0.0776, 'grad_norm': 0.9731173515319824, 'learning_rate': 4.733364794714489e-06, 'epoch': 2.29}


 76%|███████▋  | 19420/25428 [4:08:12<1:02:09,  1.61it/s]

{'loss': 0.0724, 'grad_norm': 1.8904731273651123, 'learning_rate': 4.72549944942583e-06, 'epoch': 2.29}


 76%|███████▋  | 19430/25428 [4:08:19<1:03:12,  1.58it/s]

{'loss': 0.0859, 'grad_norm': 1.6628035306930542, 'learning_rate': 4.717634104137172e-06, 'epoch': 2.29}


 76%|███████▋  | 19440/25428 [4:08:25<1:01:46,  1.62it/s]

{'loss': 0.0976, 'grad_norm': 0.6053319573402405, 'learning_rate': 4.709768758848513e-06, 'epoch': 2.29}


 76%|███████▋  | 19450/25428 [4:08:31<1:01:46,  1.61it/s]

{'loss': 0.0647, 'grad_norm': 0.6914232969284058, 'learning_rate': 4.701903413559856e-06, 'epoch': 2.29}


 77%|███████▋  | 19460/25428 [4:08:37<1:01:56,  1.61it/s]

{'loss': 0.0671, 'grad_norm': 0.7800241112709045, 'learning_rate': 4.694038068271197e-06, 'epoch': 2.3}


 77%|███████▋  | 19470/25428 [4:08:44<1:02:15,  1.59it/s]

{'loss': 0.1136, 'grad_norm': 1.9035084247589111, 'learning_rate': 4.68617272298254e-06, 'epoch': 2.3}


 77%|███████▋  | 19480/25428 [4:08:50<1:02:05,  1.60it/s]

{'loss': 0.095, 'grad_norm': 1.5830552577972412, 'learning_rate': 4.678307377693881e-06, 'epoch': 2.3}


 77%|███████▋  | 19490/25428 [4:08:56<1:01:37,  1.61it/s]

{'loss': 0.0934, 'grad_norm': 0.9272352457046509, 'learning_rate': 4.670442032405223e-06, 'epoch': 2.3}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.0987, 'grad_norm': 0.830962061882019, 'learning_rate': 4.662576687116564e-06, 'epoch': 2.3}


 77%|███████▋  | 19510/25428 [4:09:11<1:06:30,  1.48it/s]

{'loss': 0.0913, 'grad_norm': 0.9497881531715393, 'learning_rate': 4.654711341827907e-06, 'epoch': 2.3}


 77%|███████▋  | 19520/25428 [4:09:18<1:01:47,  1.59it/s]

{'loss': 0.0608, 'grad_norm': 0.8055842518806458, 'learning_rate': 4.646845996539249e-06, 'epoch': 2.3}


 77%|███████▋  | 19530/25428 [4:09:24<1:01:31,  1.60it/s]

{'loss': 0.0759, 'grad_norm': 5.300271034240723, 'learning_rate': 4.638980651250591e-06, 'epoch': 2.3}


 77%|███████▋  | 19540/25428 [4:09:30<1:01:40,  1.59it/s]

{'loss': 0.0666, 'grad_norm': 0.763189435005188, 'learning_rate': 4.6311153059619325e-06, 'epoch': 2.31}


 77%|███████▋  | 19550/25428 [4:09:37<1:01:49,  1.58it/s]

{'loss': 0.0693, 'grad_norm': 0.6379078030586243, 'learning_rate': 4.623249960673274e-06, 'epoch': 2.31}


 77%|███████▋  | 19560/25428 [4:09:43<1:00:53,  1.61it/s]

{'loss': 0.0931, 'grad_norm': 0.9651844501495361, 'learning_rate': 4.615384615384616e-06, 'epoch': 2.31}


 77%|███████▋  | 19570/25428 [4:09:49<1:01:44,  1.58it/s]

{'loss': 0.0767, 'grad_norm': 0.8568775057792664, 'learning_rate': 4.607519270095958e-06, 'epoch': 2.31}


 77%|███████▋  | 19580/25428 [4:09:56<1:00:41,  1.61it/s]

{'loss': 0.0819, 'grad_norm': 0.944139838218689, 'learning_rate': 4.5996539248073e-06, 'epoch': 2.31}


 77%|███████▋  | 19590/25428 [4:10:02<1:01:21,  1.59it/s]

{'loss': 0.086, 'grad_norm': 0.8802400827407837, 'learning_rate': 4.591788579518641e-06, 'epoch': 2.31}


 77%|███████▋  | 19600/25428 [4:10:08<1:00:14,  1.61it/s]

{'loss': 0.0709, 'grad_norm': 1.7972854375839233, 'learning_rate': 4.5839232342299835e-06, 'epoch': 2.31}


 77%|███████▋  | 19610/25428 [4:10:14<1:02:04,  1.56it/s]

{'loss': 0.0997, 'grad_norm': 1.3002253770828247, 'learning_rate': 4.5760578889413245e-06, 'epoch': 2.31}


 77%|███████▋  | 19620/25428 [4:10:21<59:56,  1.61it/s]  

{'loss': 0.0861, 'grad_norm': 0.8670957088470459, 'learning_rate': 4.568192543652667e-06, 'epoch': 2.31}


 77%|███████▋  | 19630/25428 [4:10:27<1:01:22,  1.57it/s]

{'loss': 0.1024, 'grad_norm': 0.6717561483383179, 'learning_rate': 4.560327198364008e-06, 'epoch': 2.32}


 77%|███████▋  | 19640/25428 [4:10:33<59:45,  1.61it/s]  

{'loss': 0.1093, 'grad_norm': 1.5220719575881958, 'learning_rate': 4.552461853075351e-06, 'epoch': 2.32}


 77%|███████▋  | 19650/25428 [4:10:40<1:00:06,  1.60it/s]

{'loss': 0.0618, 'grad_norm': 0.591463029384613, 'learning_rate': 4.544596507786692e-06, 'epoch': 2.32}


 77%|███████▋  | 19660/25428 [4:10:46<1:00:21,  1.59it/s]

{'loss': 0.0908, 'grad_norm': 0.6354440450668335, 'learning_rate': 4.5367311624980345e-06, 'epoch': 2.32}


 77%|███████▋  | 19670/25428 [4:10:52<1:00:41,  1.58it/s]

{'loss': 0.0607, 'grad_norm': 1.2130619287490845, 'learning_rate': 4.5288658172093755e-06, 'epoch': 2.32}


 77%|███████▋  | 19680/25428 [4:10:58<59:51,  1.60it/s]  

{'loss': 0.0742, 'grad_norm': 2.1577887535095215, 'learning_rate': 4.521000471920718e-06, 'epoch': 2.32}


 77%|███████▋  | 19690/25428 [4:11:05<58:46,  1.63it/s]  

{'loss': 0.1102, 'grad_norm': 1.4143450260162354, 'learning_rate': 4.513135126632059e-06, 'epoch': 2.32}


 77%|███████▋  | 19700/25428 [4:11:11<59:18,  1.61it/s]  

{'loss': 0.0866, 'grad_norm': 0.7920998334884644, 'learning_rate': 4.505269781343402e-06, 'epoch': 2.32}


 78%|███████▊  | 19710/25428 [4:11:17<59:25,  1.60it/s]  

{'loss': 0.0994, 'grad_norm': 0.7771753072738647, 'learning_rate': 4.497404436054743e-06, 'epoch': 2.33}


 78%|███████▊  | 19720/25428 [4:11:23<59:56,  1.59it/s]  

{'loss': 0.0671, 'grad_norm': 0.95833820104599, 'learning_rate': 4.4895390907660855e-06, 'epoch': 2.33}


 78%|███████▊  | 19730/25428 [4:11:30<59:27,  1.60it/s]  

{'loss': 0.0738, 'grad_norm': 0.8379660844802856, 'learning_rate': 4.4816737454774265e-06, 'epoch': 2.33}


 78%|███████▊  | 19740/25428 [4:11:36<58:48,  1.61it/s]

{'loss': 0.0644, 'grad_norm': 0.6642686724662781, 'learning_rate': 4.473808400188769e-06, 'epoch': 2.33}


 78%|███████▊  | 19750/25428 [4:11:42<58:24,  1.62it/s]

{'loss': 0.0792, 'grad_norm': 1.4438263177871704, 'learning_rate': 4.46594305490011e-06, 'epoch': 2.33}


 78%|███████▊  | 19760/25428 [4:11:48<59:38,  1.58it/s]

{'loss': 0.123, 'grad_norm': 0.9199283123016357, 'learning_rate': 4.458077709611453e-06, 'epoch': 2.33}


 78%|███████▊  | 19770/25428 [4:11:55<59:08,  1.59it/s]  

{'loss': 0.0913, 'grad_norm': 1.0419565439224243, 'learning_rate': 4.450212364322794e-06, 'epoch': 2.33}


 78%|███████▊  | 19780/25428 [4:12:01<58:20,  1.61it/s]  

{'loss': 0.0964, 'grad_norm': 1.519059181213379, 'learning_rate': 4.4423470190341365e-06, 'epoch': 2.33}


 78%|███████▊  | 19790/25428 [4:12:07<58:59,  1.59it/s]  

{'loss': 0.0744, 'grad_norm': 1.5853990316390991, 'learning_rate': 4.4344816737454775e-06, 'epoch': 2.33}


 78%|███████▊  | 19800/25428 [4:12:14<1:01:20,  1.53it/s]

{'loss': 0.0906, 'grad_norm': 0.8291375041007996, 'learning_rate': 4.42661632845682e-06, 'epoch': 2.34}


 78%|███████▊  | 19810/25428 [4:12:20<59:42,  1.57it/s]  

{'loss': 0.0833, 'grad_norm': 1.171797275543213, 'learning_rate': 4.418750983168161e-06, 'epoch': 2.34}


 78%|███████▊  | 19820/25428 [4:12:26<59:32,  1.57it/s]

{'loss': 0.0993, 'grad_norm': 0.9109007716178894, 'learning_rate': 4.410885637879504e-06, 'epoch': 2.34}


 78%|███████▊  | 19830/25428 [4:12:33<57:40,  1.62it/s]  

{'loss': 0.0773, 'grad_norm': 0.7121779322624207, 'learning_rate': 4.403020292590845e-06, 'epoch': 2.34}


 78%|███████▊  | 19840/25428 [4:12:39<57:47,  1.61it/s]

{'loss': 0.1035, 'grad_norm': 1.6557579040527344, 'learning_rate': 4.395154947302187e-06, 'epoch': 2.34}


 78%|███████▊  | 19850/25428 [4:12:45<58:50,  1.58it/s]

{'loss': 0.0797, 'grad_norm': 0.7806841731071472, 'learning_rate': 4.3872896020135285e-06, 'epoch': 2.34}


 78%|███████▊  | 19860/25428 [4:12:52<57:40,  1.61it/s]  

{'loss': 0.0773, 'grad_norm': 0.8938045501708984, 'learning_rate': 4.37942425672487e-06, 'epoch': 2.34}


 78%|███████▊  | 19870/25428 [4:12:58<59:09,  1.57it/s]

{'loss': 0.0655, 'grad_norm': 1.0686917304992676, 'learning_rate': 4.371558911436212e-06, 'epoch': 2.34}


 78%|███████▊  | 19880/25428 [4:13:04<58:24,  1.58it/s]

{'loss': 0.0722, 'grad_norm': 0.6962729096412659, 'learning_rate': 4.363693566147554e-06, 'epoch': 2.35}


 78%|███████▊  | 19890/25428 [4:13:10<57:06,  1.62it/s]

{'loss': 0.147, 'grad_norm': 1.6947239637374878, 'learning_rate': 4.355828220858896e-06, 'epoch': 2.35}


 78%|███████▊  | 19900/25428 [4:13:17<59:05,  1.56it/s]

{'loss': 0.0969, 'grad_norm': 1.2938417196273804, 'learning_rate': 4.347962875570238e-06, 'epoch': 2.35}


 78%|███████▊  | 19910/25428 [4:13:23<57:48,  1.59it/s]

{'loss': 0.0662, 'grad_norm': 1.182410478591919, 'learning_rate': 4.3400975302815795e-06, 'epoch': 2.35}


 78%|███████▊  | 19920/25428 [4:13:29<57:17,  1.60it/s]

{'loss': 0.0873, 'grad_norm': 0.7994793653488159, 'learning_rate': 4.332232184992921e-06, 'epoch': 2.35}


 78%|███████▊  | 19930/25428 [4:13:36<57:03,  1.61it/s]

{'loss': 0.0853, 'grad_norm': 0.9905994534492493, 'learning_rate': 4.324366839704263e-06, 'epoch': 2.35}


 78%|███████▊  | 19940/25428 [4:13:42<57:49,  1.58it/s]

{'loss': 0.0816, 'grad_norm': 1.6383932828903198, 'learning_rate': 4.316501494415605e-06, 'epoch': 2.35}


 78%|███████▊  | 19950/25428 [4:13:48<57:15,  1.59it/s]

{'loss': 0.074, 'grad_norm': 0.7311519384384155, 'learning_rate': 4.308636149126947e-06, 'epoch': 2.35}


 78%|███████▊  | 19960/25428 [4:13:55<57:46,  1.58it/s]

{'loss': 0.0992, 'grad_norm': 1.1355715990066528, 'learning_rate': 4.300770803838289e-06, 'epoch': 2.35}


 79%|███████▊  | 19970/25428 [4:14:01<56:33,  1.61it/s]

{'loss': 0.0961, 'grad_norm': 1.2088242769241333, 'learning_rate': 4.2929054585496305e-06, 'epoch': 2.36}


 79%|███████▊  | 19980/25428 [4:14:07<57:13,  1.59it/s]

{'loss': 0.0726, 'grad_norm': 0.8379736542701721, 'learning_rate': 4.285040113260972e-06, 'epoch': 2.36}


 79%|███████▊  | 19990/25428 [4:14:13<57:11,  1.58it/s]

{'loss': 0.0728, 'grad_norm': 0.5974881052970886, 'learning_rate': 4.277174767972314e-06, 'epoch': 2.36}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.0763, 'grad_norm': 0.9136428833007812, 'learning_rate': 4.269309422683656e-06, 'epoch': 2.36}


 79%|███████▊  | 20010/25428 [4:14:29<1:00:59,  1.48it/s]

{'loss': 0.0671, 'grad_norm': 0.7304434776306152, 'learning_rate': 4.261444077394998e-06, 'epoch': 2.36}


 79%|███████▊  | 20020/25428 [4:14:35<56:44,  1.59it/s]  

{'loss': 0.0549, 'grad_norm': 1.0526896715164185, 'learning_rate': 4.25357873210634e-06, 'epoch': 2.36}


 79%|███████▉  | 20030/25428 [4:14:41<56:01,  1.61it/s]

{'loss': 0.0632, 'grad_norm': 0.6570256948471069, 'learning_rate': 4.2457133868176815e-06, 'epoch': 2.36}


 79%|███████▉  | 20040/25428 [4:14:48<57:50,  1.55it/s]

{'loss': 0.0734, 'grad_norm': 0.5896267294883728, 'learning_rate': 4.237848041529023e-06, 'epoch': 2.36}


 79%|███████▉  | 20050/25428 [4:14:54<55:28,  1.62it/s]

{'loss': 0.0705, 'grad_norm': 0.6725494861602783, 'learning_rate': 4.229982696240365e-06, 'epoch': 2.37}


 79%|███████▉  | 20060/25428 [4:15:00<56:00,  1.60it/s]

{'loss': 0.0623, 'grad_norm': 0.641176164150238, 'learning_rate': 4.222117350951707e-06, 'epoch': 2.37}


 79%|███████▉  | 20070/25428 [4:15:07<57:14,  1.56it/s]

{'loss': 0.0901, 'grad_norm': 0.9017619490623474, 'learning_rate': 4.214252005663049e-06, 'epoch': 2.37}


 79%|███████▉  | 20080/25428 [4:15:13<55:56,  1.59it/s]

{'loss': 0.0716, 'grad_norm': 0.9120036959648132, 'learning_rate': 4.206386660374391e-06, 'epoch': 2.37}


 79%|███████▉  | 20090/25428 [4:15:19<56:12,  1.58it/s]

{'loss': 0.0964, 'grad_norm': 0.8669748306274414, 'learning_rate': 4.1985213150857325e-06, 'epoch': 2.37}


 79%|███████▉  | 20100/25428 [4:15:26<55:04,  1.61it/s]

{'loss': 0.0827, 'grad_norm': 1.6310653686523438, 'learning_rate': 4.190655969797074e-06, 'epoch': 2.37}


 79%|███████▉  | 20110/25428 [4:15:32<56:31,  1.57it/s]

{'loss': 0.0653, 'grad_norm': 1.0431513786315918, 'learning_rate': 4.182790624508416e-06, 'epoch': 2.37}


 79%|███████▉  | 20120/25428 [4:15:38<55:10,  1.60it/s]

{'loss': 0.0641, 'grad_norm': 1.0574668645858765, 'learning_rate': 4.174925279219758e-06, 'epoch': 2.37}


 79%|███████▉  | 20130/25428 [4:15:45<54:32,  1.62it/s]

{'loss': 0.0884, 'grad_norm': 1.3667738437652588, 'learning_rate': 4.1670599339311e-06, 'epoch': 2.37}


 79%|███████▉  | 20140/25428 [4:15:51<54:55,  1.60it/s]

{'loss': 0.0846, 'grad_norm': 0.9646145701408386, 'learning_rate': 4.159194588642442e-06, 'epoch': 2.38}


 79%|███████▉  | 20150/25428 [4:15:57<56:14,  1.56it/s]

{'loss': 0.0851, 'grad_norm': 0.8188712000846863, 'learning_rate': 4.1513292433537835e-06, 'epoch': 2.38}


 79%|███████▉  | 20160/25428 [4:16:04<54:41,  1.61it/s]

{'loss': 0.1041, 'grad_norm': 1.717143177986145, 'learning_rate': 4.143463898065125e-06, 'epoch': 2.38}


 79%|███████▉  | 20170/25428 [4:16:10<55:03,  1.59it/s]

{'loss': 0.0786, 'grad_norm': 3.7036118507385254, 'learning_rate': 4.135598552776467e-06, 'epoch': 2.38}


 79%|███████▉  | 20180/25428 [4:16:17<55:43,  1.57it/s]

{'loss': 0.0978, 'grad_norm': 1.8862546682357788, 'learning_rate': 4.127733207487809e-06, 'epoch': 2.38}


 79%|███████▉  | 20190/25428 [4:16:23<54:01,  1.62it/s]

{'loss': 0.0694, 'grad_norm': 3.6094770431518555, 'learning_rate': 4.119867862199151e-06, 'epoch': 2.38}


 79%|███████▉  | 20200/25428 [4:16:29<55:42,  1.56it/s]

{'loss': 0.0721, 'grad_norm': 0.8950343132019043, 'learning_rate': 4.112002516910493e-06, 'epoch': 2.38}


 79%|███████▉  | 20210/25428 [4:16:35<53:45,  1.62it/s]

{'loss': 0.0682, 'grad_norm': 1.0126296281814575, 'learning_rate': 4.1041371716218345e-06, 'epoch': 2.38}


 80%|███████▉  | 20220/25428 [4:16:42<54:31,  1.59it/s]

{'loss': 0.1211, 'grad_norm': 1.1235307455062866, 'learning_rate': 4.096271826333176e-06, 'epoch': 2.39}


 80%|███████▉  | 20230/25428 [4:16:48<54:08,  1.60it/s]

{'loss': 0.0761, 'grad_norm': 0.868516206741333, 'learning_rate': 4.088406481044518e-06, 'epoch': 2.39}


 80%|███████▉  | 20240/25428 [4:16:54<54:21,  1.59it/s]

{'loss': 0.0768, 'grad_norm': 0.8287244439125061, 'learning_rate': 4.08054113575586e-06, 'epoch': 2.39}


 80%|███████▉  | 20250/25428 [4:17:01<54:04,  1.60it/s]

{'loss': 0.1196, 'grad_norm': 3.963376522064209, 'learning_rate': 4.072675790467202e-06, 'epoch': 2.39}


 80%|███████▉  | 20260/25428 [4:17:07<53:58,  1.60it/s]

{'loss': 0.1043, 'grad_norm': 1.17373526096344, 'learning_rate': 4.064810445178544e-06, 'epoch': 2.39}


 80%|███████▉  | 20270/25428 [4:17:13<55:25,  1.55it/s]

{'loss': 0.066, 'grad_norm': 0.5727673768997192, 'learning_rate': 4.0569450998898855e-06, 'epoch': 2.39}


 80%|███████▉  | 20280/25428 [4:17:19<53:51,  1.59it/s]

{'loss': 0.0757, 'grad_norm': 0.67910236120224, 'learning_rate': 4.049079754601227e-06, 'epoch': 2.39}


 80%|███████▉  | 20290/25428 [4:17:26<52:57,  1.62it/s]

{'loss': 0.0721, 'grad_norm': 1.1204473972320557, 'learning_rate': 4.041214409312569e-06, 'epoch': 2.39}


 80%|███████▉  | 20300/25428 [4:17:32<53:02,  1.61it/s]

{'loss': 0.0952, 'grad_norm': 1.1075371503829956, 'learning_rate': 4.033349064023911e-06, 'epoch': 2.39}


 80%|███████▉  | 20310/25428 [4:17:38<53:34,  1.59it/s]

{'loss': 0.0965, 'grad_norm': 1.0140727758407593, 'learning_rate': 4.025483718735253e-06, 'epoch': 2.4}


 80%|███████▉  | 20320/25428 [4:17:44<53:26,  1.59it/s]

{'loss': 0.0967, 'grad_norm': 1.7168501615524292, 'learning_rate': 4.017618373446595e-06, 'epoch': 2.4}


 80%|███████▉  | 20330/25428 [4:17:51<53:56,  1.57it/s]

{'loss': 0.0754, 'grad_norm': 1.1186391115188599, 'learning_rate': 4.0097530281579365e-06, 'epoch': 2.4}


 80%|███████▉  | 20340/25428 [4:17:57<53:33,  1.58it/s]

{'loss': 0.1007, 'grad_norm': 1.0588306188583374, 'learning_rate': 4.001887682869278e-06, 'epoch': 2.4}


 80%|████████  | 20350/25428 [4:18:03<53:38,  1.58it/s]

{'loss': 0.094, 'grad_norm': 1.0088766813278198, 'learning_rate': 3.99402233758062e-06, 'epoch': 2.4}


 80%|████████  | 20360/25428 [4:18:10<53:32,  1.58it/s]

{'loss': 0.0858, 'grad_norm': 1.0282913446426392, 'learning_rate': 3.986156992291962e-06, 'epoch': 2.4}


 80%|████████  | 20370/25428 [4:18:16<53:32,  1.57it/s]

{'loss': 0.0822, 'grad_norm': 0.7808197140693665, 'learning_rate': 3.978291647003304e-06, 'epoch': 2.4}


 80%|████████  | 20380/25428 [4:18:22<53:20,  1.58it/s]

{'loss': 0.0827, 'grad_norm': 0.9603452682495117, 'learning_rate': 3.970426301714646e-06, 'epoch': 2.4}


 80%|████████  | 20390/25428 [4:18:29<53:27,  1.57it/s]

{'loss': 0.0975, 'grad_norm': 0.6746074557304382, 'learning_rate': 3.9625609564259875e-06, 'epoch': 2.41}


 80%|████████  | 20400/25428 [4:18:35<52:25,  1.60it/s]

{'loss': 0.1099, 'grad_norm': 2.046276092529297, 'learning_rate': 3.954695611137329e-06, 'epoch': 2.41}


 80%|████████  | 20410/25428 [4:18:41<52:34,  1.59it/s]

{'loss': 0.0835, 'grad_norm': 1.301882028579712, 'learning_rate': 3.946830265848671e-06, 'epoch': 2.41}


 80%|████████  | 20420/25428 [4:18:48<52:13,  1.60it/s]

{'loss': 0.0658, 'grad_norm': 0.7829489707946777, 'learning_rate': 3.938964920560013e-06, 'epoch': 2.41}


 80%|████████  | 20430/25428 [4:18:54<52:18,  1.59it/s]

{'loss': 0.0792, 'grad_norm': 1.2504019737243652, 'learning_rate': 3.931099575271355e-06, 'epoch': 2.41}


 80%|████████  | 20440/25428 [4:19:00<52:11,  1.59it/s]

{'loss': 0.0566, 'grad_norm': 1.0384788513183594, 'learning_rate': 3.923234229982697e-06, 'epoch': 2.41}


 80%|████████  | 20450/25428 [4:19:06<51:39,  1.61it/s]

{'loss': 0.1163, 'grad_norm': 0.8073058724403381, 'learning_rate': 3.9153688846940385e-06, 'epoch': 2.41}


 80%|████████  | 20460/25428 [4:19:13<52:27,  1.58it/s]

{'loss': 0.0782, 'grad_norm': 0.8712475299835205, 'learning_rate': 3.90750353940538e-06, 'epoch': 2.41}


 81%|████████  | 20470/25428 [4:19:19<52:20,  1.58it/s]

{'loss': 0.0709, 'grad_norm': 3.062974452972412, 'learning_rate': 3.899638194116722e-06, 'epoch': 2.42}


 81%|████████  | 20480/25428 [4:19:26<52:26,  1.57it/s]

{'loss': 0.0995, 'grad_norm': 0.8377696871757507, 'learning_rate': 3.891772848828064e-06, 'epoch': 2.42}


 81%|████████  | 20490/25428 [4:19:32<52:05,  1.58it/s]

{'loss': 0.0686, 'grad_norm': 1.3249322175979614, 'learning_rate': 3.883907503539406e-06, 'epoch': 2.42}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.0871, 'grad_norm': 0.8731828331947327, 'learning_rate': 3.876042158250748e-06, 'epoch': 2.42}


 81%|████████  | 20510/25428 [4:19:47<54:52,  1.49it/s]  

{'loss': 0.0834, 'grad_norm': 1.0761075019836426, 'learning_rate': 3.8681768129620895e-06, 'epoch': 2.42}


 81%|████████  | 20520/25428 [4:19:54<51:51,  1.58it/s]

{'loss': 0.0812, 'grad_norm': 1.1639305353164673, 'learning_rate': 3.860311467673431e-06, 'epoch': 2.42}


 81%|████████  | 20530/25428 [4:20:00<50:13,  1.63it/s]

{'loss': 0.0913, 'grad_norm': 1.015432357788086, 'learning_rate': 3.852446122384773e-06, 'epoch': 2.42}


 81%|████████  | 20540/25428 [4:20:06<52:11,  1.56it/s]

{'loss': 0.0724, 'grad_norm': 1.3257108926773071, 'learning_rate': 3.844580777096115e-06, 'epoch': 2.42}


 81%|████████  | 20550/25428 [4:20:13<54:17,  1.50it/s]

{'loss': 0.0732, 'grad_norm': 0.7009966969490051, 'learning_rate': 3.836715431807457e-06, 'epoch': 2.42}


 81%|████████  | 20560/25428 [4:20:19<51:01,  1.59it/s]

{'loss': 0.1262, 'grad_norm': 0.880286455154419, 'learning_rate': 3.828850086518799e-06, 'epoch': 2.43}


 81%|████████  | 20570/25428 [4:20:25<50:30,  1.60it/s]

{'loss': 0.0704, 'grad_norm': 0.9619410037994385, 'learning_rate': 3.8209847412301405e-06, 'epoch': 2.43}


 81%|████████  | 20580/25428 [4:20:32<51:15,  1.58it/s]

{'loss': 0.0668, 'grad_norm': 0.6439642310142517, 'learning_rate': 3.813119395941482e-06, 'epoch': 2.43}


 81%|████████  | 20590/25428 [4:20:38<50:29,  1.60it/s]

{'loss': 0.0743, 'grad_norm': 0.6667136549949646, 'learning_rate': 3.805254050652824e-06, 'epoch': 2.43}


 81%|████████  | 20600/25428 [4:20:44<53:49,  1.49it/s]

{'loss': 0.0969, 'grad_norm': 1.0099269151687622, 'learning_rate': 3.7973887053641656e-06, 'epoch': 2.43}


 81%|████████  | 20610/25428 [4:20:51<49:24,  1.63it/s]

{'loss': 0.0629, 'grad_norm': 0.9249946475028992, 'learning_rate': 3.789523360075508e-06, 'epoch': 2.43}


 81%|████████  | 20620/25428 [4:20:57<49:12,  1.63it/s]

{'loss': 0.1114, 'grad_norm': 1.6256955862045288, 'learning_rate': 3.7816580147868492e-06, 'epoch': 2.43}


 81%|████████  | 20630/25428 [4:21:03<49:50,  1.60it/s]

{'loss': 0.0753, 'grad_norm': 1.4168782234191895, 'learning_rate': 3.7737926694981915e-06, 'epoch': 2.43}


 81%|████████  | 20640/25428 [4:21:09<50:34,  1.58it/s]

{'loss': 0.0804, 'grad_norm': 0.8988732099533081, 'learning_rate': 3.765927324209533e-06, 'epoch': 2.44}


 81%|████████  | 20650/25428 [4:21:16<50:39,  1.57it/s]

{'loss': 0.0658, 'grad_norm': 0.9748014211654663, 'learning_rate': 3.758061978920875e-06, 'epoch': 2.44}


 81%|████████  | 20660/25428 [4:21:22<49:30,  1.60it/s]

{'loss': 0.0655, 'grad_norm': 0.6761098504066467, 'learning_rate': 3.7501966336322166e-06, 'epoch': 2.44}


 81%|████████▏ | 20670/25428 [4:21:29<50:25,  1.57it/s]

{'loss': 0.1257, 'grad_norm': 0.6108102798461914, 'learning_rate': 3.742331288343559e-06, 'epoch': 2.44}


 81%|████████▏ | 20680/25428 [4:21:35<49:38,  1.59it/s]

{'loss': 0.0962, 'grad_norm': 1.6333951950073242, 'learning_rate': 3.7344659430549002e-06, 'epoch': 2.44}


 81%|████████▏ | 20690/25428 [4:21:41<49:39,  1.59it/s]

{'loss': 0.0731, 'grad_norm': 1.1302869319915771, 'learning_rate': 3.726600597766242e-06, 'epoch': 2.44}


 81%|████████▏ | 20700/25428 [4:21:47<49:16,  1.60it/s]

{'loss': 0.0831, 'grad_norm': 0.7553156018257141, 'learning_rate': 3.718735252477584e-06, 'epoch': 2.44}


 81%|████████▏ | 20710/25428 [4:21:54<50:03,  1.57it/s]

{'loss': 0.0739, 'grad_norm': 0.8490884304046631, 'learning_rate': 3.7108699071889257e-06, 'epoch': 2.44}


 81%|████████▏ | 20720/25428 [4:22:00<49:43,  1.58it/s]

{'loss': 0.0851, 'grad_norm': 0.7949142456054688, 'learning_rate': 3.7030045619002676e-06, 'epoch': 2.44}


 82%|████████▏ | 20730/25428 [4:22:07<50:12,  1.56it/s]

{'loss': 0.0666, 'grad_norm': 0.7462761998176575, 'learning_rate': 3.6951392166116094e-06, 'epoch': 2.45}


 82%|████████▏ | 20740/25428 [4:22:13<52:59,  1.47it/s]

{'loss': 0.1145, 'grad_norm': 2.7320613861083984, 'learning_rate': 3.6872738713229512e-06, 'epoch': 2.45}


 82%|████████▏ | 20750/25428 [4:22:19<49:37,  1.57it/s]

{'loss': 0.0666, 'grad_norm': 0.7316690683364868, 'learning_rate': 3.679408526034293e-06, 'epoch': 2.45}


 82%|████████▏ | 20760/25428 [4:22:26<49:22,  1.58it/s]

{'loss': 0.1147, 'grad_norm': 0.7515506744384766, 'learning_rate': 3.671543180745635e-06, 'epoch': 2.45}


 82%|████████▏ | 20770/25428 [4:22:32<49:52,  1.56it/s]

{'loss': 0.093, 'grad_norm': 0.8860138058662415, 'learning_rate': 3.6636778354569767e-06, 'epoch': 2.45}


 82%|████████▏ | 20780/25428 [4:22:39<48:45,  1.59it/s]

{'loss': 0.084, 'grad_norm': 0.8223132491111755, 'learning_rate': 3.655812490168319e-06, 'epoch': 2.45}


 82%|████████▏ | 20790/25428 [4:22:45<49:31,  1.56it/s]

{'loss': 0.0664, 'grad_norm': 1.2454808950424194, 'learning_rate': 3.6479471448796604e-06, 'epoch': 2.45}


 82%|████████▏ | 20800/25428 [4:22:51<48:36,  1.59it/s]

{'loss': 0.1103, 'grad_norm': 0.8144438862800598, 'learning_rate': 3.6400817995910027e-06, 'epoch': 2.45}


 82%|████████▏ | 20810/25428 [4:22:58<49:34,  1.55it/s]

{'loss': 0.0865, 'grad_norm': 1.3189679384231567, 'learning_rate': 3.632216454302344e-06, 'epoch': 2.46}


 82%|████████▏ | 20820/25428 [4:23:04<48:59,  1.57it/s]

{'loss': 0.0573, 'grad_norm': 2.614654541015625, 'learning_rate': 3.6243511090136863e-06, 'epoch': 2.46}


 82%|████████▏ | 20830/25428 [4:23:10<48:33,  1.58it/s]

{'loss': 0.1216, 'grad_norm': 0.9409712553024292, 'learning_rate': 3.6164857637250277e-06, 'epoch': 2.46}


 82%|████████▏ | 20840/25428 [4:23:17<49:18,  1.55it/s]

{'loss': 0.1139, 'grad_norm': 1.5828442573547363, 'learning_rate': 3.60862041843637e-06, 'epoch': 2.46}


 82%|████████▏ | 20850/25428 [4:23:23<47:42,  1.60it/s]

{'loss': 0.0636, 'grad_norm': 0.5772495269775391, 'learning_rate': 3.6007550731477114e-06, 'epoch': 2.46}


 82%|████████▏ | 20860/25428 [4:23:30<47:58,  1.59it/s]

{'loss': 0.0773, 'grad_norm': 0.4976504147052765, 'learning_rate': 3.5928897278590537e-06, 'epoch': 2.46}


 82%|████████▏ | 20870/25428 [4:23:36<47:58,  1.58it/s]

{'loss': 0.0812, 'grad_norm': 1.2687809467315674, 'learning_rate': 3.585024382570395e-06, 'epoch': 2.46}


 82%|████████▏ | 20880/25428 [4:23:42<47:44,  1.59it/s]

{'loss': 0.0746, 'grad_norm': 1.0052571296691895, 'learning_rate': 3.5771590372817373e-06, 'epoch': 2.46}


 82%|████████▏ | 20890/25428 [4:23:49<48:44,  1.55it/s]

{'loss': 0.0876, 'grad_norm': 0.9597559571266174, 'learning_rate': 3.5692936919930787e-06, 'epoch': 2.46}


 82%|████████▏ | 20900/25428 [4:23:55<47:56,  1.57it/s]

{'loss': 0.0746, 'grad_norm': 1.135616660118103, 'learning_rate': 3.561428346704421e-06, 'epoch': 2.47}


 82%|████████▏ | 20910/25428 [4:24:02<48:31,  1.55it/s]

{'loss': 0.0742, 'grad_norm': 0.7608107328414917, 'learning_rate': 3.5535630014157624e-06, 'epoch': 2.47}


 82%|████████▏ | 20920/25428 [4:24:08<47:18,  1.59it/s]

{'loss': 0.0796, 'grad_norm': 1.445238709449768, 'learning_rate': 3.5456976561271047e-06, 'epoch': 2.47}


 82%|████████▏ | 20930/25428 [4:24:14<47:03,  1.59it/s]

{'loss': 0.1304, 'grad_norm': 0.8437132239341736, 'learning_rate': 3.537832310838446e-06, 'epoch': 2.47}


 82%|████████▏ | 20940/25428 [4:24:21<47:17,  1.58it/s]

{'loss': 0.1076, 'grad_norm': 1.2427343130111694, 'learning_rate': 3.5299669655497883e-06, 'epoch': 2.47}


 82%|████████▏ | 20950/25428 [4:24:27<46:11,  1.62it/s]

{'loss': 0.0725, 'grad_norm': 0.9963560104370117, 'learning_rate': 3.5221016202611297e-06, 'epoch': 2.47}


 82%|████████▏ | 20960/25428 [4:24:33<47:37,  1.56it/s]

{'loss': 0.1013, 'grad_norm': 0.9009600281715393, 'learning_rate': 3.5142362749724716e-06, 'epoch': 2.47}


 82%|████████▏ | 20970/25428 [4:24:40<47:51,  1.55it/s]

{'loss': 0.0686, 'grad_norm': 0.6659144163131714, 'learning_rate': 3.5063709296838134e-06, 'epoch': 2.47}


 83%|████████▎ | 20980/25428 [4:24:46<47:04,  1.57it/s]

{'loss': 0.0776, 'grad_norm': 1.545269250869751, 'learning_rate': 3.4985055843951552e-06, 'epoch': 2.48}


 83%|████████▎ | 20990/25428 [4:24:52<46:05,  1.60it/s]

{'loss': 0.0767, 'grad_norm': 0.9749877452850342, 'learning_rate': 3.490640239106497e-06, 'epoch': 2.48}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.07, 'grad_norm': 0.8865429162979126, 'learning_rate': 3.482774893817839e-06, 'epoch': 2.48}


 83%|████████▎ | 21010/25428 [4:25:08<48:55,  1.50it/s]  

{'loss': 0.0836, 'grad_norm': 0.7690128087997437, 'learning_rate': 3.4749095485291807e-06, 'epoch': 2.48}


 83%|████████▎ | 21020/25428 [4:25:14<46:24,  1.58it/s]

{'loss': 0.0821, 'grad_norm': 0.7278503775596619, 'learning_rate': 3.4670442032405226e-06, 'epoch': 2.48}


 83%|████████▎ | 21030/25428 [4:25:20<45:50,  1.60it/s]

{'loss': 0.095, 'grad_norm': 2.054795742034912, 'learning_rate': 3.4591788579518644e-06, 'epoch': 2.48}


 83%|████████▎ | 21040/25428 [4:25:27<45:04,  1.62it/s]

{'loss': 0.0523, 'grad_norm': 0.6525087952613831, 'learning_rate': 3.4513135126632062e-06, 'epoch': 2.48}


 83%|████████▎ | 21050/25428 [4:25:33<45:56,  1.59it/s]

{'loss': 0.0931, 'grad_norm': 1.0966601371765137, 'learning_rate': 3.443448167374548e-06, 'epoch': 2.48}


 83%|████████▎ | 21060/25428 [4:25:39<45:26,  1.60it/s]

{'loss': 0.08, 'grad_norm': 0.8480066657066345, 'learning_rate': 3.43558282208589e-06, 'epoch': 2.48}


 83%|████████▎ | 21070/25428 [4:25:45<45:20,  1.60it/s]

{'loss': 0.0784, 'grad_norm': 0.9574617743492126, 'learning_rate': 3.4277174767972317e-06, 'epoch': 2.49}


 83%|████████▎ | 21080/25428 [4:25:52<45:01,  1.61it/s]

{'loss': 0.0979, 'grad_norm': 0.9039132595062256, 'learning_rate': 3.4198521315085736e-06, 'epoch': 2.49}


 83%|████████▎ | 21090/25428 [4:25:58<45:59,  1.57it/s]

{'loss': 0.1127, 'grad_norm': 0.6762683391571045, 'learning_rate': 3.411986786219915e-06, 'epoch': 2.49}


 83%|████████▎ | 21100/25428 [4:26:04<45:20,  1.59it/s]

{'loss': 0.0997, 'grad_norm': 1.0992454290390015, 'learning_rate': 3.4041214409312572e-06, 'epoch': 2.49}


 83%|████████▎ | 21110/25428 [4:26:11<45:55,  1.57it/s]

{'loss': 0.0625, 'grad_norm': 0.6991212368011475, 'learning_rate': 3.3962560956425987e-06, 'epoch': 2.49}


 83%|████████▎ | 21120/25428 [4:26:17<44:55,  1.60it/s]

{'loss': 0.0851, 'grad_norm': 1.1698092222213745, 'learning_rate': 3.388390750353941e-06, 'epoch': 2.49}


 83%|████████▎ | 21130/25428 [4:26:23<45:34,  1.57it/s]

{'loss': 0.0781, 'grad_norm': 0.6670881509780884, 'learning_rate': 3.3805254050652823e-06, 'epoch': 2.49}


 83%|████████▎ | 21140/25428 [4:26:30<44:46,  1.60it/s]

{'loss': 0.0855, 'grad_norm': 0.6993826031684875, 'learning_rate': 3.3726600597766246e-06, 'epoch': 2.49}


 83%|████████▎ | 21150/25428 [4:26:36<46:32,  1.53it/s]

{'loss': 0.0587, 'grad_norm': 0.7762892842292786, 'learning_rate': 3.364794714487966e-06, 'epoch': 2.5}


 83%|████████▎ | 21160/25428 [4:26:43<45:14,  1.57it/s]

{'loss': 0.0728, 'grad_norm': 0.688737690448761, 'learning_rate': 3.3569293691993082e-06, 'epoch': 2.5}


 83%|████████▎ | 21170/25428 [4:26:49<45:00,  1.58it/s]

{'loss': 0.0803, 'grad_norm': 0.6546450853347778, 'learning_rate': 3.3490640239106497e-06, 'epoch': 2.5}


 83%|████████▎ | 21180/25428 [4:26:55<45:28,  1.56it/s]

{'loss': 0.0819, 'grad_norm': 1.0321100950241089, 'learning_rate': 3.341198678621992e-06, 'epoch': 2.5}


 83%|████████▎ | 21190/25428 [4:27:02<45:42,  1.55it/s]

{'loss': 0.1069, 'grad_norm': 1.0956711769104004, 'learning_rate': 3.3333333333333333e-06, 'epoch': 2.5}


 83%|████████▎ | 21200/25428 [4:27:08<43:43,  1.61it/s]

{'loss': 0.1074, 'grad_norm': 1.0071136951446533, 'learning_rate': 3.3254679880446756e-06, 'epoch': 2.5}


 83%|████████▎ | 21210/25428 [4:27:15<45:35,  1.54it/s]

{'loss': 0.0707, 'grad_norm': 0.6236832141876221, 'learning_rate': 3.317602642756017e-06, 'epoch': 2.5}


 83%|████████▎ | 21220/25428 [4:27:21<44:53,  1.56it/s]

{'loss': 0.1067, 'grad_norm': 0.8194040060043335, 'learning_rate': 3.3097372974673592e-06, 'epoch': 2.5}


 83%|████████▎ | 21230/25428 [4:27:27<44:30,  1.57it/s]

{'loss': 0.1063, 'grad_norm': 1.0861847400665283, 'learning_rate': 3.3018719521787007e-06, 'epoch': 2.5}


 84%|████████▎ | 21240/25428 [4:27:34<43:54,  1.59it/s]

{'loss': 0.0981, 'grad_norm': 0.8791866898536682, 'learning_rate': 3.294006606890043e-06, 'epoch': 2.51}


 84%|████████▎ | 21250/25428 [4:27:40<43:45,  1.59it/s]

{'loss': 0.0895, 'grad_norm': 0.7041336894035339, 'learning_rate': 3.2861412616013843e-06, 'epoch': 2.51}


 84%|████████▎ | 21260/25428 [4:27:46<44:12,  1.57it/s]

{'loss': 0.0789, 'grad_norm': 0.7496433258056641, 'learning_rate': 3.2782759163127266e-06, 'epoch': 2.51}


 84%|████████▎ | 21270/25428 [4:27:53<44:21,  1.56it/s]

{'loss': 0.1025, 'grad_norm': 0.9229451417922974, 'learning_rate': 3.270410571024068e-06, 'epoch': 2.51}


 84%|████████▎ | 21280/25428 [4:27:59<44:01,  1.57it/s]

{'loss': 0.1113, 'grad_norm': 0.8368417024612427, 'learning_rate': 3.2625452257354102e-06, 'epoch': 2.51}


 84%|████████▎ | 21290/25428 [4:28:05<42:41,  1.62it/s]

{'loss': 0.0955, 'grad_norm': 1.2512739896774292, 'learning_rate': 3.2546798804467517e-06, 'epoch': 2.51}


 84%|████████▍ | 21300/25428 [4:28:11<43:18,  1.59it/s]

{'loss': 0.0941, 'grad_norm': 1.1568529605865479, 'learning_rate': 3.246814535158094e-06, 'epoch': 2.51}


 84%|████████▍ | 21310/25428 [4:28:18<43:23,  1.58it/s]

{'loss': 0.0792, 'grad_norm': 0.9797244668006897, 'learning_rate': 3.2389491898694353e-06, 'epoch': 2.51}


 84%|████████▍ | 21320/25428 [4:28:24<44:00,  1.56it/s]

{'loss': 0.0745, 'grad_norm': 0.7894847393035889, 'learning_rate': 3.2310838445807776e-06, 'epoch': 2.52}


 84%|████████▍ | 21330/25428 [4:28:30<43:31,  1.57it/s]

{'loss': 0.0839, 'grad_norm': 0.9919741749763489, 'learning_rate': 3.223218499292119e-06, 'epoch': 2.52}


 84%|████████▍ | 21340/25428 [4:28:37<42:44,  1.59it/s]

{'loss': 0.0776, 'grad_norm': 1.6010788679122925, 'learning_rate': 3.2153531540034613e-06, 'epoch': 2.52}


 84%|████████▍ | 21350/25428 [4:28:43<43:15,  1.57it/s]

{'loss': 0.0664, 'grad_norm': 0.847557008266449, 'learning_rate': 3.2074878087148027e-06, 'epoch': 2.52}


 84%|████████▍ | 21360/25428 [4:28:49<42:57,  1.58it/s]

{'loss': 0.0936, 'grad_norm': 0.9939554333686829, 'learning_rate': 3.1996224634261445e-06, 'epoch': 2.52}


 84%|████████▍ | 21370/25428 [4:28:56<42:23,  1.60it/s]

{'loss': 0.112, 'grad_norm': 0.979882538318634, 'learning_rate': 3.1917571181374863e-06, 'epoch': 2.52}


 84%|████████▍ | 21380/25428 [4:29:02<41:40,  1.62it/s]

{'loss': 0.0649, 'grad_norm': 0.8122560977935791, 'learning_rate': 3.183891772848828e-06, 'epoch': 2.52}


 84%|████████▍ | 21390/25428 [4:29:08<41:34,  1.62it/s]

{'loss': 0.0962, 'grad_norm': 0.7708604335784912, 'learning_rate': 3.17602642756017e-06, 'epoch': 2.52}


 84%|████████▍ | 21400/25428 [4:29:15<44:16,  1.52it/s]

{'loss': 0.0658, 'grad_norm': 1.4933654069900513, 'learning_rate': 3.168161082271512e-06, 'epoch': 2.52}


 84%|████████▍ | 21410/25428 [4:29:21<41:50,  1.60it/s]

{'loss': 0.0719, 'grad_norm': 0.9053003191947937, 'learning_rate': 3.160295736982854e-06, 'epoch': 2.53}


 84%|████████▍ | 21420/25428 [4:29:27<41:52,  1.60it/s]

{'loss': 0.0742, 'grad_norm': 0.5578868389129639, 'learning_rate': 3.1524303916941955e-06, 'epoch': 2.53}


 84%|████████▍ | 21430/25428 [4:29:34<41:51,  1.59it/s]

{'loss': 0.0651, 'grad_norm': 0.6497705578804016, 'learning_rate': 3.1445650464055378e-06, 'epoch': 2.53}


 84%|████████▍ | 21440/25428 [4:29:40<42:13,  1.57it/s]

{'loss': 0.0827, 'grad_norm': 1.196306586265564, 'learning_rate': 3.136699701116879e-06, 'epoch': 2.53}


 84%|████████▍ | 21450/25428 [4:29:46<42:14,  1.57it/s]

{'loss': 0.0789, 'grad_norm': 1.101196527481079, 'learning_rate': 3.1288343558282214e-06, 'epoch': 2.53}


 84%|████████▍ | 21460/25428 [4:29:53<41:34,  1.59it/s]

{'loss': 0.1058, 'grad_norm': 0.7024217247962952, 'learning_rate': 3.120969010539563e-06, 'epoch': 2.53}


 84%|████████▍ | 21470/25428 [4:29:59<41:17,  1.60it/s]

{'loss': 0.0762, 'grad_norm': 1.1316219568252563, 'learning_rate': 3.113103665250905e-06, 'epoch': 2.53}


 84%|████████▍ | 21480/25428 [4:30:05<42:02,  1.57it/s]

{'loss': 0.0559, 'grad_norm': 0.5888811945915222, 'learning_rate': 3.1052383199622465e-06, 'epoch': 2.53}


 85%|████████▍ | 21490/25428 [4:30:11<41:03,  1.60it/s]

{'loss': 0.1068, 'grad_norm': 1.1387343406677246, 'learning_rate': 3.0973729746735888e-06, 'epoch': 2.54}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.1002, 'grad_norm': 1.2214488983154297, 'learning_rate': 3.08950762938493e-06, 'epoch': 2.54}


 85%|████████▍ | 21510/25428 [4:30:27<43:32,  1.50it/s]  

{'loss': 0.0565, 'grad_norm': 1.4936091899871826, 'learning_rate': 3.0816422840962724e-06, 'epoch': 2.54}


 85%|████████▍ | 21520/25428 [4:30:33<41:22,  1.57it/s]

{'loss': 0.0807, 'grad_norm': 0.5599781274795532, 'learning_rate': 3.073776938807614e-06, 'epoch': 2.54}


 85%|████████▍ | 21530/25428 [4:30:40<40:58,  1.59it/s]

{'loss': 0.1004, 'grad_norm': 0.6042673587799072, 'learning_rate': 3.065911593518956e-06, 'epoch': 2.54}


 85%|████████▍ | 21540/25428 [4:30:46<41:55,  1.55it/s]

{'loss': 0.0655, 'grad_norm': 0.7777625918388367, 'learning_rate': 3.0580462482302975e-06, 'epoch': 2.54}


 85%|████████▍ | 21550/25428 [4:30:52<41:03,  1.57it/s]

{'loss': 0.0933, 'grad_norm': 0.7318809032440186, 'learning_rate': 3.0501809029416398e-06, 'epoch': 2.54}


 85%|████████▍ | 21560/25428 [4:30:59<41:07,  1.57it/s]

{'loss': 0.063, 'grad_norm': 0.7876070141792297, 'learning_rate': 3.042315557652981e-06, 'epoch': 2.54}


 85%|████████▍ | 21570/25428 [4:31:05<41:22,  1.55it/s]

{'loss': 0.0666, 'grad_norm': 1.0749894380569458, 'learning_rate': 3.0344502123643234e-06, 'epoch': 2.54}


 85%|████████▍ | 21580/25428 [4:31:12<40:25,  1.59it/s]

{'loss': 0.068, 'grad_norm': 0.7384105324745178, 'learning_rate': 3.026584867075665e-06, 'epoch': 2.55}


 85%|████████▍ | 21590/25428 [4:31:18<40:23,  1.58it/s]

{'loss': 0.0805, 'grad_norm': 1.051598072052002, 'learning_rate': 3.018719521787007e-06, 'epoch': 2.55}


 85%|████████▍ | 21600/25428 [4:31:24<40:40,  1.57it/s]

{'loss': 0.0846, 'grad_norm': 0.5695366263389587, 'learning_rate': 3.0108541764983485e-06, 'epoch': 2.55}


 85%|████████▍ | 21610/25428 [4:31:31<39:58,  1.59it/s]

{'loss': 0.0826, 'grad_norm': 0.9367357492446899, 'learning_rate': 3.0029888312096903e-06, 'epoch': 2.55}


 85%|████████▌ | 21620/25428 [4:31:37<40:06,  1.58it/s]

{'loss': 0.103, 'grad_norm': 0.7684893012046814, 'learning_rate': 2.995123485921032e-06, 'epoch': 2.55}


 85%|████████▌ | 21630/25428 [4:31:43<40:25,  1.57it/s]

{'loss': 0.0798, 'grad_norm': 0.8784983158111572, 'learning_rate': 2.987258140632374e-06, 'epoch': 2.55}


 85%|████████▌ | 21640/25428 [4:31:50<39:58,  1.58it/s]

{'loss': 0.0868, 'grad_norm': 1.191360592842102, 'learning_rate': 2.979392795343716e-06, 'epoch': 2.55}


 85%|████████▌ | 21650/25428 [4:31:56<39:11,  1.61it/s]

{'loss': 0.0696, 'grad_norm': 0.8401247262954712, 'learning_rate': 2.9715274500550577e-06, 'epoch': 2.55}


 85%|████████▌ | 21660/25428 [4:32:02<40:12,  1.56it/s]

{'loss': 0.0718, 'grad_norm': 1.1522217988967896, 'learning_rate': 2.9636621047663995e-06, 'epoch': 2.56}


 85%|████████▌ | 21670/25428 [4:32:09<39:51,  1.57it/s]

{'loss': 0.0779, 'grad_norm': 0.973676860332489, 'learning_rate': 2.9557967594777413e-06, 'epoch': 2.56}


 85%|████████▌ | 21680/25428 [4:32:15<40:14,  1.55it/s]

{'loss': 0.079, 'grad_norm': 0.7768774628639221, 'learning_rate': 2.947931414189083e-06, 'epoch': 2.56}


 85%|████████▌ | 21690/25428 [4:32:22<38:46,  1.61it/s]

{'loss': 0.0918, 'grad_norm': 0.9017207622528076, 'learning_rate': 2.940066068900425e-06, 'epoch': 2.56}


 85%|████████▌ | 21700/25428 [4:32:28<39:50,  1.56it/s]

{'loss': 0.1026, 'grad_norm': 0.9101253747940063, 'learning_rate': 2.932200723611767e-06, 'epoch': 2.56}


 85%|████████▌ | 21710/25428 [4:32:34<39:34,  1.57it/s]

{'loss': 0.0609, 'grad_norm': 1.8471919298171997, 'learning_rate': 2.9243353783231087e-06, 'epoch': 2.56}


 85%|████████▌ | 21720/25428 [4:32:41<39:11,  1.58it/s]

{'loss': 0.0905, 'grad_norm': 0.5827623605728149, 'learning_rate': 2.9164700330344505e-06, 'epoch': 2.56}


 85%|████████▌ | 21730/25428 [4:32:47<38:51,  1.59it/s]

{'loss': 0.0749, 'grad_norm': 1.0399531126022339, 'learning_rate': 2.9086046877457923e-06, 'epoch': 2.56}


 85%|████████▌ | 21740/25428 [4:32:53<38:25,  1.60it/s]

{'loss': 0.0589, 'grad_norm': 0.5207470059394836, 'learning_rate': 2.9007393424571337e-06, 'epoch': 2.56}


 86%|████████▌ | 21750/25428 [4:33:00<38:31,  1.59it/s]

{'loss': 0.0832, 'grad_norm': 1.2057602405548096, 'learning_rate': 2.892873997168476e-06, 'epoch': 2.57}


 86%|████████▌ | 21760/25428 [4:33:06<39:11,  1.56it/s]

{'loss': 0.0698, 'grad_norm': 1.0331085920333862, 'learning_rate': 2.8850086518798174e-06, 'epoch': 2.57}


 86%|████████▌ | 21770/25428 [4:33:12<38:41,  1.58it/s]

{'loss': 0.0997, 'grad_norm': 0.8861207962036133, 'learning_rate': 2.8771433065911597e-06, 'epoch': 2.57}


 86%|████████▌ | 21780/25428 [4:33:19<38:56,  1.56it/s]

{'loss': 0.0983, 'grad_norm': 0.828783392906189, 'learning_rate': 2.869277961302501e-06, 'epoch': 2.57}


 86%|████████▌ | 21790/25428 [4:33:25<38:15,  1.58it/s]

{'loss': 0.0523, 'grad_norm': 0.6279952526092529, 'learning_rate': 2.8614126160138433e-06, 'epoch': 2.57}


 86%|████████▌ | 21800/25428 [4:33:32<37:46,  1.60it/s]

{'loss': 0.0734, 'grad_norm': 0.9538367390632629, 'learning_rate': 2.8535472707251847e-06, 'epoch': 2.57}


 86%|████████▌ | 21810/25428 [4:33:38<38:43,  1.56it/s]

{'loss': 0.0753, 'grad_norm': 0.7433029413223267, 'learning_rate': 2.845681925436527e-06, 'epoch': 2.57}


 86%|████████▌ | 21820/25428 [4:33:44<37:42,  1.59it/s]

{'loss': 0.0907, 'grad_norm': 0.81417316198349, 'learning_rate': 2.8378165801478684e-06, 'epoch': 2.57}


 86%|████████▌ | 21830/25428 [4:33:51<38:01,  1.58it/s]

{'loss': 0.0769, 'grad_norm': 1.2097692489624023, 'learning_rate': 2.8299512348592107e-06, 'epoch': 2.58}


 86%|████████▌ | 21840/25428 [4:33:57<37:39,  1.59it/s]

{'loss': 0.0957, 'grad_norm': 0.6358219981193542, 'learning_rate': 2.822085889570552e-06, 'epoch': 2.58}


 86%|████████▌ | 21850/25428 [4:34:03<38:45,  1.54it/s]

{'loss': 0.1076, 'grad_norm': 0.900678813457489, 'learning_rate': 2.8142205442818943e-06, 'epoch': 2.58}


 86%|████████▌ | 21860/25428 [4:34:10<37:04,  1.60it/s]

{'loss': 0.0662, 'grad_norm': 0.9940550327301025, 'learning_rate': 2.8063551989932357e-06, 'epoch': 2.58}


 86%|████████▌ | 21870/25428 [4:34:16<37:52,  1.57it/s]

{'loss': 0.0662, 'grad_norm': 0.7944991588592529, 'learning_rate': 2.798489853704578e-06, 'epoch': 2.58}


 86%|████████▌ | 21880/25428 [4:34:22<37:30,  1.58it/s]

{'loss': 0.0684, 'grad_norm': 0.7977409958839417, 'learning_rate': 2.7906245084159194e-06, 'epoch': 2.58}


 86%|████████▌ | 21890/25428 [4:34:29<37:08,  1.59it/s]

{'loss': 0.0835, 'grad_norm': 3.4261605739593506, 'learning_rate': 2.7827591631272617e-06, 'epoch': 2.58}


 86%|████████▌ | 21900/25428 [4:34:35<37:14,  1.58it/s]

{'loss': 0.0612, 'grad_norm': 0.7260464429855347, 'learning_rate': 2.774893817838603e-06, 'epoch': 2.58}


 86%|████████▌ | 21910/25428 [4:34:41<37:07,  1.58it/s]

{'loss': 0.0786, 'grad_norm': 0.7838212847709656, 'learning_rate': 2.7670284725499453e-06, 'epoch': 2.58}


 86%|████████▌ | 21920/25428 [4:34:48<36:36,  1.60it/s]

{'loss': 0.105, 'grad_norm': 1.9205420017242432, 'learning_rate': 2.7591631272612867e-06, 'epoch': 2.59}


 86%|████████▌ | 21930/25428 [4:34:54<37:23,  1.56it/s]

{'loss': 0.0794, 'grad_norm': 1.1089128255844116, 'learning_rate': 2.751297781972629e-06, 'epoch': 2.59}


 86%|████████▋ | 21940/25428 [4:35:00<36:41,  1.58it/s]

{'loss': 0.0642, 'grad_norm': 0.7890344262123108, 'learning_rate': 2.7434324366839704e-06, 'epoch': 2.59}


 86%|████████▋ | 21950/25428 [4:35:07<36:55,  1.57it/s]

{'loss': 0.0832, 'grad_norm': 0.6880554556846619, 'learning_rate': 2.7355670913953127e-06, 'epoch': 2.59}


 86%|████████▋ | 21960/25428 [4:35:13<36:25,  1.59it/s]

{'loss': 0.1435, 'grad_norm': 0.904633104801178, 'learning_rate': 2.727701746106654e-06, 'epoch': 2.59}


 86%|████████▋ | 21970/25428 [4:35:19<36:06,  1.60it/s]

{'loss': 0.1097, 'grad_norm': 0.8593153357505798, 'learning_rate': 2.7198364008179963e-06, 'epoch': 2.59}


 86%|████████▋ | 21980/25428 [4:35:26<36:29,  1.57it/s]

{'loss': 0.0713, 'grad_norm': 0.8000705242156982, 'learning_rate': 2.7119710555293377e-06, 'epoch': 2.59}


 86%|████████▋ | 21990/25428 [4:35:32<36:54,  1.55it/s]

{'loss': 0.0857, 'grad_norm': 1.1733181476593018, 'learning_rate': 2.70410571024068e-06, 'epoch': 2.59}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.1064, 'grad_norm': 1.8035340309143066, 'learning_rate': 2.6962403649520214e-06, 'epoch': 2.6}


 87%|████████▋ | 22010/25428 [4:35:48<38:10,  1.49it/s]  

{'loss': 0.0965, 'grad_norm': 0.760516345500946, 'learning_rate': 2.6883750196633632e-06, 'epoch': 2.6}


 87%|████████▋ | 22020/25428 [4:35:54<36:21,  1.56it/s]

{'loss': 0.0628, 'grad_norm': 0.7921002507209778, 'learning_rate': 2.680509674374705e-06, 'epoch': 2.6}


 87%|████████▋ | 22030/25428 [4:36:00<35:59,  1.57it/s]

{'loss': 0.0973, 'grad_norm': 1.3927499055862427, 'learning_rate': 2.672644329086047e-06, 'epoch': 2.6}


 87%|████████▋ | 22040/25428 [4:36:07<35:58,  1.57it/s]

{'loss': 0.0769, 'grad_norm': 0.891463041305542, 'learning_rate': 2.664778983797389e-06, 'epoch': 2.6}


 87%|████████▋ | 22050/25428 [4:36:13<35:27,  1.59it/s]

{'loss': 0.0741, 'grad_norm': 0.727695882320404, 'learning_rate': 2.6569136385087306e-06, 'epoch': 2.6}


 87%|████████▋ | 22060/25428 [4:36:19<35:35,  1.58it/s]

{'loss': 0.0874, 'grad_norm': 0.6671629548072815, 'learning_rate': 2.649048293220073e-06, 'epoch': 2.6}


 87%|████████▋ | 22070/25428 [4:36:26<35:15,  1.59it/s]

{'loss': 0.0653, 'grad_norm': 0.7607951164245605, 'learning_rate': 2.6411829479314143e-06, 'epoch': 2.6}


 87%|████████▋ | 22080/25428 [4:36:32<35:10,  1.59it/s]

{'loss': 0.0796, 'grad_norm': 1.2505329847335815, 'learning_rate': 2.6333176026427565e-06, 'epoch': 2.61}


 87%|████████▋ | 22090/25428 [4:36:38<35:33,  1.56it/s]

{'loss': 0.0939, 'grad_norm': 0.8289413452148438, 'learning_rate': 2.625452257354098e-06, 'epoch': 2.61}


 87%|████████▋ | 22100/25428 [4:36:45<35:17,  1.57it/s]

{'loss': 0.0906, 'grad_norm': 0.8990587592124939, 'learning_rate': 2.61758691206544e-06, 'epoch': 2.61}


 87%|████████▋ | 22110/25428 [4:36:51<34:24,  1.61it/s]

{'loss': 0.0664, 'grad_norm': 1.0039055347442627, 'learning_rate': 2.6097215667767816e-06, 'epoch': 2.61}


 87%|████████▋ | 22120/25428 [4:36:57<34:38,  1.59it/s]

{'loss': 0.1138, 'grad_norm': 1.139807939529419, 'learning_rate': 2.601856221488124e-06, 'epoch': 2.61}


 87%|████████▋ | 22130/25428 [4:37:04<35:08,  1.56it/s]

{'loss': 0.0831, 'grad_norm': 0.7795413732528687, 'learning_rate': 2.5939908761994653e-06, 'epoch': 2.61}


 87%|████████▋ | 22140/25428 [4:37:10<34:19,  1.60it/s]

{'loss': 0.0689, 'grad_norm': 0.7878115177154541, 'learning_rate': 2.5861255309108075e-06, 'epoch': 2.61}


 87%|████████▋ | 22150/25428 [4:37:16<34:34,  1.58it/s]

{'loss': 0.0945, 'grad_norm': 0.8454416394233704, 'learning_rate': 2.578260185622149e-06, 'epoch': 2.61}


 87%|████████▋ | 22160/25428 [4:37:23<34:11,  1.59it/s]

{'loss': 0.0691, 'grad_norm': 0.8691405057907104, 'learning_rate': 2.570394840333491e-06, 'epoch': 2.61}


 87%|████████▋ | 22170/25428 [4:37:29<34:57,  1.55it/s]

{'loss': 0.0847, 'grad_norm': 1.6844147443771362, 'learning_rate': 2.5625294950448326e-06, 'epoch': 2.62}


 87%|████████▋ | 22180/25428 [4:37:36<34:30,  1.57it/s]

{'loss': 0.0862, 'grad_norm': 1.4306440353393555, 'learning_rate': 2.554664149756175e-06, 'epoch': 2.62}


 87%|████████▋ | 22190/25428 [4:37:42<34:37,  1.56it/s]

{'loss': 0.08, 'grad_norm': 0.8830851912498474, 'learning_rate': 2.5467988044675163e-06, 'epoch': 2.62}


 87%|████████▋ | 22200/25428 [4:37:48<33:52,  1.59it/s]

{'loss': 0.0742, 'grad_norm': 0.5885899662971497, 'learning_rate': 2.5389334591788585e-06, 'epoch': 2.62}


 87%|████████▋ | 22210/25428 [4:37:55<33:44,  1.59it/s]

{'loss': 0.0644, 'grad_norm': 1.506438970565796, 'learning_rate': 2.5310681138902e-06, 'epoch': 2.62}


 87%|████████▋ | 22220/25428 [4:38:01<33:31,  1.59it/s]

{'loss': 0.0893, 'grad_norm': 0.7136847972869873, 'learning_rate': 2.523202768601542e-06, 'epoch': 2.62}


 87%|████████▋ | 22230/25428 [4:38:07<33:22,  1.60it/s]

{'loss': 0.0711, 'grad_norm': 0.7800229787826538, 'learning_rate': 2.5153374233128836e-06, 'epoch': 2.62}


 87%|████████▋ | 22240/25428 [4:38:14<34:15,  1.55it/s]

{'loss': 0.1185, 'grad_norm': 1.2998499870300293, 'learning_rate': 2.507472078024226e-06, 'epoch': 2.62}


 88%|████████▊ | 22250/25428 [4:38:20<34:11,  1.55it/s]

{'loss': 0.0888, 'grad_norm': 2.1940670013427734, 'learning_rate': 2.4996067327355673e-06, 'epoch': 2.63}


 88%|████████▊ | 22260/25428 [4:38:26<33:43,  1.57it/s]

{'loss': 0.0786, 'grad_norm': 0.6986036896705627, 'learning_rate': 2.491741387446909e-06, 'epoch': 2.63}


 88%|████████▊ | 22270/25428 [4:38:33<33:08,  1.59it/s]

{'loss': 0.0566, 'grad_norm': 1.3748019933700562, 'learning_rate': 2.483876042158251e-06, 'epoch': 2.63}


 88%|████████▊ | 22280/25428 [4:38:39<34:13,  1.53it/s]

{'loss': 0.0833, 'grad_norm': 0.9159625172615051, 'learning_rate': 2.4760106968695928e-06, 'epoch': 2.63}


 88%|████████▊ | 22290/25428 [4:38:45<33:04,  1.58it/s]

{'loss': 0.0782, 'grad_norm': 1.0866085290908813, 'learning_rate': 2.4681453515809346e-06, 'epoch': 2.63}


 88%|████████▊ | 22300/25428 [4:38:52<32:56,  1.58it/s]

{'loss': 0.0755, 'grad_norm': 1.2751883268356323, 'learning_rate': 2.4602800062922764e-06, 'epoch': 2.63}


 88%|████████▊ | 22310/25428 [4:38:58<32:46,  1.59it/s]

{'loss': 0.0645, 'grad_norm': 1.1795971393585205, 'learning_rate': 2.4524146610036183e-06, 'epoch': 2.63}


 88%|████████▊ | 22320/25428 [4:39:04<33:06,  1.56it/s]

{'loss': 0.0718, 'grad_norm': 0.8396190404891968, 'learning_rate': 2.44454931571496e-06, 'epoch': 2.63}


 88%|████████▊ | 22330/25428 [4:39:11<33:12,  1.56it/s]

{'loss': 0.0929, 'grad_norm': 1.6833027601242065, 'learning_rate': 2.436683970426302e-06, 'epoch': 2.63}


 88%|████████▊ | 22340/25428 [4:39:17<33:01,  1.56it/s]

{'loss': 0.0766, 'grad_norm': 0.5773573517799377, 'learning_rate': 2.4288186251376438e-06, 'epoch': 2.64}


 88%|████████▊ | 22350/25428 [4:39:24<31:52,  1.61it/s]

{'loss': 0.0961, 'grad_norm': 1.3877233266830444, 'learning_rate': 2.4209532798489856e-06, 'epoch': 2.64}


 88%|████████▊ | 22360/25428 [4:39:30<32:08,  1.59it/s]

{'loss': 0.0597, 'grad_norm': 0.7290491461753845, 'learning_rate': 2.4130879345603274e-06, 'epoch': 2.64}


 88%|████████▊ | 22370/25428 [4:39:36<32:39,  1.56it/s]

{'loss': 0.092, 'grad_norm': 0.7281069159507751, 'learning_rate': 2.4052225892716693e-06, 'epoch': 2.64}


 88%|████████▊ | 22380/25428 [4:39:42<31:35,  1.61it/s]

{'loss': 0.0931, 'grad_norm': 0.6785162687301636, 'learning_rate': 2.397357243983011e-06, 'epoch': 2.64}


 88%|████████▊ | 22390/25428 [4:39:49<31:28,  1.61it/s]

{'loss': 0.0702, 'grad_norm': 0.7577196359634399, 'learning_rate': 2.389491898694353e-06, 'epoch': 2.64}


 88%|████████▊ | 22400/25428 [4:39:55<32:31,  1.55it/s]

{'loss': 0.0872, 'grad_norm': 0.6153673529624939, 'learning_rate': 2.3816265534056948e-06, 'epoch': 2.64}


 88%|████████▊ | 22410/25428 [4:40:01<32:39,  1.54it/s]

{'loss': 0.0795, 'grad_norm': 0.6679569482803345, 'learning_rate': 2.3737612081170366e-06, 'epoch': 2.64}


 88%|████████▊ | 22420/25428 [4:40:08<32:16,  1.55it/s]

{'loss': 0.1062, 'grad_norm': 0.9523385167121887, 'learning_rate': 2.3658958628283784e-06, 'epoch': 2.65}


 88%|████████▊ | 22430/25428 [4:40:14<32:20,  1.55it/s]

{'loss': 0.1048, 'grad_norm': 1.2965492010116577, 'learning_rate': 2.3580305175397203e-06, 'epoch': 2.65}


 88%|████████▊ | 22440/25428 [4:40:21<31:29,  1.58it/s]

{'loss': 0.0848, 'grad_norm': 0.692241370677948, 'learning_rate': 2.350165172251062e-06, 'epoch': 2.65}


 88%|████████▊ | 22450/25428 [4:40:27<31:06,  1.60it/s]

{'loss': 0.068, 'grad_norm': 1.7577675580978394, 'learning_rate': 2.342299826962404e-06, 'epoch': 2.65}


 88%|████████▊ | 22460/25428 [4:40:33<30:42,  1.61it/s]

{'loss': 0.0793, 'grad_norm': 0.9740359783172607, 'learning_rate': 2.3344344816737458e-06, 'epoch': 2.65}


 88%|████████▊ | 22470/25428 [4:40:40<31:13,  1.58it/s]

{'loss': 0.0985, 'grad_norm': 1.0084034204483032, 'learning_rate': 2.3265691363850876e-06, 'epoch': 2.65}


 88%|████████▊ | 22480/25428 [4:40:46<31:02,  1.58it/s]

{'loss': 0.089, 'grad_norm': 1.6428961753845215, 'learning_rate': 2.3187037910964294e-06, 'epoch': 2.65}


 88%|████████▊ | 22490/25428 [4:40:52<31:05,  1.57it/s]

{'loss': 0.0684, 'grad_norm': 0.9386587738990784, 'learning_rate': 2.3108384458077713e-06, 'epoch': 2.65}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.0627, 'grad_norm': 1.3477511405944824, 'learning_rate': 2.302973100519113e-06, 'epoch': 2.65}


 89%|████████▊ | 22510/25428 [4:41:08<32:13,  1.51it/s]  

{'loss': 0.073, 'grad_norm': 1.1562520265579224, 'learning_rate': 2.295107755230455e-06, 'epoch': 2.66}


 89%|████████▊ | 22520/25428 [4:41:15<32:13,  1.50it/s]

{'loss': 0.0545, 'grad_norm': 0.6043144464492798, 'learning_rate': 2.2872424099417968e-06, 'epoch': 2.66}


 89%|████████▊ | 22530/25428 [4:41:21<30:25,  1.59it/s]

{'loss': 0.0903, 'grad_norm': 1.5816923379898071, 'learning_rate': 2.2793770646531386e-06, 'epoch': 2.66}


 89%|████████▊ | 22540/25428 [4:41:27<30:05,  1.60it/s]

{'loss': 0.1064, 'grad_norm': 1.8459036350250244, 'learning_rate': 2.2715117193644804e-06, 'epoch': 2.66}


 89%|████████▊ | 22550/25428 [4:41:34<30:47,  1.56it/s]

{'loss': 0.094, 'grad_norm': 0.7002564668655396, 'learning_rate': 2.2636463740758223e-06, 'epoch': 2.66}


 89%|████████▊ | 22560/25428 [4:41:40<30:15,  1.58it/s]

{'loss': 0.0693, 'grad_norm': 0.74420565366745, 'learning_rate': 2.255781028787164e-06, 'epoch': 2.66}


 89%|████████▉ | 22570/25428 [4:41:46<29:59,  1.59it/s]

{'loss': 0.063, 'grad_norm': 0.8216092586517334, 'learning_rate': 2.247915683498506e-06, 'epoch': 2.66}


 89%|████████▉ | 22580/25428 [4:41:53<30:20,  1.56it/s]

{'loss': 0.0753, 'grad_norm': 0.8653694987297058, 'learning_rate': 2.2400503382098478e-06, 'epoch': 2.66}


 89%|████████▉ | 22590/25428 [4:41:59<29:43,  1.59it/s]

{'loss': 0.0649, 'grad_norm': 0.8576675057411194, 'learning_rate': 2.2321849929211896e-06, 'epoch': 2.67}


 89%|████████▉ | 22600/25428 [4:42:05<29:21,  1.61it/s]

{'loss': 0.0777, 'grad_norm': 4.588893890380859, 'learning_rate': 2.2243196476325314e-06, 'epoch': 2.67}


 89%|████████▉ | 22610/25428 [4:42:11<30:03,  1.56it/s]

{'loss': 0.081, 'grad_norm': 0.857761800289154, 'learning_rate': 2.2164543023438733e-06, 'epoch': 2.67}


 89%|████████▉ | 22620/25428 [4:42:18<29:26,  1.59it/s]

{'loss': 0.0935, 'grad_norm': 0.9006972908973694, 'learning_rate': 2.208588957055215e-06, 'epoch': 2.67}


 89%|████████▉ | 22630/25428 [4:42:24<29:47,  1.56it/s]

{'loss': 0.0918, 'grad_norm': 0.539900004863739, 'learning_rate': 2.200723611766557e-06, 'epoch': 2.67}


 89%|████████▉ | 22640/25428 [4:42:31<29:39,  1.57it/s]

{'loss': 0.0771, 'grad_norm': 0.752906084060669, 'learning_rate': 2.1928582664778988e-06, 'epoch': 2.67}


 89%|████████▉ | 22650/25428 [4:42:37<29:09,  1.59it/s]

{'loss': 0.063, 'grad_norm': 0.8819246888160706, 'learning_rate': 2.1849929211892406e-06, 'epoch': 2.67}


 89%|████████▉ | 22660/25428 [4:42:43<29:35,  1.56it/s]

{'loss': 0.0805, 'grad_norm': 0.9078982472419739, 'learning_rate': 2.177127575900582e-06, 'epoch': 2.67}


 89%|████████▉ | 22670/25428 [4:42:50<28:37,  1.61it/s]

{'loss': 0.0691, 'grad_norm': 0.7393731474876404, 'learning_rate': 2.169262230611924e-06, 'epoch': 2.67}


 89%|████████▉ | 22680/25428 [4:42:56<28:59,  1.58it/s]

{'loss': 0.0939, 'grad_norm': 0.9955736398696899, 'learning_rate': 2.1613968853232657e-06, 'epoch': 2.68}


 89%|████████▉ | 22690/25428 [4:43:02<28:49,  1.58it/s]

{'loss': 0.079, 'grad_norm': 0.8255254626274109, 'learning_rate': 2.1535315400346075e-06, 'epoch': 2.68}


 89%|████████▉ | 22700/25428 [4:43:09<28:39,  1.59it/s]

{'loss': 0.0829, 'grad_norm': 0.8243411183357239, 'learning_rate': 2.1456661947459493e-06, 'epoch': 2.68}


 89%|████████▉ | 22710/25428 [4:43:15<28:44,  1.58it/s]

{'loss': 0.0889, 'grad_norm': 0.7577177286148071, 'learning_rate': 2.137800849457291e-06, 'epoch': 2.68}


 89%|████████▉ | 22720/25428 [4:43:21<28:39,  1.57it/s]

{'loss': 0.0805, 'grad_norm': 0.7851265072822571, 'learning_rate': 2.129935504168633e-06, 'epoch': 2.68}


 89%|████████▉ | 22730/25428 [4:43:28<28:35,  1.57it/s]

{'loss': 0.0907, 'grad_norm': 1.0899040699005127, 'learning_rate': 2.122070158879975e-06, 'epoch': 2.68}


 89%|████████▉ | 22740/25428 [4:43:34<28:11,  1.59it/s]

{'loss': 0.1086, 'grad_norm': 1.2262605428695679, 'learning_rate': 2.1142048135913167e-06, 'epoch': 2.68}


 89%|████████▉ | 22750/25428 [4:43:40<28:29,  1.57it/s]

{'loss': 0.0623, 'grad_norm': 0.7460969686508179, 'learning_rate': 2.1063394683026585e-06, 'epoch': 2.68}


 90%|████████▉ | 22760/25428 [4:43:47<28:23,  1.57it/s]

{'loss': 0.1369, 'grad_norm': 2.3580946922302246, 'learning_rate': 2.0984741230140003e-06, 'epoch': 2.69}


 90%|████████▉ | 22770/25428 [4:43:53<28:06,  1.58it/s]

{'loss': 0.072, 'grad_norm': 0.8092399835586548, 'learning_rate': 2.090608777725342e-06, 'epoch': 2.69}


 90%|████████▉ | 22780/25428 [4:43:59<28:24,  1.55it/s]

{'loss': 0.0831, 'grad_norm': 0.7118833661079407, 'learning_rate': 2.082743432436684e-06, 'epoch': 2.69}


 90%|████████▉ | 22790/25428 [4:44:06<28:15,  1.56it/s]

{'loss': 0.095, 'grad_norm': 0.8314240574836731, 'learning_rate': 2.074878087148026e-06, 'epoch': 2.69}


 90%|████████▉ | 22800/25428 [4:44:12<27:44,  1.58it/s]

{'loss': 0.0749, 'grad_norm': 0.9791886210441589, 'learning_rate': 2.0670127418593677e-06, 'epoch': 2.69}


 90%|████████▉ | 22810/25428 [4:44:18<27:20,  1.60it/s]

{'loss': 0.1146, 'grad_norm': 0.8070168495178223, 'learning_rate': 2.0591473965707095e-06, 'epoch': 2.69}


 90%|████████▉ | 22820/25428 [4:44:25<27:43,  1.57it/s]

{'loss': 0.0736, 'grad_norm': 0.7263376712799072, 'learning_rate': 2.0512820512820513e-06, 'epoch': 2.69}


 90%|████████▉ | 22830/25428 [4:44:31<28:03,  1.54it/s]

{'loss': 0.1391, 'grad_norm': 1.0436716079711914, 'learning_rate': 2.043416705993393e-06, 'epoch': 2.69}


 90%|████████▉ | 22840/25428 [4:44:38<27:18,  1.58it/s]

{'loss': 0.085, 'grad_norm': 1.133891224861145, 'learning_rate': 2.035551360704735e-06, 'epoch': 2.69}


 90%|████████▉ | 22850/25428 [4:44:44<27:26,  1.57it/s]

{'loss': 0.0685, 'grad_norm': 0.7922617793083191, 'learning_rate': 2.027686015416077e-06, 'epoch': 2.7}


 90%|████████▉ | 22860/25428 [4:44:50<27:13,  1.57it/s]

{'loss': 0.0614, 'grad_norm': 0.6608406901359558, 'learning_rate': 2.0198206701274187e-06, 'epoch': 2.7}


 90%|████████▉ | 22870/25428 [4:44:57<27:01,  1.58it/s]

{'loss': 0.1016, 'grad_norm': 0.6508355736732483, 'learning_rate': 2.0119553248387605e-06, 'epoch': 2.7}


 90%|████████▉ | 22880/25428 [4:45:03<26:21,  1.61it/s]

{'loss': 0.0833, 'grad_norm': 1.004756212234497, 'learning_rate': 2.0040899795501023e-06, 'epoch': 2.7}


 90%|█████████ | 22890/25428 [4:45:09<26:12,  1.61it/s]

{'loss': 0.0944, 'grad_norm': 0.8066123723983765, 'learning_rate': 1.996224634261444e-06, 'epoch': 2.7}


 90%|█████████ | 22900/25428 [4:45:15<27:06,  1.55it/s]

{'loss': 0.0971, 'grad_norm': 1.2966786623001099, 'learning_rate': 1.988359288972786e-06, 'epoch': 2.7}


 90%|█████████ | 22910/25428 [4:45:22<26:33,  1.58it/s]

{'loss': 0.0722, 'grad_norm': 0.6894010305404663, 'learning_rate': 1.980493943684128e-06, 'epoch': 2.7}


 90%|█████████ | 22920/25428 [4:45:28<26:16,  1.59it/s]

{'loss': 0.0695, 'grad_norm': 0.9993313550949097, 'learning_rate': 1.9726285983954697e-06, 'epoch': 2.7}


 90%|█████████ | 22930/25428 [4:45:34<26:32,  1.57it/s]

{'loss': 0.0686, 'grad_norm': 0.611723005771637, 'learning_rate': 1.9647632531068115e-06, 'epoch': 2.71}


 90%|█████████ | 22940/25428 [4:45:41<26:24,  1.57it/s]

{'loss': 0.0775, 'grad_norm': 0.6811347007751465, 'learning_rate': 1.9568979078181533e-06, 'epoch': 2.71}


 90%|█████████ | 22950/25428 [4:45:47<26:23,  1.57it/s]

{'loss': 0.0814, 'grad_norm': 0.8979427218437195, 'learning_rate': 1.949032562529495e-06, 'epoch': 2.71}


 90%|█████████ | 22960/25428 [4:45:53<26:17,  1.56it/s]

{'loss': 0.0922, 'grad_norm': 1.0033643245697021, 'learning_rate': 1.941167217240837e-06, 'epoch': 2.71}


 90%|█████████ | 22970/25428 [4:46:00<25:56,  1.58it/s]

{'loss': 0.086, 'grad_norm': 0.9241247177124023, 'learning_rate': 1.933301871952179e-06, 'epoch': 2.71}


 90%|█████████ | 22980/25428 [4:46:06<26:03,  1.57it/s]

{'loss': 0.0751, 'grad_norm': 0.8702436089515686, 'learning_rate': 1.9254365266635207e-06, 'epoch': 2.71}


 90%|█████████ | 22990/25428 [4:46:12<25:15,  1.61it/s]

{'loss': 0.0654, 'grad_norm': 0.9508900046348572, 'learning_rate': 1.9175711813748625e-06, 'epoch': 2.71}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.1169, 'grad_norm': 0.7687578797340393, 'learning_rate': 1.9097058360862043e-06, 'epoch': 2.71}


 90%|█████████ | 23010/25428 [4:46:28<26:40,  1.51it/s]

{'loss': 0.0643, 'grad_norm': 0.8116378784179688, 'learning_rate': 1.9018404907975464e-06, 'epoch': 2.71}


 91%|█████████ | 23020/25428 [4:46:34<25:06,  1.60it/s]

{'loss': 0.0839, 'grad_norm': 2.300266742706299, 'learning_rate': 1.893975145508888e-06, 'epoch': 2.72}


 91%|█████████ | 23030/25428 [4:46:40<24:43,  1.62it/s]

{'loss': 0.1043, 'grad_norm': 0.8199872374534607, 'learning_rate': 1.8861098002202298e-06, 'epoch': 2.72}


 91%|█████████ | 23040/25428 [4:46:47<25:16,  1.57it/s]

{'loss': 0.0613, 'grad_norm': 0.7509812712669373, 'learning_rate': 1.8782444549315717e-06, 'epoch': 2.72}


 91%|█████████ | 23050/25428 [4:46:53<24:59,  1.59it/s]

{'loss': 0.0691, 'grad_norm': 0.8415840268135071, 'learning_rate': 1.8703791096429135e-06, 'epoch': 2.72}


 91%|█████████ | 23060/25428 [4:46:59<24:47,  1.59it/s]

{'loss': 0.0729, 'grad_norm': 0.7320478558540344, 'learning_rate': 1.8625137643542553e-06, 'epoch': 2.72}


 91%|█████████ | 23070/25428 [4:47:06<25:19,  1.55it/s]

{'loss': 0.0851, 'grad_norm': 0.6787845492362976, 'learning_rate': 1.8546484190655972e-06, 'epoch': 2.72}


 91%|█████████ | 23080/25428 [4:47:12<24:32,  1.59it/s]

{'loss': 0.0767, 'grad_norm': 0.7175055742263794, 'learning_rate': 1.846783073776939e-06, 'epoch': 2.72}


 91%|█████████ | 23090/25428 [4:47:18<24:30,  1.59it/s]

{'loss': 0.0708, 'grad_norm': 0.7540864944458008, 'learning_rate': 1.8389177284882808e-06, 'epoch': 2.72}


 91%|█████████ | 23100/25428 [4:47:25<24:23,  1.59it/s]

{'loss': 0.0646, 'grad_norm': 0.8189655542373657, 'learning_rate': 1.8310523831996227e-06, 'epoch': 2.73}


 91%|█████████ | 23110/25428 [4:47:31<24:58,  1.55it/s]

{'loss': 0.0645, 'grad_norm': 0.8220463991165161, 'learning_rate': 1.8231870379109645e-06, 'epoch': 2.73}


 91%|█████████ | 23120/25428 [4:47:37<24:31,  1.57it/s]

{'loss': 0.0965, 'grad_norm': 1.694669485092163, 'learning_rate': 1.8153216926223063e-06, 'epoch': 2.73}


 91%|█████████ | 23130/25428 [4:47:44<24:14,  1.58it/s]

{'loss': 0.0586, 'grad_norm': 0.5437029600143433, 'learning_rate': 1.8074563473336482e-06, 'epoch': 2.73}


 91%|█████████ | 23140/25428 [4:47:50<23:29,  1.62it/s]

{'loss': 0.0692, 'grad_norm': 0.8756787776947021, 'learning_rate': 1.79959100204499e-06, 'epoch': 2.73}


 91%|█████████ | 23150/25428 [4:47:56<23:46,  1.60it/s]

{'loss': 0.1005, 'grad_norm': 2.442466974258423, 'learning_rate': 1.7917256567563318e-06, 'epoch': 2.73}


 91%|█████████ | 23160/25428 [4:48:03<23:55,  1.58it/s]

{'loss': 0.0668, 'grad_norm': 1.038714051246643, 'learning_rate': 1.7838603114676737e-06, 'epoch': 2.73}


 91%|█████████ | 23170/25428 [4:48:09<23:54,  1.57it/s]

{'loss': 0.0819, 'grad_norm': 0.8794323801994324, 'learning_rate': 1.7759949661790155e-06, 'epoch': 2.73}


 91%|█████████ | 23180/25428 [4:48:15<23:53,  1.57it/s]

{'loss': 0.0593, 'grad_norm': 0.5849329233169556, 'learning_rate': 1.7681296208903573e-06, 'epoch': 2.73}


 91%|█████████ | 23190/25428 [4:48:22<23:25,  1.59it/s]

{'loss': 0.0585, 'grad_norm': 0.6886354684829712, 'learning_rate': 1.7602642756016992e-06, 'epoch': 2.74}


 91%|█████████ | 23200/25428 [4:48:28<23:16,  1.60it/s]

{'loss': 0.0692, 'grad_norm': 0.8580537438392639, 'learning_rate': 1.752398930313041e-06, 'epoch': 2.74}


 91%|█████████▏| 23210/25428 [4:48:34<23:09,  1.60it/s]

{'loss': 0.1291, 'grad_norm': 0.6935058236122131, 'learning_rate': 1.7445335850243828e-06, 'epoch': 2.74}


 91%|█████████▏| 23220/25428 [4:48:41<23:22,  1.57it/s]

{'loss': 0.0947, 'grad_norm': 1.1366169452667236, 'learning_rate': 1.7366682397357245e-06, 'epoch': 2.74}


 91%|█████████▏| 23230/25428 [4:48:47<23:10,  1.58it/s]

{'loss': 0.0623, 'grad_norm': 0.9895640015602112, 'learning_rate': 1.7288028944470663e-06, 'epoch': 2.74}


 91%|█████████▏| 23240/25428 [4:48:53<22:31,  1.62it/s]

{'loss': 0.0745, 'grad_norm': 1.4459551572799683, 'learning_rate': 1.7209375491584081e-06, 'epoch': 2.74}


 91%|█████████▏| 23250/25428 [4:48:59<22:33,  1.61it/s]

{'loss': 0.0796, 'grad_norm': 0.7141916751861572, 'learning_rate': 1.71307220386975e-06, 'epoch': 2.74}


 91%|█████████▏| 23260/25428 [4:49:06<22:27,  1.61it/s]

{'loss': 0.0915, 'grad_norm': 0.6781381964683533, 'learning_rate': 1.7052068585810918e-06, 'epoch': 2.74}


 92%|█████████▏| 23270/25428 [4:49:12<22:32,  1.60it/s]

{'loss': 0.0753, 'grad_norm': 0.8679581880569458, 'learning_rate': 1.6973415132924336e-06, 'epoch': 2.75}


 92%|█████████▏| 23280/25428 [4:49:18<23:03,  1.55it/s]

{'loss': 0.0781, 'grad_norm': 1.1184076070785522, 'learning_rate': 1.6894761680037755e-06, 'epoch': 2.75}


 92%|█████████▏| 23290/25428 [4:49:25<22:12,  1.60it/s]

{'loss': 0.0802, 'grad_norm': 1.0501734018325806, 'learning_rate': 1.6816108227151173e-06, 'epoch': 2.75}


 92%|█████████▏| 23300/25428 [4:49:31<22:08,  1.60it/s]

{'loss': 0.0559, 'grad_norm': 1.545819878578186, 'learning_rate': 1.6737454774264591e-06, 'epoch': 2.75}


 92%|█████████▏| 23310/25428 [4:49:37<22:05,  1.60it/s]

{'loss': 0.078, 'grad_norm': 1.644157886505127, 'learning_rate': 1.665880132137801e-06, 'epoch': 2.75}


 92%|█████████▏| 23320/25428 [4:49:44<22:09,  1.59it/s]

{'loss': 0.0588, 'grad_norm': 1.146276593208313, 'learning_rate': 1.6580147868491428e-06, 'epoch': 2.75}


 92%|█████████▏| 23330/25428 [4:49:50<22:13,  1.57it/s]

{'loss': 0.0887, 'grad_norm': 0.6250101327896118, 'learning_rate': 1.6501494415604846e-06, 'epoch': 2.75}


 92%|█████████▏| 23340/25428 [4:49:56<22:24,  1.55it/s]

{'loss': 0.082, 'grad_norm': 0.5802408456802368, 'learning_rate': 1.6422840962718265e-06, 'epoch': 2.75}


 92%|█████████▏| 23350/25428 [4:50:03<21:39,  1.60it/s]

{'loss': 0.0668, 'grad_norm': 0.6204439997673035, 'learning_rate': 1.6344187509831683e-06, 'epoch': 2.75}


 92%|█████████▏| 23360/25428 [4:50:09<21:31,  1.60it/s]

{'loss': 0.0541, 'grad_norm': 0.7175517678260803, 'learning_rate': 1.6265534056945101e-06, 'epoch': 2.76}


 92%|█████████▏| 23370/25428 [4:50:15<21:52,  1.57it/s]

{'loss': 0.0858, 'grad_norm': 1.1711314916610718, 'learning_rate': 1.618688060405852e-06, 'epoch': 2.76}


 92%|█████████▏| 23380/25428 [4:50:22<21:22,  1.60it/s]

{'loss': 0.1032, 'grad_norm': 1.2163106203079224, 'learning_rate': 1.6108227151171938e-06, 'epoch': 2.76}


 92%|█████████▏| 23390/25428 [4:50:28<21:00,  1.62it/s]

{'loss': 0.0925, 'grad_norm': 2.061039924621582, 'learning_rate': 1.6029573698285356e-06, 'epoch': 2.76}


 92%|█████████▏| 23400/25428 [4:50:34<21:12,  1.59it/s]

{'loss': 0.0792, 'grad_norm': 0.7772673964500427, 'learning_rate': 1.5950920245398775e-06, 'epoch': 2.76}


 92%|█████████▏| 23410/25428 [4:50:40<21:03,  1.60it/s]

{'loss': 0.0718, 'grad_norm': 0.6487688422203064, 'learning_rate': 1.5872266792512193e-06, 'epoch': 2.76}


 92%|█████████▏| 23420/25428 [4:50:47<21:19,  1.57it/s]

{'loss': 0.0748, 'grad_norm': 0.8992019295692444, 'learning_rate': 1.579361333962561e-06, 'epoch': 2.76}


 92%|█████████▏| 23430/25428 [4:50:53<20:45,  1.60it/s]

{'loss': 0.0643, 'grad_norm': 1.010709524154663, 'learning_rate': 1.5714959886739028e-06, 'epoch': 2.76}


 92%|█████████▏| 23440/25428 [4:50:59<20:47,  1.59it/s]

{'loss': 0.0777, 'grad_norm': 0.5363717675209045, 'learning_rate': 1.5636306433852446e-06, 'epoch': 2.77}


 92%|█████████▏| 23450/25428 [4:51:06<21:42,  1.52it/s]

{'loss': 0.0645, 'grad_norm': 1.1917047500610352, 'learning_rate': 1.5557652980965864e-06, 'epoch': 2.77}


 92%|█████████▏| 23460/25428 [4:51:12<20:42,  1.58it/s]

{'loss': 0.1015, 'grad_norm': 0.875900149345398, 'learning_rate': 1.5478999528079283e-06, 'epoch': 2.77}


 92%|█████████▏| 23470/25428 [4:51:18<20:29,  1.59it/s]

{'loss': 0.0736, 'grad_norm': 0.6467592716217041, 'learning_rate': 1.54003460751927e-06, 'epoch': 2.77}


 92%|█████████▏| 23480/25428 [4:51:25<20:29,  1.58it/s]

{'loss': 0.0594, 'grad_norm': 1.6106829643249512, 'learning_rate': 1.532169262230612e-06, 'epoch': 2.77}


 92%|█████████▏| 23490/25428 [4:51:31<20:31,  1.57it/s]

{'loss': 0.0744, 'grad_norm': 0.7486612200737, 'learning_rate': 1.5243039169419538e-06, 'epoch': 2.77}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.0777, 'grad_norm': 0.9245768785476685, 'learning_rate': 1.5164385716532956e-06, 'epoch': 2.77}


 92%|█████████▏| 23510/25428 [4:51:47<21:29,  1.49it/s]

{'loss': 0.086, 'grad_norm': 0.980857789516449, 'learning_rate': 1.5085732263646374e-06, 'epoch': 2.77}


 92%|█████████▏| 23520/25428 [4:51:53<20:16,  1.57it/s]

{'loss': 0.0844, 'grad_norm': 0.6910010576248169, 'learning_rate': 1.5007078810759793e-06, 'epoch': 2.77}


 93%|█████████▎| 23530/25428 [4:51:59<19:41,  1.61it/s]

{'loss': 0.0799, 'grad_norm': 1.096592903137207, 'learning_rate': 1.492842535787321e-06, 'epoch': 2.78}


 93%|█████████▎| 23540/25428 [4:52:05<19:54,  1.58it/s]

{'loss': 0.0778, 'grad_norm': 1.042919635772705, 'learning_rate': 1.484977190498663e-06, 'epoch': 2.78}


 93%|█████████▎| 23550/25428 [4:52:12<19:38,  1.59it/s]

{'loss': 0.101, 'grad_norm': 1.1346687078475952, 'learning_rate': 1.4771118452100048e-06, 'epoch': 2.78}


 93%|█████████▎| 23560/25428 [4:52:18<19:35,  1.59it/s]

{'loss': 0.0691, 'grad_norm': 1.3801615238189697, 'learning_rate': 1.4692464999213466e-06, 'epoch': 2.78}


 93%|█████████▎| 23570/25428 [4:52:24<19:41,  1.57it/s]

{'loss': 0.0639, 'grad_norm': 0.8419768214225769, 'learning_rate': 1.4613811546326884e-06, 'epoch': 2.78}


 93%|█████████▎| 23580/25428 [4:52:31<19:40,  1.56it/s]

{'loss': 0.0887, 'grad_norm': 1.0430549383163452, 'learning_rate': 1.4535158093440303e-06, 'epoch': 2.78}


 93%|█████████▎| 23590/25428 [4:52:37<19:12,  1.59it/s]

{'loss': 0.0662, 'grad_norm': 0.9806357622146606, 'learning_rate': 1.445650464055372e-06, 'epoch': 2.78}


 93%|█████████▎| 23600/25428 [4:52:43<18:59,  1.60it/s]

{'loss': 0.0629, 'grad_norm': 0.881986677646637, 'learning_rate': 1.437785118766714e-06, 'epoch': 2.78}


 93%|█████████▎| 23610/25428 [4:52:50<19:15,  1.57it/s]

{'loss': 0.0706, 'grad_norm': 0.9868962168693542, 'learning_rate': 1.429919773478056e-06, 'epoch': 2.79}


 93%|█████████▎| 23620/25428 [4:52:56<18:53,  1.59it/s]

{'loss': 0.0823, 'grad_norm': 0.8809521794319153, 'learning_rate': 1.4220544281893978e-06, 'epoch': 2.79}


 93%|█████████▎| 23630/25428 [4:53:02<18:31,  1.62it/s]

{'loss': 0.1156, 'grad_norm': 1.079897403717041, 'learning_rate': 1.4141890829007396e-06, 'epoch': 2.79}


 93%|█████████▎| 23640/25428 [4:53:08<18:36,  1.60it/s]

{'loss': 0.1102, 'grad_norm': 0.662877082824707, 'learning_rate': 1.4063237376120815e-06, 'epoch': 2.79}


 93%|█████████▎| 23650/25428 [4:53:15<18:58,  1.56it/s]

{'loss': 0.0788, 'grad_norm': 0.7359985113143921, 'learning_rate': 1.3984583923234233e-06, 'epoch': 2.79}


 93%|█████████▎| 23660/25428 [4:53:21<18:23,  1.60it/s]

{'loss': 0.0796, 'grad_norm': 0.6735591888427734, 'learning_rate': 1.3905930470347651e-06, 'epoch': 2.79}


 93%|█████████▎| 23670/25428 [4:53:27<18:10,  1.61it/s]

{'loss': 0.0814, 'grad_norm': 0.6038164496421814, 'learning_rate': 1.382727701746107e-06, 'epoch': 2.79}


 93%|█████████▎| 23680/25428 [4:53:33<18:08,  1.61it/s]

{'loss': 0.083, 'grad_norm': 0.6303996443748474, 'learning_rate': 1.3748623564574486e-06, 'epoch': 2.79}


 93%|█████████▎| 23690/25428 [4:53:40<17:55,  1.62it/s]

{'loss': 0.0658, 'grad_norm': 0.9223996996879578, 'learning_rate': 1.3669970111687904e-06, 'epoch': 2.79}


 93%|█████████▎| 23700/25428 [4:53:46<17:58,  1.60it/s]

{'loss': 0.0773, 'grad_norm': 1.0395777225494385, 'learning_rate': 1.3591316658801323e-06, 'epoch': 2.8}


 93%|█████████▎| 23710/25428 [4:53:52<18:08,  1.58it/s]

{'loss': 0.0773, 'grad_norm': 0.928770124912262, 'learning_rate': 1.351266320591474e-06, 'epoch': 2.8}


 93%|█████████▎| 23720/25428 [4:53:58<17:29,  1.63it/s]

{'loss': 0.1101, 'grad_norm': 0.7879769802093506, 'learning_rate': 1.343400975302816e-06, 'epoch': 2.8}


 93%|█████████▎| 23730/25428 [4:54:05<17:42,  1.60it/s]

{'loss': 0.0744, 'grad_norm': 0.9000400304794312, 'learning_rate': 1.3355356300141578e-06, 'epoch': 2.8}


 93%|█████████▎| 23740/25428 [4:54:11<17:42,  1.59it/s]

{'loss': 0.0879, 'grad_norm': 0.5894361734390259, 'learning_rate': 1.3276702847254996e-06, 'epoch': 2.8}


 93%|█████████▎| 23750/25428 [4:54:17<17:41,  1.58it/s]

{'loss': 0.0953, 'grad_norm': 1.102495789527893, 'learning_rate': 1.3198049394368414e-06, 'epoch': 2.8}


 93%|█████████▎| 23760/25428 [4:54:23<17:24,  1.60it/s]

{'loss': 0.0696, 'grad_norm': 0.9225733280181885, 'learning_rate': 1.3119395941481833e-06, 'epoch': 2.8}


 93%|█████████▎| 23770/25428 [4:54:30<17:16,  1.60it/s]

{'loss': 0.0596, 'grad_norm': 0.6332449316978455, 'learning_rate': 1.304074248859525e-06, 'epoch': 2.8}


 94%|█████████▎| 23780/25428 [4:54:36<17:26,  1.57it/s]

{'loss': 0.0794, 'grad_norm': 0.9219376444816589, 'learning_rate': 1.296208903570867e-06, 'epoch': 2.81}


 94%|█████████▎| 23790/25428 [4:54:42<17:17,  1.58it/s]

{'loss': 0.066, 'grad_norm': 2.580294132232666, 'learning_rate': 1.2883435582822088e-06, 'epoch': 2.81}


 94%|█████████▎| 23800/25428 [4:54:49<16:44,  1.62it/s]

{'loss': 0.0829, 'grad_norm': 0.6545908451080322, 'learning_rate': 1.2804782129935506e-06, 'epoch': 2.81}


 94%|█████████▎| 23810/25428 [4:54:55<16:58,  1.59it/s]

{'loss': 0.0646, 'grad_norm': 1.816837191581726, 'learning_rate': 1.2726128677048924e-06, 'epoch': 2.81}


 94%|█████████▎| 23820/25428 [4:55:01<16:53,  1.59it/s]

{'loss': 0.0745, 'grad_norm': 0.5438745617866516, 'learning_rate': 1.2647475224162343e-06, 'epoch': 2.81}


 94%|█████████▎| 23830/25428 [4:55:08<16:49,  1.58it/s]

{'loss': 0.088, 'grad_norm': 1.8960636854171753, 'learning_rate': 1.256882177127576e-06, 'epoch': 2.81}


 94%|█████████▍| 23840/25428 [4:55:14<17:00,  1.56it/s]

{'loss': 0.1, 'grad_norm': 0.8513152003288269, 'learning_rate': 1.249016831838918e-06, 'epoch': 2.81}


 94%|█████████▍| 23850/25428 [4:55:20<16:42,  1.57it/s]

{'loss': 0.0745, 'grad_norm': 0.8356228470802307, 'learning_rate': 1.2411514865502598e-06, 'epoch': 2.81}


 94%|█████████▍| 23860/25428 [4:55:27<16:19,  1.60it/s]

{'loss': 0.0665, 'grad_norm': 0.9895920157432556, 'learning_rate': 1.2332861412616016e-06, 'epoch': 2.82}


 94%|█████████▍| 23870/25428 [4:55:33<16:26,  1.58it/s]

{'loss': 0.0726, 'grad_norm': 0.7702412009239197, 'learning_rate': 1.2254207959729434e-06, 'epoch': 2.82}


 94%|█████████▍| 23880/25428 [4:55:39<16:07,  1.60it/s]

{'loss': 0.0602, 'grad_norm': 0.8855778574943542, 'learning_rate': 1.217555450684285e-06, 'epoch': 2.82}


 94%|█████████▍| 23890/25428 [4:55:45<16:05,  1.59it/s]

{'loss': 0.1102, 'grad_norm': 0.8553612232208252, 'learning_rate': 1.2096901053956269e-06, 'epoch': 2.82}


 94%|█████████▍| 23900/25428 [4:55:52<16:09,  1.58it/s]

{'loss': 0.0737, 'grad_norm': 0.5972902178764343, 'learning_rate': 1.2018247601069687e-06, 'epoch': 2.82}


 94%|█████████▍| 23910/25428 [4:55:58<15:48,  1.60it/s]

{'loss': 0.1077, 'grad_norm': 1.0837202072143555, 'learning_rate': 1.1939594148183106e-06, 'epoch': 2.82}


 94%|█████████▍| 23920/25428 [4:56:04<15:44,  1.60it/s]

{'loss': 0.0679, 'grad_norm': 1.0479401350021362, 'learning_rate': 1.1860940695296524e-06, 'epoch': 2.82}


 94%|█████████▍| 23930/25428 [4:56:11<15:30,  1.61it/s]

{'loss': 0.071, 'grad_norm': 0.8147936463356018, 'learning_rate': 1.1782287242409942e-06, 'epoch': 2.82}


 94%|█████████▍| 23940/25428 [4:56:17<15:26,  1.61it/s]

{'loss': 0.0702, 'grad_norm': 0.56201171875, 'learning_rate': 1.170363378952336e-06, 'epoch': 2.82}


 94%|█████████▍| 23950/25428 [4:56:23<15:28,  1.59it/s]

{'loss': 0.1072, 'grad_norm': 0.7197719216346741, 'learning_rate': 1.162498033663678e-06, 'epoch': 2.83}


 94%|█████████▍| 23960/25428 [4:56:29<15:25,  1.59it/s]

{'loss': 0.0627, 'grad_norm': 0.9424118399620056, 'learning_rate': 1.1546326883750197e-06, 'epoch': 2.83}


 94%|█████████▍| 23970/25428 [4:56:36<15:17,  1.59it/s]

{'loss': 0.0856, 'grad_norm': 0.8058695793151855, 'learning_rate': 1.1467673430863616e-06, 'epoch': 2.83}


 94%|█████████▍| 23980/25428 [4:56:42<15:01,  1.61it/s]

{'loss': 0.0845, 'grad_norm': 0.9665006995201111, 'learning_rate': 1.1389019977977034e-06, 'epoch': 2.83}


 94%|█████████▍| 23990/25428 [4:56:48<15:20,  1.56it/s]

{'loss': 0.0927, 'grad_norm': 0.5423025488853455, 'learning_rate': 1.1310366525090452e-06, 'epoch': 2.83}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.103, 'grad_norm': 2.145231008529663, 'learning_rate': 1.123171307220387e-06, 'epoch': 2.83}


 94%|█████████▍| 24010/25428 [4:57:04<15:30,  1.52it/s]

{'loss': 0.074, 'grad_norm': 0.5604661703109741, 'learning_rate': 1.115305961931729e-06, 'epoch': 2.83}


 94%|█████████▍| 24020/25428 [4:57:10<14:59,  1.56it/s]

{'loss': 0.0752, 'grad_norm': 1.1565580368041992, 'learning_rate': 1.1074406166430707e-06, 'epoch': 2.83}


 95%|█████████▍| 24030/25428 [4:57:17<14:55,  1.56it/s]

{'loss': 0.0936, 'grad_norm': 0.6260870695114136, 'learning_rate': 1.0995752713544126e-06, 'epoch': 2.84}


 95%|█████████▍| 24040/25428 [4:57:23<14:34,  1.59it/s]

{'loss': 0.0714, 'grad_norm': 0.6098964810371399, 'learning_rate': 1.0917099260657544e-06, 'epoch': 2.84}


 95%|█████████▍| 24050/25428 [4:57:29<14:28,  1.59it/s]

{'loss': 0.0962, 'grad_norm': 0.8063956499099731, 'learning_rate': 1.0838445807770962e-06, 'epoch': 2.84}


 95%|█████████▍| 24060/25428 [4:57:35<14:15,  1.60it/s]

{'loss': 0.0631, 'grad_norm': 0.9385173320770264, 'learning_rate': 1.075979235488438e-06, 'epoch': 2.84}


 95%|█████████▍| 24070/25428 [4:57:42<14:20,  1.58it/s]

{'loss': 0.0677, 'grad_norm': 0.7314965128898621, 'learning_rate': 1.0681138901997797e-06, 'epoch': 2.84}


 95%|█████████▍| 24080/25428 [4:57:48<14:14,  1.58it/s]

{'loss': 0.0839, 'grad_norm': 1.3833433389663696, 'learning_rate': 1.0602485449111217e-06, 'epoch': 2.84}


 95%|█████████▍| 24090/25428 [4:57:54<14:08,  1.58it/s]

{'loss': 0.0673, 'grad_norm': 0.7024118304252625, 'learning_rate': 1.0523831996224636e-06, 'epoch': 2.84}


 95%|█████████▍| 24100/25428 [4:58:01<13:51,  1.60it/s]

{'loss': 0.0654, 'grad_norm': 1.3872963190078735, 'learning_rate': 1.0445178543338054e-06, 'epoch': 2.84}


 95%|█████████▍| 24110/25428 [4:58:07<13:45,  1.60it/s]

{'loss': 0.0893, 'grad_norm': 1.034064531326294, 'learning_rate': 1.0366525090451472e-06, 'epoch': 2.84}


 95%|█████████▍| 24120/25428 [4:58:13<14:03,  1.55it/s]

{'loss': 0.0734, 'grad_norm': 0.5506867170333862, 'learning_rate': 1.028787163756489e-06, 'epoch': 2.85}


 95%|█████████▍| 24130/25428 [4:58:20<13:38,  1.59it/s]

{'loss': 0.0726, 'grad_norm': 0.955490231513977, 'learning_rate': 1.020921818467831e-06, 'epoch': 2.85}


 95%|█████████▍| 24140/25428 [4:58:26<13:22,  1.60it/s]

{'loss': 0.1296, 'grad_norm': 0.9135907888412476, 'learning_rate': 1.0130564731791727e-06, 'epoch': 2.85}


 95%|█████████▍| 24150/25428 [4:58:32<13:25,  1.59it/s]

{'loss': 0.0853, 'grad_norm': 0.823899507522583, 'learning_rate': 1.0051911278905146e-06, 'epoch': 2.85}


 95%|█████████▌| 24160/25428 [4:58:38<13:27,  1.57it/s]

{'loss': 0.0882, 'grad_norm': 0.8102250695228577, 'learning_rate': 9.973257826018564e-07, 'epoch': 2.85}


 95%|█████████▌| 24170/25428 [4:58:45<13:06,  1.60it/s]

{'loss': 0.0838, 'grad_norm': 1.5586539506912231, 'learning_rate': 9.894604373131982e-07, 'epoch': 2.85}


 95%|█████████▌| 24180/25428 [4:58:51<13:06,  1.59it/s]

{'loss': 0.072, 'grad_norm': 0.6657664179801941, 'learning_rate': 9.8159509202454e-07, 'epoch': 2.85}


 95%|█████████▌| 24190/25428 [4:58:57<13:13,  1.56it/s]

{'loss': 0.0827, 'grad_norm': 0.882610559463501, 'learning_rate': 9.73729746735882e-07, 'epoch': 2.85}


 95%|█████████▌| 24200/25428 [4:59:04<13:13,  1.55it/s]

{'loss': 0.0863, 'grad_norm': 1.148146629333496, 'learning_rate': 9.658644014472235e-07, 'epoch': 2.86}


 95%|█████████▌| 24210/25428 [4:59:10<12:48,  1.58it/s]

{'loss': 0.0832, 'grad_norm': 0.9075663089752197, 'learning_rate': 9.579990561585654e-07, 'epoch': 2.86}


 95%|█████████▌| 24220/25428 [4:59:16<12:50,  1.57it/s]

{'loss': 0.0995, 'grad_norm': 0.7266477346420288, 'learning_rate': 9.501337108699073e-07, 'epoch': 2.86}


 95%|█████████▌| 24230/25428 [4:59:23<12:34,  1.59it/s]

{'loss': 0.0778, 'grad_norm': 1.032906413078308, 'learning_rate': 9.422683655812491e-07, 'epoch': 2.86}


 95%|█████████▌| 24240/25428 [4:59:29<12:18,  1.61it/s]

{'loss': 0.0672, 'grad_norm': 1.0338002443313599, 'learning_rate': 9.34403020292591e-07, 'epoch': 2.86}


 95%|█████████▌| 24250/25428 [4:59:35<12:03,  1.63it/s]

{'loss': 0.0777, 'grad_norm': 1.3320387601852417, 'learning_rate': 9.265376750039327e-07, 'epoch': 2.86}


 95%|█████████▌| 24260/25428 [4:59:41<11:59,  1.62it/s]

{'loss': 0.0729, 'grad_norm': 0.8598119616508484, 'learning_rate': 9.186723297152745e-07, 'epoch': 2.86}


 95%|█████████▌| 24270/25428 [4:59:48<11:58,  1.61it/s]

{'loss': 0.0582, 'grad_norm': 2.8896324634552, 'learning_rate': 9.108069844266164e-07, 'epoch': 2.86}


 95%|█████████▌| 24280/25428 [4:59:54<11:52,  1.61it/s]

{'loss': 0.0625, 'grad_norm': 6.42627477645874, 'learning_rate': 9.029416391379582e-07, 'epoch': 2.86}


 96%|█████████▌| 24290/25428 [5:00:00<11:51,  1.60it/s]

{'loss': 0.0565, 'grad_norm': 0.8749899864196777, 'learning_rate': 8.950762938493e-07, 'epoch': 2.87}


 96%|█████████▌| 24300/25428 [5:00:06<11:44,  1.60it/s]

{'loss': 0.0705, 'grad_norm': 0.83486008644104, 'learning_rate': 8.872109485606419e-07, 'epoch': 2.87}


 96%|█████████▌| 24310/25428 [5:00:13<11:56,  1.56it/s]

{'loss': 0.0843, 'grad_norm': 1.06315279006958, 'learning_rate': 8.793456032719837e-07, 'epoch': 2.87}


 96%|█████████▌| 24320/25428 [5:00:19<11:33,  1.60it/s]

{'loss': 0.0614, 'grad_norm': 0.719326376914978, 'learning_rate': 8.714802579833255e-07, 'epoch': 2.87}


 96%|█████████▌| 24330/25428 [5:00:25<11:30,  1.59it/s]

{'loss': 0.1035, 'grad_norm': 0.7824913263320923, 'learning_rate': 8.636149126946674e-07, 'epoch': 2.87}


 96%|█████████▌| 24340/25428 [5:00:31<11:17,  1.61it/s]

{'loss': 0.0565, 'grad_norm': 1.0424168109893799, 'learning_rate': 8.557495674060092e-07, 'epoch': 2.87}


 96%|█████████▌| 24350/25428 [5:00:38<11:06,  1.62it/s]

{'loss': 0.1037, 'grad_norm': 0.9606763124465942, 'learning_rate': 8.478842221173509e-07, 'epoch': 2.87}


 96%|█████████▌| 24360/25428 [5:00:44<11:09,  1.60it/s]

{'loss': 0.0997, 'grad_norm': 1.1534757614135742, 'learning_rate': 8.400188768286928e-07, 'epoch': 2.87}


 96%|█████████▌| 24370/25428 [5:00:50<10:59,  1.60it/s]

{'loss': 0.0773, 'grad_norm': 0.8522371649742126, 'learning_rate': 8.321535315400346e-07, 'epoch': 2.88}


 96%|█████████▌| 24380/25428 [5:00:57<11:05,  1.57it/s]

{'loss': 0.0683, 'grad_norm': 0.8699948787689209, 'learning_rate': 8.242881862513764e-07, 'epoch': 2.88}


 96%|█████████▌| 24390/25428 [5:01:03<10:55,  1.58it/s]

{'loss': 0.0626, 'grad_norm': 0.7600132822990417, 'learning_rate': 8.164228409627184e-07, 'epoch': 2.88}


 96%|█████████▌| 24400/25428 [5:01:09<10:47,  1.59it/s]

{'loss': 0.0644, 'grad_norm': 0.819754421710968, 'learning_rate': 8.085574956740602e-07, 'epoch': 2.88}


 96%|█████████▌| 24410/25428 [5:01:15<10:44,  1.58it/s]

{'loss': 0.0643, 'grad_norm': 0.5703526735305786, 'learning_rate': 8.00692150385402e-07, 'epoch': 2.88}


 96%|█████████▌| 24420/25428 [5:01:22<10:31,  1.60it/s]

{'loss': 0.0687, 'grad_norm': 0.7194814085960388, 'learning_rate': 7.928268050967439e-07, 'epoch': 2.88}


 96%|█████████▌| 24430/25428 [5:01:28<10:42,  1.55it/s]

{'loss': 0.0852, 'grad_norm': 0.7912011742591858, 'learning_rate': 7.849614598080857e-07, 'epoch': 2.88}


 96%|█████████▌| 24440/25428 [5:01:34<10:21,  1.59it/s]

{'loss': 0.0693, 'grad_norm': 0.8743535280227661, 'learning_rate': 7.770961145194275e-07, 'epoch': 2.88}


 96%|█████████▌| 24450/25428 [5:01:41<10:10,  1.60it/s]

{'loss': 0.098, 'grad_norm': 1.2804465293884277, 'learning_rate': 7.692307692307694e-07, 'epoch': 2.88}


 96%|█████████▌| 24460/25428 [5:01:47<10:00,  1.61it/s]

{'loss': 0.0846, 'grad_norm': 1.1633548736572266, 'learning_rate': 7.613654239421112e-07, 'epoch': 2.89}


 96%|█████████▌| 24470/25428 [5:01:53<09:58,  1.60it/s]

{'loss': 0.0889, 'grad_norm': 1.5857272148132324, 'learning_rate': 7.53500078653453e-07, 'epoch': 2.89}


 96%|█████████▋| 24480/25428 [5:01:59<09:55,  1.59it/s]

{'loss': 0.0984, 'grad_norm': 0.7677264213562012, 'learning_rate': 7.456347333647948e-07, 'epoch': 2.89}


 96%|█████████▋| 24490/25428 [5:02:06<09:36,  1.63it/s]

{'loss': 0.0773, 'grad_norm': 0.9914957880973816, 'learning_rate': 7.377693880761366e-07, 'epoch': 2.89}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.0921, 'grad_norm': 0.8060165643692017, 'learning_rate': 7.299040427874784e-07, 'epoch': 2.89}


 96%|█████████▋| 24510/25428 [5:02:21<10:08,  1.51it/s]

{'loss': 0.0813, 'grad_norm': 1.610453486442566, 'learning_rate': 7.220386974988203e-07, 'epoch': 2.89}


 96%|█████████▋| 24520/25428 [5:02:27<09:35,  1.58it/s]

{'loss': 0.0614, 'grad_norm': 0.8305816650390625, 'learning_rate': 7.141733522101621e-07, 'epoch': 2.89}


 96%|█████████▋| 24530/25428 [5:02:34<09:32,  1.57it/s]

{'loss': 0.0804, 'grad_norm': 0.8458247780799866, 'learning_rate': 7.063080069215039e-07, 'epoch': 2.89}


 97%|█████████▋| 24540/25428 [5:02:40<09:14,  1.60it/s]

{'loss': 0.0724, 'grad_norm': 0.7932008504867554, 'learning_rate': 6.984426616328458e-07, 'epoch': 2.9}


 97%|█████████▋| 24550/25428 [5:02:46<09:12,  1.59it/s]

{'loss': 0.0747, 'grad_norm': 0.7754974961280823, 'learning_rate': 6.905773163441876e-07, 'epoch': 2.9}


 97%|█████████▋| 24560/25428 [5:02:52<08:56,  1.62it/s]

{'loss': 0.1064, 'grad_norm': 0.8578158617019653, 'learning_rate': 6.827119710555294e-07, 'epoch': 2.9}


 97%|█████████▋| 24570/25428 [5:02:59<08:55,  1.60it/s]

{'loss': 0.0779, 'grad_norm': 0.6732182502746582, 'learning_rate': 6.748466257668713e-07, 'epoch': 2.9}


 97%|█████████▋| 24580/25428 [5:03:05<08:49,  1.60it/s]

{'loss': 0.0809, 'grad_norm': 0.7869293689727783, 'learning_rate': 6.66981280478213e-07, 'epoch': 2.9}


 97%|█████████▋| 24590/25428 [5:03:11<08:41,  1.61it/s]

{'loss': 0.0655, 'grad_norm': 0.8630747199058533, 'learning_rate': 6.591159351895548e-07, 'epoch': 2.9}


 97%|█████████▋| 24600/25428 [5:03:18<08:55,  1.55it/s]

{'loss': 0.0781, 'grad_norm': 0.7365797162055969, 'learning_rate': 6.512505899008966e-07, 'epoch': 2.9}


 97%|█████████▋| 24610/25428 [5:03:24<08:36,  1.58it/s]

{'loss': 0.0692, 'grad_norm': 0.8946202397346497, 'learning_rate': 6.433852446122385e-07, 'epoch': 2.9}


 97%|█████████▋| 24620/25428 [5:03:30<08:22,  1.61it/s]

{'loss': 0.0887, 'grad_norm': 0.9414567947387695, 'learning_rate': 6.355198993235803e-07, 'epoch': 2.9}


 97%|█████████▋| 24630/25428 [5:03:37<08:25,  1.58it/s]

{'loss': 0.0534, 'grad_norm': 0.8098441362380981, 'learning_rate': 6.276545540349222e-07, 'epoch': 2.91}


 97%|█████████▋| 24640/25428 [5:03:43<08:15,  1.59it/s]

{'loss': 0.0651, 'grad_norm': 1.0686016082763672, 'learning_rate': 6.197892087462641e-07, 'epoch': 2.91}


 97%|█████████▋| 24650/25428 [5:03:49<08:06,  1.60it/s]

{'loss': 0.0765, 'grad_norm': 0.6882450580596924, 'learning_rate': 6.119238634576058e-07, 'epoch': 2.91}


 97%|█████████▋| 24660/25428 [5:03:56<08:13,  1.56it/s]

{'loss': 0.0766, 'grad_norm': 1.0910040140151978, 'learning_rate': 6.040585181689477e-07, 'epoch': 2.91}


 97%|█████████▋| 24670/25428 [5:04:02<07:52,  1.60it/s]

{'loss': 0.0727, 'grad_norm': 0.6669167876243591, 'learning_rate': 5.961931728802895e-07, 'epoch': 2.91}


 97%|█████████▋| 24680/25428 [5:04:08<07:52,  1.58it/s]

{'loss': 0.0659, 'grad_norm': 0.9976215362548828, 'learning_rate': 5.883278275916313e-07, 'epoch': 2.91}


 97%|█████████▋| 24690/25428 [5:04:14<07:48,  1.57it/s]

{'loss': 0.0743, 'grad_norm': 0.727220356464386, 'learning_rate': 5.804624823029732e-07, 'epoch': 2.91}


 97%|█████████▋| 24700/25428 [5:04:21<07:36,  1.60it/s]

{'loss': 0.0708, 'grad_norm': 0.660807728767395, 'learning_rate': 5.72597137014315e-07, 'epoch': 2.91}


 97%|█████████▋| 24710/25428 [5:04:27<07:24,  1.61it/s]

{'loss': 0.1071, 'grad_norm': 0.8070771098136902, 'learning_rate': 5.647317917256568e-07, 'epoch': 2.92}


 97%|█████████▋| 24720/25428 [5:04:33<07:27,  1.58it/s]

{'loss': 0.0748, 'grad_norm': 0.6187365651130676, 'learning_rate': 5.568664464369987e-07, 'epoch': 2.92}


 97%|█████████▋| 24730/25428 [5:04:39<07:17,  1.60it/s]

{'loss': 0.0621, 'grad_norm': 0.8195086717605591, 'learning_rate': 5.490011011483405e-07, 'epoch': 2.92}


 97%|█████████▋| 24740/25428 [5:04:46<07:08,  1.60it/s]

{'loss': 0.0942, 'grad_norm': 0.7801932096481323, 'learning_rate': 5.411357558596823e-07, 'epoch': 2.92}


 97%|█████████▋| 24750/25428 [5:04:52<07:08,  1.58it/s]

{'loss': 0.0676, 'grad_norm': 0.8578587174415588, 'learning_rate': 5.33270410571024e-07, 'epoch': 2.92}


 97%|█████████▋| 24760/25428 [5:04:58<07:07,  1.56it/s]

{'loss': 0.1044, 'grad_norm': 2.5165841579437256, 'learning_rate': 5.254050652823659e-07, 'epoch': 2.92}


 97%|█████████▋| 24770/25428 [5:05:05<06:59,  1.57it/s]

{'loss': 0.0666, 'grad_norm': 1.1317552328109741, 'learning_rate': 5.175397199937077e-07, 'epoch': 2.92}


 97%|█████████▋| 24780/25428 [5:05:11<06:45,  1.60it/s]

{'loss': 0.0802, 'grad_norm': 0.8118991851806641, 'learning_rate': 5.096743747050495e-07, 'epoch': 2.92}


 97%|█████████▋| 24790/25428 [5:05:17<06:40,  1.59it/s]

{'loss': 0.0653, 'grad_norm': 0.8744416236877441, 'learning_rate': 5.018090294163915e-07, 'epoch': 2.92}


 98%|█████████▊| 24800/25428 [5:05:23<06:29,  1.61it/s]

{'loss': 0.0892, 'grad_norm': 1.0723741054534912, 'learning_rate': 4.939436841277333e-07, 'epoch': 2.93}


 98%|█████████▊| 24810/25428 [5:05:30<06:29,  1.59it/s]

{'loss': 0.1174, 'grad_norm': 0.8139235973358154, 'learning_rate': 4.86078338839075e-07, 'epoch': 2.93}


 98%|█████████▊| 24820/25428 [5:05:36<06:18,  1.61it/s]

{'loss': 0.0631, 'grad_norm': 0.6279920339584351, 'learning_rate': 4.782129935504169e-07, 'epoch': 2.93}


 98%|█████████▊| 24830/25428 [5:05:42<06:15,  1.59it/s]

{'loss': 0.0817, 'grad_norm': 1.1660466194152832, 'learning_rate': 4.7034764826175877e-07, 'epoch': 2.93}


 98%|█████████▊| 24840/25428 [5:05:48<06:05,  1.61it/s]

{'loss': 0.0619, 'grad_norm': 0.931013822555542, 'learning_rate': 4.6248230297310055e-07, 'epoch': 2.93}


 98%|█████████▊| 24850/25428 [5:05:55<06:00,  1.61it/s]

{'loss': 0.0649, 'grad_norm': 1.1565284729003906, 'learning_rate': 4.546169576844424e-07, 'epoch': 2.93}


 98%|█████████▊| 24860/25428 [5:06:01<05:58,  1.59it/s]

{'loss': 0.0619, 'grad_norm': 1.0909032821655273, 'learning_rate': 4.467516123957842e-07, 'epoch': 2.93}


 98%|█████████▊| 24870/25428 [5:06:07<05:48,  1.60it/s]

{'loss': 0.0832, 'grad_norm': 1.0096827745437622, 'learning_rate': 4.3888626710712605e-07, 'epoch': 2.93}


 98%|█████████▊| 24880/25428 [5:06:14<05:43,  1.59it/s]

{'loss': 0.087, 'grad_norm': 0.8189225792884827, 'learning_rate': 4.310209218184679e-07, 'epoch': 2.94}


 98%|█████████▊| 24890/25428 [5:06:20<05:36,  1.60it/s]

{'loss': 0.0962, 'grad_norm': 1.1415560245513916, 'learning_rate': 4.2315557652980966e-07, 'epoch': 2.94}


 98%|█████████▊| 24900/25428 [5:06:26<05:27,  1.61it/s]

{'loss': 0.0649, 'grad_norm': 0.631587564945221, 'learning_rate': 4.152902312411515e-07, 'epoch': 2.94}


 98%|█████████▊| 24910/25428 [5:06:32<05:26,  1.59it/s]

{'loss': 0.0765, 'grad_norm': 1.1311261653900146, 'learning_rate': 4.0742488595249333e-07, 'epoch': 2.94}


 98%|█████████▊| 24920/25428 [5:06:39<05:22,  1.58it/s]

{'loss': 0.0545, 'grad_norm': 0.5858814120292664, 'learning_rate': 3.9955954066383516e-07, 'epoch': 2.94}


 98%|█████████▊| 24930/25428 [5:06:45<05:08,  1.61it/s]

{'loss': 0.074, 'grad_norm': 0.9516298174858093, 'learning_rate': 3.91694195375177e-07, 'epoch': 2.94}


 98%|█████████▊| 24940/25428 [5:06:51<05:04,  1.60it/s]

{'loss': 0.0713, 'grad_norm': 1.0382221937179565, 'learning_rate': 3.838288500865188e-07, 'epoch': 2.94}


 98%|█████████▊| 24950/25428 [5:06:57<04:55,  1.62it/s]

{'loss': 0.1232, 'grad_norm': 1.8034499883651733, 'learning_rate': 3.7596350479786066e-07, 'epoch': 2.94}


 98%|█████████▊| 24960/25428 [5:07:04<04:54,  1.59it/s]

{'loss': 0.0823, 'grad_norm': 1.1260881423950195, 'learning_rate': 3.680981595092025e-07, 'epoch': 2.94}


 98%|█████████▊| 24970/25428 [5:07:10<04:44,  1.61it/s]

{'loss': 0.1009, 'grad_norm': 0.8822631239891052, 'learning_rate': 3.6023281422054433e-07, 'epoch': 2.95}


 98%|█████████▊| 24980/25428 [5:07:16<04:41,  1.59it/s]

{'loss': 0.0808, 'grad_norm': 2.0067081451416016, 'learning_rate': 3.5236746893188617e-07, 'epoch': 2.95}


 98%|█████████▊| 24990/25428 [5:07:22<04:34,  1.59it/s]

{'loss': 0.0821, 'grad_norm': 0.6420237421989441, 'learning_rate': 3.44502123643228e-07, 'epoch': 2.95}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'loss': 0.0638, 'grad_norm': 1.3747484683990479, 'learning_rate': 3.366367783545698e-07, 'epoch': 2.95}


 98%|█████████▊| 25010/25428 [5:07:38<04:38,  1.50it/s]

{'loss': 0.0979, 'grad_norm': 0.8473395109176636, 'learning_rate': 3.287714330659116e-07, 'epoch': 2.95}


 98%|█████████▊| 25020/25428 [5:07:44<04:13,  1.61it/s]

{'loss': 0.0928, 'grad_norm': 0.85488361120224, 'learning_rate': 3.2090608777725345e-07, 'epoch': 2.95}


 98%|█████████▊| 25030/25428 [5:07:50<04:05,  1.62it/s]

{'loss': 0.0896, 'grad_norm': 0.8254507780075073, 'learning_rate': 3.130407424885953e-07, 'epoch': 2.95}


 98%|█████████▊| 25040/25428 [5:07:56<04:01,  1.61it/s]

{'loss': 0.0589, 'grad_norm': 0.7665356397628784, 'learning_rate': 3.051753971999371e-07, 'epoch': 2.95}


 99%|█████████▊| 25050/25428 [5:08:02<03:57,  1.59it/s]

{'loss': 0.0951, 'grad_norm': 1.1871274709701538, 'learning_rate': 2.973100519112789e-07, 'epoch': 2.96}


 99%|█████████▊| 25060/25428 [5:08:09<03:54,  1.57it/s]

{'loss': 0.0723, 'grad_norm': 0.7302382588386536, 'learning_rate': 2.894447066226208e-07, 'epoch': 2.96}


 99%|█████████▊| 25070/25428 [5:08:15<03:50,  1.55it/s]

{'loss': 0.0728, 'grad_norm': 1.0439231395721436, 'learning_rate': 2.815793613339626e-07, 'epoch': 2.96}


 99%|█████████▊| 25080/25428 [5:08:22<03:38,  1.59it/s]

{'loss': 0.1039, 'grad_norm': 1.256287932395935, 'learning_rate': 2.737140160453044e-07, 'epoch': 2.96}


 99%|█████████▊| 25090/25428 [5:08:28<03:31,  1.60it/s]

{'loss': 0.077, 'grad_norm': 0.6862112283706665, 'learning_rate': 2.6584867075664623e-07, 'epoch': 2.96}


 99%|█████████▊| 25100/25428 [5:08:34<03:24,  1.60it/s]

{'loss': 0.0725, 'grad_norm': 1.1643043756484985, 'learning_rate': 2.5798332546798806e-07, 'epoch': 2.96}


 99%|█████████▊| 25110/25428 [5:08:40<03:16,  1.62it/s]

{'loss': 0.0892, 'grad_norm': 1.031731128692627, 'learning_rate': 2.501179801793299e-07, 'epoch': 2.96}


 99%|█████████▉| 25120/25428 [5:08:46<03:11,  1.61it/s]

{'loss': 0.0888, 'grad_norm': 1.2539738416671753, 'learning_rate': 2.4225263489067173e-07, 'epoch': 2.96}


 99%|█████████▉| 25130/25428 [5:08:53<03:08,  1.58it/s]

{'loss': 0.1234, 'grad_norm': 4.431975841522217, 'learning_rate': 2.3438728960201354e-07, 'epoch': 2.96}


 99%|█████████▉| 25140/25428 [5:08:59<03:00,  1.59it/s]

{'loss': 0.0714, 'grad_norm': 0.5669638514518738, 'learning_rate': 2.265219443133554e-07, 'epoch': 2.97}


 99%|█████████▉| 25150/25428 [5:09:05<02:52,  1.61it/s]

{'loss': 0.1131, 'grad_norm': 1.3629933595657349, 'learning_rate': 2.186565990246972e-07, 'epoch': 2.97}


 99%|█████████▉| 25160/25428 [5:09:12<02:47,  1.60it/s]

{'loss': 0.0919, 'grad_norm': 0.8179122805595398, 'learning_rate': 2.1079125373603904e-07, 'epoch': 2.97}


 99%|█████████▉| 25170/25428 [5:09:18<02:41,  1.59it/s]

{'loss': 0.1103, 'grad_norm': 1.1494501829147339, 'learning_rate': 2.0292590844738087e-07, 'epoch': 2.97}


 99%|█████████▉| 25180/25428 [5:09:24<02:35,  1.60it/s]

{'loss': 0.0692, 'grad_norm': 1.1450473070144653, 'learning_rate': 1.9506056315872268e-07, 'epoch': 2.97}


 99%|█████████▉| 25190/25428 [5:09:30<02:26,  1.63it/s]

{'loss': 0.087, 'grad_norm': 0.987125813961029, 'learning_rate': 1.871952178700645e-07, 'epoch': 2.97}


 99%|█████████▉| 25200/25428 [5:09:36<02:21,  1.62it/s]

{'loss': 0.0691, 'grad_norm': 0.5448877215385437, 'learning_rate': 1.7932987258140632e-07, 'epoch': 2.97}


 99%|█████████▉| 25210/25428 [5:09:43<02:15,  1.61it/s]

{'loss': 0.068, 'grad_norm': 0.8012106418609619, 'learning_rate': 1.7146452729274815e-07, 'epoch': 2.97}


 99%|█████████▉| 25220/25428 [5:09:49<02:08,  1.62it/s]

{'loss': 0.063, 'grad_norm': 1.8582720756530762, 'learning_rate': 1.6359918200409e-07, 'epoch': 2.98}


 99%|█████████▉| 25230/25428 [5:09:55<02:05,  1.58it/s]

{'loss': 0.0859, 'grad_norm': 1.4138374328613281, 'learning_rate': 1.5573383671543182e-07, 'epoch': 2.98}


 99%|█████████▉| 25240/25428 [5:10:01<01:56,  1.62it/s]

{'loss': 0.0747, 'grad_norm': 1.0277611017227173, 'learning_rate': 1.4786849142677365e-07, 'epoch': 2.98}


 99%|█████████▉| 25250/25428 [5:10:08<01:51,  1.59it/s]

{'loss': 0.0993, 'grad_norm': 0.7364727258682251, 'learning_rate': 1.4000314613811549e-07, 'epoch': 2.98}


 99%|█████████▉| 25260/25428 [5:10:14<01:45,  1.59it/s]

{'loss': 0.0618, 'grad_norm': 0.9313037991523743, 'learning_rate': 1.321378008494573e-07, 'epoch': 2.98}


 99%|█████████▉| 25270/25428 [5:10:20<01:40,  1.58it/s]

{'loss': 0.0687, 'grad_norm': 0.8267958164215088, 'learning_rate': 1.2427245556079913e-07, 'epoch': 2.98}


 99%|█████████▉| 25280/25428 [5:10:26<01:31,  1.62it/s]

{'loss': 0.0836, 'grad_norm': 0.7394489645957947, 'learning_rate': 1.1640711027214096e-07, 'epoch': 2.98}


 99%|█████████▉| 25290/25428 [5:10:33<01:28,  1.57it/s]

{'loss': 0.0604, 'grad_norm': 0.6265057325363159, 'learning_rate': 1.0854176498348278e-07, 'epoch': 2.98}


 99%|█████████▉| 25300/25428 [5:10:39<01:21,  1.58it/s]

{'loss': 0.0641, 'grad_norm': 1.4172152280807495, 'learning_rate': 1.0067641969482461e-07, 'epoch': 2.98}


100%|█████████▉| 25310/25428 [5:10:45<01:13,  1.61it/s]

{'loss': 0.0577, 'grad_norm': 1.041049838066101, 'learning_rate': 9.281107440616643e-08, 'epoch': 2.99}


100%|█████████▉| 25320/25428 [5:10:52<01:07,  1.60it/s]

{'loss': 0.0817, 'grad_norm': 1.1052302122116089, 'learning_rate': 8.494572911750827e-08, 'epoch': 2.99}


100%|█████████▉| 25330/25428 [5:10:58<01:01,  1.61it/s]

{'loss': 0.0638, 'grad_norm': 0.6508084535598755, 'learning_rate': 7.70803838288501e-08, 'epoch': 2.99}


100%|█████████▉| 25340/25428 [5:11:04<00:55,  1.60it/s]

{'loss': 0.0527, 'grad_norm': 0.811946451663971, 'learning_rate': 6.921503854019192e-08, 'epoch': 2.99}


100%|█████████▉| 25350/25428 [5:11:10<00:49,  1.56it/s]

{'loss': 0.0938, 'grad_norm': 0.4449474811553955, 'learning_rate': 6.134969325153375e-08, 'epoch': 2.99}


100%|█████████▉| 25360/25428 [5:11:17<00:42,  1.60it/s]

{'loss': 0.079, 'grad_norm': 0.6197950839996338, 'learning_rate': 5.3484347962875574e-08, 'epoch': 2.99}


100%|█████████▉| 25370/25428 [5:11:23<00:36,  1.61it/s]

{'loss': 0.0698, 'grad_norm': 1.1615828275680542, 'learning_rate': 4.56190026742174e-08, 'epoch': 2.99}


100%|█████████▉| 25380/25428 [5:11:29<00:30,  1.59it/s]

{'loss': 0.0667, 'grad_norm': 0.7912222146987915, 'learning_rate': 3.775365738555923e-08, 'epoch': 2.99}


100%|█████████▉| 25390/25428 [5:11:36<00:23,  1.59it/s]

{'loss': 0.0766, 'grad_norm': 1.0146721601486206, 'learning_rate': 2.9888312096901055e-08, 'epoch': 3.0}


100%|█████████▉| 25400/25428 [5:11:42<00:17,  1.59it/s]

{'loss': 0.0639, 'grad_norm': 1.2424649000167847, 'learning_rate': 2.2022966808242885e-08, 'epoch': 3.0}


100%|█████████▉| 25410/25428 [5:11:48<00:11,  1.62it/s]

{'loss': 0.079, 'grad_norm': 0.9199841022491455, 'learning_rate': 1.4157621519584712e-08, 'epoch': 3.0}


100%|█████████▉| 25420/25428 [5:11:54<00:04,  1.62it/s]

{'loss': 0.0873, 'grad_norm': 0.7286950945854187, 'learning_rate': 6.292276230926538e-09, 'epoch': 3.0}


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
                                                       
100%|██████████| 25428/25428 [5:31:17<00:00,  1.28it/s]
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


{'eval_loss': 0.049213238060474396, 'eval_runtime': 1155.489, 'eval_samples_per_second': 58.68, 'eval_steps_per_second': 7.335, 'epoch': 3.0}
{'train_runtime': 19877.8826, 'train_samples_per_second': 10.233, 'train_steps_per_second': 1.279, 'train_loss': 0.19169289360884434, 'epoch': 3.0}


## Inference

In [4]:
import torch
from transformers import BartTokenizer, BartForConditionalGeneration

# load the trained model and tokenizer
model = BartForConditionalGeneration.from_pretrained('./caesar_cipher_decoder')
bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

# set to evaluation mode
model.eval()

# from df
ciphertext = "DEO LHWUEJC DWO PWGAJ DEI WYNKOO PDA QJEPAZ GEJCZKI LHWUEJC WP PDA HKJZKJ LWHHWZEQI WJZ PDA NKUWH WHXANP DWHH."
# HIS PLAYING HAS TAKEN HIM ACROSS THE UNITED KINGDOM PLAYING AT THE LONDON PALLADIUM AND THE ROYAL ALBERT HALL
input_ids = bart_tokenizer.encode(ciphertext, return_tensors='pt')

with torch.no_grad():
    generated_ids = model.generate(input_ids)

decoded_output = bart_tokenizer.decode(generated_ids[0], skip_special_tokens=True)

print(f"Ciphertext: {ciphertext}")
print(f"Decoded plaintext: {decoded_output}")




Ciphertext: DEO LHWUEJC DWO PWGAJ DEI WYNKOO PDA QJEPAZ GEJCZKI LHWUEJC WP PDA HKJZKJ LWHHWZEQI WJZ PDA NKUWH WHXANP DWHH.
Decoded plaintext: His playing has taken him across the United Kingdom playing at the London Palladium and the
