IMAGE LOADING

In [1]:
from transformers import BlipProcessor, BlipForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import Dataset
from PIL import Image
import pandas as pd
import re
import io

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
splits = {'train': 'data/combined_train.parquet', 
          'validation': 'data/validation-00000-of-00001-f5d78556e52cceb7.parquet'}
dataset = pd.read_parquet(splits["train"])

PREPROCESSİNG

In [3]:
reviews_sample = dataset.sample(6000, random_state=42)

reviews_sample = reviews_sample.drop(columns=["imgid", "captions"])
reviews_sample['image'] = reviews_sample['image'].apply(lambda x: x['bytes'])
reviews_sample = reviews_sample.rename(columns={"image": "image_bytes", "detailed_caption": "captions"})


colors = [
    "Kırmızı",
    "Yeşil",
    "Mavi",
    "Sarı",
    "Turuncu",
    "Pembe",
    "Mor",
    "Kahverengi",
    "Siyah",
    "Beyaz",
    "Gri",
    "Lacivert",
    "Altın",
    "Gümüş",
    "Zeytin",
    "Fuşya",
    "Bordo",
    "Mercan",
    "Ceviz",
    "Zümrüt",
    "Teal",
    "Bej",
    "kırmızı",
    "yeşil",
    "mavi",
    "sarı",
    "turuncu",
    "pembe",
    "mor",
    "kahverengi",
    "siyah",
    "beyaz",
    "gri",
    "lacivert",
    "altın",
    "gümüş",
    "zeytin",
    "fuşya",
    "bordo",
    "mercan",
    "ceviz",
    "zümrüt",
    "teal",
    "bej"
];

def preprocess_captions(caption):
    sentences = []
    caption = re.sub("\s+", " ", caption).strip()
    for color in colors:
      new_sentences = re.findall(rf'[^.!?]*\b{color}\b[^.!?]*[.!?]', caption, re.IGNORECASE)
      for new_sentence in new_sentences:
         if new_sentence not in sentences:
            sentences.insert(0, new_sentence)
    new_caption = ""
    for sentence in sentences:
        new_caption += sentence
    return new_caption

reviews_sample["captions"] = reviews_sample["captions"].apply(preprocess_captions)

TRAIN SPLIT

In [4]:
hf_dataset = Dataset.from_pandas(reviews_sample)

train_test_split = hf_dataset.train_test_split(train_size=0.9, seed=42)
train_dataset = train_test_split["train"]
val_dataset = train_test_split["test"]

In [5]:
model_name = "Salesforce/blip-image-captioning-base"
model = BlipForConditionalGeneration.from_pretrained(model_name)
processor = BlipProcessor.from_pretrained(model_name)

def preprocess_data(example):
    image_bytes = example["image_bytes"]
    caption = example["captions"]

    images = [Image.open(io.BytesIO(img_bytes)) for img_bytes in image_bytes]

    inputs = processor(
        images=images,
        text=caption,
        padding=True,
        max_length=39,
        truncation=True,
        return_tensors="pt",
    )
    inputs['labels'] = inputs['input_ids'].clone()

    return inputs

train_dataset = train_dataset.map(preprocess_data, batched=True)
val_dataset = val_dataset.map(preprocess_data, batched=True)

Map: 100%|██████████| 5400/5400 [02:06<00:00, 42.52 examples/s]
Map: 100%|██████████| 600/600 [00:07<00:00, 76.04 examples/s]


In [6]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./trainer_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
    warmup_steps=500,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=5,
    weight_decay=0.01,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=100,
    metric_for_best_model="eval_loss",
    fp16=False,
)

# Trainer oluşturma
trainer = Seq2SeqTrainer(
    model=model,  # Model
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=processor,
)

trainer.train()

trainer.save_model("./trained_model")


  trainer = Seq2SeqTrainer(
  0%|          | 100/27000 [01:19<5:49:25,  1.28it/s]

{'loss': 6.9861, 'grad_norm': 16.782052993774414, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.02}


  1%|          | 200/27000 [02:37<5:48:06,  1.28it/s]

{'loss': 5.0176, 'grad_norm': 20.200523376464844, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.04}


  1%|          | 300/27000 [03:55<5:53:57,  1.26it/s]

{'loss': 4.271, 'grad_norm': 20.896299362182617, 'learning_rate': 1.2e-05, 'epoch': 0.06}


  1%|▏         | 400/27000 [05:19<6:15:13,  1.18it/s]

{'loss': 4.0033, 'grad_norm': 17.34522247314453, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.07}


  2%|▏         | 500/27000 [06:43<6:16:08,  1.17it/s]

{'loss': 3.3946, 'grad_norm': 20.07935333251953, 'learning_rate': 2e-05, 'epoch': 0.09}


  2%|▏         | 600/27000 [08:08<6:12:28,  1.18it/s]

{'loss': 3.3283, 'grad_norm': 17.988447189331055, 'learning_rate': 1.999929729520755e-05, 'epoch': 0.11}


  3%|▎         | 700/27000 [09:33<6:12:53,  1.18it/s]

{'loss': 3.1216, 'grad_norm': 16.967605590820312, 'learning_rate': 1.9997189279589003e-05, 'epoch': 0.13}


  3%|▎         | 800/27000 [10:58<6:21:41,  1.14it/s]

{'loss': 2.893, 'grad_norm': 15.97664737701416, 'learning_rate': 1.9993676249406895e-05, 'epoch': 0.15}


  3%|▎         | 900/27000 [12:25<6:13:16,  1.17it/s]

{'loss': 2.7965, 'grad_norm': 15.171396255493164, 'learning_rate': 1.9988758698385854e-05, 'epoch': 0.17}


  4%|▎         | 1000/27000 [14:05<8:27:53,  1.17s/it]

{'loss': 2.6734, 'grad_norm': 17.542736053466797, 'learning_rate': 1.9982437317643218e-05, 'epoch': 0.19}


  4%|▍         | 1100/27000 [15:38<6:14:15,  1.15it/s]

{'loss': 2.4119, 'grad_norm': 16.802717208862305, 'learning_rate': 1.9974712995591887e-05, 'epoch': 0.2}


  4%|▍         | 1200/27000 [17:07<6:30:09,  1.10it/s]

{'loss': 2.4565, 'grad_norm': 13.795280456542969, 'learning_rate': 1.9965586817815494e-05, 'epoch': 0.22}


  5%|▍         | 1300/27000 [18:37<6:25:37,  1.11it/s]

{'loss': 2.3599, 'grad_norm': 16.586885452270508, 'learning_rate': 1.995506006691581e-05, 'epoch': 0.24}


  5%|▌         | 1400/27000 [20:08<6:31:15,  1.09it/s]

{'loss': 2.437, 'grad_norm': 16.13336181640625, 'learning_rate': 1.9943134222332493e-05, 'epoch': 0.26}


  6%|▌         | 1500/27000 [21:40<6:26:15,  1.10it/s]

{'loss': 2.4218, 'grad_norm': 18.002979278564453, 'learning_rate': 1.992981096013517e-05, 'epoch': 0.28}


  6%|▌         | 1600/27000 [23:10<6:13:09,  1.13it/s]

{'loss': 2.2581, 'grad_norm': 15.422009468078613, 'learning_rate': 1.9915092152787888e-05, 'epoch': 0.3}


  6%|▋         | 1700/27000 [24:37<5:59:02,  1.17it/s]

{'loss': 2.2963, 'grad_norm': 12.818949699401855, 'learning_rate': 1.9898979868885933e-05, 'epoch': 0.31}


  7%|▋         | 1800/27000 [26:04<6:21:01,  1.10it/s]

{'loss': 2.3093, 'grad_norm': 13.686702728271484, 'learning_rate': 1.988147637286513e-05, 'epoch': 0.33}


  7%|▋         | 1900/27000 [27:33<6:11:47,  1.13it/s]

{'loss': 2.1822, 'grad_norm': 12.756145477294922, 'learning_rate': 1.9862584124683587e-05, 'epoch': 0.35}


  7%|▋         | 2000/27000 [29:04<6:15:28,  1.11it/s]

{'loss': 2.2764, 'grad_norm': 10.803709983825684, 'learning_rate': 1.984230577947597e-05, 'epoch': 0.37}


  8%|▊         | 2100/27000 [30:34<6:20:29,  1.09it/s]

{'loss': 2.1603, 'grad_norm': 13.762480735778809, 'learning_rate': 1.9820644187180354e-05, 'epoch': 0.39}


  8%|▊         | 2200/27000 [32:07<6:30:11,  1.06it/s]

{'loss': 2.112, 'grad_norm': 11.130783081054688, 'learning_rate': 1.9797602392137678e-05, 'epoch': 0.41}


  9%|▊         | 2300/27000 [33:40<6:15:15,  1.10it/s]

{'loss': 2.23, 'grad_norm': 14.091726303100586, 'learning_rate': 1.9773183632663907e-05, 'epoch': 0.43}


  9%|▉         | 2400/27000 [35:12<6:19:03,  1.08it/s]

{'loss': 2.1091, 'grad_norm': 10.521045684814453, 'learning_rate': 1.97473913405949e-05, 'epoch': 0.44}


  9%|▉         | 2500/27000 [36:44<6:14:53,  1.09it/s]

{'loss': 2.1189, 'grad_norm': 12.54970932006836, 'learning_rate': 1.972022914080411e-05, 'epoch': 0.46}


 10%|▉         | 2600/27000 [38:16<6:19:03,  1.07it/s]

{'loss': 2.2107, 'grad_norm': 15.953191757202148, 'learning_rate': 1.9691700850693126e-05, 'epoch': 0.48}


 10%|█         | 2700/27000 [39:49<6:14:15,  1.08it/s]

{'loss': 1.8714, 'grad_norm': 14.040995597839355, 'learning_rate': 1.9661810479655184e-05, 'epoch': 0.5}


 10%|█         | 2800/27000 [41:22<6:14:44,  1.08it/s]

{'loss': 2.0673, 'grad_norm': 12.803861618041992, 'learning_rate': 1.9630562228511682e-05, 'epoch': 0.52}


 11%|█         | 2900/27000 [42:54<6:11:15,  1.08it/s]

{'loss': 2.0444, 'grad_norm': 13.082601547241211, 'learning_rate': 1.9597960488921785e-05, 'epoch': 0.54}


 11%|█         | 3000/27000 [44:27<6:09:27,  1.08it/s]

{'loss': 2.1496, 'grad_norm': 12.56184196472168, 'learning_rate': 1.9564009842765225e-05, 'epoch': 0.56}


 11%|█▏        | 3100/27000 [45:59<6:03:25,  1.10it/s]

{'loss': 1.9879, 'grad_norm': 14.546916961669922, 'learning_rate': 1.9528715061498355e-05, 'epoch': 0.57}


 12%|█▏        | 3200/27000 [47:32<6:09:40,  1.07it/s]

{'loss': 1.8278, 'grad_norm': 14.489623069763184, 'learning_rate': 1.949208110548356e-05, 'epoch': 0.59}


 12%|█▏        | 3300/27000 [49:05<6:05:38,  1.08it/s]

{'loss': 1.9715, 'grad_norm': 11.676129341125488, 'learning_rate': 1.9454113123292133e-05, 'epoch': 0.61}


 13%|█▎        | 3400/27000 [50:37<6:02:15,  1.09it/s]

{'loss': 1.9887, 'grad_norm': 16.266300201416016, 'learning_rate': 1.9414816450980686e-05, 'epoch': 0.63}


 13%|█▎        | 3500/27000 [52:10<6:02:08,  1.08it/s]

{'loss': 1.7796, 'grad_norm': 11.107911109924316, 'learning_rate': 1.9374196611341212e-05, 'epoch': 0.65}


 13%|█▎        | 3600/27000 [53:43<5:58:43,  1.09it/s]

{'loss': 1.9398, 'grad_norm': 8.736380577087402, 'learning_rate': 1.93322593131249e-05, 'epoch': 0.67}


 14%|█▎        | 3700/27000 [55:15<5:55:00,  1.09it/s]

{'loss': 2.0039, 'grad_norm': 9.466944694519043, 'learning_rate': 1.9289010450239843e-05, 'epoch': 0.69}


 14%|█▍        | 3800/27000 [56:47<5:59:05,  1.08it/s]

{'loss': 1.8498, 'grad_norm': 16.27962303161621, 'learning_rate': 1.924445610092269e-05, 'epoch': 0.7}


 14%|█▍        | 3900/27000 [58:19<5:50:13,  1.10it/s]

{'loss': 1.8454, 'grad_norm': 13.181221008300781, 'learning_rate': 1.9198602526884388e-05, 'epoch': 0.72}


 15%|█▍        | 4000/27000 [59:52<5:52:24,  1.09it/s]

{'loss': 1.9309, 'grad_norm': 14.208638191223145, 'learning_rate': 1.9151456172430186e-05, 'epoch': 0.74}


 15%|█▌        | 4100/27000 [1:01:25<5:51:36,  1.09it/s]

{'loss': 1.9119, 'grad_norm': 7.085445880889893, 'learning_rate': 1.910302366355393e-05, 'epoch': 0.76}


 16%|█▌        | 4200/27000 [1:02:57<5:49:08,  1.09it/s]

{'loss': 1.8512, 'grad_norm': 10.356016159057617, 'learning_rate': 1.9053311807006845e-05, 'epoch': 0.78}


 16%|█▌        | 4300/27000 [1:04:30<5:51:10,  1.08it/s]

{'loss': 1.7861, 'grad_norm': 2.9744603633880615, 'learning_rate': 1.900232758934089e-05, 'epoch': 0.8}


 16%|█▋        | 4400/27000 [1:06:03<5:46:20,  1.09it/s]

{'loss': 1.736, 'grad_norm': 12.414628982543945, 'learning_rate': 1.8950078175926886e-05, 'epoch': 0.81}


 17%|█▋        | 4500/27000 [1:07:35<5:46:56,  1.08it/s]

{'loss': 1.7988, 'grad_norm': 11.809439659118652, 'learning_rate': 1.8896570909947477e-05, 'epoch': 0.83}


 17%|█▋        | 4600/27000 [1:09:07<5:40:44,  1.10it/s]

{'loss': 1.8097, 'grad_norm': 12.036709785461426, 'learning_rate': 1.8841813311365102e-05, 'epoch': 0.85}


 17%|█▋        | 4700/27000 [1:10:39<5:40:11,  1.09it/s]

{'loss': 1.8387, 'grad_norm': 9.292655944824219, 'learning_rate': 1.8785813075865164e-05, 'epoch': 0.87}


 18%|█▊        | 4800/27000 [1:12:12<5:45:01,  1.07it/s]

{'loss': 1.7408, 'grad_norm': 8.341708183288574, 'learning_rate': 1.8728578073774427e-05, 'epoch': 0.89}


 18%|█▊        | 4900/27000 [1:13:47<5:52:56,  1.04it/s]

{'loss': 1.8168, 'grad_norm': 13.776483535766602, 'learning_rate': 1.8670116348954945e-05, 'epoch': 0.91}


 19%|█▊        | 5000/27000 [1:15:20<5:37:31,  1.09it/s]

{'loss': 1.821, 'grad_norm': 11.124797821044922, 'learning_rate': 1.8610436117673557e-05, 'epoch': 0.93}


 19%|█▉        | 5100/27000 [1:16:51<5:26:32,  1.12it/s]

{'loss': 1.9008, 'grad_norm': 10.898279190063477, 'learning_rate': 1.8549545767447174e-05, 'epoch': 0.94}


 19%|█▉        | 5200/27000 [1:18:21<5:26:16,  1.11it/s]

{'loss': 1.7576, 'grad_norm': 13.271834373474121, 'learning_rate': 1.848745385586398e-05, 'epoch': 0.96}


 20%|█▉        | 5300/27000 [1:19:52<5:31:46,  1.09it/s]

{'loss': 1.7394, 'grad_norm': 12.070611000061035, 'learning_rate': 1.842416910938074e-05, 'epoch': 0.98}


 20%|██        | 5400/27000 [1:21:24<5:25:37,  1.11it/s]

{'loss': 1.7647, 'grad_norm': 10.156956672668457, 'learning_rate': 1.8359700422096385e-05, 'epoch': 1.0}


                                                        
 20%|██        | 5400/27000 [1:24:03<5:25:37,  1.11it/s]

{'eval_loss': 1.7258288860321045, 'eval_runtime': 158.2596, 'eval_samples_per_second': 3.791, 'eval_steps_per_second': 3.791, 'epoch': 1.0}


 20%|██        | 5500/27000 [1:25:40<5:24:26,  1.10it/s]  

{'loss': 1.5214, 'grad_norm': 13.675884246826172, 'learning_rate': 1.829405685450202e-05, 'epoch': 1.02}


 21%|██        | 5600/27000 [1:27:10<5:19:03,  1.12it/s]

{'loss': 1.6274, 'grad_norm': 23.716835021972656, 'learning_rate': 1.822724763220755e-05, 'epoch': 1.04}


 21%|██        | 5700/27000 [1:28:42<5:30:11,  1.08it/s]

{'loss': 1.5163, 'grad_norm': 11.855768203735352, 'learning_rate': 1.815928214464511e-05, 'epoch': 1.06}


 21%|██▏       | 5800/27000 [1:30:14<5:30:48,  1.07it/s]

{'loss': 1.4792, 'grad_norm': 17.647554397583008, 'learning_rate': 1.8090169943749477e-05, 'epoch': 1.07}


 22%|██▏       | 5900/27000 [1:31:47<5:26:00,  1.08it/s]

{'loss': 1.5372, 'grad_norm': 13.778925895690918, 'learning_rate': 1.8019920742615596e-05, 'epoch': 1.09}


 22%|██▏       | 6000/27000 [1:33:20<5:24:19,  1.08it/s]

{'loss': 1.5253, 'grad_norm': 13.170023918151855, 'learning_rate': 1.7948544414133534e-05, 'epoch': 1.11}


 23%|██▎       | 6100/27000 [1:34:52<5:30:49,  1.05it/s]

{'loss': 1.5926, 'grad_norm': 10.504948616027832, 'learning_rate': 1.7876050989600908e-05, 'epoch': 1.13}


 23%|██▎       | 6200/27000 [1:36:24<5:19:04,  1.09it/s]

{'loss': 1.454, 'grad_norm': 11.22573471069336, 'learning_rate': 1.7802450657313086e-05, 'epoch': 1.15}


 23%|██▎       | 6300/27000 [1:37:56<5:15:48,  1.09it/s]

{'loss': 1.5848, 'grad_norm': 17.586688995361328, 'learning_rate': 1.7727753761131312e-05, 'epoch': 1.17}


 24%|██▎       | 6400/27000 [1:39:28<5:20:17,  1.07it/s]

{'loss': 1.5364, 'grad_norm': 11.210163116455078, 'learning_rate': 1.7651970799028976e-05, 'epoch': 1.19}


 24%|██▍       | 6500/27000 [1:41:00<5:14:46,  1.09it/s]

{'loss': 1.5025, 'grad_norm': 13.444021224975586, 'learning_rate': 1.7575112421616203e-05, 'epoch': 1.2}


 24%|██▍       | 6600/27000 [1:42:32<5:11:39,  1.09it/s]

{'loss': 1.3714, 'grad_norm': 11.151616096496582, 'learning_rate': 1.7497189430643025e-05, 'epoch': 1.22}


 25%|██▍       | 6700/27000 [1:44:05<5:11:44,  1.09it/s]

{'loss': 1.5644, 'grad_norm': 11.721892356872559, 'learning_rate': 1.741821277748128e-05, 'epoch': 1.24}


 25%|██▌       | 6800/27000 [1:45:37<5:11:35,  1.08it/s]

{'loss': 1.5586, 'grad_norm': 13.421332359313965, 'learning_rate': 1.7338193561585507e-05, 'epoch': 1.26}


 26%|██▌       | 6900/27000 [1:47:10<5:06:50,  1.09it/s]

{'loss': 1.5613, 'grad_norm': 15.999070167541504, 'learning_rate': 1.7257143028933004e-05, 'epoch': 1.28}


 26%|██▌       | 7000/27000 [1:48:42<5:11:59,  1.07it/s]

{'loss': 1.5658, 'grad_norm': 14.291301727294922, 'learning_rate': 1.717507257044331e-05, 'epoch': 1.3}


 26%|██▋       | 7100/27000 [1:50:14<5:06:20,  1.08it/s]

{'loss': 1.4534, 'grad_norm': 9.711705207824707, 'learning_rate': 1.7091993720377336e-05, 'epoch': 1.31}


 27%|██▋       | 7200/27000 [1:51:47<5:04:18,  1.08it/s]

{'loss': 1.4792, 'grad_norm': 14.006060600280762, 'learning_rate': 1.7007918154716286e-05, 'epoch': 1.33}


 27%|██▋       | 7300/27000 [1:53:19<5:05:15,  1.08it/s]

{'loss': 1.5437, 'grad_norm': 12.11197280883789, 'learning_rate': 1.692285768952076e-05, 'epoch': 1.35}


 27%|██▋       | 7400/27000 [1:54:52<5:05:58,  1.07it/s]

{'loss': 1.5565, 'grad_norm': 11.576311111450195, 'learning_rate': 1.6836824279270053e-05, 'epoch': 1.37}


 28%|██▊       | 7500/27000 [1:56:25<4:58:34,  1.09it/s]

{'loss': 1.5422, 'grad_norm': 10.670971870422363, 'learning_rate': 1.6749830015182106e-05, 'epoch': 1.39}


 28%|██▊       | 7600/27000 [1:57:57<4:59:36,  1.08it/s]

{'loss': 1.4932, 'grad_norm': 11.295905113220215, 'learning_rate': 1.6661887123514183e-05, 'epoch': 1.41}


 29%|██▊       | 7700/27000 [1:59:30<4:59:58,  1.07it/s]

{'loss': 1.4815, 'grad_norm': 11.56929874420166, 'learning_rate': 1.657300796384457e-05, 'epoch': 1.43}


 29%|██▉       | 7800/27000 [2:01:02<4:54:21,  1.09it/s]

{'loss': 1.5996, 'grad_norm': 16.02731704711914, 'learning_rate': 1.648320502733555e-05, 'epoch': 1.44}


 29%|██▉       | 7900/27000 [2:02:37<4:55:36,  1.08it/s]

{'loss': 1.4221, 'grad_norm': 11.801347732543945, 'learning_rate': 1.63924909349779e-05, 'epoch': 1.46}


 30%|██▉       | 8000/27000 [2:04:13<4:58:18,  1.06it/s]

{'loss': 1.3807, 'grad_norm': 11.73095417022705, 'learning_rate': 1.6300878435817115e-05, 'epoch': 1.48}


 30%|███       | 8100/27000 [2:05:45<4:52:33,  1.08it/s]

{'loss': 1.4264, 'grad_norm': 10.628098487854004, 'learning_rate': 1.6208380405161623e-05, 'epoch': 1.5}


 30%|███       | 8200/27000 [2:07:18<4:53:18,  1.07it/s]

{'loss': 1.3985, 'grad_norm': 7.639126300811768, 'learning_rate': 1.6115009842773322e-05, 'epoch': 1.52}


 31%|███       | 8300/27000 [2:08:51<4:49:47,  1.08it/s]

{'loss': 1.4407, 'grad_norm': 12.083284378051758, 'learning_rate': 1.6020779871040538e-05, 'epoch': 1.54}


 31%|███       | 8400/27000 [2:10:24<4:48:11,  1.08it/s]

{'loss': 1.4575, 'grad_norm': 12.076325416564941, 'learning_rate': 1.5925703733133823e-05, 'epoch': 1.56}


 31%|███▏      | 8500/27000 [2:11:57<4:45:47,  1.08it/s]

{'loss': 1.412, 'grad_norm': 8.677366256713867, 'learning_rate': 1.5829794791144723e-05, 'epoch': 1.57}


 32%|███▏      | 8600/27000 [2:13:29<4:47:33,  1.07it/s]

{'loss': 1.3904, 'grad_norm': 13.265637397766113, 'learning_rate': 1.5733066524207875e-05, 'epoch': 1.59}


 32%|███▏      | 8700/27000 [2:15:02<4:44:49,  1.07it/s]

{'loss': 1.5504, 'grad_norm': 14.042135238647461, 'learning_rate': 1.5635532526606625e-05, 'epoch': 1.61}


 33%|███▎      | 8800/27000 [2:16:37<4:46:36,  1.06it/s]

{'loss': 1.418, 'grad_norm': 9.696134567260742, 'learning_rate': 1.5537206505862486e-05, 'epoch': 1.63}


 33%|███▎      | 8900/27000 [2:18:11<4:38:41,  1.08it/s]

{'loss': 1.3714, 'grad_norm': 18.861316680908203, 'learning_rate': 1.5438102280808653e-05, 'epoch': 1.65}


 33%|███▎      | 9000/27000 [2:19:43<4:38:22,  1.08it/s]

{'loss': 1.4678, 'grad_norm': 12.759781837463379, 'learning_rate': 1.533823377964791e-05, 'epoch': 1.67}


 34%|███▎      | 9100/27000 [2:21:17<4:36:16,  1.08it/s]

{'loss': 1.384, 'grad_norm': 12.190314292907715, 'learning_rate': 1.5237615037995129e-05, 'epoch': 1.69}


 34%|███▍      | 9200/27000 [2:22:50<4:35:13,  1.08it/s]

{'loss': 1.4408, 'grad_norm': 15.446051597595215, 'learning_rate': 1.5136260196904704e-05, 'epoch': 1.7}


 34%|███▍      | 9300/27000 [2:24:22<4:32:29,  1.08it/s]

{'loss': 1.5171, 'grad_norm': 11.255937576293945, 'learning_rate': 1.5034183500883153e-05, 'epoch': 1.72}


 35%|███▍      | 9400/27000 [2:25:55<4:32:02,  1.08it/s]

{'loss': 1.3245, 'grad_norm': 13.7977933883667, 'learning_rate': 1.4931399295887172e-05, 'epoch': 1.74}


 35%|███▌      | 9500/27000 [2:27:28<4:34:22,  1.06it/s]

{'loss': 1.4335, 'grad_norm': 14.849212646484375, 'learning_rate': 1.482792202730745e-05, 'epoch': 1.76}


 36%|███▌      | 9600/27000 [2:29:00<4:20:48,  1.11it/s]

{'loss': 1.5204, 'grad_norm': 12.78270435333252, 'learning_rate': 1.4723766237938495e-05, 'epoch': 1.78}


 36%|███▌      | 9700/27000 [2:30:30<4:22:56,  1.10it/s]

{'loss': 1.368, 'grad_norm': 9.777685165405273, 'learning_rate': 1.4618946565934775e-05, 'epoch': 1.8}


 36%|███▋      | 9800/27000 [2:32:01<4:26:10,  1.08it/s]

{'loss': 1.47, 'grad_norm': 9.491820335388184, 'learning_rate': 1.4513477742753465e-05, 'epoch': 1.81}


 37%|███▋      | 9900/27000 [2:33:34<4:26:47,  1.07it/s]

{'loss': 1.4301, 'grad_norm': 11.575868606567383, 'learning_rate': 1.4407374591084064e-05, 'epoch': 1.83}


 37%|███▋      | 10000/27000 [2:35:08<4:24:59,  1.07it/s]

{'loss': 1.4917, 'grad_norm': 13.025124549865723, 'learning_rate': 1.4300652022765207e-05, 'epoch': 1.85}


 37%|███▋      | 10100/27000 [2:36:44<4:21:54,  1.08it/s]

{'loss': 1.3348, 'grad_norm': 9.626993179321289, 'learning_rate': 1.419332503668894e-05, 'epoch': 1.87}


 38%|███▊      | 10200/27000 [2:38:17<4:19:26,  1.08it/s]

{'loss': 1.4087, 'grad_norm': 7.828608989715576, 'learning_rate': 1.408540871669275e-05, 'epoch': 1.89}


 38%|███▊      | 10300/27000 [2:39:50<4:18:03,  1.08it/s]

{'loss': 1.3288, 'grad_norm': 6.1449384689331055, 'learning_rate': 1.3976918229439698e-05, 'epoch': 1.91}


 39%|███▊      | 10400/27000 [2:41:23<4:15:43,  1.08it/s]

{'loss': 1.4105, 'grad_norm': 10.539163589477539, 'learning_rate': 1.3867868822286838e-05, 'epoch': 1.93}


 39%|███▉      | 10500/27000 [2:42:57<4:15:53,  1.07it/s]

{'loss': 1.4022, 'grad_norm': 13.583150863647461, 'learning_rate': 1.3758275821142382e-05, 'epoch': 1.94}


 39%|███▉      | 10600/27000 [2:44:31<4:13:33,  1.08it/s]

{'loss': 1.3428, 'grad_norm': 12.62500286102295, 'learning_rate': 1.3648154628311754e-05, 'epoch': 1.96}


 40%|███▉      | 10700/27000 [2:46:05<4:19:05,  1.05it/s]

{'loss': 1.3105, 'grad_norm': 11.468812942504883, 'learning_rate': 1.3537520720332943e-05, 'epoch': 1.98}


 40%|████      | 10800/27000 [2:47:38<4:01:12,  1.12it/s]

{'loss': 1.3779, 'grad_norm': 9.32304573059082, 'learning_rate': 1.3426389645801415e-05, 'epoch': 2.0}


                                                         
 40%|████      | 10800/27000 [2:50:19<4:01:12,  1.12it/s]

{'eval_loss': 1.4804002046585083, 'eval_runtime': 161.3992, 'eval_samples_per_second': 3.717, 'eval_steps_per_second': 3.717, 'epoch': 2.0}


 40%|████      | 10900/27000 [2:51:58<4:04:42,  1.10it/s]  

{'loss': 1.1041, 'grad_norm': 17.667850494384766, 'learning_rate': 1.3314777023184907e-05, 'epoch': 2.02}


 41%|████      | 11000/27000 [2:53:30<4:03:30,  1.10it/s]

{'loss': 1.1135, 'grad_norm': 8.640571594238281, 'learning_rate': 1.3202698538628376e-05, 'epoch': 2.04}


 41%|████      | 11100/27000 [2:55:03<4:02:28,  1.09it/s]

{'loss': 1.1467, 'grad_norm': 9.440882682800293, 'learning_rate': 1.3090169943749475e-05, 'epoch': 2.06}


 41%|████▏     | 11200/27000 [2:56:35<4:02:44,  1.08it/s]

{'loss': 1.1104, 'grad_norm': 15.208870887756348, 'learning_rate': 1.2977207053424781e-05, 'epoch': 2.07}


 42%|████▏     | 11300/27000 [2:58:07<4:01:35,  1.08it/s]

{'loss': 1.0756, 'grad_norm': 10.240755081176758, 'learning_rate': 1.2863825743567174e-05, 'epoch': 2.09}


 42%|████▏     | 11400/27000 [2:59:40<4:00:08,  1.08it/s]

{'loss': 1.1058, 'grad_norm': 13.320487976074219, 'learning_rate': 1.2750041948894621e-05, 'epoch': 2.11}


 43%|████▎     | 11500/27000 [3:01:13<3:59:30,  1.08it/s]

{'loss': 1.1053, 'grad_norm': 10.97850513458252, 'learning_rate': 1.2635871660690677e-05, 'epoch': 2.13}


 43%|████▎     | 11600/27000 [3:02:47<4:01:37,  1.06it/s]

{'loss': 1.091, 'grad_norm': 12.700586318969727, 'learning_rate': 1.2521330924557087e-05, 'epoch': 2.15}


 43%|████▎     | 11700/27000 [3:04:20<3:53:29,  1.09it/s]

{'loss': 1.0201, 'grad_norm': 0.1499675214290619, 'learning_rate': 1.2406435838158686e-05, 'epoch': 2.17}


 44%|████▎     | 11800/27000 [3:05:52<3:55:04,  1.08it/s]

{'loss': 1.1344, 'grad_norm': 15.380077362060547, 'learning_rate': 1.2291202548961042e-05, 'epoch': 2.19}


 44%|████▍     | 11900/27000 [3:07:25<3:52:13,  1.08it/s]

{'loss': 1.0885, 'grad_norm': 12.225910186767578, 'learning_rate': 1.217564725196108e-05, 'epoch': 2.2}


 44%|████▍     | 12000/27000 [3:08:57<3:54:13,  1.07it/s]

{'loss': 1.1019, 'grad_norm': 13.834033966064453, 'learning_rate': 1.2059786187410984e-05, 'epoch': 2.22}


 45%|████▍     | 12100/27000 [3:10:30<3:49:03,  1.08it/s]

{'loss': 1.1078, 'grad_norm': 9.223601341247559, 'learning_rate': 1.1943635638535827e-05, 'epoch': 2.24}


 45%|████▌     | 12200/27000 [3:12:02<3:47:28,  1.08it/s]

{'loss': 1.0576, 'grad_norm': 10.666858673095703, 'learning_rate': 1.1827211929245075e-05, 'epoch': 2.26}


 46%|████▌     | 12300/27000 [3:13:35<3:47:12,  1.08it/s]

{'loss': 1.0717, 'grad_norm': 12.91480541229248, 'learning_rate': 1.1710531421838422e-05, 'epoch': 2.28}


 46%|████▌     | 12400/27000 [3:15:08<3:44:03,  1.09it/s]

{'loss': 1.058, 'grad_norm': 13.229775428771973, 'learning_rate': 1.1593610514706217e-05, 'epoch': 2.3}


 46%|████▋     | 12500/27000 [3:16:42<3:51:56,  1.04it/s]

{'loss': 1.1237, 'grad_norm': 6.564270496368408, 'learning_rate': 1.1476465640024814e-05, 'epoch': 2.31}


 47%|████▋     | 12600/27000 [3:18:17<3:41:55,  1.08it/s]

{'loss': 1.0756, 'grad_norm': 11.59900188446045, 'learning_rate': 1.1359113261447183e-05, 'epoch': 2.33}


 47%|████▋     | 12700/27000 [3:19:49<3:46:57,  1.05it/s]

{'loss': 1.0989, 'grad_norm': 16.099082946777344, 'learning_rate': 1.1241569871789096e-05, 'epoch': 2.35}


 47%|████▋     | 12800/27000 [3:21:22<3:42:10,  1.07it/s]

{'loss': 1.0318, 'grad_norm': 10.273197174072266, 'learning_rate': 1.112385199071119e-05, 'epoch': 2.37}


 48%|████▊     | 12900/27000 [3:22:56<3:37:19,  1.08it/s]

{'loss': 1.0922, 'grad_norm': 16.388418197631836, 'learning_rate': 1.1005976162397309e-05, 'epoch': 2.39}


 48%|████▊     | 13000/27000 [3:24:28<3:40:21,  1.06it/s]

{'loss': 1.0751, 'grad_norm': 10.254449844360352, 'learning_rate': 1.0887958953229349e-05, 'epoch': 2.41}


 49%|████▊     | 13100/27000 [3:26:02<3:34:42,  1.08it/s]

{'loss': 1.0771, 'grad_norm': 16.32122230529785, 'learning_rate': 1.0769816949459002e-05, 'epoch': 2.43}


 49%|████▉     | 13200/27000 [3:27:35<3:39:42,  1.05it/s]

{'loss': 1.0289, 'grad_norm': 16.060937881469727, 'learning_rate': 1.0651566754876715e-05, 'epoch': 2.44}


 49%|████▉     | 13300/27000 [3:29:09<3:38:13,  1.05it/s]

{'loss': 1.1733, 'grad_norm': 14.11966609954834, 'learning_rate': 1.0533224988478176e-05, 'epoch': 2.46}


 50%|████▉     | 13400/27000 [3:30:42<3:29:31,  1.08it/s]

{'loss': 1.0385, 'grad_norm': 12.8890380859375, 'learning_rate': 1.0414808282128668e-05, 'epoch': 2.48}


 50%|█████     | 13500/27000 [3:32:17<3:30:36,  1.07it/s]

{'loss': 1.1382, 'grad_norm': 12.878044128417969, 'learning_rate': 1.0296333278225599e-05, 'epoch': 2.5}


 50%|█████     | 13600/27000 [3:33:50<3:22:23,  1.10it/s]

{'loss': 1.0675, 'grad_norm': 12.676161766052246, 'learning_rate': 1.0177816627359575e-05, 'epoch': 2.52}


 51%|█████     | 13700/27000 [3:35:21<3:20:04,  1.11it/s]

{'loss': 1.0187, 'grad_norm': 15.147758483886719, 'learning_rate': 1.0059274985974305e-05, 'epoch': 2.54}


 51%|█████     | 13800/27000 [3:36:53<3:25:05,  1.07it/s]

{'loss': 1.1057, 'grad_norm': 14.481773376464844, 'learning_rate': 9.940725014025696e-06, 'epoch': 2.56}


 51%|█████▏    | 13900/27000 [3:38:26<3:21:18,  1.08it/s]

{'loss': 1.0662, 'grad_norm': 9.197722434997559, 'learning_rate': 9.822183372640426e-06, 'epoch': 2.57}


 52%|█████▏    | 14000/27000 [3:39:59<3:19:08,  1.09it/s]

{'loss': 1.145, 'grad_norm': 8.399251937866211, 'learning_rate': 9.703666721774403e-06, 'epoch': 2.59}


 52%|█████▏    | 14100/27000 [3:41:33<3:19:49,  1.08it/s]

{'loss': 1.0454, 'grad_norm': 13.158170700073242, 'learning_rate': 9.585191717871336e-06, 'epoch': 2.61}


 53%|█████▎    | 14200/27000 [3:43:06<3:16:43,  1.08it/s]

{'loss': 1.1332, 'grad_norm': 10.681807518005371, 'learning_rate': 9.466775011521825e-06, 'epoch': 2.63}


 53%|█████▎    | 14300/27000 [3:44:39<3:17:02,  1.07it/s]

{'loss': 1.0673, 'grad_norm': 14.842741966247559, 'learning_rate': 9.34843324512329e-06, 'epoch': 2.65}


 53%|█████▎    | 14400/27000 [3:46:12<3:14:37,  1.08it/s]

{'loss': 1.0418, 'grad_norm': 12.527252197265625, 'learning_rate': 9.230183050541001e-06, 'epoch': 2.67}


 54%|█████▎    | 14500/27000 [3:47:46<3:14:09,  1.07it/s]

{'loss': 1.1338, 'grad_norm': 15.279927253723145, 'learning_rate': 9.112041046770653e-06, 'epoch': 2.69}


 54%|█████▍    | 14600/27000 [3:49:19<3:11:45,  1.08it/s]

{'loss': 1.0483, 'grad_norm': 9.078425407409668, 'learning_rate': 8.994023837602694e-06, 'epoch': 2.7}


 54%|█████▍    | 14700/27000 [3:50:52<3:09:14,  1.08it/s]

{'loss': 1.0369, 'grad_norm': 7.3414154052734375, 'learning_rate': 8.876148009288813e-06, 'epoch': 2.72}


 55%|█████▍    | 14800/27000 [3:52:25<3:08:07,  1.08it/s]

{'loss': 1.0479, 'grad_norm': 8.925265312194824, 'learning_rate': 8.758430128210908e-06, 'epoch': 2.74}


 55%|█████▌    | 14900/27000 [3:53:58<3:05:59,  1.08it/s]

{'loss': 1.0125, 'grad_norm': 7.989018440246582, 'learning_rate': 8.64088673855282e-06, 'epoch': 2.76}


 56%|█████▌    | 15000/27000 [3:55:31<3:05:25,  1.08it/s]

{'loss': 0.9836, 'grad_norm': 9.730603218078613, 'learning_rate': 8.52353435997519e-06, 'epoch': 2.78}


 56%|█████▌    | 15100/27000 [3:57:03<3:03:13,  1.08it/s]

{'loss': 1.0412, 'grad_norm': 14.540582656860352, 'learning_rate': 8.406389485293786e-06, 'epoch': 2.8}


 56%|█████▋    | 15200/27000 [3:58:36<3:04:13,  1.07it/s]

{'loss': 1.019, 'grad_norm': 7.822939872741699, 'learning_rate': 8.289468578161581e-06, 'epoch': 2.81}


 57%|█████▋    | 15300/27000 [4:00:09<3:04:18,  1.06it/s]

{'loss': 1.0743, 'grad_norm': 11.892812728881836, 'learning_rate': 8.172788070754927e-06, 'epoch': 2.83}


 57%|█████▋    | 15400/27000 [4:01:43<3:17:59,  1.02s/it]

{'loss': 1.0187, 'grad_norm': 14.848938941955566, 'learning_rate': 8.056364361464176e-06, 'epoch': 2.85}


 57%|█████▋    | 15500/27000 [4:03:19<3:00:47,  1.06it/s]

{'loss': 1.0804, 'grad_norm': 11.9744234085083, 'learning_rate': 7.940213812589018e-06, 'epoch': 2.87}


 58%|█████▊    | 15600/27000 [4:04:52<2:56:14,  1.08it/s]

{'loss': 1.0316, 'grad_norm': 12.387872695922852, 'learning_rate': 7.824352748038924e-06, 'epoch': 2.89}


 58%|█████▊    | 15700/27000 [4:06:24<2:53:26,  1.09it/s]

{'loss': 1.1007, 'grad_norm': 15.612529754638672, 'learning_rate': 7.70879745103896e-06, 'epoch': 2.91}


 59%|█████▊    | 15800/27000 [4:07:57<2:53:59,  1.07it/s]

{'loss': 1.0061, 'grad_norm': 13.438986778259277, 'learning_rate': 7.593564161841318e-06, 'epoch': 2.93}


 59%|█████▉    | 15900/27000 [4:09:30<2:52:35,  1.07it/s]

{'loss': 1.0604, 'grad_norm': 13.66864013671875, 'learning_rate': 7.478669075442917e-06, 'epoch': 2.94}


 59%|█████▉    | 16000/27000 [4:11:03<2:49:31,  1.08it/s]

{'loss': 0.9901, 'grad_norm': 13.061705589294434, 'learning_rate': 7.364128339309326e-06, 'epoch': 2.96}


 60%|█████▉    | 16100/27000 [4:12:37<2:49:56,  1.07it/s]

{'loss': 1.0657, 'grad_norm': 6.378993988037109, 'learning_rate': 7.249958051105383e-06, 'epoch': 2.98}


 60%|██████    | 16200/27000 [4:14:10<2:44:47,  1.09it/s]

{'loss': 0.9877, 'grad_norm': 0.552795946598053, 'learning_rate': 7.136174256432828e-06, 'epoch': 3.0}


                                                         
 60%|██████    | 16200/27000 [4:16:51<2:44:47,  1.09it/s]

{'eval_loss': 1.4153633117675781, 'eval_runtime': 160.5299, 'eval_samples_per_second': 3.738, 'eval_steps_per_second': 3.738, 'epoch': 3.0}


 60%|██████    | 16300/27000 [4:18:31<2:48:14,  1.06it/s]  

{'loss': 0.7206, 'grad_norm': 10.520231246948242, 'learning_rate': 7.022792946575222e-06, 'epoch': 3.02}


 61%|██████    | 16400/27000 [4:20:04<2:41:43,  1.09it/s]

{'loss': 0.713, 'grad_norm': 14.261785507202148, 'learning_rate': 6.909830056250527e-06, 'epoch': 3.04}


 61%|██████    | 16500/27000 [4:21:37<2:43:11,  1.07it/s]

{'loss': 0.6981, 'grad_norm': 8.697927474975586, 'learning_rate': 6.797301461371626e-06, 'epoch': 3.06}


 61%|██████▏   | 16600/27000 [4:23:10<2:44:03,  1.06it/s]

{'loss': 0.7435, 'grad_norm': 9.56253719329834, 'learning_rate': 6.6852229768150976e-06, 'epoch': 3.07}


 62%|██████▏   | 16700/27000 [4:24:42<2:39:19,  1.08it/s]

{'loss': 0.7342, 'grad_norm': 13.521544456481934, 'learning_rate': 6.573610354198587e-06, 'epoch': 3.09}


 62%|██████▏   | 16800/27000 [4:26:15<2:37:16,  1.08it/s]

{'loss': 0.7038, 'grad_norm': 19.65543556213379, 'learning_rate': 6.4624792796670624e-06, 'epoch': 3.11}


 63%|██████▎   | 16900/27000 [4:27:48<2:40:00,  1.05it/s]

{'loss': 0.7664, 'grad_norm': 10.914294242858887, 'learning_rate': 6.35184537168825e-06, 'epoch': 3.13}


 63%|██████▎   | 17000/27000 [4:29:22<2:40:11,  1.04it/s]

{'loss': 0.6726, 'grad_norm': 12.26204776763916, 'learning_rate': 6.241724178857621e-06, 'epoch': 3.15}


 63%|██████▎   | 17100/27000 [4:30:55<2:33:20,  1.08it/s]

{'loss': 0.7483, 'grad_norm': 12.574210166931152, 'learning_rate': 6.132131177713165e-06, 'epoch': 3.17}


 64%|██████▎   | 17200/27000 [4:32:29<2:34:09,  1.06it/s]

{'loss': 0.7473, 'grad_norm': 9.048454284667969, 'learning_rate': 6.023081770560307e-06, 'epoch': 3.19}


 64%|██████▍   | 17300/27000 [4:34:03<2:36:23,  1.03it/s]

{'loss': 0.7482, 'grad_norm': 18.46243667602539, 'learning_rate': 5.9145912833072535e-06, 'epoch': 3.2}


 64%|██████▍   | 17400/27000 [4:35:37<2:26:56,  1.09it/s]

{'loss': 0.7055, 'grad_norm': 10.330591201782227, 'learning_rate': 5.8066749633110675e-06, 'epoch': 3.22}


 65%|██████▍   | 17500/27000 [4:37:09<2:23:01,  1.11it/s]

{'loss': 0.7241, 'grad_norm': 13.054483413696289, 'learning_rate': 5.699347977234799e-06, 'epoch': 3.24}


 65%|██████▌   | 17600/27000 [4:38:39<2:21:41,  1.11it/s]

{'loss': 0.7296, 'grad_norm': 10.58279037475586, 'learning_rate': 5.592625408915939e-06, 'epoch': 3.26}


 66%|██████▌   | 17700/27000 [4:40:12<2:23:30,  1.08it/s]

{'loss': 0.7447, 'grad_norm': 16.58905601501465, 'learning_rate': 5.486522257246538e-06, 'epoch': 3.28}


 66%|██████▌   | 17800/27000 [4:41:45<2:22:35,  1.08it/s]

{'loss': 0.7149, 'grad_norm': 11.241843223571777, 'learning_rate': 5.381053434065229e-06, 'epoch': 3.3}


 66%|██████▋   | 17900/27000 [4:43:19<2:20:58,  1.08it/s]

{'loss': 0.6744, 'grad_norm': 10.685516357421875, 'learning_rate': 5.276233762061507e-06, 'epoch': 3.31}


 67%|██████▋   | 18000/27000 [4:44:52<2:19:51,  1.07it/s]

{'loss': 0.675, 'grad_norm': 12.830891609191895, 'learning_rate': 5.172077972692553e-06, 'epoch': 3.33}


 67%|██████▋   | 18100/27000 [4:46:26<2:19:39,  1.06it/s]

{'loss': 0.7767, 'grad_norm': 14.657520294189453, 'learning_rate': 5.068600704112832e-06, 'epoch': 3.35}


 67%|██████▋   | 18200/27000 [4:47:59<2:17:53,  1.06it/s]

{'loss': 0.6975, 'grad_norm': 22.461076736450195, 'learning_rate': 4.965816499116849e-06, 'epoch': 3.37}


 68%|██████▊   | 18300/27000 [4:49:32<2:18:10,  1.05it/s]

{'loss': 0.7214, 'grad_norm': 9.181120872497559, 'learning_rate': 4.863739803095299e-06, 'epoch': 3.39}


 68%|██████▊   | 18400/27000 [4:51:06<2:12:56,  1.08it/s]

{'loss': 0.7024, 'grad_norm': 8.777721405029297, 'learning_rate': 4.762384962004877e-06, 'epoch': 3.41}


 69%|██████▊   | 18500/27000 [4:52:39<2:12:34,  1.07it/s]

{'loss': 0.7424, 'grad_norm': 14.40735912322998, 'learning_rate': 4.661766220352098e-06, 'epoch': 3.43}


 69%|██████▉   | 18600/27000 [4:54:12<2:08:50,  1.09it/s]

{'loss': 0.7018, 'grad_norm': 8.719531059265137, 'learning_rate': 4.561897719191349e-06, 'epoch': 3.44}


 69%|██████▉   | 18700/27000 [4:55:45<2:08:39,  1.08it/s]

{'loss': 0.7355, 'grad_norm': 7.02386999130249, 'learning_rate': 4.4627934941375185e-06, 'epoch': 3.46}


 70%|██████▉   | 18800/27000 [4:57:18<2:07:58,  1.07it/s]

{'loss': 0.7049, 'grad_norm': 16.721595764160156, 'learning_rate': 4.36446747339338e-06, 'epoch': 3.48}


 70%|███████   | 18900/27000 [4:58:51<2:07:01,  1.06it/s]

{'loss': 0.7125, 'grad_norm': 8.93045711517334, 'learning_rate': 4.2669334757921284e-06, 'epoch': 3.5}


 70%|███████   | 19000/27000 [5:00:24<2:05:31,  1.06it/s]

{'loss': 0.721, 'grad_norm': 13.754283905029297, 'learning_rate': 4.170205208855281e-06, 'epoch': 3.52}


 71%|███████   | 19100/27000 [5:01:58<2:03:49,  1.06it/s]

{'loss': 0.5839, 'grad_norm': 9.662952423095703, 'learning_rate': 4.0742962668661826e-06, 'epoch': 3.54}


 71%|███████   | 19200/27000 [5:03:32<2:01:09,  1.07it/s]

{'loss': 0.7452, 'grad_norm': 9.815534591674805, 'learning_rate': 3.979220128959463e-06, 'epoch': 3.56}


 71%|███████▏  | 19300/27000 [5:05:06<1:59:00,  1.08it/s]

{'loss': 0.6708, 'grad_norm': 10.452494621276855, 'learning_rate': 3.884990157226683e-06, 'epoch': 3.57}


 72%|███████▏  | 19400/27000 [5:06:39<1:57:16,  1.08it/s]

{'loss': 0.7255, 'grad_norm': 15.825014114379883, 'learning_rate': 3.7916195948383817e-06, 'epoch': 3.59}


 72%|███████▏  | 19500/27000 [5:08:14<1:56:17,  1.07it/s]

{'loss': 0.6614, 'grad_norm': 3.70101261138916, 'learning_rate': 3.6991215641828903e-06, 'epoch': 3.61}


 73%|███████▎  | 19600/27000 [5:09:47<1:55:00,  1.07it/s]

{'loss': 0.7702, 'grad_norm': 10.310979843139648, 'learning_rate': 3.607509065022101e-06, 'epoch': 3.63}


 73%|███████▎  | 19700/27000 [5:11:19<1:53:14,  1.07it/s]

{'loss': 0.687, 'grad_norm': 17.446086883544922, 'learning_rate': 3.5167949726644545e-06, 'epoch': 3.65}


 73%|███████▎  | 19800/27000 [5:12:53<1:51:49,  1.07it/s]

{'loss': 0.7466, 'grad_norm': 21.986412048339844, 'learning_rate': 3.4269920361554342e-06, 'epoch': 3.67}


 74%|███████▎  | 19900/27000 [5:14:26<1:51:01,  1.07it/s]

{'loss': 0.6903, 'grad_norm': 15.098955154418945, 'learning_rate': 3.3381128764858195e-06, 'epoch': 3.69}


 74%|███████▍  | 20000/27000 [5:15:59<1:48:09,  1.08it/s]

{'loss': 0.6498, 'grad_norm': 0.29427093267440796, 'learning_rate': 3.250169984817897e-06, 'epoch': 3.7}


 74%|███████▍  | 20100/27000 [5:17:33<1:46:44,  1.08it/s]

{'loss': 0.6938, 'grad_norm': 15.551942825317383, 'learning_rate': 3.163175720729954e-06, 'epoch': 3.72}


 75%|███████▍  | 20200/27000 [5:19:06<1:45:20,  1.08it/s]

{'loss': 0.6998, 'grad_norm': 11.688100814819336, 'learning_rate': 3.0771423104792454e-06, 'epoch': 3.74}


 75%|███████▌  | 20300/27000 [5:20:40<1:47:00,  1.04it/s]

{'loss': 0.7505, 'grad_norm': 17.372678756713867, 'learning_rate': 2.992081845283715e-06, 'epoch': 3.76}


 76%|███████▌  | 20400/27000 [5:22:15<1:43:53,  1.06it/s]

{'loss': 0.7202, 'grad_norm': 14.740959167480469, 'learning_rate': 2.908006279622667e-06, 'epoch': 3.78}


 76%|███████▌  | 20500/27000 [5:23:48<1:40:10,  1.08it/s]

{'loss': 0.6974, 'grad_norm': 17.041240692138672, 'learning_rate': 2.8249274295566875e-06, 'epoch': 3.8}


 76%|███████▋  | 20600/27000 [5:25:21<1:39:23,  1.07it/s]

{'loss': 0.7439, 'grad_norm': 14.35239028930664, 'learning_rate': 2.742856971066996e-06, 'epoch': 3.81}


 77%|███████▋  | 20700/27000 [5:26:55<1:37:50,  1.07it/s]

{'loss': 0.6684, 'grad_norm': 11.53475570678711, 'learning_rate': 2.6618064384144925e-06, 'epoch': 3.83}


 77%|███████▋  | 20800/27000 [5:28:29<1:36:02,  1.08it/s]

{'loss': 0.6923, 'grad_norm': 15.390894889831543, 'learning_rate': 2.58178722251872e-06, 'epoch': 3.85}


 77%|███████▋  | 20900/27000 [5:30:02<1:34:24,  1.08it/s]

{'loss': 0.7455, 'grad_norm': 16.343917846679688, 'learning_rate': 2.502810569356976e-06, 'epoch': 3.87}


 78%|███████▊  | 21000/27000 [5:31:37<1:36:23,  1.04it/s]

{'loss': 0.6727, 'grad_norm': 12.078563690185547, 'learning_rate': 2.4248875783838e-06, 'epoch': 3.89}


 78%|███████▊  | 21100/27000 [5:33:11<1:32:34,  1.06it/s]

{'loss': 0.6735, 'grad_norm': 18.5535945892334, 'learning_rate': 2.3480292009710282e-06, 'epoch': 3.91}


 79%|███████▊  | 21200/27000 [5:34:44<1:30:55,  1.06it/s]

{'loss': 0.6619, 'grad_norm': 22.244855880737305, 'learning_rate': 2.272246238868687e-06, 'epoch': 3.93}


 79%|███████▉  | 21300/27000 [5:36:17<1:28:12,  1.08it/s]

{'loss': 0.6715, 'grad_norm': 12.401668548583984, 'learning_rate': 2.1975493426869155e-06, 'epoch': 3.94}


 79%|███████▉  | 21400/27000 [5:37:50<1:26:38,  1.08it/s]

{'loss': 0.6881, 'grad_norm': 10.957382202148438, 'learning_rate': 2.1239490103990946e-06, 'epoch': 3.96}


 80%|███████▉  | 21500/27000 [5:39:24<1:24:30,  1.08it/s]

{'loss': 0.6567, 'grad_norm': 19.285480499267578, 'learning_rate': 2.0514555858664663e-06, 'epoch': 3.98}


 80%|████████  | 21600/27000 [5:40:58<1:21:06,  1.11it/s]

{'loss': 0.6649, 'grad_norm': 11.771470069885254, 'learning_rate': 1.980079257384405e-06, 'epoch': 4.0}


                                                         
 80%|████████  | 21600/27000 [5:43:38<1:21:06,  1.11it/s]

{'eval_loss': 1.442002534866333, 'eval_runtime': 159.7416, 'eval_samples_per_second': 3.756, 'eval_steps_per_second': 3.756, 'epoch': 4.0}


 80%|████████  | 21700/27000 [5:45:16<1:23:06,  1.06it/s] 

{'loss': 0.5361, 'grad_norm': 11.9911470413208, 'learning_rate': 1.9098300562505266e-06, 'epoch': 4.02}


 81%|████████  | 21800/27000 [5:46:49<1:24:06,  1.03it/s]

{'loss': 0.5007, 'grad_norm': 11.224106788635254, 'learning_rate': 1.8407178553548876e-06, 'epoch': 4.04}


 81%|████████  | 21900/27000 [5:48:22<1:18:32,  1.08it/s]

{'loss': 0.4805, 'grad_norm': 0.6482678055763245, 'learning_rate': 1.772752367792452e-06, 'epoch': 4.06}


 81%|████████▏ | 22000/27000 [5:49:54<1:17:26,  1.08it/s]

{'loss': 0.4507, 'grad_norm': 10.082929611206055, 'learning_rate': 1.7059431454979825e-06, 'epoch': 4.07}


 82%|████████▏ | 22100/27000 [5:51:27<1:16:37,  1.07it/s]

{'loss': 0.4935, 'grad_norm': 8.435874938964844, 'learning_rate': 1.6402995779036146e-06, 'epoch': 4.09}


 82%|████████▏ | 22200/27000 [5:53:01<1:14:57,  1.07it/s]

{'loss': 0.5019, 'grad_norm': 11.697484970092773, 'learning_rate': 1.575830890619261e-06, 'epoch': 4.11}


 83%|████████▎ | 22300/27000 [5:54:34<1:12:31,  1.08it/s]

{'loss': 0.4898, 'grad_norm': 11.167634963989258, 'learning_rate': 1.5125461441360223e-06, 'epoch': 4.13}


 83%|████████▎ | 22400/27000 [5:56:07<1:11:10,  1.08it/s]

{'loss': 0.4887, 'grad_norm': 13.699238777160645, 'learning_rate': 1.450454232552826e-06, 'epoch': 4.15}


 83%|████████▎ | 22500/27000 [5:57:40<1:10:52,  1.06it/s]

{'loss': 0.4606, 'grad_norm': 13.742584228515625, 'learning_rate': 1.3895638823264447e-06, 'epoch': 4.17}


 84%|████████▎ | 22600/27000 [5:59:13<1:08:14,  1.07it/s]

{'loss': 0.4576, 'grad_norm': 13.886234283447266, 'learning_rate': 1.3298836510450597e-06, 'epoch': 4.19}


 84%|████████▍ | 22700/27000 [6:00:47<1:05:35,  1.09it/s]

{'loss': 0.4925, 'grad_norm': 14.223188400268555, 'learning_rate': 1.2714219262255777e-06, 'epoch': 4.2}


 84%|████████▍ | 22800/27000 [6:02:19<1:05:01,  1.08it/s]

{'loss': 0.4976, 'grad_norm': 11.851966857910156, 'learning_rate': 1.214186924134838e-06, 'epoch': 4.22}


 85%|████████▍ | 22900/27000 [6:03:52<1:03:42,  1.07it/s]

{'loss': 0.4849, 'grad_norm': 5.276533603668213, 'learning_rate': 1.158186688634898e-06, 'epoch': 4.24}


 85%|████████▌ | 23000/27000 [6:05:25<1:01:44,  1.08it/s]

{'loss': 0.4841, 'grad_norm': 11.785329818725586, 'learning_rate': 1.1034290900525279e-06, 'epoch': 4.26}


 86%|████████▌ | 23100/27000 [6:06:57<59:37,  1.09it/s]  

{'loss': 0.4664, 'grad_norm': 15.647848129272461, 'learning_rate': 1.0499218240731168e-06, 'epoch': 4.28}


 86%|████████▌ | 23200/27000 [6:08:31<58:52,  1.08it/s]  

{'loss': 0.4756, 'grad_norm': 5.070110321044922, 'learning_rate': 9.976724106591128e-07, 'epoch': 4.3}


 86%|████████▋ | 23300/27000 [6:10:03<56:57,  1.08it/s]

{'loss': 0.4638, 'grad_norm': 11.047628402709961, 'learning_rate': 9.466881929931582e-07, 'epoch': 4.31}


 87%|████████▋ | 23400/27000 [6:11:36<55:33,  1.08it/s]

{'loss': 0.5043, 'grad_norm': 10.530380249023438, 'learning_rate': 8.969763364460682e-07, 'epoch': 4.33}


 87%|████████▋ | 23500/27000 [6:13:09<53:56,  1.08it/s]

{'loss': 0.4694, 'grad_norm': 11.663345336914062, 'learning_rate': 8.485438275698154e-07, 'epoch': 4.35}


 87%|████████▋ | 23600/27000 [6:14:42<52:42,  1.07it/s]

{'loss': 0.4929, 'grad_norm': 8.547643661499023, 'learning_rate': 8.01397473115616e-07, 'epoch': 4.37}


 88%|████████▊ | 23700/27000 [6:16:15<51:28,  1.07it/s]

{'loss': 0.5296, 'grad_norm': 8.469728469848633, 'learning_rate': 7.555438990773134e-07, 'epoch': 4.39}


 88%|████████▊ | 23800/27000 [6:17:48<49:26,  1.08it/s]

{'loss': 0.4622, 'grad_norm': 12.448553085327148, 'learning_rate': 7.109895497601571e-07, 'epoch': 4.41}


 89%|████████▊ | 23900/27000 [6:19:23<49:57,  1.03it/s]

{'loss': 0.4566, 'grad_norm': 7.959648132324219, 'learning_rate': 6.677406868751024e-07, 'epoch': 4.43}


 89%|████████▉ | 24000/27000 [6:20:56<46:14,  1.08it/s]

{'loss': 0.4692, 'grad_norm': 7.910272121429443, 'learning_rate': 6.258033886587899e-07, 'epoch': 4.44}


 89%|████████▉ | 24100/27000 [6:22:29<45:19,  1.07it/s]

{'loss': 0.4714, 'grad_norm': 12.396090507507324, 'learning_rate': 5.851835490193136e-07, 'epoch': 4.46}


 90%|████████▉ | 24200/27000 [6:24:04<45:02,  1.04it/s]

{'loss': 0.4538, 'grad_norm': 12.21224308013916, 'learning_rate': 5.458868767078673e-07, 'epoch': 4.48}


 90%|█████████ | 24300/27000 [6:25:37<41:23,  1.09it/s]

{'loss': 0.5458, 'grad_norm': 8.024529457092285, 'learning_rate': 5.079188945164426e-07, 'epoch': 4.5}


 90%|█████████ | 24400/27000 [6:27:10<40:27,  1.07it/s]

{'loss': 0.4697, 'grad_norm': 9.451026916503906, 'learning_rate': 4.7128493850164715e-07, 'epoch': 4.52}


 91%|█████████ | 24500/27000 [6:28:45<38:30,  1.08it/s]

{'loss': 0.4642, 'grad_norm': 8.657573699951172, 'learning_rate': 4.359901572347758e-07, 'epoch': 4.54}


 91%|█████████ | 24600/27000 [6:30:18<37:14,  1.07it/s]

{'loss': 0.5104, 'grad_norm': 14.94742202758789, 'learning_rate': 4.02039511078216e-07, 'epoch': 4.56}


 91%|█████████▏| 24700/27000 [6:31:54<36:55,  1.04it/s]

{'loss': 0.4737, 'grad_norm': 6.652502059936523, 'learning_rate': 3.6943777148831907e-07, 'epoch': 4.57}


 92%|█████████▏| 24800/27000 [6:33:29<34:43,  1.06it/s]

{'loss': 0.4724, 'grad_norm': 8.239376068115234, 'learning_rate': 3.381895203448182e-07, 'epoch': 4.59}


 92%|█████████▏| 24900/27000 [6:35:04<35:14,  1.01s/it]

{'loss': 0.4885, 'grad_norm': 0.12700557708740234, 'learning_rate': 3.0829914930687767e-07, 'epoch': 4.61}


 93%|█████████▎| 25000/27000 [6:36:39<30:53,  1.08it/s]

{'loss': 0.4747, 'grad_norm': 0.2677971422672272, 'learning_rate': 2.7977085919589253e-07, 'epoch': 4.63}


 93%|█████████▎| 25100/27000 [6:38:13<30:06,  1.05it/s]

{'loss': 0.4551, 'grad_norm': 10.004918098449707, 'learning_rate': 2.5260865940510027e-07, 'epoch': 4.65}


 93%|█████████▎| 25200/27000 [6:39:47<27:40,  1.08it/s]

{'loss': 0.5307, 'grad_norm': 9.768919944763184, 'learning_rate': 2.2681636733609457e-07, 'epoch': 4.67}


 94%|█████████▎| 25300/27000 [6:41:21<26:00,  1.09it/s]

{'loss': 0.5088, 'grad_norm': 13.23923397064209, 'learning_rate': 2.0239760786232355e-07, 'epoch': 4.69}


 94%|█████████▍| 25400/27000 [6:42:54<24:42,  1.08it/s]

{'loss': 0.5014, 'grad_norm': 9.449386596679688, 'learning_rate': 1.793558128196493e-07, 'epoch': 4.7}


 94%|█████████▍| 25500/27000 [6:44:26<23:18,  1.07it/s]

{'loss': 0.4569, 'grad_norm': 12.94646167755127, 'learning_rate': 1.5769422052403172e-07, 'epoch': 4.72}


 95%|█████████▍| 25600/27000 [6:45:58<21:09,  1.10it/s]

{'loss': 0.4688, 'grad_norm': 11.07835865020752, 'learning_rate': 1.3741587531641566e-07, 'epoch': 4.74}


 95%|█████████▌| 25700/27000 [6:47:29<19:46,  1.10it/s]

{'loss': 0.4628, 'grad_norm': 14.924241065979004, 'learning_rate': 1.185236271348722e-07, 'epoch': 4.76}


 96%|█████████▌| 25800/27000 [6:49:09<17:57,  1.11it/s]

{'loss': 0.4868, 'grad_norm': 9.914632797241211, 'learning_rate': 1.0102013111406905e-07, 'epoch': 4.78}


 96%|█████████▌| 25900/27000 [6:50:35<15:11,  1.21it/s]

{'loss': 0.4455, 'grad_norm': 12.173138618469238, 'learning_rate': 8.490784721211454e-08, 'epoch': 4.8}


 96%|█████████▋| 26000/27000 [6:52:01<14:13,  1.17it/s]

{'loss': 0.4683, 'grad_norm': 8.69320011138916, 'learning_rate': 7.018903986483083e-08, 'epoch': 4.81}


 97%|█████████▋| 26100/27000 [6:53:28<13:06,  1.14it/s]

{'loss': 0.5523, 'grad_norm': 8.234060287475586, 'learning_rate': 5.686577766751078e-08, 'epoch': 4.83}


 97%|█████████▋| 26200/27000 [6:54:54<11:37,  1.15it/s]

{'loss': 0.4328, 'grad_norm': 9.778225898742676, 'learning_rate': 4.4939933084192646e-08, 'epoch': 4.85}


 97%|█████████▋| 26300/27000 [6:56:21<10:08,  1.15it/s]

{'loss': 0.3954, 'grad_norm': 16.112401962280273, 'learning_rate': 3.4413182184507285e-08, 'epoch': 4.87}


 98%|█████████▊| 26400/27000 [6:57:48<08:40,  1.15it/s]

{'loss': 0.4545, 'grad_norm': 9.846810340881348, 'learning_rate': 2.528700440811438e-08, 'epoch': 4.89}


 98%|█████████▊| 26500/27000 [6:59:15<07:16,  1.15it/s]

{'loss': 0.4921, 'grad_norm': 8.194111824035645, 'learning_rate': 1.7562682356786488e-08, 'epoch': 4.91}


 99%|█████████▊| 26600/27000 [7:00:42<05:46,  1.15it/s]

{'loss': 0.5019, 'grad_norm': 5.34130859375, 'learning_rate': 1.1241301614147715e-08, 'epoch': 4.93}


 99%|█████████▉| 26700/27000 [7:02:10<04:21,  1.15it/s]

{'loss': 0.4735, 'grad_norm': 6.579595565795898, 'learning_rate': 6.323750593106859e-09, 'epoch': 4.94}


 99%|█████████▉| 26800/27000 [7:03:37<02:53,  1.15it/s]

{'loss': 0.4782, 'grad_norm': 11.933350563049316, 'learning_rate': 2.810720410998391e-09, 'epoch': 4.96}


100%|█████████▉| 26900/27000 [7:05:04<01:26,  1.16it/s]

{'loss': 0.4842, 'grad_norm': 14.280290603637695, 'learning_rate': 7.027047924512698e-10, 'epoch': 4.98}


100%|██████████| 27000/27000 [7:06:31<00:00,  1.20it/s]

{'loss': 0.4788, 'grad_norm': 13.052371978759766, 'learning_rate': 0.0, 'epoch': 5.0}


                                                       
100%|██████████| 27000/27000 [7:09:08<00:00,  1.20it/s]

{'eval_loss': 1.4753512144088745, 'eval_runtime': 152.747, 'eval_samples_per_second': 3.928, 'eval_steps_per_second': 3.928, 'epoch': 5.0}


100%|██████████| 27000/27000 [7:09:13<00:00,  1.05it/s]


{'train_runtime': 25753.337, 'train_samples_per_second': 1.048, 'train_steps_per_second': 1.048, 'train_loss': 1.2166555875142415, 'epoch': 5.0}
