In [114]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import torch
import numpy as np
from sklearn.metrics import classification_report


In [115]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")


In [116]:
model_name = "bert-base-uncased" 
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

 62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 1433/2295 [02:02<01:01, 13.94it/s]Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [117]:
for param in model.bert.parameters():
    param.requires_grad = False  # Freeze all layers

for param in model.bert.encoder.layer[-3:].parameters():
    param.requires_grad = True   # Unfreeze the last two layers

In [118]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [119]:
def preprocess_function(examples):
    # Tokenize the sentence pairs for BERT
    return tokenizer(
        examples['sentence1'], 
        examples['sentence2'], 
        truncation=True, 
        padding="max_length", 
        max_length=128
    )

In [121]:
dataset = load_dataset("glue", "mrpc")

In [122]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

In [123]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=50,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    fp16=False,  # Disable mixed precision
    use_mps_device=True  # Use MPS if available
)



In [1]:
from transformers import EvalPrediction
def compute_metrics(p: EvalPrediction):
    predictions = torch.argmax(torch.tensor(p.predictions), dim=-1)
    labels = torch.tensor(p.label_ids)
    accuracy = (predictions == labels).float().mean().item()
    return {"accuracy": accuracy}

  from .autonotebook import tqdm as notebook_tqdm


In [125]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


In [126]:
trainer.train()


  0%|          | 147/45900 [08:43<45:13:02,  3.56s/it]
 62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 1433/2295 [08:24<05:03,  2.84it/s]
  0%|          | 12/22950 [00:00<28:09, 13.57it/s]

{'loss': 0.7697, 'grad_norm': 9.011730194091797, 'learning_rate': 9.995642701525055e-05, 'epoch': 0.02}


  0%|          | 22/22950 [00:01<27:10, 14.06it/s]

{'loss': 0.6097, 'grad_norm': 2.7702386379241943, 'learning_rate': 9.991285403050109e-05, 'epoch': 0.04}


  0%|          | 32/22950 [00:02<27:13, 14.03it/s]

{'loss': 0.6927, 'grad_norm': 1.7381782531738281, 'learning_rate': 9.986928104575164e-05, 'epoch': 0.07}


  0%|          | 42/22950 [00:03<27:10, 14.05it/s]

{'loss': 0.5941, 'grad_norm': 1.8423632383346558, 'learning_rate': 9.982570806100219e-05, 'epoch': 0.09}


  0%|          | 52/22950 [00:03<27:00, 14.13it/s]

{'loss': 0.5846, 'grad_norm': 2.6960787773132324, 'learning_rate': 9.978213507625273e-05, 'epoch': 0.11}


  0%|          | 62/22950 [00:04<26:48, 14.23it/s]

{'loss': 0.6094, 'grad_norm': 2.6984822750091553, 'learning_rate': 9.973856209150328e-05, 'epoch': 0.13}


  0%|          | 72/22950 [00:05<27:10, 14.03it/s]

{'loss': 0.6116, 'grad_norm': 8.26318073272705, 'learning_rate': 9.969498910675382e-05, 'epoch': 0.15}


  0%|          | 82/22950 [00:05<27:22, 13.92it/s]

{'loss': 0.6288, 'grad_norm': 4.552694797515869, 'learning_rate': 9.965141612200437e-05, 'epoch': 0.17}


  0%|          | 92/22950 [00:06<27:25, 13.89it/s]

{'loss': 0.5327, 'grad_norm': 9.21972942352295, 'learning_rate': 9.96078431372549e-05, 'epoch': 0.2}


  0%|          | 102/22950 [00:07<26:55, 14.14it/s]

{'loss': 0.6113, 'grad_norm': 2.601677179336548, 'learning_rate': 9.956427015250545e-05, 'epoch': 0.22}


  0%|          | 112/22950 [00:08<26:54, 14.14it/s]

{'loss': 0.6155, 'grad_norm': 2.143115758895874, 'learning_rate': 9.952069716775599e-05, 'epoch': 0.24}


  1%|          | 122/22950 [00:08<26:55, 14.13it/s]

{'loss': 0.5813, 'grad_norm': 4.181572437286377, 'learning_rate': 9.947712418300654e-05, 'epoch': 0.26}


  1%|          | 132/22950 [00:09<27:12, 13.98it/s]

{'loss': 0.5542, 'grad_norm': 6.465627193450928, 'learning_rate': 9.943355119825708e-05, 'epoch': 0.28}


  1%|          | 142/22950 [00:10<26:53, 14.14it/s]

{'loss': 0.6112, 'grad_norm': 6.595923900604248, 'learning_rate': 9.938997821350763e-05, 'epoch': 0.31}


  1%|          | 152/22950 [00:10<26:58, 14.08it/s]

{'loss': 0.5308, 'grad_norm': 4.151679515838623, 'learning_rate': 9.934640522875818e-05, 'epoch': 0.33}


  1%|          | 162/22950 [00:11<27:01, 14.05it/s]

{'loss': 0.5526, 'grad_norm': 3.1362569332122803, 'learning_rate': 9.930283224400871e-05, 'epoch': 0.35}


  1%|          | 172/22950 [00:12<27:00, 14.06it/s]

{'loss': 0.5267, 'grad_norm': 3.5995051860809326, 'learning_rate': 9.925925925925926e-05, 'epoch': 0.37}


  1%|          | 182/22950 [00:13<26:47, 14.16it/s]

{'loss': 0.5571, 'grad_norm': 3.2346367835998535, 'learning_rate': 9.921568627450981e-05, 'epoch': 0.39}


  1%|          | 192/22950 [00:13<26:59, 14.05it/s]

{'loss': 0.5797, 'grad_norm': 2.6242940425872803, 'learning_rate': 9.917211328976035e-05, 'epoch': 0.41}


  1%|          | 202/22950 [00:14<26:42, 14.20it/s]

{'loss': 0.5099, 'grad_norm': 3.9577698707580566, 'learning_rate': 9.91285403050109e-05, 'epoch': 0.44}


  1%|          | 212/22950 [00:15<26:56, 14.07it/s]

{'loss': 0.5206, 'grad_norm': 4.164804458618164, 'learning_rate': 9.908496732026145e-05, 'epoch': 0.46}


  1%|          | 222/22950 [00:15<26:41, 14.19it/s]

{'loss': 0.5035, 'grad_norm': 3.535187244415283, 'learning_rate': 9.904139433551199e-05, 'epoch': 0.48}


  1%|          | 232/22950 [00:16<26:42, 14.17it/s]

{'loss': 0.4395, 'grad_norm': 3.6939501762390137, 'learning_rate': 9.899782135076254e-05, 'epoch': 0.5}


  1%|          | 242/22950 [00:17<26:45, 14.14it/s]

{'loss': 0.4749, 'grad_norm': 4.332082271575928, 'learning_rate': 9.895424836601307e-05, 'epoch': 0.52}


  1%|          | 252/22950 [00:17<26:37, 14.20it/s]

{'loss': 0.4403, 'grad_norm': 2.8594253063201904, 'learning_rate': 9.891067538126362e-05, 'epoch': 0.54}


  1%|          | 262/22950 [00:18<26:35, 14.22it/s]

{'loss': 0.4641, 'grad_norm': 7.0564751625061035, 'learning_rate': 9.886710239651416e-05, 'epoch': 0.57}


  1%|          | 272/22950 [00:19<26:52, 14.07it/s]

{'loss': 0.4431, 'grad_norm': 2.6310949325561523, 'learning_rate': 9.882352941176471e-05, 'epoch': 0.59}


  1%|          | 282/22950 [00:20<26:39, 14.17it/s]

{'loss': 0.452, 'grad_norm': 7.98635721206665, 'learning_rate': 9.877995642701525e-05, 'epoch': 0.61}


  1%|‚ñè         | 292/22950 [00:20<26:39, 14.17it/s]

{'loss': 0.5564, 'grad_norm': 3.358661651611328, 'learning_rate': 9.87363834422658e-05, 'epoch': 0.63}


  1%|‚ñè         | 302/22950 [00:21<26:32, 14.22it/s]

{'loss': 0.4024, 'grad_norm': 3.23071551322937, 'learning_rate': 9.869281045751635e-05, 'epoch': 0.65}


  1%|‚ñè         | 312/22950 [00:22<26:39, 14.15it/s]

{'loss': 0.4638, 'grad_norm': 4.191335678100586, 'learning_rate': 9.864923747276688e-05, 'epoch': 0.68}


  1%|‚ñè         | 322/22950 [00:22<26:42, 14.12it/s]

{'loss': 0.4521, 'grad_norm': 5.107395648956299, 'learning_rate': 9.860566448801743e-05, 'epoch': 0.7}


  1%|‚ñè         | 332/22950 [00:23<26:30, 14.22it/s]

{'loss': 0.403, 'grad_norm': 4.972215175628662, 'learning_rate': 9.856209150326798e-05, 'epoch': 0.72}


  1%|‚ñè         | 342/22950 [00:24<26:36, 14.16it/s]

{'loss': 0.6902, 'grad_norm': 6.56525182723999, 'learning_rate': 9.851851851851852e-05, 'epoch': 0.74}


  2%|‚ñè         | 352/22950 [00:25<26:33, 14.18it/s]

{'loss': 0.4902, 'grad_norm': 6.459534645080566, 'learning_rate': 9.847494553376907e-05, 'epoch': 0.76}


  2%|‚ñè         | 362/22950 [00:25<26:32, 14.18it/s]

{'loss': 0.5493, 'grad_norm': 4.491764545440674, 'learning_rate': 9.843137254901961e-05, 'epoch': 0.78}


  2%|‚ñè         | 372/22950 [00:26<26:33, 14.17it/s]

{'loss': 0.4763, 'grad_norm': 2.4666314125061035, 'learning_rate': 9.838779956427016e-05, 'epoch': 0.81}


  2%|‚ñè         | 382/22950 [00:27<26:35, 14.15it/s]

{'loss': 0.3833, 'grad_norm': 3.4491934776306152, 'learning_rate': 9.834422657952071e-05, 'epoch': 0.83}


  2%|‚ñè         | 392/22950 [00:27<26:29, 14.20it/s]

{'loss': 0.3921, 'grad_norm': 5.313027381896973, 'learning_rate': 9.830065359477125e-05, 'epoch': 0.85}


  2%|‚ñè         | 402/22950 [00:28<26:29, 14.18it/s]

{'loss': 0.3933, 'grad_norm': 7.769924163818359, 'learning_rate': 9.82570806100218e-05, 'epoch': 0.87}


  2%|‚ñè         | 412/22950 [00:29<26:34, 14.13it/s]

{'loss': 0.4049, 'grad_norm': 6.850612640380859, 'learning_rate': 9.821350762527233e-05, 'epoch': 0.89}


  2%|‚ñè         | 422/22950 [00:29<26:31, 14.15it/s]

{'loss': 0.6573, 'grad_norm': 15.123741149902344, 'learning_rate': 9.816993464052288e-05, 'epoch': 0.92}


  2%|‚ñè         | 432/22950 [00:30<26:27, 14.19it/s]

{'loss': 0.4347, 'grad_norm': 3.299757957458496, 'learning_rate': 9.812636165577342e-05, 'epoch': 0.94}


  2%|‚ñè         | 442/22950 [00:31<26:17, 14.27it/s]

{'loss': 0.5775, 'grad_norm': 8.06278133392334, 'learning_rate': 9.808278867102397e-05, 'epoch': 0.96}


  2%|‚ñè         | 452/22950 [00:32<26:21, 14.23it/s]

{'loss': 0.628, 'grad_norm': 2.402723789215088, 'learning_rate': 9.80392156862745e-05, 'epoch': 0.98}


                                                   
  2%|‚ñè         | 459/22950 [00:34<26:18, 14.25it/s]

{'eval_loss': 0.4033973217010498, 'eval_accuracy': 0.813725471496582, 'eval_runtime': 2.1433, 'eval_samples_per_second': 190.365, 'eval_steps_per_second': 23.796, 'epoch': 1.0}


  2%|‚ñè         | 462/22950 [00:35<2:07:28,  2.94it/s]

{'loss': 0.4476, 'grad_norm': 3.0554356575012207, 'learning_rate': 9.799564270152506e-05, 'epoch': 1.0}


  2%|‚ñè         | 472/22950 [00:36<43:29,  8.62it/s]  

{'loss': 0.3571, 'grad_norm': 4.225066661834717, 'learning_rate': 9.79520697167756e-05, 'epoch': 1.02}


  2%|‚ñè         | 482/22950 [00:36<29:18, 12.77it/s]

{'loss': 0.377, 'grad_norm': 14.745720863342285, 'learning_rate': 9.790849673202614e-05, 'epoch': 1.05}


  2%|‚ñè         | 492/22950 [00:37<27:05, 13.82it/s]

{'loss': 0.3972, 'grad_norm': 7.7601213455200195, 'learning_rate': 9.786492374727669e-05, 'epoch': 1.07}


  2%|‚ñè         | 502/22950 [00:38<26:41, 14.01it/s]

{'loss': 0.3885, 'grad_norm': 3.1197614669799805, 'learning_rate': 9.782135076252724e-05, 'epoch': 1.09}


  2%|‚ñè         | 512/22950 [00:38<26:30, 14.11it/s]

{'loss': 0.3252, 'grad_norm': 3.2256836891174316, 'learning_rate': 9.777777777777778e-05, 'epoch': 1.11}


  2%|‚ñè         | 522/22950 [00:39<26:32, 14.08it/s]

{'loss': 0.395, 'grad_norm': 3.5215258598327637, 'learning_rate': 9.773420479302833e-05, 'epoch': 1.13}


  2%|‚ñè         | 532/22950 [00:40<26:26, 14.13it/s]

{'loss': 0.4059, 'grad_norm': 9.710822105407715, 'learning_rate': 9.769063180827888e-05, 'epoch': 1.15}


  2%|‚ñè         | 542/22950 [00:41<26:28, 14.11it/s]

{'loss': 0.2975, 'grad_norm': 3.4651880264282227, 'learning_rate': 9.764705882352942e-05, 'epoch': 1.18}


  2%|‚ñè         | 552/22950 [00:41<26:27, 14.11it/s]

{'loss': 0.3966, 'grad_norm': 5.133207321166992, 'learning_rate': 9.760348583877997e-05, 'epoch': 1.2}


  2%|‚ñè         | 562/22950 [00:42<26:35, 14.04it/s]

{'loss': 0.3936, 'grad_norm': 5.293666362762451, 'learning_rate': 9.755991285403052e-05, 'epoch': 1.22}


  2%|‚ñè         | 572/22950 [00:43<26:31, 14.06it/s]

{'loss': 0.387, 'grad_norm': 4.337482929229736, 'learning_rate': 9.751633986928105e-05, 'epoch': 1.24}


  3%|‚ñé         | 582/22950 [00:43<26:32, 14.04it/s]

{'loss': 0.357, 'grad_norm': 3.292640209197998, 'learning_rate': 9.747276688453159e-05, 'epoch': 1.26}


  3%|‚ñé         | 592/22950 [00:44<26:35, 14.01it/s]

{'loss': 0.3769, 'grad_norm': 11.342426300048828, 'learning_rate': 9.742919389978214e-05, 'epoch': 1.29}


  3%|‚ñé         | 602/22950 [00:45<26:26, 14.08it/s]

{'loss': 0.2542, 'grad_norm': 0.8411110639572144, 'learning_rate': 9.738562091503268e-05, 'epoch': 1.31}


  3%|‚ñé         | 612/22950 [00:46<26:27, 14.08it/s]

{'loss': 0.1918, 'grad_norm': 4.406158447265625, 'learning_rate': 9.734204793028323e-05, 'epoch': 1.33}


  3%|‚ñé         | 622/22950 [00:46<26:28, 14.06it/s]

{'loss': 0.3294, 'grad_norm': 5.197004318237305, 'learning_rate': 9.729847494553376e-05, 'epoch': 1.35}


  3%|‚ñé         | 632/22950 [00:47<26:21, 14.12it/s]

{'loss': 0.2961, 'grad_norm': 0.44478365778923035, 'learning_rate': 9.725490196078431e-05, 'epoch': 1.37}


  3%|‚ñé         | 642/22950 [00:48<26:21, 14.11it/s]

{'loss': 0.4949, 'grad_norm': 4.0631561279296875, 'learning_rate': 9.721132897603486e-05, 'epoch': 1.39}


  3%|‚ñé         | 652/22950 [00:48<26:17, 14.13it/s]

{'loss': 0.3074, 'grad_norm': 8.956859588623047, 'learning_rate': 9.71677559912854e-05, 'epoch': 1.42}


  3%|‚ñé         | 662/22950 [00:49<26:29, 14.02it/s]

{'loss': 0.2398, 'grad_norm': 3.46256160736084, 'learning_rate': 9.712418300653595e-05, 'epoch': 1.44}


  3%|‚ñé         | 672/22950 [00:50<26:20, 14.10it/s]

{'loss': 0.3667, 'grad_norm': 8.167367935180664, 'learning_rate': 9.70806100217865e-05, 'epoch': 1.46}


  3%|‚ñé         | 682/22950 [00:50<26:14, 14.14it/s]

{'loss': 0.5409, 'grad_norm': 5.6773481369018555, 'learning_rate': 9.703703703703704e-05, 'epoch': 1.48}


  3%|‚ñé         | 692/22950 [00:51<26:26, 14.03it/s]

{'loss': 0.2892, 'grad_norm': 12.890181541442871, 'learning_rate': 9.699346405228759e-05, 'epoch': 1.5}


  3%|‚ñé         | 702/22950 [00:52<26:18, 14.09it/s]

{'loss': 0.3372, 'grad_norm': 11.03884506225586, 'learning_rate': 9.694989106753814e-05, 'epoch': 1.53}


  3%|‚ñé         | 712/22950 [00:53<26:10, 14.16it/s]

{'loss': 0.3751, 'grad_norm': 6.980913162231445, 'learning_rate': 9.690631808278868e-05, 'epoch': 1.55}


  3%|‚ñé         | 722/22950 [00:53<26:12, 14.13it/s]

{'loss': 0.3522, 'grad_norm': 0.8523093461990356, 'learning_rate': 9.686274509803923e-05, 'epoch': 1.57}


  3%|‚ñé         | 732/22950 [00:54<26:21, 14.05it/s]

{'loss': 0.6914, 'grad_norm': 7.355917930603027, 'learning_rate': 9.681917211328978e-05, 'epoch': 1.59}


  3%|‚ñé         | 742/22950 [00:55<26:05, 14.19it/s]

{'loss': 0.4824, 'grad_norm': 7.4401021003723145, 'learning_rate': 9.677559912854031e-05, 'epoch': 1.61}


  3%|‚ñé         | 752/22950 [00:55<26:08, 14.15it/s]

{'loss': 0.3362, 'grad_norm': 9.598684310913086, 'learning_rate': 9.673202614379085e-05, 'epoch': 1.63}


  3%|‚ñé         | 762/22950 [00:56<26:11, 14.12it/s]

{'loss': 0.3722, 'grad_norm': 11.185385704040527, 'learning_rate': 9.66884531590414e-05, 'epoch': 1.66}


  3%|‚ñé         | 772/22950 [00:57<26:07, 14.15it/s]

{'loss': 0.3355, 'grad_norm': 10.332694053649902, 'learning_rate': 9.664488017429194e-05, 'epoch': 1.68}


  3%|‚ñé         | 782/22950 [00:58<26:16, 14.07it/s]

{'loss': 0.3935, 'grad_norm': 1.469797134399414, 'learning_rate': 9.660130718954249e-05, 'epoch': 1.7}


  3%|‚ñé         | 792/22950 [00:58<26:12, 14.09it/s]

{'loss': 0.4025, 'grad_norm': 8.866989135742188, 'learning_rate': 9.655773420479304e-05, 'epoch': 1.72}


  3%|‚ñé         | 802/22950 [00:59<26:16, 14.05it/s]

{'loss': 0.2643, 'grad_norm': 5.117719650268555, 'learning_rate': 9.651416122004357e-05, 'epoch': 1.74}


  4%|‚ñé         | 812/22950 [01:00<26:13, 14.07it/s]

{'loss': 0.314, 'grad_norm': 0.8386020064353943, 'learning_rate': 9.647058823529412e-05, 'epoch': 1.76}


  4%|‚ñé         | 822/22950 [01:00<26:17, 14.03it/s]

{'loss': 0.3784, 'grad_norm': 4.924105644226074, 'learning_rate': 9.642701525054467e-05, 'epoch': 1.79}


  4%|‚ñé         | 832/22950 [01:01<26:20, 13.99it/s]

{'loss': 0.4421, 'grad_norm': 1.8329886198043823, 'learning_rate': 9.638344226579521e-05, 'epoch': 1.81}


  4%|‚ñé         | 842/22950 [01:02<26:23, 13.96it/s]

{'loss': 0.3391, 'grad_norm': 6.609942436218262, 'learning_rate': 9.633986928104576e-05, 'epoch': 1.83}


  4%|‚ñé         | 852/22950 [01:03<26:21, 13.97it/s]

{'loss': 0.3561, 'grad_norm': 4.048640251159668, 'learning_rate': 9.62962962962963e-05, 'epoch': 1.85}


  4%|‚ñç         | 862/22950 [01:03<26:06, 14.10it/s]

{'loss': 0.3505, 'grad_norm': 2.6093928813934326, 'learning_rate': 9.625272331154685e-05, 'epoch': 1.87}


  4%|‚ñç         | 872/22950 [01:04<26:08, 14.08it/s]

{'loss': 0.2404, 'grad_norm': 3.240705728530884, 'learning_rate': 9.62091503267974e-05, 'epoch': 1.9}


  4%|‚ñç         | 882/22950 [01:05<26:03, 14.11it/s]

{'loss': 0.3312, 'grad_norm': 10.022309303283691, 'learning_rate': 9.616557734204793e-05, 'epoch': 1.92}


  4%|‚ñç         | 892/22950 [01:05<26:00, 14.14it/s]

{'loss': 0.3158, 'grad_norm': 4.2502827644348145, 'learning_rate': 9.612200435729848e-05, 'epoch': 1.94}


  4%|‚ñç         | 902/22950 [01:06<26:04, 14.09it/s]

{'loss': 0.4745, 'grad_norm': 10.781002044677734, 'learning_rate': 9.607843137254903e-05, 'epoch': 1.96}


  4%|‚ñç         | 912/22950 [01:07<26:13, 14.00it/s]

{'loss': 0.4388, 'grad_norm': 15.389747619628906, 'learning_rate': 9.603485838779957e-05, 'epoch': 1.98}


                                                   
  4%|‚ñç         | 918/22950 [01:09<24:42, 14.86it/s]

{'eval_loss': 0.3803345859050751, 'eval_accuracy': 0.8602941036224365, 'eval_runtime': 2.1482, 'eval_samples_per_second': 189.931, 'eval_steps_per_second': 23.741, 'epoch': 2.0}


  4%|‚ñç         | 922/22950 [01:10<2:05:10,  2.93it/s]

{'loss': 0.3139, 'grad_norm': 6.630633354187012, 'learning_rate': 9.599128540305011e-05, 'epoch': 2.0}


  4%|‚ñç         | 932/22950 [01:11<42:41,  8.60it/s]  

{'loss': 0.2951, 'grad_norm': 1.6825692653656006, 'learning_rate': 9.594771241830066e-05, 'epoch': 2.03}


  4%|‚ñç         | 942/22950 [01:12<29:09, 12.58it/s]

{'loss': 0.189, 'grad_norm': 1.3956886529922485, 'learning_rate': 9.59041394335512e-05, 'epoch': 2.05}


  4%|‚ñç         | 952/22950 [01:12<26:32, 13.82it/s]

{'loss': 0.2197, 'grad_norm': 0.5981944799423218, 'learning_rate': 9.586056644880174e-05, 'epoch': 2.07}


  4%|‚ñç         | 962/22950 [01:13<26:02, 14.07it/s]

{'loss': 0.2491, 'grad_norm': 4.241788864135742, 'learning_rate': 9.58169934640523e-05, 'epoch': 2.09}


  4%|‚ñç         | 972/22950 [01:14<25:54, 14.14it/s]

{'loss': 0.2371, 'grad_norm': 0.8236940503120422, 'learning_rate': 9.577342047930283e-05, 'epoch': 2.11}


  4%|‚ñç         | 982/22950 [01:14<25:51, 14.15it/s]

{'loss': 0.3772, 'grad_norm': 17.92857551574707, 'learning_rate': 9.572984749455338e-05, 'epoch': 2.14}


  4%|‚ñç         | 992/22950 [01:15<26:04, 14.04it/s]

{'loss': 0.2687, 'grad_norm': 4.673656463623047, 'learning_rate': 9.568627450980393e-05, 'epoch': 2.16}


  4%|‚ñç         | 1002/22950 [01:16<25:49, 14.16it/s]

{'loss': 0.1492, 'grad_norm': 4.422848224639893, 'learning_rate': 9.564270152505447e-05, 'epoch': 2.18}


  4%|‚ñç         | 1012/22950 [01:16<25:50, 14.15it/s]

{'loss': 0.2746, 'grad_norm': 17.428939819335938, 'learning_rate': 9.559912854030502e-05, 'epoch': 2.2}


  4%|‚ñç         | 1022/22950 [01:17<25:54, 14.11it/s]

{'loss': 0.1732, 'grad_norm': 9.753844261169434, 'learning_rate': 9.555555555555557e-05, 'epoch': 2.22}


  4%|‚ñç         | 1032/22950 [01:18<25:56, 14.08it/s]

{'loss': 0.3547, 'grad_norm': 11.244794845581055, 'learning_rate': 9.55119825708061e-05, 'epoch': 2.24}


  5%|‚ñç         | 1042/22950 [01:19<26:01, 14.03it/s]

{'loss': 0.1788, 'grad_norm': 10.786327362060547, 'learning_rate': 9.546840958605666e-05, 'epoch': 2.27}


  5%|‚ñç         | 1052/22950 [01:19<25:50, 14.13it/s]

{'loss': 0.3243, 'grad_norm': 8.625617027282715, 'learning_rate': 9.54248366013072e-05, 'epoch': 2.29}


  5%|‚ñç         | 1062/22950 [01:20<25:44, 14.17it/s]

{'loss': 0.3912, 'grad_norm': 12.702516555786133, 'learning_rate': 9.538126361655774e-05, 'epoch': 2.31}


  5%|‚ñç         | 1072/22950 [01:21<25:43, 14.17it/s]

{'loss': 0.3359, 'grad_norm': 2.0616278648376465, 'learning_rate': 9.533769063180828e-05, 'epoch': 2.33}


  5%|‚ñç         | 1082/22950 [01:21<25:41, 14.19it/s]

{'loss': 0.3129, 'grad_norm': 8.294937133789062, 'learning_rate': 9.529411764705883e-05, 'epoch': 2.35}


  5%|‚ñç         | 1092/22950 [01:22<25:49, 14.11it/s]

{'loss': 0.2801, 'grad_norm': 22.191526412963867, 'learning_rate': 9.525054466230937e-05, 'epoch': 2.37}


  5%|‚ñç         | 1102/22950 [01:23<25:55, 14.05it/s]

{'loss': 0.2165, 'grad_norm': 1.6277053356170654, 'learning_rate': 9.520697167755992e-05, 'epoch': 2.4}


  5%|‚ñç         | 1112/22950 [01:24<25:46, 14.12it/s]

{'loss': 0.1742, 'grad_norm': 3.6550817489624023, 'learning_rate': 9.516339869281045e-05, 'epoch': 2.42}


  5%|‚ñç         | 1122/22950 [01:24<25:48, 14.09it/s]

{'loss': 0.3791, 'grad_norm': 9.425207138061523, 'learning_rate': 9.5119825708061e-05, 'epoch': 2.44}


  5%|‚ñç         | 1132/22950 [01:25<25:48, 14.09it/s]

{'loss': 0.1329, 'grad_norm': 2.982245445251465, 'learning_rate': 9.507625272331155e-05, 'epoch': 2.46}


  5%|‚ñç         | 1142/22950 [01:26<25:54, 14.02it/s]

{'loss': 0.1704, 'grad_norm': 13.260034561157227, 'learning_rate': 9.503267973856209e-05, 'epoch': 2.48}


  5%|‚ñå         | 1152/22950 [01:26<25:48, 14.08it/s]

{'loss': 0.4565, 'grad_norm': 11.843381881713867, 'learning_rate': 9.498910675381264e-05, 'epoch': 2.51}


  5%|‚ñå         | 1162/22950 [01:27<25:43, 14.12it/s]

{'loss': 0.3791, 'grad_norm': 0.3390936851501465, 'learning_rate': 9.494553376906319e-05, 'epoch': 2.53}


  5%|‚ñå         | 1172/22950 [01:28<25:43, 14.11it/s]

{'loss': 0.1787, 'grad_norm': 9.567461967468262, 'learning_rate': 9.490196078431373e-05, 'epoch': 2.55}


  5%|‚ñå         | 1182/22950 [01:29<25:31, 14.22it/s]

{'loss': 0.2161, 'grad_norm': 10.226404190063477, 'learning_rate': 9.485838779956428e-05, 'epoch': 2.57}


  5%|‚ñå         | 1192/22950 [01:29<25:38, 14.15it/s]

{'loss': 0.1805, 'grad_norm': 1.98470938205719, 'learning_rate': 9.481481481481483e-05, 'epoch': 2.59}


  5%|‚ñå         | 1202/22950 [01:30<25:50, 14.03it/s]

{'loss': 0.2346, 'grad_norm': 15.627663612365723, 'learning_rate': 9.477124183006536e-05, 'epoch': 2.61}


  5%|‚ñå         | 1212/22950 [01:31<25:40, 14.11it/s]

{'loss': 0.2819, 'grad_norm': 0.22536088526248932, 'learning_rate': 9.472766884531591e-05, 'epoch': 2.64}


  5%|‚ñå         | 1222/22950 [01:31<25:47, 14.04it/s]

{'loss': 0.163, 'grad_norm': 1.1900253295898438, 'learning_rate': 9.468409586056646e-05, 'epoch': 2.66}


  5%|‚ñå         | 1232/22950 [01:32<25:52, 13.99it/s]

{'loss': 0.2951, 'grad_norm': 3.3752617835998535, 'learning_rate': 9.4640522875817e-05, 'epoch': 2.68}


  5%|‚ñå         | 1242/22950 [01:33<25:42, 14.08it/s]

{'loss': 0.0978, 'grad_norm': 0.1447405368089676, 'learning_rate': 9.459694989106754e-05, 'epoch': 2.7}


  5%|‚ñå         | 1252/22950 [01:34<25:47, 14.02it/s]

{'loss': 0.3176, 'grad_norm': 31.347076416015625, 'learning_rate': 9.455337690631809e-05, 'epoch': 2.72}


  5%|‚ñå         | 1262/22950 [01:34<25:43, 14.05it/s]

{'loss': 0.2804, 'grad_norm': 2.9308934211730957, 'learning_rate': 9.450980392156862e-05, 'epoch': 2.75}


  6%|‚ñå         | 1272/22950 [01:35<25:50, 13.98it/s]

{'loss': 0.3031, 'grad_norm': 18.61043930053711, 'learning_rate': 9.446623093681917e-05, 'epoch': 2.77}


  6%|‚ñå         | 1282/22950 [01:36<26:02, 13.87it/s]

{'loss': 0.3934, 'grad_norm': 5.417897701263428, 'learning_rate': 9.442265795206972e-05, 'epoch': 2.79}


  6%|‚ñå         | 1292/22950 [01:36<26:00, 13.88it/s]

{'loss': 0.202, 'grad_norm': 8.849499702453613, 'learning_rate': 9.437908496732026e-05, 'epoch': 2.81}


  6%|‚ñå         | 1302/22950 [01:37<26:03, 13.84it/s]

{'loss': 0.2451, 'grad_norm': 0.3564208447933197, 'learning_rate': 9.433551198257081e-05, 'epoch': 2.83}


  6%|‚ñå         | 1312/22950 [01:38<25:58, 13.88it/s]

{'loss': 0.3539, 'grad_norm': 7.487433433532715, 'learning_rate': 9.429193899782136e-05, 'epoch': 2.85}


  6%|‚ñå         | 1322/22950 [01:39<25:53, 13.92it/s]

{'loss': 0.2991, 'grad_norm': 40.068641662597656, 'learning_rate': 9.42483660130719e-05, 'epoch': 2.88}


  6%|‚ñå         | 1332/22950 [01:39<25:52, 13.92it/s]

{'loss': 0.3385, 'grad_norm': 13.591753959655762, 'learning_rate': 9.420479302832245e-05, 'epoch': 2.9}


  6%|‚ñå         | 1342/22950 [01:40<25:50, 13.94it/s]

{'loss': 0.1593, 'grad_norm': 11.553074836730957, 'learning_rate': 9.416122004357298e-05, 'epoch': 2.92}


  6%|‚ñå         | 1352/22950 [01:41<25:56, 13.88it/s]

{'loss': 0.5234, 'grad_norm': 4.297688961029053, 'learning_rate': 9.411764705882353e-05, 'epoch': 2.94}


  6%|‚ñå         | 1362/22950 [01:41<26:04, 13.80it/s]

{'loss': 0.3245, 'grad_norm': 3.2293243408203125, 'learning_rate': 9.407407407407408e-05, 'epoch': 2.96}


  6%|‚ñå         | 1372/22950 [01:42<25:59, 13.84it/s]

{'loss': 0.137, 'grad_norm': 1.2273073196411133, 'learning_rate': 9.403050108932462e-05, 'epoch': 2.98}


                                                    
  6%|‚ñå         | 1377/22950 [01:45<25:48, 13.93it/s]

{'eval_loss': 0.5336281061172485, 'eval_accuracy': 0.8308823704719543, 'eval_runtime': 2.1344, 'eval_samples_per_second': 191.156, 'eval_steps_per_second': 23.895, 'epoch': 3.0}


  6%|‚ñå         | 1382/22950 [01:46<1:35:35,  3.76it/s]

{'loss': 0.2188, 'grad_norm': 0.4966100752353668, 'learning_rate': 9.398692810457517e-05, 'epoch': 3.01}


  6%|‚ñå         | 1392/22950 [01:46<37:34,  9.56it/s]  

{'loss': 0.1252, 'grad_norm': 1.0261492729187012, 'learning_rate': 9.394335511982572e-05, 'epoch': 3.03}


  6%|‚ñå         | 1402/22950 [01:47<27:48, 12.91it/s]

{'loss': 0.1565, 'grad_norm': 24.926742553710938, 'learning_rate': 9.389978213507626e-05, 'epoch': 3.05}


  6%|‚ñå         | 1412/22950 [01:48<26:10, 13.71it/s]

{'loss': 0.3827, 'grad_norm': 0.7065860033035278, 'learning_rate': 9.38562091503268e-05, 'epoch': 3.07}


  6%|‚ñå         | 1422/22950 [01:48<25:51, 13.88it/s]

{'loss': 0.0875, 'grad_norm': 0.2702427804470062, 'learning_rate': 9.381263616557735e-05, 'epoch': 3.09}


  6%|‚ñå         | 1432/22950 [01:49<25:50, 13.88it/s]

{'loss': 0.0045, 'grad_norm': 0.17771652340888977, 'learning_rate': 9.376906318082788e-05, 'epoch': 3.12}


  6%|‚ñã         | 1442/22950 [01:50<25:30, 14.06it/s]

{'loss': 0.2041, 'grad_norm': 4.954220771789551, 'learning_rate': 9.372549019607843e-05, 'epoch': 3.14}


  6%|‚ñã         | 1452/22950 [01:51<25:39, 13.96it/s]

{'loss': 0.128, 'grad_norm': 0.09771336615085602, 'learning_rate': 9.368191721132898e-05, 'epoch': 3.16}


  6%|‚ñã         | 1462/22950 [01:51<25:45, 13.91it/s]

{'loss': 0.2705, 'grad_norm': 0.10588280111551285, 'learning_rate': 9.363834422657952e-05, 'epoch': 3.18}


  6%|‚ñã         | 1472/22950 [01:52<25:38, 13.96it/s]

{'loss': 0.1557, 'grad_norm': 0.18396155536174774, 'learning_rate': 9.359477124183007e-05, 'epoch': 3.2}


  6%|‚ñã         | 1482/22950 [01:53<25:34, 13.99it/s]

{'loss': 0.1498, 'grad_norm': 8.632065773010254, 'learning_rate': 9.355119825708062e-05, 'epoch': 3.22}


  7%|‚ñã         | 1492/22950 [01:53<25:28, 14.04it/s]

{'loss': 0.0099, 'grad_norm': 0.28565940260887146, 'learning_rate': 9.350762527233116e-05, 'epoch': 3.25}


  7%|‚ñã         | 1502/22950 [01:54<25:38, 13.94it/s]

{'loss': 0.1683, 'grad_norm': 0.2998894453048706, 'learning_rate': 9.34640522875817e-05, 'epoch': 3.27}


  7%|‚ñã         | 1512/22950 [01:55<25:35, 13.96it/s]

{'loss': 0.0571, 'grad_norm': 0.19831708073616028, 'learning_rate': 9.342047930283226e-05, 'epoch': 3.29}


  7%|‚ñã         | 1522/22950 [01:56<25:24, 14.05it/s]

{'loss': 0.3104, 'grad_norm': 29.206764221191406, 'learning_rate': 9.337690631808279e-05, 'epoch': 3.31}


  7%|‚ñã         | 1532/22950 [01:56<25:36, 13.94it/s]

{'loss': 0.215, 'grad_norm': 6.769792556762695, 'learning_rate': 9.333333333333334e-05, 'epoch': 3.33}


  7%|‚ñã         | 1542/22950 [01:57<25:37, 13.92it/s]

{'loss': 0.2392, 'grad_norm': 23.107769012451172, 'learning_rate': 9.32897603485839e-05, 'epoch': 3.36}


  7%|‚ñã         | 1552/22950 [01:58<25:32, 13.96it/s]

{'loss': 0.1622, 'grad_norm': 8.476014137268066, 'learning_rate': 9.324618736383443e-05, 'epoch': 3.38}


  7%|‚ñã         | 1562/22950 [01:58<25:32, 13.96it/s]

{'loss': 0.3224, 'grad_norm': 5.182549476623535, 'learning_rate': 9.320261437908498e-05, 'epoch': 3.4}


  7%|‚ñã         | 1572/22950 [01:59<25:26, 14.00it/s]

{'loss': 0.1751, 'grad_norm': 5.785684108734131, 'learning_rate': 9.315904139433552e-05, 'epoch': 3.42}


  7%|‚ñã         | 1582/22950 [02:00<25:13, 14.12it/s]

{'loss': 0.1356, 'grad_norm': 1.1866304874420166, 'learning_rate': 9.311546840958605e-05, 'epoch': 3.44}


  7%|‚ñã         | 1592/22950 [02:01<25:16, 14.08it/s]

{'loss': 0.1463, 'grad_norm': 0.12576501071453094, 'learning_rate': 9.30718954248366e-05, 'epoch': 3.46}


  7%|‚ñã         | 1602/22950 [02:01<25:22, 14.02it/s]

{'loss': 0.1434, 'grad_norm': 0.12180451303720474, 'learning_rate': 9.302832244008714e-05, 'epoch': 3.49}


  7%|‚ñã         | 1612/22950 [02:02<25:18, 14.05it/s]

{'loss': 0.2263, 'grad_norm': 8.715578079223633, 'learning_rate': 9.298474945533769e-05, 'epoch': 3.51}


  7%|‚ñã         | 1622/22950 [02:03<25:23, 14.00it/s]

{'loss': 0.2003, 'grad_norm': 1.0581095218658447, 'learning_rate': 9.294117647058824e-05, 'epoch': 3.53}


  7%|‚ñã         | 1632/22950 [02:03<25:18, 14.04it/s]

{'loss': 0.1719, 'grad_norm': 38.57347106933594, 'learning_rate': 9.289760348583878e-05, 'epoch': 3.55}


  7%|‚ñã         | 1642/22950 [02:04<25:21, 14.00it/s]

{'loss': 0.1444, 'grad_norm': 0.9553351998329163, 'learning_rate': 9.285403050108933e-05, 'epoch': 3.57}


  7%|‚ñã         | 1652/22950 [02:05<25:16, 14.05it/s]

{'loss': 0.2534, 'grad_norm': 5.928398609161377, 'learning_rate': 9.281045751633988e-05, 'epoch': 3.59}


  7%|‚ñã         | 1662/22950 [02:06<25:13, 14.06it/s]

{'loss': 0.1496, 'grad_norm': 18.38498878479004, 'learning_rate': 9.276688453159041e-05, 'epoch': 3.62}


  7%|‚ñã         | 1672/22950 [02:06<25:05, 14.13it/s]

{'loss': 0.167, 'grad_norm': 8.503744125366211, 'learning_rate': 9.272331154684096e-05, 'epoch': 3.64}


  7%|‚ñã         | 1682/22950 [02:07<25:13, 14.06it/s]

{'loss': 0.0603, 'grad_norm': 31.517141342163086, 'learning_rate': 9.267973856209151e-05, 'epoch': 3.66}


  7%|‚ñã         | 1692/22950 [02:08<25:16, 14.02it/s]

{'loss': 0.1693, 'grad_norm': 2.7958805561065674, 'learning_rate': 9.263616557734205e-05, 'epoch': 3.68}


  7%|‚ñã         | 1702/22950 [02:08<25:12, 14.05it/s]

{'loss': 0.2555, 'grad_norm': 0.2059556394815445, 'learning_rate': 9.25925925925926e-05, 'epoch': 3.7}


  7%|‚ñã         | 1712/22950 [02:09<25:01, 14.15it/s]

{'loss': 0.1019, 'grad_norm': 0.06454271823167801, 'learning_rate': 9.254901960784315e-05, 'epoch': 3.73}


  8%|‚ñä         | 1722/22950 [02:10<25:05, 14.10it/s]

{'loss': 0.143, 'grad_norm': 57.74462890625, 'learning_rate': 9.250544662309369e-05, 'epoch': 3.75}


  8%|‚ñä         | 1732/22950 [02:11<25:04, 14.10it/s]

{'loss': 0.1814, 'grad_norm': 30.84895896911621, 'learning_rate': 9.246187363834424e-05, 'epoch': 3.77}


  8%|‚ñä         | 1742/22950 [02:11<25:15, 13.99it/s]

{'loss': 0.1416, 'grad_norm': 0.06343986839056015, 'learning_rate': 9.241830065359478e-05, 'epoch': 3.79}


  8%|‚ñä         | 1752/22950 [02:12<25:15, 13.99it/s]

{'loss': 0.2344, 'grad_norm': 4.535186767578125, 'learning_rate': 9.237472766884531e-05, 'epoch': 3.81}


  8%|‚ñä         | 1762/22950 [02:13<25:00, 14.12it/s]

{'loss': 0.274, 'grad_norm': 0.08349151909351349, 'learning_rate': 9.233115468409586e-05, 'epoch': 3.83}


  8%|‚ñä         | 1772/22950 [02:13<25:01, 14.11it/s]

{'loss': 0.4707, 'grad_norm': 30.054641723632812, 'learning_rate': 9.228758169934641e-05, 'epoch': 3.86}


  8%|‚ñä         | 1782/22950 [02:14<25:03, 14.08it/s]

{'loss': 0.1312, 'grad_norm': 0.19736430048942566, 'learning_rate': 9.224400871459695e-05, 'epoch': 3.88}


  8%|‚ñä         | 1792/22950 [02:15<24:59, 14.11it/s]

{'loss': 0.0838, 'grad_norm': 0.2292512208223343, 'learning_rate': 9.22004357298475e-05, 'epoch': 3.9}


  8%|‚ñä         | 1802/22950 [02:15<25:05, 14.05it/s]

{'loss': 0.2972, 'grad_norm': 41.57825469970703, 'learning_rate': 9.215686274509804e-05, 'epoch': 3.92}


  8%|‚ñä         | 1812/22950 [02:16<25:09, 14.01it/s]

{'loss': 0.1788, 'grad_norm': 0.28453299403190613, 'learning_rate': 9.211328976034859e-05, 'epoch': 3.94}


  8%|‚ñä         | 1822/22950 [02:17<25:05, 14.03it/s]

{'loss': 0.2333, 'grad_norm': 24.249937057495117, 'learning_rate': 9.206971677559914e-05, 'epoch': 3.97}


  8%|‚ñä         | 1832/22950 [02:18<25:13, 13.96it/s]

{'loss': 0.26, 'grad_norm': 0.3629447817802429, 'learning_rate': 9.202614379084967e-05, 'epoch': 3.99}


                                                    
  8%|‚ñä         | 1836/22950 [02:20<24:03, 14.62it/s]

{'eval_loss': 0.7260833382606506, 'eval_accuracy': 0.8357843160629272, 'eval_runtime': 2.1663, 'eval_samples_per_second': 188.342, 'eval_steps_per_second': 23.543, 'epoch': 4.0}


  8%|‚ñä         | 1842/22950 [02:21<1:32:54,  3.79it/s]

{'loss': 0.1118, 'grad_norm': 1.48964262008667, 'learning_rate': 9.198257080610022e-05, 'epoch': 4.01}


  8%|‚ñä         | 1852/22950 [02:22<36:37,  9.60it/s]  

{'loss': 0.1024, 'grad_norm': 0.05296572670340538, 'learning_rate': 9.193899782135077e-05, 'epoch': 4.03}


  8%|‚ñä         | 1862/22950 [02:22<27:18, 12.87it/s]

{'loss': 0.0683, 'grad_norm': 3.773041009902954, 'learning_rate': 9.189542483660131e-05, 'epoch': 4.05}


  8%|‚ñä         | 1872/22950 [02:23<25:42, 13.67it/s]

{'loss': 0.1701, 'grad_norm': 21.58287239074707, 'learning_rate': 9.185185185185186e-05, 'epoch': 4.07}


  8%|‚ñä         | 1882/22950 [02:24<25:31, 13.76it/s]

{'loss': 0.1439, 'grad_norm': 0.12291950732469559, 'learning_rate': 9.180827886710241e-05, 'epoch': 4.1}


  8%|‚ñä         | 1892/22950 [02:25<25:33, 13.73it/s]

{'loss': 0.1664, 'grad_norm': 0.08812243491411209, 'learning_rate': 9.176470588235295e-05, 'epoch': 4.12}


  8%|‚ñä         | 1902/22950 [02:25<25:20, 13.84it/s]

{'loss': 0.1794, 'grad_norm': 0.060871370136737823, 'learning_rate': 9.172113289760348e-05, 'epoch': 4.14}


  8%|‚ñä         | 1912/22950 [02:26<25:15, 13.88it/s]

{'loss': 0.1102, 'grad_norm': 0.490065336227417, 'learning_rate': 9.167755991285403e-05, 'epoch': 4.16}


  8%|‚ñä         | 1922/22950 [02:27<25:12, 13.90it/s]

{'loss': 0.229, 'grad_norm': 0.06756642460823059, 'learning_rate': 9.163398692810457e-05, 'epoch': 4.18}


  8%|‚ñä         | 1932/22950 [02:27<25:06, 13.95it/s]

{'loss': 0.0447, 'grad_norm': 0.0893545001745224, 'learning_rate': 9.159041394335512e-05, 'epoch': 4.2}


  8%|‚ñä         | 1942/22950 [02:28<25:18, 13.84it/s]

{'loss': 0.0718, 'grad_norm': 2.906525135040283, 'learning_rate': 9.154684095860567e-05, 'epoch': 4.23}


  9%|‚ñä         | 1952/22950 [02:29<25:06, 13.94it/s]

{'loss': 0.0787, 'grad_norm': 0.08129039406776428, 'learning_rate': 9.150326797385621e-05, 'epoch': 4.25}


  9%|‚ñä         | 1962/22950 [02:30<24:59, 13.99it/s]

{'loss': 0.1026, 'grad_norm': 0.08651131391525269, 'learning_rate': 9.145969498910676e-05, 'epoch': 4.27}


  9%|‚ñä         | 1972/22950 [02:30<24:53, 14.05it/s]

{'loss': 0.2653, 'grad_norm': 0.06326533108949661, 'learning_rate': 9.141612200435731e-05, 'epoch': 4.29}


  9%|‚ñä         | 1982/22950 [02:31<24:58, 13.99it/s]

{'loss': 0.0946, 'grad_norm': 0.3832416236400604, 'learning_rate': 9.137254901960784e-05, 'epoch': 4.31}


  9%|‚ñä         | 1992/22950 [02:32<24:50, 14.06it/s]

{'loss': 0.1493, 'grad_norm': 26.16234588623047, 'learning_rate': 9.13289760348584e-05, 'epoch': 4.34}


  9%|‚ñä         | 2002/22950 [02:32<24:48, 14.08it/s]

{'loss': 0.102, 'grad_norm': 7.237424373626709, 'learning_rate': 9.128540305010894e-05, 'epoch': 4.36}


  9%|‚ñâ         | 2012/22950 [02:33<24:52, 14.03it/s]

{'loss': 0.1085, 'grad_norm': 0.49124282598495483, 'learning_rate': 9.124183006535948e-05, 'epoch': 4.38}


  9%|‚ñâ         | 2022/22950 [02:34<24:54, 14.00it/s]

{'loss': 0.0474, 'grad_norm': 0.04020654410123825, 'learning_rate': 9.119825708061003e-05, 'epoch': 4.4}


  9%|‚ñâ         | 2032/22950 [02:35<24:44, 14.09it/s]

{'loss': 0.0985, 'grad_norm': 0.23247355222702026, 'learning_rate': 9.115468409586058e-05, 'epoch': 4.42}


  9%|‚ñâ         | 2042/22950 [02:35<24:35, 14.17it/s]

{'loss': 0.1522, 'grad_norm': 0.039759375154972076, 'learning_rate': 9.111111111111112e-05, 'epoch': 4.44}


  9%|‚ñâ         | 2052/22950 [02:36<24:38, 14.13it/s]

{'loss': 0.157, 'grad_norm': 0.05720265582203865, 'learning_rate': 9.106753812636167e-05, 'epoch': 4.47}


  9%|‚ñâ         | 2062/22950 [02:37<24:44, 14.07it/s]

{'loss': 0.1485, 'grad_norm': 11.918928146362305, 'learning_rate': 9.10239651416122e-05, 'epoch': 4.49}


  9%|‚ñâ         | 2072/22950 [02:37<24:51, 14.00it/s]

{'loss': 0.2017, 'grad_norm': 13.356316566467285, 'learning_rate': 9.098039215686274e-05, 'epoch': 4.51}


  9%|‚ñâ         | 2082/22950 [02:38<24:40, 14.09it/s]

{'loss': 0.2323, 'grad_norm': 0.050696663558483124, 'learning_rate': 9.093681917211329e-05, 'epoch': 4.53}


  9%|‚ñâ         | 2092/22950 [02:39<24:47, 14.02it/s]

{'loss': 0.1286, 'grad_norm': 0.10638576000928879, 'learning_rate': 9.089324618736383e-05, 'epoch': 4.55}


  9%|‚ñâ         | 2102/22950 [02:40<24:57, 13.92it/s]

{'loss': 0.0566, 'grad_norm': 1.3837010860443115, 'learning_rate': 9.084967320261438e-05, 'epoch': 4.58}


  9%|‚ñâ         | 2112/22950 [02:40<24:43, 14.05it/s]

{'loss': 0.0679, 'grad_norm': 0.06249496713280678, 'learning_rate': 9.080610021786493e-05, 'epoch': 4.6}


  9%|‚ñâ         | 2122/22950 [02:41<24:48, 13.99it/s]

{'loss': 0.1135, 'grad_norm': 40.675926208496094, 'learning_rate': 9.076252723311547e-05, 'epoch': 4.62}


  9%|‚ñâ         | 2132/22950 [02:42<24:41, 14.05it/s]

{'loss': 0.0679, 'grad_norm': 59.639060974121094, 'learning_rate': 9.071895424836602e-05, 'epoch': 4.64}


  9%|‚ñâ         | 2142/22950 [02:42<24:35, 14.10it/s]

{'loss': 0.0644, 'grad_norm': 0.035783182829618454, 'learning_rate': 9.067538126361657e-05, 'epoch': 4.66}


  9%|‚ñâ         | 2152/22950 [02:43<24:29, 14.15it/s]

{'loss': 0.0018, 'grad_norm': 0.08772481977939606, 'learning_rate': 9.06318082788671e-05, 'epoch': 4.68}


  9%|‚ñâ         | 2162/22950 [02:44<24:37, 14.07it/s]

{'loss': 0.169, 'grad_norm': 15.493842124938965, 'learning_rate': 9.058823529411765e-05, 'epoch': 4.71}


  9%|‚ñâ         | 2172/22950 [02:45<24:39, 14.04it/s]

{'loss': 0.182, 'grad_norm': 1.0803717374801636, 'learning_rate': 9.05446623093682e-05, 'epoch': 4.73}


 10%|‚ñâ         | 2182/22950 [02:45<24:37, 14.06it/s]

{'loss': 0.2373, 'grad_norm': 0.11914557218551636, 'learning_rate': 9.050108932461874e-05, 'epoch': 4.75}


 10%|‚ñâ         | 2192/22950 [02:46<24:30, 14.11it/s]

{'loss': 0.1106, 'grad_norm': 0.05766041576862335, 'learning_rate': 9.045751633986929e-05, 'epoch': 4.77}


 10%|‚ñâ         | 2202/22950 [02:47<24:29, 14.12it/s]

{'loss': 0.1222, 'grad_norm': 0.22477851808071136, 'learning_rate': 9.041394335511984e-05, 'epoch': 4.79}


 10%|‚ñâ         | 2212/22950 [02:47<24:39, 14.01it/s]

{'loss': 0.263, 'grad_norm': 4.637242317199707, 'learning_rate': 9.037037037037038e-05, 'epoch': 4.81}


 10%|‚ñâ         | 2222/22950 [02:48<24:33, 14.07it/s]

{'loss': 0.2153, 'grad_norm': 0.6612178087234497, 'learning_rate': 9.032679738562093e-05, 'epoch': 4.84}


 10%|‚ñâ         | 2232/22950 [02:49<24:46, 13.94it/s]

{'loss': 0.127, 'grad_norm': 14.045889854431152, 'learning_rate': 9.028322440087146e-05, 'epoch': 4.86}


 10%|‚ñâ         | 2242/22950 [02:50<24:50, 13.89it/s]

{'loss': 0.0217, 'grad_norm': 0.09886190295219421, 'learning_rate': 9.0239651416122e-05, 'epoch': 4.88}


 10%|‚ñâ         | 2252/22950 [02:50<24:47, 13.91it/s]

{'loss': 0.1661, 'grad_norm': 28.0428409576416, 'learning_rate': 9.019607843137255e-05, 'epoch': 4.9}


 10%|‚ñâ         | 2262/22950 [02:51<24:46, 13.91it/s]

{'loss': 0.1127, 'grad_norm': 0.07015331089496613, 'learning_rate': 9.01525054466231e-05, 'epoch': 4.92}


 10%|‚ñâ         | 2272/22950 [02:52<24:40, 13.97it/s]

{'loss': 0.0739, 'grad_norm': 0.3998020887374878, 'learning_rate': 9.010893246187364e-05, 'epoch': 4.95}


 10%|‚ñâ         | 2282/22950 [02:52<24:51, 13.85it/s]

{'loss': 0.1497, 'grad_norm': 1.4271951913833618, 'learning_rate': 9.006535947712419e-05, 'epoch': 4.97}


 10%|‚ñâ         | 2292/22950 [02:53<24:51, 13.85it/s]

{'loss': 0.1522, 'grad_norm': 0.0819455161690712, 'learning_rate': 9.002178649237472e-05, 'epoch': 4.99}


                                                    
 10%|‚ñà         | 2295/22950 [02:56<24:52, 13.84it/s]

{'eval_loss': 0.8389745950698853, 'eval_accuracy': 0.843137264251709, 'eval_runtime': 2.1743, 'eval_samples_per_second': 187.644, 'eval_steps_per_second': 23.456, 'epoch': 5.0}


 10%|‚ñà         | 2302/22950 [02:57<1:11:25,  4.82it/s]

{'loss': 0.0621, 'grad_norm': 0.05018370971083641, 'learning_rate': 8.997821350762527e-05, 'epoch': 5.01}


 10%|‚ñà         | 2312/22950 [02:57<32:42, 10.52it/s]  

{'loss': 0.052, 'grad_norm': 0.06732727587223053, 'learning_rate': 8.993464052287582e-05, 'epoch': 5.03}


 10%|‚ñà         | 2322/22950 [02:58<26:25, 13.01it/s]

{'loss': 0.0519, 'grad_norm': 16.10770034790039, 'learning_rate': 8.989106753812636e-05, 'epoch': 5.05}


 10%|‚ñà         | 2332/22950 [02:59<25:12, 13.63it/s]

{'loss': 0.2556, 'grad_norm': 0.08321578055620193, 'learning_rate': 8.984749455337691e-05, 'epoch': 5.08}


 10%|‚ñà         | 2342/22950 [02:59<25:04, 13.70it/s]

{'loss': 0.0092, 'grad_norm': 0.0306978989392519, 'learning_rate': 8.980392156862746e-05, 'epoch': 5.1}


 10%|‚ñà         | 2352/22950 [03:00<24:59, 13.74it/s]

{'loss': 0.0034, 'grad_norm': 1.929258942604065, 'learning_rate': 8.9760348583878e-05, 'epoch': 5.12}


 10%|‚ñà         | 2362/22950 [03:01<24:54, 13.77it/s]

{'loss': 0.0021, 'grad_norm': 0.052900172770023346, 'learning_rate': 8.971677559912855e-05, 'epoch': 5.14}


 10%|‚ñà         | 2372/22950 [03:02<24:54, 13.77it/s]

{'loss': 0.0077, 'grad_norm': 0.028691451996564865, 'learning_rate': 8.96732026143791e-05, 'epoch': 5.16}


 10%|‚ñà         | 2382/22950 [03:02<24:46, 13.84it/s]

{'loss': 0.144, 'grad_norm': 0.018278149887919426, 'learning_rate': 8.962962962962963e-05, 'epoch': 5.19}


 10%|‚ñà         | 2392/22950 [03:03<24:44, 13.84it/s]

{'loss': 0.0815, 'grad_norm': 0.020333657041192055, 'learning_rate': 8.958605664488019e-05, 'epoch': 5.21}


 10%|‚ñà         | 2402/22950 [03:04<24:42, 13.86it/s]

{'loss': 0.1083, 'grad_norm': 0.10911599546670914, 'learning_rate': 8.954248366013072e-05, 'epoch': 5.23}


 11%|‚ñà         | 2412/22950 [03:04<24:47, 13.81it/s]

{'loss': 0.1188, 'grad_norm': 48.73066711425781, 'learning_rate': 8.949891067538126e-05, 'epoch': 5.25}


 11%|‚ñà         | 2422/22950 [03:05<24:37, 13.89it/s]

{'loss': 0.0709, 'grad_norm': 0.0794529989361763, 'learning_rate': 8.945533769063181e-05, 'epoch': 5.27}


 11%|‚ñà         | 2432/22950 [03:06<24:40, 13.86it/s]

{'loss': 0.1273, 'grad_norm': 0.033675309270620346, 'learning_rate': 8.941176470588236e-05, 'epoch': 5.29}


 11%|‚ñà         | 2442/22950 [03:07<24:45, 13.81it/s]

{'loss': 0.0685, 'grad_norm': 0.03029320389032364, 'learning_rate': 8.93681917211329e-05, 'epoch': 5.32}


 11%|‚ñà         | 2452/22950 [03:07<24:38, 13.86it/s]

{'loss': 0.052, 'grad_norm': 80.63548278808594, 'learning_rate': 8.932461873638345e-05, 'epoch': 5.34}


 11%|‚ñà         | 2462/22950 [03:08<24:39, 13.84it/s]

{'loss': 0.1535, 'grad_norm': 92.01500701904297, 'learning_rate': 8.9281045751634e-05, 'epoch': 5.36}


 11%|‚ñà         | 2472/22950 [03:09<24:34, 13.89it/s]

{'loss': 0.0547, 'grad_norm': 0.04541012644767761, 'learning_rate': 8.923747276688453e-05, 'epoch': 5.38}


 11%|‚ñà         | 2482/22950 [03:10<24:38, 13.85it/s]

{'loss': 0.0992, 'grad_norm': 1.007738471031189, 'learning_rate': 8.919389978213508e-05, 'epoch': 5.4}


 11%|‚ñà         | 2492/22950 [03:10<24:42, 13.80it/s]

{'loss': 0.0998, 'grad_norm': 25.334718704223633, 'learning_rate': 8.915032679738563e-05, 'epoch': 5.42}


 11%|‚ñà         | 2502/22950 [03:11<24:32, 13.89it/s]

{'loss': 0.1496, 'grad_norm': 0.07228364795446396, 'learning_rate': 8.910675381263617e-05, 'epoch': 5.45}


 11%|‚ñà         | 2512/22950 [03:12<24:37, 13.83it/s]

{'loss': 0.0646, 'grad_norm': 0.3179069757461548, 'learning_rate': 8.906318082788672e-05, 'epoch': 5.47}


 11%|‚ñà         | 2522/22950 [03:12<24:51, 13.69it/s]

{'loss': 0.1212, 'grad_norm': 0.1104908138513565, 'learning_rate': 8.901960784313726e-05, 'epoch': 5.49}


 11%|‚ñà         | 2532/22950 [03:13<24:50, 13.70it/s]

{'loss': 0.3033, 'grad_norm': 22.82947540283203, 'learning_rate': 8.89760348583878e-05, 'epoch': 5.51}


 11%|‚ñà         | 2542/22950 [03:14<24:50, 13.69it/s]

{'loss': 0.062, 'grad_norm': 0.1954731047153473, 'learning_rate': 8.893246187363836e-05, 'epoch': 5.53}


 11%|‚ñà         | 2552/22950 [03:15<25:00, 13.59it/s]

{'loss': 0.1769, 'grad_norm': 0.06268730759620667, 'learning_rate': 8.888888888888889e-05, 'epoch': 5.56}


 11%|‚ñà         | 2562/22950 [03:15<25:20, 13.41it/s]

{'loss': 0.0947, 'grad_norm': 0.035209301859140396, 'learning_rate': 8.884531590413944e-05, 'epoch': 5.58}


 11%|‚ñà         | 2572/22950 [03:16<25:51, 13.14it/s]

{'loss': 0.0145, 'grad_norm': 0.032220326364040375, 'learning_rate': 8.880174291938998e-05, 'epoch': 5.6}


 11%|‚ñà‚ñè        | 2582/22950 [03:17<25:33, 13.28it/s]

{'loss': 0.2197, 'grad_norm': 0.10573980957269669, 'learning_rate': 8.875816993464052e-05, 'epoch': 5.62}


 11%|‚ñà‚ñè        | 2592/22950 [03:18<25:30, 13.30it/s]

{'loss': 0.0012, 'grad_norm': 0.0214481670409441, 'learning_rate': 8.871459694989107e-05, 'epoch': 5.64}


 11%|‚ñà‚ñè        | 2602/22950 [03:18<25:28, 13.31it/s]

{'loss': 0.1159, 'grad_norm': 0.026128562167286873, 'learning_rate': 8.867102396514162e-05, 'epoch': 5.66}


 11%|‚ñà‚ñè        | 2612/22950 [03:19<25:36, 13.23it/s]

{'loss': 0.1128, 'grad_norm': 47.78032302856445, 'learning_rate': 8.862745098039215e-05, 'epoch': 5.69}


 11%|‚ñà‚ñè        | 2622/22950 [03:20<25:29, 13.29it/s]

{'loss': 0.0449, 'grad_norm': 0.01644926331937313, 'learning_rate': 8.85838779956427e-05, 'epoch': 5.71}


 11%|‚ñà‚ñè        | 2632/22950 [03:21<25:56, 13.05it/s]

{'loss': 0.16, 'grad_norm': 0.027089666575193405, 'learning_rate': 8.854030501089325e-05, 'epoch': 5.73}


 12%|‚ñà‚ñè        | 2642/22950 [03:21<25:41, 13.18it/s]

{'loss': 0.1246, 'grad_norm': 18.606821060180664, 'learning_rate': 8.849673202614379e-05, 'epoch': 5.75}


 12%|‚ñà‚ñè        | 2652/22950 [03:22<25:52, 13.08it/s]

{'loss': 0.0014, 'grad_norm': 0.4179931879043579, 'learning_rate': 8.845315904139434e-05, 'epoch': 5.77}


 12%|‚ñà‚ñè        | 2662/22950 [03:23<26:03, 12.97it/s]

{'loss': 0.0463, 'grad_norm': 0.017881030216813087, 'learning_rate': 8.840958605664489e-05, 'epoch': 5.8}


 12%|‚ñà‚ñè        | 2672/22950 [03:24<26:22, 12.82it/s]

{'loss': 0.2394, 'grad_norm': 1.126392126083374, 'learning_rate': 8.836601307189543e-05, 'epoch': 5.82}


 12%|‚ñà‚ñè        | 2682/22950 [03:24<26:04, 12.96it/s]

{'loss': 0.237, 'grad_norm': 13.94946575164795, 'learning_rate': 8.832244008714598e-05, 'epoch': 5.84}


 12%|‚ñà‚ñè        | 2692/22950 [03:25<26:13, 12.88it/s]

{'loss': 0.3691, 'grad_norm': 0.08279843628406525, 'learning_rate': 8.827886710239653e-05, 'epoch': 5.86}


 12%|‚ñà‚ñè        | 2702/22950 [03:26<26:22, 12.80it/s]

{'loss': 0.0545, 'grad_norm': 0.08149643242359161, 'learning_rate': 8.823529411764706e-05, 'epoch': 5.88}


 12%|‚ñà‚ñè        | 2712/22950 [03:27<26:29, 12.74it/s]

{'loss': 0.0812, 'grad_norm': 0.02400522120296955, 'learning_rate': 8.819172113289761e-05, 'epoch': 5.9}


 12%|‚ñà‚ñè        | 2722/22950 [03:28<26:25, 12.76it/s]

{'loss': 0.0513, 'grad_norm': 0.04071604833006859, 'learning_rate': 8.814814814814815e-05, 'epoch': 5.93}


 12%|‚ñà‚ñè        | 2732/22950 [03:28<26:22, 12.77it/s]

{'loss': 0.0019, 'grad_norm': 0.04405476897954941, 'learning_rate': 8.81045751633987e-05, 'epoch': 5.95}


 12%|‚ñà‚ñè        | 2742/22950 [03:29<26:38, 12.64it/s]

{'loss': 0.0016, 'grad_norm': 0.048746805638074875, 'learning_rate': 8.806100217864924e-05, 'epoch': 5.97}


 12%|‚ñà‚ñè        | 2752/22950 [03:30<26:44, 12.59it/s]

{'loss': 0.1568, 'grad_norm': 4.798043251037598, 'learning_rate': 8.801742919389977e-05, 'epoch': 5.99}


                                                    
 12%|‚ñà‚ñè        | 2754/22950 [03:33<25:15, 13.33it/s]

{'eval_loss': 0.7676053047180176, 'eval_accuracy': 0.8602941036224365, 'eval_runtime': 2.4167, 'eval_samples_per_second': 168.825, 'eval_steps_per_second': 21.103, 'epoch': 6.0}


 12%|‚ñà‚ñè        | 2762/22950 [03:34<1:16:43,  4.39it/s]

{'loss': 0.2613, 'grad_norm': 0.15545932948589325, 'learning_rate': 8.797385620915033e-05, 'epoch': 6.01}


 12%|‚ñà‚ñè        | 2772/22950 [03:34<33:16, 10.11it/s]  

{'loss': 0.0219, 'grad_norm': 0.03675495460629463, 'learning_rate': 8.793028322440088e-05, 'epoch': 6.03}


 12%|‚ñà‚ñè        | 2782/22950 [03:35<29:07, 11.54it/s]

{'loss': 0.0016, 'grad_norm': 0.026436150074005127, 'learning_rate': 8.788671023965141e-05, 'epoch': 6.06}


 12%|‚ñà‚ñè        | 2792/22950 [03:36<27:52, 12.05it/s]

{'loss': 0.0042, 'grad_norm': 13.582642555236816, 'learning_rate': 8.784313725490196e-05, 'epoch': 6.08}


 12%|‚ñà‚ñè        | 2802/22950 [03:37<28:06, 11.95it/s]

{'loss': 0.0224, 'grad_norm': 0.30946195125579834, 'learning_rate': 8.779956427015251e-05, 'epoch': 6.1}


 12%|‚ñà‚ñè        | 2812/22950 [03:38<28:00, 11.98it/s]

{'loss': 0.0893, 'grad_norm': 0.01285457145422697, 'learning_rate': 8.775599128540305e-05, 'epoch': 6.12}


 12%|‚ñà‚ñè        | 2822/22950 [03:39<28:09, 11.92it/s]

{'loss': 0.0375, 'grad_norm': 0.010248844511806965, 'learning_rate': 8.77124183006536e-05, 'epoch': 6.14}


 12%|‚ñà‚ñè        | 2832/22950 [03:39<28:22, 11.82it/s]

{'loss': 0.0006, 'grad_norm': 0.024293450638651848, 'learning_rate': 8.766884531590415e-05, 'epoch': 6.17}


 12%|‚ñà‚ñè        | 2842/22950 [03:40<28:44, 11.66it/s]

{'loss': 0.2602, 'grad_norm': 0.017776982858777046, 'learning_rate': 8.762527233115469e-05, 'epoch': 6.19}


 12%|‚ñà‚ñè        | 2852/22950 [03:41<29:05, 11.51it/s]

{'loss': 0.0791, 'grad_norm': 0.012632000260055065, 'learning_rate': 8.758169934640524e-05, 'epoch': 6.21}


 12%|‚ñà‚ñè        | 2862/22950 [03:42<28:35, 11.71it/s]

{'loss': 0.0308, 'grad_norm': 15.761818885803223, 'learning_rate': 8.753812636165579e-05, 'epoch': 6.23}


 13%|‚ñà‚ñé        | 2872/22950 [03:43<28:44, 11.64it/s]

{'loss': 0.0264, 'grad_norm': 0.06499979645013809, 'learning_rate': 8.749455337690632e-05, 'epoch': 6.25}


 13%|‚ñà‚ñé        | 2882/22950 [03:44<29:08, 11.48it/s]

{'loss': 0.2379, 'grad_norm': 0.6817538738250732, 'learning_rate': 8.745098039215687e-05, 'epoch': 6.27}


 13%|‚ñà‚ñé        | 2892/22950 [03:45<28:52, 11.58it/s]

{'loss': 0.001, 'grad_norm': 0.021673867478966713, 'learning_rate': 8.740740740740741e-05, 'epoch': 6.3}


 13%|‚ñà‚ñé        | 2902/22950 [03:45<29:08, 11.46it/s]

{'loss': 0.0759, 'grad_norm': 0.029813187196850777, 'learning_rate': 8.736383442265795e-05, 'epoch': 6.32}


 13%|‚ñà‚ñé        | 2912/22950 [03:46<29:17, 11.40it/s]

{'loss': 0.1026, 'grad_norm': 0.007585362531244755, 'learning_rate': 8.73202614379085e-05, 'epoch': 6.34}


 13%|‚ñà‚ñé        | 2922/22950 [03:47<29:19, 11.38it/s]

{'loss': 0.1049, 'grad_norm': 0.0698917880654335, 'learning_rate': 8.727668845315905e-05, 'epoch': 6.36}


 13%|‚ñà‚ñé        | 2932/22950 [03:48<29:40, 11.24it/s]

{'loss': 0.0618, 'grad_norm': 0.028374455869197845, 'learning_rate': 8.723311546840958e-05, 'epoch': 6.38}


 13%|‚ñà‚ñé        | 2942/22950 [03:49<29:45, 11.21it/s]

{'loss': 0.1031, 'grad_norm': 0.022409649565815926, 'learning_rate': 8.718954248366013e-05, 'epoch': 6.41}


 13%|‚ñà‚ñé        | 2952/22950 [03:50<30:14, 11.02it/s]

{'loss': 0.0838, 'grad_norm': 0.4337977170944214, 'learning_rate': 8.714596949891068e-05, 'epoch': 6.43}


 13%|‚ñà‚ñé        | 2962/22950 [03:51<30:22, 10.97it/s]

{'loss': 0.025, 'grad_norm': 0.03037302754819393, 'learning_rate': 8.710239651416122e-05, 'epoch': 6.45}


 13%|‚ñà‚ñé        | 2972/22950 [03:52<30:22, 10.96it/s]

{'loss': 0.1111, 'grad_norm': 0.018114902079105377, 'learning_rate': 8.705882352941177e-05, 'epoch': 6.47}


 13%|‚ñà‚ñé        | 2982/22950 [03:53<30:24, 10.94it/s]

{'loss': 0.0584, 'grad_norm': 0.0164602380245924, 'learning_rate': 8.701525054466232e-05, 'epoch': 6.49}


 13%|‚ñà‚ñé        | 2992/22950 [03:54<30:54, 10.76it/s]

{'loss': 0.0306, 'grad_norm': 0.02001779153943062, 'learning_rate': 8.697167755991286e-05, 'epoch': 6.51}


 13%|‚ñà‚ñé        | 3002/22950 [03:55<31:17, 10.63it/s]

{'loss': 0.1455, 'grad_norm': 27.12652015686035, 'learning_rate': 8.692810457516341e-05, 'epoch': 6.54}


 13%|‚ñà‚ñé        | 3012/22950 [03:56<31:32, 10.53it/s]

{'loss': 0.1316, 'grad_norm': 18.581701278686523, 'learning_rate': 8.688453159041394e-05, 'epoch': 6.56}


 13%|‚ñà‚ñé        | 3022/22950 [03:57<31:44, 10.46it/s]

{'loss': 0.165, 'grad_norm': 12.803542137145996, 'learning_rate': 8.68409586056645e-05, 'epoch': 6.58}


 13%|‚ñà‚ñé        | 3032/22950 [03:57<32:01, 10.37it/s]

{'loss': 0.1477, 'grad_norm': 0.0343611016869545, 'learning_rate': 8.679738562091504e-05, 'epoch': 6.6}


 13%|‚ñà‚ñé        | 3042/22950 [03:58<32:26, 10.23it/s]

{'loss': 0.0058, 'grad_norm': 0.03788885474205017, 'learning_rate': 8.675381263616558e-05, 'epoch': 6.62}


 13%|‚ñà‚ñé        | 3052/22950 [03:59<32:23, 10.24it/s]

{'loss': 0.0019, 'grad_norm': 0.04101739078760147, 'learning_rate': 8.671023965141613e-05, 'epoch': 6.64}


 13%|‚ñà‚ñé        | 3062/22950 [04:00<32:55, 10.07it/s]

{'loss': 0.1416, 'grad_norm': 0.2125289887189865, 'learning_rate': 8.666666666666667e-05, 'epoch': 6.67}


 13%|‚ñà‚ñé        | 3071/22950 [04:01<33:04, 10.02it/s]

{'loss': 0.2053, 'grad_norm': 16.88814353942871, 'learning_rate': 8.66230936819172e-05, 'epoch': 6.69}


 13%|‚ñà‚ñé        | 3081/22950 [04:02<33:18,  9.94it/s]

{'loss': 0.1025, 'grad_norm': 0.26008158922195435, 'learning_rate': 8.657952069716775e-05, 'epoch': 6.71}


 13%|‚ñà‚ñé        | 3091/22950 [04:03<33:45,  9.80it/s]

{'loss': 0.1762, 'grad_norm': 0.7208096981048584, 'learning_rate': 8.65359477124183e-05, 'epoch': 6.73}


 14%|‚ñà‚ñé        | 3101/22950 [04:04<34:03,  9.71it/s]

{'loss': 0.0436, 'grad_norm': 0.028910737484693527, 'learning_rate': 8.649237472766884e-05, 'epoch': 6.75}


 14%|‚ñà‚ñé        | 3111/22950 [04:05<34:20,  9.63it/s]

{'loss': 0.1259, 'grad_norm': 0.03781857714056969, 'learning_rate': 8.644880174291939e-05, 'epoch': 6.78}


 14%|‚ñà‚ñé        | 3121/22950 [04:06<34:43,  9.52it/s]

{'loss': 0.088, 'grad_norm': 43.566741943359375, 'learning_rate': 8.640522875816994e-05, 'epoch': 6.8}


 14%|‚ñà‚ñé        | 3131/22950 [04:08<34:44,  9.51it/s]

{'loss': 0.1333, 'grad_norm': 28.376630783081055, 'learning_rate': 8.636165577342048e-05, 'epoch': 6.82}


 14%|‚ñà‚ñé        | 3141/22950 [04:09<35:31,  9.29it/s]

{'loss': 0.0027, 'grad_norm': 0.03656983748078346, 'learning_rate': 8.631808278867103e-05, 'epoch': 6.84}


 14%|‚ñà‚ñé        | 3151/22950 [04:10<35:53,  9.19it/s]

{'loss': 0.0908, 'grad_norm': 0.017614047974348068, 'learning_rate': 8.627450980392158e-05, 'epoch': 6.86}


 14%|‚ñà‚ñç        | 3161/22950 [04:11<36:42,  8.98it/s]

{'loss': 0.1178, 'grad_norm': 0.1809031367301941, 'learning_rate': 8.623093681917212e-05, 'epoch': 6.88}


 14%|‚ñà‚ñç        | 3171/22950 [04:12<36:56,  8.92it/s]

{'loss': 0.0019, 'grad_norm': 0.05842214077711105, 'learning_rate': 8.618736383442267e-05, 'epoch': 6.91}


 14%|‚ñà‚ñç        | 3181/22950 [04:13<37:10,  8.86it/s]

{'loss': 0.0082, 'grad_norm': 2.5983259677886963, 'learning_rate': 8.614379084967322e-05, 'epoch': 6.93}


 14%|‚ñà‚ñç        | 3191/22950 [04:14<37:45,  8.72it/s]

{'loss': 0.0784, 'grad_norm': 0.0671386644244194, 'learning_rate': 8.610021786492375e-05, 'epoch': 6.95}


 14%|‚ñà‚ñç        | 3201/22950 [04:15<38:15,  8.60it/s]

{'loss': 0.0035, 'grad_norm': 0.06600970774888992, 'learning_rate': 8.60566448801743e-05, 'epoch': 6.97}


 14%|‚ñà‚ñç        | 3211/22950 [04:17<39:11,  8.39it/s]

{'loss': 0.036, 'grad_norm': 84.26060485839844, 'learning_rate': 8.601307189542484e-05, 'epoch': 6.99}


                                                    
 14%|‚ñà‚ñç        | 3213/22950 [04:20<39:02,  8.43it/s]

{'eval_loss': 1.0004558563232422, 'eval_accuracy': 0.8504902124404907, 'eval_runtime': 3.6106, 'eval_samples_per_second': 113.0, 'eval_steps_per_second': 14.125, 'epoch': 7.0}


 14%|‚ñà‚ñç        | 3221/22950 [04:22<1:39:18,  3.31it/s]

{'loss': 0.0011, 'grad_norm': 0.019795019179582596, 'learning_rate': 8.596949891067539e-05, 'epoch': 7.02}


 14%|‚ñà‚ñç        | 3231/22950 [04:23<46:19,  7.10it/s]  

{'loss': 0.1229, 'grad_norm': 0.008134126663208008, 'learning_rate': 8.592592592592593e-05, 'epoch': 7.04}


 14%|‚ñà‚ñç        | 3241/22950 [04:24<42:08,  7.80it/s]

{'loss': 0.0022, 'grad_norm': 0.007166509050875902, 'learning_rate': 8.588235294117646e-05, 'epoch': 7.06}


 14%|‚ñà‚ñç        | 3251/22950 [04:26<42:38,  7.70it/s]

{'loss': 0.0898, 'grad_norm': 0.008496575057506561, 'learning_rate': 8.583877995642701e-05, 'epoch': 7.08}


 14%|‚ñà‚ñç        | 3261/22950 [04:27<43:12,  7.59it/s]

{'loss': 0.0896, 'grad_norm': 0.015249619260430336, 'learning_rate': 8.579520697167756e-05, 'epoch': 7.1}


 14%|‚ñà‚ñç        | 3271/22950 [04:28<43:30,  7.54it/s]

{'loss': 0.0007, 'grad_norm': 0.0092210927978158, 'learning_rate': 8.57516339869281e-05, 'epoch': 7.12}


 14%|‚ñà‚ñç        | 3281/22950 [04:30<44:44,  7.33it/s]

{'loss': 0.0003, 'grad_norm': 0.0194009467959404, 'learning_rate': 8.570806100217865e-05, 'epoch': 7.15}


 14%|‚ñà‚ñç        | 3291/22950 [04:31<45:13,  7.25it/s]

{'loss': 0.1777, 'grad_norm': 0.02180052176117897, 'learning_rate': 8.56644880174292e-05, 'epoch': 7.17}


 14%|‚ñà‚ñç        | 3301/22950 [04:32<45:28,  7.20it/s]

{'loss': 0.0025, 'grad_norm': 3.1297264099121094, 'learning_rate': 8.562091503267974e-05, 'epoch': 7.19}


 14%|‚ñà‚ñç        | 3311/22950 [04:34<45:46,  7.15it/s]

{'loss': 0.1686, 'grad_norm': 0.025547293946146965, 'learning_rate': 8.557734204793029e-05, 'epoch': 7.21}


 14%|‚ñà‚ñç        | 3321/22950 [04:35<46:14,  7.07it/s]

{'loss': 0.0012, 'grad_norm': 0.02349967136979103, 'learning_rate': 8.553376906318084e-05, 'epoch': 7.23}


 15%|‚ñà‚ñç        | 3331/22950 [04:37<46:41,  7.00it/s]

{'loss': 0.1272, 'grad_norm': 0.023870816454291344, 'learning_rate': 8.549019607843137e-05, 'epoch': 7.25}


 15%|‚ñà‚ñç        | 3341/22950 [04:38<50:57,  6.41it/s]

{'loss': 0.1601, 'grad_norm': 0.04286196082830429, 'learning_rate': 8.544662309368192e-05, 'epoch': 7.28}


 15%|‚ñà‚ñç        | 3351/22950 [04:40<50:22,  6.48it/s]

{'loss': 0.0847, 'grad_norm': 65.30606079101562, 'learning_rate': 8.540305010893247e-05, 'epoch': 7.3}


 15%|‚ñà‚ñç        | 3361/22950 [04:41<49:41,  6.57it/s]

{'loss': 0.0334, 'grad_norm': 0.46826934814453125, 'learning_rate': 8.535947712418301e-05, 'epoch': 7.32}


 15%|‚ñà‚ñç        | 3371/22950 [04:43<50:29,  6.46it/s]

{'loss': 0.106, 'grad_norm': 0.08961880207061768, 'learning_rate': 8.531590413943356e-05, 'epoch': 7.34}


 15%|‚ñà‚ñç        | 3381/22950 [04:45<51:31,  6.33it/s]

{'loss': 0.1111, 'grad_norm': 0.02841605432331562, 'learning_rate': 8.52723311546841e-05, 'epoch': 7.36}


 15%|‚ñà‚ñç        | 3391/22950 [04:46<52:44,  6.18it/s]

{'loss': 0.2438, 'grad_norm': 0.06934823095798492, 'learning_rate': 8.522875816993465e-05, 'epoch': 7.39}


 15%|‚ñà‚ñç        | 3401/22950 [04:48<55:06,  5.91it/s]

{'loss': 0.1176, 'grad_norm': 0.12941399216651917, 'learning_rate': 8.518518518518518e-05, 'epoch': 7.41}


 15%|‚ñà‚ñç        | 3411/22950 [04:49<52:45,  6.17it/s]

{'loss': 0.0075, 'grad_norm': 0.05033465102314949, 'learning_rate': 8.514161220043573e-05, 'epoch': 7.43}


 15%|‚ñà‚ñç        | 3421/22950 [04:51<54:44,  5.95it/s]

{'loss': 0.018, 'grad_norm': 0.05304570868611336, 'learning_rate': 8.509803921568627e-05, 'epoch': 7.45}


 15%|‚ñà‚ñç        | 3431/22950 [04:53<54:11,  6.00it/s]

{'loss': 0.1123, 'grad_norm': 0.03586588799953461, 'learning_rate': 8.505446623093682e-05, 'epoch': 7.47}


 15%|‚ñà‚ñç        | 3441/22950 [04:54<54:38,  5.95it/s]

{'loss': 0.112, 'grad_norm': 0.05523961782455444, 'learning_rate': 8.501089324618737e-05, 'epoch': 7.49}


 15%|‚ñà‚ñå        | 3451/22950 [04:56<55:11,  5.89it/s]

{'loss': 0.1387, 'grad_norm': 0.06440357118844986, 'learning_rate': 8.496732026143791e-05, 'epoch': 7.52}


 15%|‚ñà‚ñå        | 3461/22950 [04:58<1:05:07,  4.99it/s]

{'loss': 0.1303, 'grad_norm': 0.029527422040700912, 'learning_rate': 8.492374727668846e-05, 'epoch': 7.54}


 15%|‚ñà‚ñå        | 3471/22950 [05:00<56:06,  5.79it/s]  

{'loss': 0.0302, 'grad_norm': 0.049958162009716034, 'learning_rate': 8.4880174291939e-05, 'epoch': 7.56}


 15%|‚ñà‚ñå        | 3481/22950 [05:01<56:09,  5.78it/s]  

{'loss': 0.11, 'grad_norm': 0.053907930850982666, 'learning_rate': 8.483660130718955e-05, 'epoch': 7.58}


 15%|‚ñà‚ñå        | 3491/22950 [05:03<56:21,  5.75it/s]

{'loss': 0.1067, 'grad_norm': 0.06882866472005844, 'learning_rate': 8.47930283224401e-05, 'epoch': 7.6}


 15%|‚ñà‚ñå        | 3500/22950 [05:05<1:13:38,  4.40it/s]

{'loss': 0.0848, 'grad_norm': 0.023251596838235855, 'learning_rate': 8.474945533769063e-05, 'epoch': 7.63}


 15%|‚ñà‚ñå        | 3511/22950 [05:07<52:52,  6.13it/s]  

{'loss': 0.0007, 'grad_norm': 0.05194279924035072, 'learning_rate': 8.470588235294118e-05, 'epoch': 7.65}


 15%|‚ñà‚ñå        | 3521/22950 [05:08<56:10,  5.76it/s]

{'loss': 0.0008, 'grad_norm': 0.029969921335577965, 'learning_rate': 8.466230936819173e-05, 'epoch': 7.67}


 15%|‚ñà‚ñå        | 3531/22950 [05:10<56:13,  5.76it/s]

{'loss': 0.0814, 'grad_norm': 0.015971794724464417, 'learning_rate': 8.461873638344227e-05, 'epoch': 7.69}


 15%|‚ñà‚ñå        | 3540/22950 [05:12<58:06,  5.57it/s]

{'loss': 0.0598, 'grad_norm': 0.04118232801556587, 'learning_rate': 8.457516339869282e-05, 'epoch': 7.71}


 15%|‚ñà‚ñå        | 3551/22950 [05:14<56:40,  5.71it/s]  

{'loss': 0.1425, 'grad_norm': 0.37758511304855347, 'learning_rate': 8.453159041394336e-05, 'epoch': 7.73}


 16%|‚ñà‚ñå        | 3561/22950 [05:15<55:54,  5.78it/s]

{'loss': 0.0975, 'grad_norm': 0.01982320286333561, 'learning_rate': 8.44880174291939e-05, 'epoch': 7.76}


 16%|‚ñà‚ñå        | 3571/22950 [05:17<55:59,  5.77it/s]

{'loss': 0.1293, 'grad_norm': 0.04088256508111954, 'learning_rate': 8.444444444444444e-05, 'epoch': 7.78}


 16%|‚ñà‚ñå        | 3581/22950 [05:19<55:32,  5.81it/s]

{'loss': 0.001, 'grad_norm': 0.015425250865519047, 'learning_rate': 8.4400871459695e-05, 'epoch': 7.8}


 16%|‚ñà‚ñå        | 3591/22950 [05:21<54:43,  5.90it/s]

{'loss': 0.0431, 'grad_norm': 0.02309001237154007, 'learning_rate': 8.435729847494553e-05, 'epoch': 7.82}


 16%|‚ñà‚ñå        | 3601/22950 [05:22<54:05,  5.96it/s]

{'loss': 0.0461, 'grad_norm': 44.371246337890625, 'learning_rate': 8.431372549019608e-05, 'epoch': 7.84}


 16%|‚ñà‚ñå        | 3611/22950 [05:24<51:53,  6.21it/s]  

{'loss': 0.0407, 'grad_norm': 0.01633119024336338, 'learning_rate': 8.427015250544663e-05, 'epoch': 7.86}


 16%|‚ñà‚ñå        | 3621/22950 [05:26<53:25,  6.03it/s]

{'loss': 0.188, 'grad_norm': 0.024231446906924248, 'learning_rate': 8.422657952069717e-05, 'epoch': 7.89}


 16%|‚ñà‚ñå        | 3631/22950 [05:27<52:13,  6.17it/s]

{'loss': 0.0014, 'grad_norm': 0.012397798709571362, 'learning_rate': 8.418300653594772e-05, 'epoch': 7.91}


 16%|‚ñà‚ñå        | 3641/22950 [05:29<51:37,  6.23it/s]

{'loss': 0.1471, 'grad_norm': 0.02584845945239067, 'learning_rate': 8.413943355119827e-05, 'epoch': 7.93}


 16%|‚ñà‚ñå        | 3651/22950 [05:31<51:10,  6.28it/s]

{'loss': 0.0719, 'grad_norm': 8.07699966430664, 'learning_rate': 8.40958605664488e-05, 'epoch': 7.95}


 16%|‚ñà‚ñå        | 3661/22950 [05:32<50:37,  6.35it/s]

{'loss': 0.001, 'grad_norm': 0.01880578137934208, 'learning_rate': 8.405228758169935e-05, 'epoch': 7.97}


 16%|‚ñà‚ñå        | 3671/22950 [05:34<49:41,  6.47it/s]

{'loss': 0.0012, 'grad_norm': 0.0299154631793499, 'learning_rate': 8.40087145969499e-05, 'epoch': 8.0}


                                                    
 16%|‚ñà‚ñå        | 3672/22950 [05:38<49:41,  6.47it/s]

{'eval_loss': 0.9441902041435242, 'eval_accuracy': 0.8406862616539001, 'eval_runtime': 4.5628, 'eval_samples_per_second': 89.42, 'eval_steps_per_second': 11.177, 'epoch': 8.0}


 16%|‚ñà‚ñå        | 3681/22950 [05:40<1:21:38,  3.93it/s]

{'loss': 0.017, 'grad_norm': 0.07770567387342453, 'learning_rate': 8.396514161220044e-05, 'epoch': 8.02}


 16%|‚ñà‚ñå        | 3691/22950 [05:42<46:54,  6.84it/s]  

{'loss': 0.1462, 'grad_norm': 0.04227592796087265, 'learning_rate': 8.392156862745099e-05, 'epoch': 8.04}


 16%|‚ñà‚ñå        | 3701/22950 [05:43<45:32,  7.05it/s]

{'loss': 0.1397, 'grad_norm': 0.029922237619757652, 'learning_rate': 8.387799564270154e-05, 'epoch': 8.06}


 16%|‚ñà‚ñå        | 3711/22950 [05:45<42:19,  7.58it/s]  

{'loss': 0.0026, 'grad_norm': 0.026046963408589363, 'learning_rate': 8.383442265795208e-05, 'epoch': 8.08}


 16%|‚ñà‚ñå        | 3721/22950 [05:46<44:02,  7.28it/s]

{'loss': 0.0022, 'grad_norm': 0.019767343997955322, 'learning_rate': 8.379084967320261e-05, 'epoch': 8.1}


 16%|‚ñà‚ñã        | 3731/22950 [05:47<44:08,  7.26it/s]

{'loss': 0.0583, 'grad_norm': 0.014622305519878864, 'learning_rate': 8.374727668845315e-05, 'epoch': 8.13}


 16%|‚ñà‚ñã        | 3741/22950 [05:49<43:02,  7.44it/s]

{'loss': 0.0007, 'grad_norm': 0.1342594474554062, 'learning_rate': 8.37037037037037e-05, 'epoch': 8.15}


 16%|‚ñà‚ñã        | 3751/22950 [05:50<42:23,  7.55it/s]

{'loss': 0.188, 'grad_norm': 0.11530395597219467, 'learning_rate': 8.366013071895425e-05, 'epoch': 8.17}


 16%|‚ñà‚ñã        | 3761/22950 [05:51<41:38,  7.68it/s]

{'loss': 0.0038, 'grad_norm': 0.024209409952163696, 'learning_rate': 8.361655773420479e-05, 'epoch': 8.19}


 16%|‚ñà‚ñã        | 3771/22950 [05:53<41:12,  7.76it/s]

{'loss': 0.0401, 'grad_norm': 0.012828114442527294, 'learning_rate': 8.357298474945534e-05, 'epoch': 8.21}


 16%|‚ñà‚ñã        | 3781/22950 [05:54<40:47,  7.83it/s]

{'loss': 0.0023, 'grad_norm': 0.009979248046875, 'learning_rate': 8.352941176470589e-05, 'epoch': 8.24}


 17%|‚ñà‚ñã        | 3791/22950 [05:55<40:23,  7.90it/s]

{'loss': 0.0358, 'grad_norm': 0.01336497999727726, 'learning_rate': 8.348583877995643e-05, 'epoch': 8.26}


 17%|‚ñà‚ñã        | 3801/22950 [05:57<39:41,  8.04it/s]

{'loss': 0.0004, 'grad_norm': 0.016230396926403046, 'learning_rate': 8.344226579520698e-05, 'epoch': 8.28}


 17%|‚ñà‚ñã        | 3811/22950 [05:58<39:39,  8.04it/s]

{'loss': 0.0005, 'grad_norm': 0.10382896661758423, 'learning_rate': 8.339869281045753e-05, 'epoch': 8.3}


 17%|‚ñà‚ñã        | 3821/22950 [05:59<39:23,  8.09it/s]

{'loss': 0.0012, 'grad_norm': 0.012880704365670681, 'learning_rate': 8.335511982570806e-05, 'epoch': 8.32}


 17%|‚ñà‚ñã        | 3831/22950 [06:00<38:47,  8.21it/s]

{'loss': 0.0889, 'grad_norm': 0.01041412353515625, 'learning_rate': 8.331154684095861e-05, 'epoch': 8.34}


 17%|‚ñà‚ñã        | 3841/22950 [06:01<38:17,  8.32it/s]

{'loss': 0.0797, 'grad_norm': 0.008502278476953506, 'learning_rate': 8.326797385620916e-05, 'epoch': 8.37}


 17%|‚ñà‚ñã        | 3851/22950 [06:03<37:49,  8.42it/s]

{'loss': 0.0041, 'grad_norm': 34.52000427246094, 'learning_rate': 8.32244008714597e-05, 'epoch': 8.39}


 17%|‚ñà‚ñã        | 3861/22950 [06:04<37:29,  8.49it/s]

{'loss': 0.0106, 'grad_norm': 0.01036261860281229, 'learning_rate': 8.318082788671025e-05, 'epoch': 8.41}


 17%|‚ñà‚ñã        | 3871/22950 [06:05<36:58,  8.60it/s]

{'loss': 0.0006, 'grad_norm': 0.007114456035196781, 'learning_rate': 8.313725490196079e-05, 'epoch': 8.43}


 17%|‚ñà‚ñã        | 3881/22950 [06:06<36:57,  8.60it/s]

{'loss': 0.0009, 'grad_norm': 0.00842360220849514, 'learning_rate': 8.309368191721134e-05, 'epoch': 8.45}


 17%|‚ñà‚ñã        | 3891/22950 [06:07<36:44,  8.65it/s]

{'loss': 0.0003, 'grad_norm': 0.06469463557004929, 'learning_rate': 8.305010893246187e-05, 'epoch': 8.47}


 17%|‚ñà‚ñã        | 3901/22950 [06:09<36:15,  8.76it/s]

{'loss': 0.0685, 'grad_norm': 0.024043908342719078, 'learning_rate': 8.300653594771242e-05, 'epoch': 8.5}


 17%|‚ñà‚ñã        | 3911/22950 [06:10<35:42,  8.89it/s]

{'loss': 0.0674, 'grad_norm': 0.007842331193387508, 'learning_rate': 8.296296296296296e-05, 'epoch': 8.52}


 17%|‚ñà‚ñã        | 3921/22950 [06:11<35:12,  9.01it/s]

{'loss': 0.0003, 'grad_norm': 0.005711296573281288, 'learning_rate': 8.291938997821351e-05, 'epoch': 8.54}


 17%|‚ñà‚ñã        | 3931/22950 [06:12<34:57,  9.07it/s]

{'loss': 0.0825, 'grad_norm': 1.5993281602859497, 'learning_rate': 8.287581699346406e-05, 'epoch': 8.56}


 17%|‚ñà‚ñã        | 3941/22950 [06:13<34:24,  9.21it/s]

{'loss': 0.0373, 'grad_norm': 0.010338113643229008, 'learning_rate': 8.28322440087146e-05, 'epoch': 8.58}


 17%|‚ñà‚ñã        | 3951/22950 [06:14<36:14,  8.74it/s]

{'loss': 0.2851, 'grad_norm': 12.498015403747559, 'learning_rate': 8.278867102396515e-05, 'epoch': 8.61}


 17%|‚ñà‚ñã        | 3961/22950 [06:15<33:58,  9.32it/s]

{'loss': 0.0004, 'grad_norm': 0.01902260258793831, 'learning_rate': 8.274509803921568e-05, 'epoch': 8.63}


 17%|‚ñà‚ñã        | 3971/22950 [06:16<33:46,  9.37it/s]

{'loss': 0.1286, 'grad_norm': 31.745262145996094, 'learning_rate': 8.270152505446623e-05, 'epoch': 8.65}


 17%|‚ñà‚ñã        | 3981/22950 [06:17<33:25,  9.46it/s]

{'loss': 0.0775, 'grad_norm': 0.022161537781357765, 'learning_rate': 8.265795206971678e-05, 'epoch': 8.67}


 17%|‚ñà‚ñã        | 3991/22950 [06:18<33:14,  9.51it/s]

{'loss': 0.0259, 'grad_norm': 0.017459379509091377, 'learning_rate': 8.261437908496732e-05, 'epoch': 8.69}


 17%|‚ñà‚ñã        | 4001/22950 [06:19<33:03,  9.56it/s]

{'loss': 0.0465, 'grad_norm': 0.010801630094647408, 'learning_rate': 8.257080610021787e-05, 'epoch': 8.71}


 17%|‚ñà‚ñã        | 4011/22950 [06:20<32:58,  9.57it/s]

{'loss': 0.0004, 'grad_norm': 0.007749311160296202, 'learning_rate': 8.252723311546842e-05, 'epoch': 8.74}


 18%|‚ñà‚ñä        | 4021/22950 [06:22<32:56,  9.58it/s]

{'loss': 0.0015, 'grad_norm': 0.008310046046972275, 'learning_rate': 8.248366013071896e-05, 'epoch': 8.76}


 18%|‚ñà‚ñä        | 4031/22950 [06:23<32:57,  9.57it/s]

{'loss': 0.0979, 'grad_norm': 0.015593888238072395, 'learning_rate': 8.244008714596951e-05, 'epoch': 8.78}


 18%|‚ñà‚ñä        | 4042/22950 [06:24<32:10,  9.79it/s]

{'loss': 0.0679, 'grad_norm': 20.451684951782227, 'learning_rate': 8.239651416122004e-05, 'epoch': 8.8}


 18%|‚ñà‚ñä        | 4051/22950 [06:25<32:22,  9.73it/s]

{'loss': 0.0323, 'grad_norm': 0.013769898563623428, 'learning_rate': 8.23529411764706e-05, 'epoch': 8.82}


 18%|‚ñà‚ñä        | 4061/22950 [06:26<32:17,  9.75it/s]

{'loss': 0.0417, 'grad_norm': 0.009685920551419258, 'learning_rate': 8.230936819172113e-05, 'epoch': 8.85}


 18%|‚ñà‚ñä        | 4072/22950 [06:27<32:02,  9.82it/s]

{'loss': 0.0416, 'grad_norm': 0.026001373305916786, 'learning_rate': 8.226579520697168e-05, 'epoch': 8.87}


 18%|‚ñà‚ñä        | 4081/22950 [06:28<32:00,  9.82it/s]

{'loss': 0.0004, 'grad_norm': 0.014874422922730446, 'learning_rate': 8.222222222222222e-05, 'epoch': 8.89}


 18%|‚ñà‚ñä        | 4091/22950 [06:29<31:53,  9.86it/s]

{'loss': 0.0003, 'grad_norm': 0.01445221807807684, 'learning_rate': 8.217864923747277e-05, 'epoch': 8.91}


 18%|‚ñà‚ñä        | 4101/22950 [06:30<32:27,  9.68it/s]

{'loss': 0.2308, 'grad_norm': 0.010921893641352654, 'learning_rate': 8.213507625272332e-05, 'epoch': 8.93}


 18%|‚ñà‚ñä        | 4111/22950 [06:31<31:02, 10.11it/s]

{'loss': 0.1125, 'grad_norm': 0.017342459410429, 'learning_rate': 8.209150326797386e-05, 'epoch': 8.95}


 18%|‚ñà‚ñä        | 4121/22950 [06:32<31:25,  9.99it/s]

{'loss': 0.0412, 'grad_norm': 72.35075378417969, 'learning_rate': 8.20479302832244e-05, 'epoch': 8.98}


 18%|‚ñà‚ñä        | 4131/22950 [06:33<29:08, 10.76it/s]

{'loss': 0.1611, 'grad_norm': 0.03566427901387215, 'learning_rate': 8.200435729847496e-05, 'epoch': 9.0}


                                                    
 18%|‚ñà‚ñä        | 4131/22950 [06:36<29:08, 10.76it/s]

{'eval_loss': 1.186488389968872, 'eval_accuracy': 0.8161764740943909, 'eval_runtime': 2.9832, 'eval_samples_per_second': 136.766, 'eval_steps_per_second': 17.096, 'epoch': 9.0}


 18%|‚ñà‚ñä        | 4141/22950 [06:37<1:08:58,  4.55it/s]

{'loss': 0.0003, 'grad_norm': 0.010046028532087803, 'learning_rate': 8.196078431372549e-05, 'epoch': 9.02}


 18%|‚ñà‚ñä        | 4151/22950 [06:38<35:06,  8.92it/s]  

{'loss': 0.1789, 'grad_norm': 0.010845513083040714, 'learning_rate': 8.191721132897604e-05, 'epoch': 9.04}


 18%|‚ñà‚ñä        | 4161/22950 [06:39<31:29,  9.94it/s]

{'loss': 0.0296, 'grad_norm': 0.017700934782624245, 'learning_rate': 8.187363834422659e-05, 'epoch': 9.06}


 18%|‚ñà‚ñä        | 4171/22950 [06:40<31:32,  9.92it/s]

{'loss': 0.0026, 'grad_norm': 0.014263873919844627, 'learning_rate': 8.183006535947713e-05, 'epoch': 9.08}


 18%|‚ñà‚ñä        | 4181/22950 [06:41<30:57, 10.10it/s]

{'loss': 0.0832, 'grad_norm': 22.467544555664062, 'learning_rate': 8.178649237472768e-05, 'epoch': 9.11}


 18%|‚ñà‚ñä        | 4191/22950 [06:42<30:18, 10.31it/s]

{'loss': 0.0017, 'grad_norm': 0.009195951744914055, 'learning_rate': 8.174291938997822e-05, 'epoch': 9.13}


 18%|‚ñà‚ñä        | 4201/22950 [06:43<30:10, 10.36it/s]

{'loss': 0.1486, 'grad_norm': 0.01353084109723568, 'learning_rate': 8.169934640522877e-05, 'epoch': 9.15}


 18%|‚ñà‚ñä        | 4211/22950 [06:44<30:04, 10.39it/s]

{'loss': 0.0499, 'grad_norm': 0.011305129155516624, 'learning_rate': 8.16557734204793e-05, 'epoch': 9.17}


 18%|‚ñà‚ñä        | 4221/22950 [06:45<30:06, 10.37it/s]

{'loss': 0.074, 'grad_norm': 0.01814962923526764, 'learning_rate': 8.161220043572985e-05, 'epoch': 9.19}


 18%|‚ñà‚ñä        | 4231/22950 [06:46<29:54, 10.43it/s]

{'loss': 0.0004, 'grad_norm': 0.07685619592666626, 'learning_rate': 8.156862745098039e-05, 'epoch': 9.22}


 18%|‚ñà‚ñä        | 4241/22950 [06:47<29:49, 10.45it/s]

{'loss': 0.0741, 'grad_norm': 0.01175085548311472, 'learning_rate': 8.152505446623094e-05, 'epoch': 9.24}


 19%|‚ñà‚ñä        | 4251/22950 [06:48<29:44, 10.48it/s]

{'loss': 0.0604, 'grad_norm': 0.012678230181336403, 'learning_rate': 8.148148148148148e-05, 'epoch': 9.26}


 19%|‚ñà‚ñä        | 4261/22950 [06:49<29:53, 10.42it/s]

{'loss': 0.0388, 'grad_norm': 0.23432044684886932, 'learning_rate': 8.143790849673203e-05, 'epoch': 9.28}


 19%|‚ñà‚ñä        | 4271/22950 [06:50<29:40, 10.49it/s]

{'loss': 0.0003, 'grad_norm': 0.011029129847884178, 'learning_rate': 8.139433551198258e-05, 'epoch': 9.3}


 19%|‚ñà‚ñä        | 4281/22950 [06:51<29:34, 10.52it/s]

{'loss': 0.0009, 'grad_norm': 0.011559529229998589, 'learning_rate': 8.135076252723311e-05, 'epoch': 9.32}


 19%|‚ñà‚ñä        | 4291/22950 [06:51<29:16, 10.62it/s]

{'loss': 0.001, 'grad_norm': 0.009251661598682404, 'learning_rate': 8.130718954248366e-05, 'epoch': 9.35}


 19%|‚ñà‚ñä        | 4301/22950 [06:52<29:19, 10.60it/s]

{'loss': 0.1574, 'grad_norm': 0.015845881775021553, 'learning_rate': 8.126361655773421e-05, 'epoch': 9.37}


 19%|‚ñà‚ñâ        | 4311/22950 [06:53<29:09, 10.66it/s]

{'loss': 0.0832, 'grad_norm': 0.016569528728723526, 'learning_rate': 8.122004357298475e-05, 'epoch': 9.39}


 19%|‚ñà‚ñâ        | 4321/22950 [06:54<29:09, 10.65it/s]

{'loss': 0.1963, 'grad_norm': 0.019929109141230583, 'learning_rate': 8.11764705882353e-05, 'epoch': 9.41}


 19%|‚ñà‚ñâ        | 4331/22950 [06:55<29:06, 10.66it/s]

{'loss': 0.08, 'grad_norm': 3.259453773498535, 'learning_rate': 8.113289760348585e-05, 'epoch': 9.43}


 19%|‚ñà‚ñâ        | 4341/22950 [06:56<29:43, 10.43it/s]

{'loss': 0.1251, 'grad_norm': 37.21355056762695, 'learning_rate': 8.108932461873639e-05, 'epoch': 9.46}


 19%|‚ñà‚ñâ        | 4351/22950 [06:57<29:10, 10.62it/s]

{'loss': 0.0548, 'grad_norm': 0.05268871411681175, 'learning_rate': 8.104575163398694e-05, 'epoch': 9.48}


 19%|‚ñà‚ñâ        | 4361/22950 [06:58<29:04, 10.66it/s]

{'loss': 0.0021, 'grad_norm': 0.03525322303175926, 'learning_rate': 8.100217864923749e-05, 'epoch': 9.5}


 19%|‚ñà‚ñâ        | 4371/22950 [06:59<28:51, 10.73it/s]

{'loss': 0.2134, 'grad_norm': 0.03358076140284538, 'learning_rate': 8.095860566448802e-05, 'epoch': 9.52}


 19%|‚ñà‚ñâ        | 4381/22950 [07:00<28:59, 10.68it/s]

{'loss': 0.0016, 'grad_norm': 0.05157404765486717, 'learning_rate': 8.091503267973856e-05, 'epoch': 9.54}


 19%|‚ñà‚ñâ        | 4391/22950 [07:01<28:43, 10.77it/s]

{'loss': 0.1005, 'grad_norm': 0.7389188408851624, 'learning_rate': 8.087145969498911e-05, 'epoch': 9.56}


 19%|‚ñà‚ñâ        | 4401/22950 [07:02<28:43, 10.76it/s]

{'loss': 0.0008, 'grad_norm': 0.025227421894669533, 'learning_rate': 8.082788671023965e-05, 'epoch': 9.59}


 19%|‚ñà‚ñâ        | 4411/22950 [07:03<28:32, 10.82it/s]

{'loss': 0.0821, 'grad_norm': 0.013240291737020016, 'learning_rate': 8.07843137254902e-05, 'epoch': 9.61}


 19%|‚ñà‚ñâ        | 4421/22950 [07:04<28:33, 10.81it/s]

{'loss': 0.0587, 'grad_norm': 0.014374759048223495, 'learning_rate': 8.074074074074075e-05, 'epoch': 9.63}


 19%|‚ñà‚ñâ        | 4431/22950 [07:05<28:24, 10.86it/s]

{'loss': 0.2655, 'grad_norm': 0.032231446355581284, 'learning_rate': 8.069716775599128e-05, 'epoch': 9.65}


 19%|‚ñà‚ñâ        | 4441/22950 [07:06<28:00, 11.01it/s]

{'loss': 0.1447, 'grad_norm': 0.5093405842781067, 'learning_rate': 8.065359477124184e-05, 'epoch': 9.67}


 19%|‚ñà‚ñâ        | 4451/22950 [07:06<28:00, 11.01it/s]

{'loss': 0.12, 'grad_norm': 13.918524742126465, 'learning_rate': 8.061002178649237e-05, 'epoch': 9.69}


 19%|‚ñà‚ñâ        | 4461/22950 [07:07<27:51, 11.06it/s]

{'loss': 0.0463, 'grad_norm': 0.03807394206523895, 'learning_rate': 8.056644880174292e-05, 'epoch': 9.72}


 19%|‚ñà‚ñâ        | 4471/22950 [07:08<27:54, 11.04it/s]

{'loss': 0.0437, 'grad_norm': 3.2070584297180176, 'learning_rate': 8.052287581699347e-05, 'epoch': 9.74}


 20%|‚ñà‚ñâ        | 4481/22950 [07:09<27:58, 11.00it/s]

{'loss': 0.0525, 'grad_norm': 0.04112854599952698, 'learning_rate': 8.047930283224401e-05, 'epoch': 9.76}


 20%|‚ñà‚ñâ        | 4491/22950 [07:10<27:56, 11.01it/s]

{'loss': 0.0015, 'grad_norm': 0.010231448337435722, 'learning_rate': 8.043572984749456e-05, 'epoch': 9.78}


 20%|‚ñà‚ñâ        | 4501/22950 [07:11<28:04, 10.96it/s]

{'loss': 0.1521, 'grad_norm': 0.015601584687829018, 'learning_rate': 8.039215686274511e-05, 'epoch': 9.8}


 20%|‚ñà‚ñâ        | 4511/22950 [07:12<27:32, 11.16it/s]

{'loss': 0.0582, 'grad_norm': 0.02792472578585148, 'learning_rate': 8.034858387799565e-05, 'epoch': 9.83}


 20%|‚ñà‚ñâ        | 4521/22950 [07:13<27:35, 11.13it/s]

{'loss': 0.0613, 'grad_norm': 0.012483273632824421, 'learning_rate': 8.03050108932462e-05, 'epoch': 9.85}


 20%|‚ñà‚ñâ        | 4531/22950 [07:14<27:16, 11.26it/s]

{'loss': 0.0884, 'grad_norm': 0.019377684220671654, 'learning_rate': 8.026143790849675e-05, 'epoch': 9.87}


 20%|‚ñà‚ñâ        | 4541/22950 [07:15<27:16, 11.25it/s]

{'loss': 0.2434, 'grad_norm': 42.9317626953125, 'learning_rate': 8.021786492374728e-05, 'epoch': 9.89}


 20%|‚ñà‚ñâ        | 4551/22950 [07:15<27:10, 11.28it/s]

{'loss': 0.0719, 'grad_norm': 0.013798649422824383, 'learning_rate': 8.017429193899782e-05, 'epoch': 9.91}


 20%|‚ñà‚ñâ        | 4561/22950 [07:16<27:12, 11.26it/s]

{'loss': 0.0007, 'grad_norm': 0.020749341696500778, 'learning_rate': 8.013071895424837e-05, 'epoch': 9.93}


 20%|‚ñà‚ñâ        | 4571/22950 [07:17<27:19, 11.21it/s]

{'loss': 0.0006, 'grad_norm': 0.01872897893190384, 'learning_rate': 8.00871459694989e-05, 'epoch': 9.96}


 20%|‚ñà‚ñâ        | 4581/22950 [07:18<27:56, 10.96it/s]

{'loss': 0.0012, 'grad_norm': 0.009629287756979465, 'learning_rate': 8.004357298474946e-05, 'epoch': 9.98}


 20%|‚ñà‚ñà        | 4590/22950 [07:19<27:00, 11.33it/s]

{'loss': 0.0006, 'grad_norm': 0.14043660461902618, 'learning_rate': 8e-05, 'epoch': 10.0}


                                                    
 20%|‚ñà‚ñà        | 4590/22950 [07:22<27:00, 11.33it/s]

{'eval_loss': 1.1237154006958008, 'eval_accuracy': 0.8357843160629272, 'eval_runtime': 2.6423, 'eval_samples_per_second': 154.411, 'eval_steps_per_second': 19.301, 'epoch': 10.0}


 20%|‚ñà‚ñà        | 4601/22950 [07:23<47:56,  6.38it/s]  

{'loss': 0.0002, 'grad_norm': 0.007102778647094965, 'learning_rate': 7.995642701525054e-05, 'epoch': 10.02}


 20%|‚ñà‚ñà        | 4611/22950 [07:24<31:13,  9.79it/s]

{'loss': 0.0732, 'grad_norm': 0.04578900337219238, 'learning_rate': 7.99128540305011e-05, 'epoch': 10.04}


 20%|‚ñà‚ñà        | 4621/22950 [07:25<27:23, 11.15it/s]

{'loss': 0.0003, 'grad_norm': 0.007135958410799503, 'learning_rate': 7.986928104575164e-05, 'epoch': 10.07}


 20%|‚ñà‚ñà        | 4631/22950 [07:25<26:46, 11.40it/s]

{'loss': 0.0003, 'grad_norm': 0.1254386156797409, 'learning_rate': 7.982570806100218e-05, 'epoch': 10.09}


 20%|‚ñà‚ñà        | 4641/22950 [07:26<27:31, 11.09it/s]

{'loss': 0.0003, 'grad_norm': 0.006089155562222004, 'learning_rate': 7.978213507625273e-05, 'epoch': 10.11}


 20%|‚ñà‚ñà        | 4651/22950 [07:27<26:40, 11.43it/s]

{'loss': 0.0017, 'grad_norm': 0.024798519909381866, 'learning_rate': 7.973856209150328e-05, 'epoch': 10.13}


 20%|‚ñà‚ñà        | 4661/22950 [07:28<26:29, 11.51it/s]

{'loss': 0.0002, 'grad_norm': 0.006191829685121775, 'learning_rate': 7.969498910675382e-05, 'epoch': 10.15}


 20%|‚ñà‚ñà        | 4671/22950 [07:29<26:37, 11.44it/s]

{'loss': 0.0602, 'grad_norm': 0.0066398754715919495, 'learning_rate': 7.965141612200437e-05, 'epoch': 10.17}


 20%|‚ñà‚ñà        | 4681/22950 [07:30<26:22, 11.54it/s]

{'loss': 0.0313, 'grad_norm': 150.25221252441406, 'learning_rate': 7.96078431372549e-05, 'epoch': 10.2}


 20%|‚ñà‚ñà        | 4691/22950 [07:31<26:15, 11.59it/s]

{'loss': 0.0002, 'grad_norm': 0.00716119771823287, 'learning_rate': 7.956427015250545e-05, 'epoch': 10.22}


 20%|‚ñà‚ñà        | 4701/22950 [07:32<26:14, 11.59it/s]

{'loss': 0.0733, 'grad_norm': 0.021927788853645325, 'learning_rate': 7.952069716775599e-05, 'epoch': 10.24}


 21%|‚ñà‚ñà        | 4711/22950 [07:32<26:23, 11.52it/s]

{'loss': 0.068, 'grad_norm': 0.004247908480465412, 'learning_rate': 7.947712418300654e-05, 'epoch': 10.26}


 21%|‚ñà‚ñà        | 4721/22950 [07:33<26:26, 11.49it/s]

{'loss': 0.0486, 'grad_norm': 0.0162690207362175, 'learning_rate': 7.943355119825708e-05, 'epoch': 10.28}


 21%|‚ñà‚ñà        | 4731/22950 [07:34<25:53, 11.73it/s]

{'loss': 0.1032, 'grad_norm': 12.538105010986328, 'learning_rate': 7.938997821350763e-05, 'epoch': 10.31}


 21%|‚ñà‚ñà        | 4741/22950 [07:35<26:13, 11.58it/s]

{'loss': 0.0206, 'grad_norm': 0.09309760481119156, 'learning_rate': 7.934640522875816e-05, 'epoch': 10.33}


 21%|‚ñà‚ñà        | 4751/22950 [07:36<26:03, 11.64it/s]

{'loss': 0.0496, 'grad_norm': 0.0055381436832249165, 'learning_rate': 7.930283224400871e-05, 'epoch': 10.35}


 21%|‚ñà‚ñà        | 4761/22950 [07:37<25:58, 11.67it/s]

{'loss': 0.0724, 'grad_norm': 0.03084554523229599, 'learning_rate': 7.925925925925926e-05, 'epoch': 10.37}


 21%|‚ñà‚ñà        | 4771/22950 [07:38<25:55, 11.69it/s]

{'loss': 0.1035, 'grad_norm': 0.009584704414010048, 'learning_rate': 7.92156862745098e-05, 'epoch': 10.39}


 21%|‚ñà‚ñà        | 4781/22950 [07:38<25:50, 11.71it/s]

{'loss': 0.0197, 'grad_norm': 0.00537918321788311, 'learning_rate': 7.917211328976035e-05, 'epoch': 10.41}


 21%|‚ñà‚ñà        | 4791/22950 [07:39<25:47, 11.73it/s]

{'loss': 0.0008, 'grad_norm': 0.1570729911327362, 'learning_rate': 7.91285403050109e-05, 'epoch': 10.44}


 21%|‚ñà‚ñà        | 4801/22950 [07:40<26:21, 11.47it/s]

{'loss': 0.0564, 'grad_norm': 0.004450436681509018, 'learning_rate': 7.908496732026144e-05, 'epoch': 10.46}


 21%|‚ñà‚ñà        | 4811/22950 [07:41<25:43, 11.75it/s]

{'loss': 0.1086, 'grad_norm': 0.11696509271860123, 'learning_rate': 7.904139433551199e-05, 'epoch': 10.48}


 21%|‚ñà‚ñà        | 4821/22950 [07:42<25:52, 11.68it/s]

{'loss': 0.118, 'grad_norm': 0.008362858556210995, 'learning_rate': 7.899782135076254e-05, 'epoch': 10.5}


 21%|‚ñà‚ñà        | 4831/22950 [07:43<25:58, 11.63it/s]

{'loss': 0.0497, 'grad_norm': 0.04402341693639755, 'learning_rate': 7.895424836601308e-05, 'epoch': 10.52}


 21%|‚ñà‚ñà        | 4841/22950 [07:44<25:46, 11.71it/s]

{'loss': 0.0293, 'grad_norm': 0.02049735002219677, 'learning_rate': 7.891067538126363e-05, 'epoch': 10.54}


 21%|‚ñà‚ñà        | 4851/22950 [07:44<25:55, 11.63it/s]

{'loss': 0.0009, 'grad_norm': 0.008456775918602943, 'learning_rate': 7.886710239651418e-05, 'epoch': 10.57}


 21%|‚ñà‚ñà        | 4861/22950 [07:45<25:32, 11.80it/s]

{'loss': 0.0004, 'grad_norm': 0.0070371790789067745, 'learning_rate': 7.882352941176471e-05, 'epoch': 10.59}


 21%|‚ñà‚ñà        | 4871/22950 [07:46<25:35, 11.78it/s]

{'loss': 0.1055, 'grad_norm': 0.02190876193344593, 'learning_rate': 7.877995642701525e-05, 'epoch': 10.61}


 21%|‚ñà‚ñà‚ñè       | 4881/22950 [07:47<25:59, 11.59it/s]

{'loss': 0.1884, 'grad_norm': 0.07925235480070114, 'learning_rate': 7.87363834422658e-05, 'epoch': 10.63}


 21%|‚ñà‚ñà‚ñè       | 4891/22950 [07:48<25:37, 11.75it/s]

{'loss': 0.0383, 'grad_norm': 0.037253107875585556, 'learning_rate': 7.869281045751634e-05, 'epoch': 10.65}


 21%|‚ñà‚ñà‚ñè       | 4901/22950 [07:49<25:48, 11.66it/s]

{'loss': 0.012, 'grad_norm': 0.019241496920585632, 'learning_rate': 7.864923747276689e-05, 'epoch': 10.68}


 21%|‚ñà‚ñà‚ñè       | 4911/22950 [07:50<25:30, 11.78it/s]

{'loss': 0.0082, 'grad_norm': 0.02195778116583824, 'learning_rate': 7.860566448801742e-05, 'epoch': 10.7}


 21%|‚ñà‚ñà‚ñè       | 4921/22950 [07:50<25:33, 11.75it/s]

{'loss': 0.1707, 'grad_norm': 0.009777860715985298, 'learning_rate': 7.856209150326797e-05, 'epoch': 10.72}


 21%|‚ñà‚ñà‚ñè       | 4931/22950 [07:51<25:31, 11.77it/s]

{'loss': 0.0757, 'grad_norm': 0.07530750334262848, 'learning_rate': 7.851851851851852e-05, 'epoch': 10.74}


 22%|‚ñà‚ñà‚ñè       | 4941/22950 [07:52<25:37, 11.71it/s]

{'loss': 0.0005, 'grad_norm': 0.04187234491109848, 'learning_rate': 7.847494553376906e-05, 'epoch': 10.76}


 22%|‚ñà‚ñà‚ñè       | 4951/22950 [07:53<25:34, 11.73it/s]

{'loss': 0.0004, 'grad_norm': 0.011082609184086323, 'learning_rate': 7.843137254901961e-05, 'epoch': 10.78}


 22%|‚ñà‚ñà‚ñè       | 4961/22950 [07:54<26:08, 11.47it/s]

{'loss': 0.0011, 'grad_norm': 0.006376883015036583, 'learning_rate': 7.838779956427016e-05, 'epoch': 10.81}


 22%|‚ñà‚ñà‚ñè       | 4971/22950 [07:55<25:24, 11.79it/s]

{'loss': 0.0003, 'grad_norm': 0.00989922322332859, 'learning_rate': 7.83442265795207e-05, 'epoch': 10.83}


 22%|‚ñà‚ñà‚ñè       | 4981/22950 [07:56<25:16, 11.85it/s]

{'loss': 0.0011, 'grad_norm': 0.018606772646307945, 'learning_rate': 7.830065359477125e-05, 'epoch': 10.85}


 22%|‚ñà‚ñà‚ñè       | 4991/22950 [07:56<25:17, 11.84it/s]

{'loss': 0.1358, 'grad_norm': 0.008690115064382553, 'learning_rate': 7.82570806100218e-05, 'epoch': 10.87}


 22%|‚ñà‚ñà‚ñè       | 5001/22950 [07:57<25:13, 11.86it/s]

{'loss': 0.0439, 'grad_norm': 0.013299399055540562, 'learning_rate': 7.821350762527233e-05, 'epoch': 10.89}


 22%|‚ñà‚ñà‚ñè       | 5011/22950 [07:58<25:22, 11.78it/s]

{'loss': 0.0838, 'grad_norm': 0.00794298481196165, 'learning_rate': 7.816993464052288e-05, 'epoch': 10.92}


 22%|‚ñà‚ñà‚ñè       | 5021/22950 [07:59<25:36, 11.67it/s]

{'loss': 0.0561, 'grad_norm': 0.2990257143974304, 'learning_rate': 7.812636165577343e-05, 'epoch': 10.94}


 22%|‚ñà‚ñà‚ñè       | 5031/22950 [08:00<25:50, 11.56it/s]

{'loss': 0.0714, 'grad_norm': 0.008052770048379898, 'learning_rate': 7.808278867102397e-05, 'epoch': 10.96}


 22%|‚ñà‚ñà‚ñè       | 5041/22950 [08:01<25:00, 11.93it/s]

{'loss': 0.0109, 'grad_norm': 0.21409422159194946, 'learning_rate': 7.803921568627451e-05, 'epoch': 10.98}


                                                    
 22%|‚ñà‚ñà‚ñè       | 5049/22950 [08:04<23:46, 12.55it/s]

{'eval_loss': 1.102249026298523, 'eval_accuracy': 0.8284313678741455, 'eval_runtime': 2.53, 'eval_samples_per_second': 161.262, 'eval_steps_per_second': 20.158, 'epoch': 11.0}


 22%|‚ñà‚ñà‚ñè       | 5051/22950 [08:04<2:38:39,  1.88it/s]

{'loss': 0.0024, 'grad_norm': 0.006967089604586363, 'learning_rate': 7.799564270152506e-05, 'epoch': 11.0}


 22%|‚ñà‚ñà‚ñè       | 5061/22950 [08:05<44:39,  6.68it/s]  

{'loss': 0.0145, 'grad_norm': 0.011034760624170303, 'learning_rate': 7.79520697167756e-05, 'epoch': 11.02}


 22%|‚ñà‚ñà‚ñè       | 5071/22950 [08:06<29:52,  9.98it/s]

{'loss': 0.0869, 'grad_norm': 0.01602019928395748, 'learning_rate': 7.790849673202614e-05, 'epoch': 11.05}


 22%|‚ñà‚ñà‚ñè       | 5081/22950 [08:07<25:42, 11.58it/s]

{'loss': 0.0003, 'grad_norm': 0.007713334169238806, 'learning_rate': 7.78649237472767e-05, 'epoch': 11.07}


 22%|‚ñà‚ñà‚ñè       | 5091/22950 [08:08<25:52, 11.50it/s]

{'loss': 0.0812, 'grad_norm': 0.0050483145751059055, 'learning_rate': 7.782135076252723e-05, 'epoch': 11.09}


 22%|‚ñà‚ñà‚ñè       | 5101/22950 [08:09<25:12, 11.80it/s]

{'loss': 0.1198, 'grad_norm': 0.49063917994499207, 'learning_rate': 7.777777777777778e-05, 'epoch': 11.11}


 22%|‚ñà‚ñà‚ñè       | 5111/22950 [08:09<25:08, 11.82it/s]

{'loss': 0.1112, 'grad_norm': 20.404964447021484, 'learning_rate': 7.773420479302833e-05, 'epoch': 11.13}


 22%|‚ñà‚ñà‚ñè       | 5121/22950 [08:10<25:03, 11.86it/s]

{'loss': 0.0003, 'grad_norm': 0.06290633976459503, 'learning_rate': 7.769063180827887e-05, 'epoch': 11.15}


 22%|‚ñà‚ñà‚ñè       | 5131/22950 [08:11<25:08, 11.82it/s]

{'loss': 0.0593, 'grad_norm': 0.011661703698337078, 'learning_rate': 7.764705882352942e-05, 'epoch': 11.18}


 22%|‚ñà‚ñà‚ñè       | 5141/22950 [08:12<25:11, 11.78it/s]

{'loss': 0.0012, 'grad_norm': 0.027411630377173424, 'learning_rate': 7.760348583877996e-05, 'epoch': 11.2}


 22%|‚ñà‚ñà‚ñè       | 5151/22950 [08:13<24:55, 11.90it/s]

{'loss': 0.0006, 'grad_norm': 0.03054608218371868, 'learning_rate': 7.75599128540305e-05, 'epoch': 11.22}


 22%|‚ñà‚ñà‚ñè       | 5161/22950 [08:14<24:55, 11.90it/s]

{'loss': 0.0005, 'grad_norm': 0.005368048790842295, 'learning_rate': 7.751633986928106e-05, 'epoch': 11.24}


 23%|‚ñà‚ñà‚ñé       | 5171/22950 [08:14<25:12, 11.75it/s]

{'loss': 0.0958, 'grad_norm': 0.005868096835911274, 'learning_rate': 7.747276688453159e-05, 'epoch': 11.26}


 23%|‚ñà‚ñà‚ñé       | 5181/22950 [08:15<25:00, 11.84it/s]

{'loss': 0.0004, 'grad_norm': 0.005146769806742668, 'learning_rate': 7.742919389978214e-05, 'epoch': 11.29}


 23%|‚ñà‚ñà‚ñé       | 5191/22950 [08:16<24:43, 11.97it/s]

{'loss': 0.0246, 'grad_norm': 0.004917158745229244, 'learning_rate': 7.738562091503269e-05, 'epoch': 11.31}


 23%|‚ñà‚ñà‚ñé       | 5201/22950 [08:17<24:51, 11.90it/s]

{'loss': 0.0711, 'grad_norm': 0.006147541105747223, 'learning_rate': 7.734204793028323e-05, 'epoch': 11.33}


 23%|‚ñà‚ñà‚ñé       | 5211/22950 [08:18<24:47, 11.92it/s]

{'loss': 0.0562, 'grad_norm': 6.269453048706055, 'learning_rate': 7.729847494553377e-05, 'epoch': 11.35}


 23%|‚ñà‚ñà‚ñé       | 5221/22950 [08:19<24:47, 11.92it/s]

{'loss': 0.0006, 'grad_norm': 0.004417852498590946, 'learning_rate': 7.725490196078432e-05, 'epoch': 11.37}


 23%|‚ñà‚ñà‚ñé       | 5231/22950 [08:19<24:41, 11.96it/s]

{'loss': 0.0252, 'grad_norm': 0.34493592381477356, 'learning_rate': 7.721132897603485e-05, 'epoch': 11.39}


 23%|‚ñà‚ñà‚ñé       | 5241/22950 [08:20<25:16, 11.68it/s]

{'loss': 0.088, 'grad_norm': 0.03429926931858063, 'learning_rate': 7.71677559912854e-05, 'epoch': 11.42}


 23%|‚ñà‚ñà‚ñé       | 5251/22950 [08:21<24:31, 12.03it/s]

{'loss': 0.0006, 'grad_norm': 0.016666611656546593, 'learning_rate': 7.712418300653595e-05, 'epoch': 11.44}


 23%|‚ñà‚ñà‚ñé       | 5261/22950 [08:22<24:30, 12.03it/s]

{'loss': 0.0024, 'grad_norm': 0.003975632134824991, 'learning_rate': 7.708061002178649e-05, 'epoch': 11.46}


 23%|‚ñà‚ñà‚ñé       | 5271/22950 [08:23<25:21, 11.62it/s]

{'loss': 0.0465, 'grad_norm': 0.017221711575984955, 'learning_rate': 7.703703703703704e-05, 'epoch': 11.48}


 23%|‚ñà‚ñà‚ñé       | 5281/22950 [08:24<24:40, 11.94it/s]

{'loss': 0.2783, 'grad_norm': 32.23651885986328, 'learning_rate': 7.699346405228759e-05, 'epoch': 11.5}


 23%|‚ñà‚ñà‚ñé       | 5291/22950 [08:25<24:35, 11.97it/s]

{'loss': 0.0018, 'grad_norm': 0.029490912333130836, 'learning_rate': 7.694989106753813e-05, 'epoch': 11.53}


 23%|‚ñà‚ñà‚ñé       | 5301/22950 [08:25<24:35, 11.96it/s]

{'loss': 0.0004, 'grad_norm': 0.030116945505142212, 'learning_rate': 7.690631808278868e-05, 'epoch': 11.55}


 23%|‚ñà‚ñà‚ñé       | 5311/22950 [08:26<24:24, 12.05it/s]

{'loss': 0.0006, 'grad_norm': 0.04636634513735771, 'learning_rate': 7.686274509803923e-05, 'epoch': 11.57}


 23%|‚ñà‚ñà‚ñé       | 5321/22950 [08:27<25:02, 11.73it/s]

{'loss': 0.0641, 'grad_norm': 73.96914672851562, 'learning_rate': 7.681917211328976e-05, 'epoch': 11.59}


 23%|‚ñà‚ñà‚ñé       | 5331/22950 [08:28<24:29, 11.99it/s]

{'loss': 0.106, 'grad_norm': 21.515024185180664, 'learning_rate': 7.677559912854031e-05, 'epoch': 11.61}


 23%|‚ñà‚ñà‚ñé       | 5341/22950 [08:29<24:19, 12.07it/s]

{'loss': 0.0707, 'grad_norm': 1.855865478515625, 'learning_rate': 7.673202614379086e-05, 'epoch': 11.63}


 23%|‚ñà‚ñà‚ñé       | 5351/22950 [08:30<24:33, 11.94it/s]

{'loss': 0.0587, 'grad_norm': 0.01899907924234867, 'learning_rate': 7.66884531590414e-05, 'epoch': 11.66}


 23%|‚ñà‚ñà‚ñé       | 5361/22950 [08:30<24:32, 11.94it/s]

{'loss': 0.0004, 'grad_norm': 0.01202993281185627, 'learning_rate': 7.664488017429195e-05, 'epoch': 11.68}


 23%|‚ñà‚ñà‚ñé       | 5371/22950 [08:31<24:26, 11.99it/s]

{'loss': 0.043, 'grad_norm': 0.020736997947096825, 'learning_rate': 7.660130718954249e-05, 'epoch': 11.7}


 23%|‚ñà‚ñà‚ñé       | 5381/22950 [08:32<24:22, 12.01it/s]

{'loss': 0.0405, 'grad_norm': 52.56166076660156, 'learning_rate': 7.655773420479302e-05, 'epoch': 11.72}


 23%|‚ñà‚ñà‚ñé       | 5391/22950 [08:33<24:13, 12.08it/s]

{'loss': 0.0519, 'grad_norm': 0.007681458257138729, 'learning_rate': 7.651416122004357e-05, 'epoch': 11.74}


 24%|‚ñà‚ñà‚ñé       | 5401/22950 [08:34<24:49, 11.78it/s]

{'loss': 0.1031, 'grad_norm': 3.5724215507507324, 'learning_rate': 7.647058823529411e-05, 'epoch': 11.76}


 24%|‚ñà‚ñà‚ñé       | 5411/22950 [08:35<24:18, 12.02it/s]

{'loss': 0.0073, 'grad_norm': 0.036738842725753784, 'learning_rate': 7.642701525054466e-05, 'epoch': 11.79}


 24%|‚ñà‚ñà‚ñé       | 5421/22950 [08:35<24:21, 11.99it/s]

{'loss': 0.0014, 'grad_norm': 0.3507322669029236, 'learning_rate': 7.638344226579521e-05, 'epoch': 11.81}


 24%|‚ñà‚ñà‚ñé       | 5431/22950 [08:36<24:17, 12.02it/s]

{'loss': 0.0739, 'grad_norm': 0.005153702571988106, 'learning_rate': 7.633986928104575e-05, 'epoch': 11.83}


 24%|‚ñà‚ñà‚ñé       | 5441/22950 [08:37<24:19, 12.00it/s]

{'loss': 0.0381, 'grad_norm': 0.13845737278461456, 'learning_rate': 7.62962962962963e-05, 'epoch': 11.85}


 24%|‚ñà‚ñà‚ñç       | 5451/22950 [08:38<24:15, 12.03it/s]

{'loss': 0.0455, 'grad_norm': 0.006235477048903704, 'learning_rate': 7.625272331154685e-05, 'epoch': 11.87}


 24%|‚ñà‚ñà‚ñç       | 5461/22950 [08:39<24:10, 12.05it/s]

{'loss': 0.0019, 'grad_norm': 0.2197955846786499, 'learning_rate': 7.620915032679738e-05, 'epoch': 11.9}


 24%|‚ñà‚ñà‚ñç       | 5471/22950 [08:40<24:16, 12.00it/s]

{'loss': 0.0003, 'grad_norm': 0.0314033068716526, 'learning_rate': 7.616557734204794e-05, 'epoch': 11.92}


 24%|‚ñà‚ñà‚ñç       | 5481/22950 [08:40<24:26, 11.91it/s]

{'loss': 0.0002, 'grad_norm': 0.009712771512567997, 'learning_rate': 7.612200435729849e-05, 'epoch': 11.94}


 24%|‚ñà‚ñà‚ñç       | 5491/22950 [08:41<24:06, 12.07it/s]

{'loss': 0.0002, 'grad_norm': 0.005270154215395451, 'learning_rate': 7.607843137254902e-05, 'epoch': 11.96}


 24%|‚ñà‚ñà‚ñç       | 5501/22950 [08:42<24:35, 11.82it/s]

{'loss': 0.0576, 'grad_norm': 0.008615467697381973, 'learning_rate': 7.603485838779957e-05, 'epoch': 11.98}


                                                    
 24%|‚ñà‚ñà‚ñç       | 5508/22950 [08:45<24:49, 11.71it/s]

{'eval_loss': 1.1145397424697876, 'eval_accuracy': 0.8480392098426819, 'eval_runtime': 2.5495, 'eval_samples_per_second': 160.034, 'eval_steps_per_second': 20.004, 'epoch': 12.0}


 24%|‚ñà‚ñà‚ñç       | 5511/22950 [08:46<1:58:15,  2.46it/s]

{'loss': 0.0003, 'grad_norm': 0.004148573614656925, 'learning_rate': 7.599128540305012e-05, 'epoch': 12.0}


 24%|‚ñà‚ñà‚ñç       | 5521/22950 [08:47<37:16,  7.79it/s]  

{'loss': 0.0002, 'grad_norm': 0.004254343453794718, 'learning_rate': 7.594771241830066e-05, 'epoch': 12.03}


 24%|‚ñà‚ñà‚ñç       | 5531/22950 [08:48<27:05, 10.72it/s]

{'loss': 0.0448, 'grad_norm': 0.005628762301057577, 'learning_rate': 7.59041394335512e-05, 'epoch': 12.05}


 24%|‚ñà‚ñà‚ñç       | 5541/22950 [08:48<24:53, 11.66it/s]

{'loss': 0.0005, 'grad_norm': 0.009571940638124943, 'learning_rate': 7.586056644880175e-05, 'epoch': 12.07}


 24%|‚ñà‚ñà‚ñç       | 5551/22950 [08:49<24:28, 11.84it/s]

{'loss': 0.0105, 'grad_norm': 0.017629921436309814, 'learning_rate': 7.581699346405228e-05, 'epoch': 12.09}


 24%|‚ñà‚ñà‚ñç       | 5561/22950 [08:50<24:06, 12.02it/s]

{'loss': 0.0273, 'grad_norm': 0.0023692178074270487, 'learning_rate': 7.577342047930283e-05, 'epoch': 12.11}


 24%|‚ñà‚ñà‚ñç       | 5571/22950 [08:51<24:09, 11.99it/s]

{'loss': 0.0002, 'grad_norm': 0.003146957140415907, 'learning_rate': 7.572984749455338e-05, 'epoch': 12.14}


 24%|‚ñà‚ñà‚ñç       | 5581/22950 [08:52<24:11, 11.97it/s]

{'loss': 0.0003, 'grad_norm': 0.0028869968373328447, 'learning_rate': 7.568627450980392e-05, 'epoch': 12.16}


 24%|‚ñà‚ñà‚ñç       | 5591/22950 [08:53<24:08, 11.99it/s]

{'loss': 0.0001, 'grad_norm': 0.0024128949735313654, 'learning_rate': 7.564270152505447e-05, 'epoch': 12.18}


 24%|‚ñà‚ñà‚ñç       | 5601/22950 [08:53<23:58, 12.06it/s]

{'loss': 0.0001, 'grad_norm': 0.004612130578607321, 'learning_rate': 7.559912854030502e-05, 'epoch': 12.2}


 24%|‚ñà‚ñà‚ñç       | 5611/22950 [08:54<24:29, 11.80it/s]

{'loss': 0.1193, 'grad_norm': 0.003804196836426854, 'learning_rate': 7.555555555555556e-05, 'epoch': 12.22}


 24%|‚ñà‚ñà‚ñç       | 5621/22950 [08:55<24:01, 12.02it/s]

{'loss': 0.0744, 'grad_norm': 28.47087287902832, 'learning_rate': 7.55119825708061e-05, 'epoch': 12.24}


 25%|‚ñà‚ñà‚ñç       | 5631/22950 [08:56<24:04, 11.99it/s]

{'loss': 0.0384, 'grad_norm': 0.01691649667918682, 'learning_rate': 7.546840958605664e-05, 'epoch': 12.27}


 25%|‚ñà‚ñà‚ñç       | 5641/22950 [08:57<23:55, 12.06it/s]

{'loss': 0.0426, 'grad_norm': 37.021942138671875, 'learning_rate': 7.54248366013072e-05, 'epoch': 12.29}


 25%|‚ñà‚ñà‚ñç       | 5651/22950 [08:58<23:54, 12.06it/s]

{'loss': 0.0019, 'grad_norm': 0.03335657715797424, 'learning_rate': 7.538126361655774e-05, 'epoch': 12.31}


 25%|‚ñà‚ñà‚ñç       | 5661/22950 [08:58<24:04, 11.97it/s]

{'loss': 0.0004, 'grad_norm': 0.0599299818277359, 'learning_rate': 7.533769063180828e-05, 'epoch': 12.33}


 25%|‚ñà‚ñà‚ñç       | 5671/22950 [08:59<23:40, 12.16it/s]

{'loss': 0.096, 'grad_norm': 0.03477390483021736, 'learning_rate': 7.529411764705883e-05, 'epoch': 12.35}


 25%|‚ñà‚ñà‚ñç       | 5681/22950 [09:00<24:04, 11.95it/s]

{'loss': 0.1044, 'grad_norm': 0.013243477791547775, 'learning_rate': 7.525054466230938e-05, 'epoch': 12.37}


 25%|‚ñà‚ñà‚ñç       | 5691/22950 [09:01<24:23, 11.80it/s]

{'loss': 0.0018, 'grad_norm': 0.022475961595773697, 'learning_rate': 7.520697167755992e-05, 'epoch': 12.4}


 25%|‚ñà‚ñà‚ñç       | 5701/22950 [09:02<23:50, 12.06it/s]

{'loss': 0.0007, 'grad_norm': 0.010087811388075352, 'learning_rate': 7.516339869281045e-05, 'epoch': 12.42}


 25%|‚ñà‚ñà‚ñç       | 5711/22950 [09:03<23:51, 12.04it/s]

{'loss': 0.1123, 'grad_norm': 0.019509920850396156, 'learning_rate': 7.5119825708061e-05, 'epoch': 12.44}


 25%|‚ñà‚ñà‚ñç       | 5721/22950 [09:03<23:43, 12.10it/s]

{'loss': 0.1309, 'grad_norm': 0.011965480633080006, 'learning_rate': 7.507625272331154e-05, 'epoch': 12.46}


 25%|‚ñà‚ñà‚ñç       | 5731/22950 [09:04<23:59, 11.96it/s]

{'loss': 0.0005, 'grad_norm': 0.014409187249839306, 'learning_rate': 7.503267973856209e-05, 'epoch': 12.48}


 25%|‚ñà‚ñà‚ñå       | 5741/22950 [09:05<24:00, 11.95it/s]

{'loss': 0.0884, 'grad_norm': 0.011955841444432735, 'learning_rate': 7.498910675381264e-05, 'epoch': 12.51}


 25%|‚ñà‚ñà‚ñå       | 5751/22950 [09:06<23:55, 11.98it/s]

{'loss': 0.0007, 'grad_norm': 0.0936691090464592, 'learning_rate': 7.494553376906318e-05, 'epoch': 12.53}


 25%|‚ñà‚ñà‚ñå       | 5761/22950 [09:07<24:09, 11.86it/s]

{'loss': 0.0006, 'grad_norm': 0.010328106582164764, 'learning_rate': 7.490196078431373e-05, 'epoch': 12.55}


 25%|‚ñà‚ñà‚ñå       | 5771/22950 [09:08<23:51, 12.00it/s]

{'loss': 0.0141, 'grad_norm': 0.01878761127591133, 'learning_rate': 7.485838779956428e-05, 'epoch': 12.57}


 25%|‚ñà‚ñà‚ñå       | 5781/22950 [09:08<23:50, 12.00it/s]

{'loss': 0.0406, 'grad_norm': 0.009265282191336155, 'learning_rate': 7.481481481481481e-05, 'epoch': 12.59}


 25%|‚ñà‚ñà‚ñå       | 5791/22950 [09:09<23:37, 12.10it/s]

{'loss': 0.0633, 'grad_norm': 0.008687668479979038, 'learning_rate': 7.477124183006536e-05, 'epoch': 12.61}


 25%|‚ñà‚ñà‚ñå       | 5801/22950 [09:10<23:39, 12.08it/s]

{'loss': 0.0003, 'grad_norm': 0.016935380175709724, 'learning_rate': 7.472766884531592e-05, 'epoch': 12.64}


 25%|‚ñà‚ñà‚ñå       | 5811/22950 [09:11<23:47, 12.01it/s]

{'loss': 0.0613, 'grad_norm': 0.010852613486349583, 'learning_rate': 7.468409586056645e-05, 'epoch': 12.66}


 25%|‚ñà‚ñà‚ñå       | 5821/22950 [09:12<24:09, 11.82it/s]

{'loss': 0.1086, 'grad_norm': 35.6172981262207, 'learning_rate': 7.4640522875817e-05, 'epoch': 12.68}


 25%|‚ñà‚ñà‚ñå       | 5831/22950 [09:13<24:05, 11.84it/s]

{'loss': 0.0002, 'grad_norm': 0.006890362128615379, 'learning_rate': 7.459694989106755e-05, 'epoch': 12.7}


 25%|‚ñà‚ñà‚ñå       | 5841/22950 [09:13<23:45, 12.00it/s]

{'loss': 0.047, 'grad_norm': 0.010199214331805706, 'learning_rate': 7.455337690631809e-05, 'epoch': 12.72}


 25%|‚ñà‚ñà‚ñå       | 5851/22950 [09:14<23:37, 12.06it/s]

{'loss': 0.1103, 'grad_norm': 0.00406284537166357, 'learning_rate': 7.450980392156864e-05, 'epoch': 12.75}


 26%|‚ñà‚ñà‚ñå       | 5861/22950 [09:15<23:37, 12.05it/s]

{'loss': 0.0476, 'grad_norm': 0.18582366406917572, 'learning_rate': 7.446623093681918e-05, 'epoch': 12.77}


 26%|‚ñà‚ñà‚ñå       | 5871/22950 [09:16<23:26, 12.14it/s]

{'loss': 0.0267, 'grad_norm': 0.015492614358663559, 'learning_rate': 7.442265795206971e-05, 'epoch': 12.79}


 26%|‚ñà‚ñà‚ñå       | 5881/22950 [09:17<23:45, 11.97it/s]

{'loss': 0.1509, 'grad_norm': 48.13603591918945, 'learning_rate': 7.437908496732026e-05, 'epoch': 12.81}


 26%|‚ñà‚ñà‚ñå       | 5891/22950 [09:17<24:01, 11.84it/s]

{'loss': 0.0341, 'grad_norm': 0.007805976085364819, 'learning_rate': 7.43355119825708e-05, 'epoch': 12.83}


 26%|‚ñà‚ñà‚ñå       | 5901/22950 [09:18<23:18, 12.19it/s]

{'loss': 0.0002, 'grad_norm': 0.007240112405270338, 'learning_rate': 7.429193899782135e-05, 'epoch': 12.85}


 26%|‚ñà‚ñà‚ñå       | 5911/22950 [09:19<23:38, 12.02it/s]

{'loss': 0.3427, 'grad_norm': 14.271882057189941, 'learning_rate': 7.42483660130719e-05, 'epoch': 12.88}


 26%|‚ñà‚ñà‚ñå       | 5921/22950 [09:20<23:23, 12.13it/s]

{'loss': 0.0007, 'grad_norm': 0.028148164972662926, 'learning_rate': 7.420479302832244e-05, 'epoch': 12.9}


 26%|‚ñà‚ñà‚ñå       | 5931/22950 [09:21<23:27, 12.09it/s]

{'loss': 0.034, 'grad_norm': 0.01922539994120598, 'learning_rate': 7.416122004357299e-05, 'epoch': 12.92}


 26%|‚ñà‚ñà‚ñå       | 5941/22950 [09:22<23:27, 12.08it/s]

{'loss': 0.0641, 'grad_norm': 0.01706862635910511, 'learning_rate': 7.411764705882354e-05, 'epoch': 12.94}


 26%|‚ñà‚ñà‚ñå       | 5951/22950 [09:22<23:22, 12.12it/s]

{'loss': 0.0091, 'grad_norm': 2.143747568130493, 'learning_rate': 7.407407407407407e-05, 'epoch': 12.96}


 26%|‚ñà‚ñà‚ñå       | 5961/22950 [09:23<24:24, 11.60it/s]

{'loss': 0.1938, 'grad_norm': 0.03339885175228119, 'learning_rate': 7.403050108932462e-05, 'epoch': 12.98}


                                                    
 26%|‚ñà‚ñà‚ñå       | 5967/22950 [09:26<22:22, 12.65it/s]

{'eval_loss': 0.8999571204185486, 'eval_accuracy': 0.8529411554336548, 'eval_runtime': 2.4924, 'eval_samples_per_second': 163.698, 'eval_steps_per_second': 20.462, 'epoch': 13.0}


 26%|‚ñà‚ñà‚ñå       | 5971/22950 [09:27<1:49:08,  2.59it/s]

{'loss': 0.0545, 'grad_norm': 0.041031379252672195, 'learning_rate': 7.398692810457517e-05, 'epoch': 13.01}


 26%|‚ñà‚ñà‚ñå       | 5981/22950 [09:28<35:41,  7.92it/s]  

{'loss': 0.0017, 'grad_norm': 0.012825588695704937, 'learning_rate': 7.394335511982571e-05, 'epoch': 13.03}


 26%|‚ñà‚ñà‚ñå       | 5991/22950 [09:29<26:31, 10.66it/s]

{'loss': 0.0016, 'grad_norm': 0.01822812296450138, 'learning_rate': 7.389978213507626e-05, 'epoch': 13.05}


 26%|‚ñà‚ñà‚ñå       | 6001/22950 [09:29<23:53, 11.83it/s]

{'loss': 0.0005, 'grad_norm': 0.06545990705490112, 'learning_rate': 7.385620915032681e-05, 'epoch': 13.07}


 26%|‚ñà‚ñà‚ñå       | 6011/22950 [09:30<23:41, 11.92it/s]

{'loss': 0.001, 'grad_norm': 0.19838902354240417, 'learning_rate': 7.381263616557735e-05, 'epoch': 13.09}


 26%|‚ñà‚ñà‚ñå       | 6021/22950 [09:31<23:52, 11.82it/s]

{'loss': 0.0519, 'grad_norm': 127.94129180908203, 'learning_rate': 7.37690631808279e-05, 'epoch': 13.12}


 26%|‚ñà‚ñà‚ñã       | 6031/22950 [09:32<23:37, 11.94it/s]

{'loss': 0.0365, 'grad_norm': 0.007202567998319864, 'learning_rate': 7.372549019607843e-05, 'epoch': 13.14}


 26%|‚ñà‚ñà‚ñã       | 6041/22950 [09:33<23:22, 12.05it/s]

{'loss': 0.0622, 'grad_norm': 0.013955730944871902, 'learning_rate': 7.368191721132897e-05, 'epoch': 13.16}


 26%|‚ñà‚ñà‚ñã       | 6051/22950 [09:34<23:25, 12.02it/s]

{'loss': 0.0789, 'grad_norm': 29.656742095947266, 'learning_rate': 7.363834422657952e-05, 'epoch': 13.18}


 26%|‚ñà‚ñà‚ñã       | 6061/22950 [09:34<23:26, 12.01it/s]

{'loss': 0.1171, 'grad_norm': 0.43314850330352783, 'learning_rate': 7.359477124183007e-05, 'epoch': 13.2}


 26%|‚ñà‚ñà‚ñã       | 6071/22950 [09:35<23:28, 11.99it/s]

{'loss': 0.0263, 'grad_norm': 0.014857066795229912, 'learning_rate': 7.355119825708061e-05, 'epoch': 13.22}


 26%|‚ñà‚ñà‚ñã       | 6081/22950 [09:36<23:17, 12.07it/s]

{'loss': 0.0004, 'grad_norm': 0.17486484348773956, 'learning_rate': 7.350762527233116e-05, 'epoch': 13.25}


 27%|‚ñà‚ñà‚ñã       | 6091/22950 [09:37<23:50, 11.79it/s]

{'loss': 0.0144, 'grad_norm': 0.019290072843432426, 'learning_rate': 7.346405228758171e-05, 'epoch': 13.27}


 27%|‚ñà‚ñà‚ñã       | 6101/22950 [09:38<23:12, 12.10it/s]

{'loss': 0.069, 'grad_norm': 0.007027773652225733, 'learning_rate': 7.342047930283224e-05, 'epoch': 13.29}


 27%|‚ñà‚ñà‚ñã       | 6111/22950 [09:39<23:10, 12.11it/s]

{'loss': 0.107, 'grad_norm': 7.6253204345703125, 'learning_rate': 7.33769063180828e-05, 'epoch': 13.31}


 27%|‚ñà‚ñà‚ñã       | 6121/22950 [09:39<23:25, 11.97it/s]

{'loss': 0.0403, 'grad_norm': 0.02296769805252552, 'learning_rate': 7.333333333333333e-05, 'epoch': 13.33}


 27%|‚ñà‚ñà‚ñã       | 6131/22950 [09:40<23:09, 12.11it/s]

{'loss': 0.0649, 'grad_norm': 0.018509289249777794, 'learning_rate': 7.328976034858388e-05, 'epoch': 13.36}


 27%|‚ñà‚ñà‚ñã       | 6141/22950 [09:41<23:16, 12.04it/s]

{'loss': 0.1011, 'grad_norm': 0.02165757119655609, 'learning_rate': 7.324618736383443e-05, 'epoch': 13.38}


 27%|‚ñà‚ñà‚ñã       | 6151/22950 [09:42<23:12, 12.06it/s]

{'loss': 0.0592, 'grad_norm': 31.430234909057617, 'learning_rate': 7.320261437908497e-05, 'epoch': 13.4}


 27%|‚ñà‚ñà‚ñã       | 6161/22950 [09:43<23:41, 11.81it/s]

{'loss': 0.0009, 'grad_norm': 0.018162287771701813, 'learning_rate': 7.315904139433552e-05, 'epoch': 13.42}


 27%|‚ñà‚ñà‚ñã       | 6171/22950 [09:44<23:17, 12.00it/s]

{'loss': 0.0712, 'grad_norm': 0.01531296782195568, 'learning_rate': 7.311546840958607e-05, 'epoch': 13.44}


 27%|‚ñà‚ñà‚ñã       | 6181/22950 [09:44<23:08, 12.07it/s]

{'loss': 0.0934, 'grad_norm': 0.013587972149252892, 'learning_rate': 7.30718954248366e-05, 'epoch': 13.46}


 27%|‚ñà‚ñà‚ñã       | 6191/22950 [09:45<23:09, 12.06it/s]

{'loss': 0.0663, 'grad_norm': 0.02303602732717991, 'learning_rate': 7.302832244008716e-05, 'epoch': 13.49}


 27%|‚ñà‚ñà‚ñã       | 6201/22950 [09:46<23:12, 12.02it/s]

{'loss': 0.0032, 'grad_norm': 0.04187677800655365, 'learning_rate': 7.298474945533769e-05, 'epoch': 13.51}


 27%|‚ñà‚ñà‚ñã       | 6211/22950 [09:47<23:19, 11.96it/s]

{'loss': 0.0302, 'grad_norm': 0.02579977922141552, 'learning_rate': 7.294117647058823e-05, 'epoch': 13.53}


 27%|‚ñà‚ñà‚ñã       | 6221/22950 [09:48<23:10, 12.03it/s]

{'loss': 0.0008, 'grad_norm': 0.016936903819441795, 'learning_rate': 7.289760348583878e-05, 'epoch': 13.55}


 27%|‚ñà‚ñà‚ñã       | 6231/22950 [09:49<23:38, 11.79it/s]

{'loss': 0.0549, 'grad_norm': 0.011396666057407856, 'learning_rate': 7.285403050108933e-05, 'epoch': 13.57}


 27%|‚ñà‚ñà‚ñã       | 6241/22950 [09:49<23:15, 11.98it/s]

{'loss': 0.017, 'grad_norm': 0.02574576996266842, 'learning_rate': 7.281045751633987e-05, 'epoch': 13.59}


 27%|‚ñà‚ñà‚ñã       | 6251/22950 [09:50<23:11, 12.00it/s]

{'loss': 0.0003, 'grad_norm': 0.015551619231700897, 'learning_rate': 7.276688453159042e-05, 'epoch': 13.62}


 27%|‚ñà‚ñà‚ñã       | 6261/22950 [09:51<23:10, 12.01it/s]

{'loss': 0.2039, 'grad_norm': 0.6718763709068298, 'learning_rate': 7.272331154684097e-05, 'epoch': 13.64}


 27%|‚ñà‚ñà‚ñã       | 6271/22950 [09:52<23:04, 12.04it/s]

{'loss': 0.0014, 'grad_norm': 0.010465123690664768, 'learning_rate': 7.26797385620915e-05, 'epoch': 13.66}


 27%|‚ñà‚ñà‚ñã       | 6281/22950 [09:53<23:11, 11.98it/s]

{'loss': 0.1197, 'grad_norm': 45.562950134277344, 'learning_rate': 7.263616557734205e-05, 'epoch': 13.68}


 27%|‚ñà‚ñà‚ñã       | 6291/22950 [09:54<22:57, 12.09it/s]

{'loss': 0.1354, 'grad_norm': 25.882465362548828, 'learning_rate': 7.25925925925926e-05, 'epoch': 13.7}


 27%|‚ñà‚ñà‚ñã       | 6301/22950 [09:54<23:30, 11.81it/s]

{'loss': 0.0017, 'grad_norm': 0.03282665088772774, 'learning_rate': 7.254901960784314e-05, 'epoch': 13.73}


 27%|‚ñà‚ñà‚ñã       | 6311/22950 [09:55<23:14, 11.93it/s]

{'loss': 0.0639, 'grad_norm': 0.019134003669023514, 'learning_rate': 7.250544662309369e-05, 'epoch': 13.75}


 28%|‚ñà‚ñà‚ñä       | 6321/22950 [09:56<23:12, 11.94it/s]

{'loss': 0.0388, 'grad_norm': 0.01846109889447689, 'learning_rate': 7.246187363834424e-05, 'epoch': 13.77}


 28%|‚ñà‚ñà‚ñä       | 6331/22950 [09:57<23:35, 11.74it/s]

{'loss': 0.0007, 'grad_norm': 0.0462803915143013, 'learning_rate': 7.241830065359478e-05, 'epoch': 13.79}


 28%|‚ñà‚ñà‚ñä       | 6341/22950 [09:58<23:06, 11.98it/s]

{'loss': 0.0049, 'grad_norm': 17.55461883544922, 'learning_rate': 7.237472766884533e-05, 'epoch': 13.81}


 28%|‚ñà‚ñà‚ñä       | 6351/22950 [09:59<23:14, 11.90it/s]

{'loss': 0.0022, 'grad_norm': 0.0293723251670599, 'learning_rate': 7.233115468409586e-05, 'epoch': 13.83}


 28%|‚ñà‚ñà‚ñä       | 6361/22950 [09:59<23:03, 11.99it/s]

{'loss': 0.0007, 'grad_norm': 0.05623548477888107, 'learning_rate': 7.228758169934641e-05, 'epoch': 13.86}


 28%|‚ñà‚ñà‚ñä       | 6371/22950 [10:00<23:25, 11.79it/s]

{'loss': 0.0749, 'grad_norm': 28.792461395263672, 'learning_rate': 7.224400871459695e-05, 'epoch': 13.88}


 28%|‚ñà‚ñà‚ñä       | 6381/22950 [10:01<22:54, 12.05it/s]

{'loss': 0.0938, 'grad_norm': 0.00614724587649107, 'learning_rate': 7.220043572984749e-05, 'epoch': 13.9}


 28%|‚ñà‚ñà‚ñä       | 6391/22950 [10:02<23:05, 11.95it/s]

{'loss': 0.1063, 'grad_norm': 0.006238422356545925, 'learning_rate': 7.215686274509804e-05, 'epoch': 13.92}


 28%|‚ñà‚ñà‚ñä       | 6401/22950 [10:03<23:01, 11.98it/s]

{'loss': 0.0045, 'grad_norm': 24.74534797668457, 'learning_rate': 7.211328976034859e-05, 'epoch': 13.94}


 28%|‚ñà‚ñà‚ñä       | 6411/22950 [10:04<23:03, 11.96it/s]

{'loss': 0.0255, 'grad_norm': 0.02570093609392643, 'learning_rate': 7.206971677559912e-05, 'epoch': 13.97}


 28%|‚ñà‚ñà‚ñä       | 6421/22950 [10:04<23:05, 11.93it/s]

{'loss': 0.0209, 'grad_norm': 0.015196598134934902, 'learning_rate': 7.202614379084967e-05, 'epoch': 13.99}


                                                    
 28%|‚ñà‚ñà‚ñä       | 6426/22950 [10:07<23:08, 11.90it/s]

{'eval_loss': 1.123636245727539, 'eval_accuracy': 0.8210784196853638, 'eval_runtime': 2.506, 'eval_samples_per_second': 162.812, 'eval_steps_per_second': 20.351, 'epoch': 14.0}


 28%|‚ñà‚ñà‚ñä       | 6431/22950 [10:08<1:20:16,  3.43it/s]

{'loss': 0.0004, 'grad_norm': 0.015967778861522675, 'learning_rate': 7.198257080610022e-05, 'epoch': 14.01}


 28%|‚ñà‚ñà‚ñä       | 6441/22950 [10:09<31:31,  8.73it/s]  

{'loss': 0.0447, 'grad_norm': 1.9209378957748413, 'learning_rate': 7.193899782135076e-05, 'epoch': 14.03}


 28%|‚ñà‚ñà‚ñä       | 6451/22950 [10:10<24:52, 11.05it/s]

{'loss': 0.0004, 'grad_norm': 0.03647174313664436, 'learning_rate': 7.189542483660131e-05, 'epoch': 14.05}


 28%|‚ñà‚ñà‚ñä       | 6461/22950 [10:11<23:28, 11.71it/s]

{'loss': 0.0379, 'grad_norm': 0.0076948548667132854, 'learning_rate': 7.185185185185186e-05, 'epoch': 14.07}


 28%|‚ñà‚ñà‚ñä       | 6471/22950 [10:11<22:58, 11.96it/s]

{'loss': 0.0003, 'grad_norm': 0.005460980348289013, 'learning_rate': 7.18082788671024e-05, 'epoch': 14.1}


 28%|‚ñà‚ñà‚ñä       | 6481/22950 [10:12<23:09, 11.85it/s]

{'loss': 0.0002, 'grad_norm': 0.014438509941101074, 'learning_rate': 7.176470588235295e-05, 'epoch': 14.12}


 28%|‚ñà‚ñà‚ñä       | 6491/22950 [10:13<23:08, 11.85it/s]

{'loss': 0.0002, 'grad_norm': 0.00785636156797409, 'learning_rate': 7.17211328976035e-05, 'epoch': 14.14}


 28%|‚ñà‚ñà‚ñä       | 6501/22950 [10:14<23:19, 11.76it/s]

{'loss': 0.0002, 'grad_norm': 0.015319536440074444, 'learning_rate': 7.167755991285404e-05, 'epoch': 14.16}


 28%|‚ñà‚ñà‚ñä       | 6511/22950 [10:15<23:00, 11.91it/s]

{'loss': 0.0003, 'grad_norm': 0.004861438646912575, 'learning_rate': 7.163398692810459e-05, 'epoch': 14.18}


 28%|‚ñà‚ñà‚ñä       | 6521/22950 [10:16<22:57, 11.93it/s]

{'loss': 0.1017, 'grad_norm': 28.57935905456543, 'learning_rate': 7.159041394335512e-05, 'epoch': 14.2}


 28%|‚ñà‚ñà‚ñä       | 6531/22950 [10:16<23:02, 11.88it/s]

{'loss': 0.0612, 'grad_norm': 0.010294213891029358, 'learning_rate': 7.154684095860566e-05, 'epoch': 14.23}


 29%|‚ñà‚ñà‚ñä       | 6541/22950 [10:17<22:58, 11.90it/s]

{'loss': 0.0009, 'grad_norm': 0.05483907461166382, 'learning_rate': 7.150326797385621e-05, 'epoch': 14.25}


 29%|‚ñà‚ñà‚ñä       | 6551/22950 [10:18<22:57, 11.91it/s]

{'loss': 0.0004, 'grad_norm': 0.008452451787889004, 'learning_rate': 7.145969498910676e-05, 'epoch': 14.27}


 29%|‚ñà‚ñà‚ñä       | 6561/22950 [10:19<24:38, 11.08it/s]

{'loss': 0.1106, 'grad_norm': 0.009685520082712173, 'learning_rate': 7.14161220043573e-05, 'epoch': 14.29}


 29%|‚ñà‚ñà‚ñä       | 6571/22950 [10:20<23:17, 11.72it/s]

{'loss': 0.08, 'grad_norm': 0.04847777262330055, 'learning_rate': 7.137254901960785e-05, 'epoch': 14.31}


 29%|‚ñà‚ñà‚ñä       | 6581/22950 [10:21<22:59, 11.87it/s]

{'loss': 0.0285, 'grad_norm': 0.023529620841145515, 'learning_rate': 7.132897603485838e-05, 'epoch': 14.34}


 29%|‚ñà‚ñà‚ñä       | 6591/22950 [10:22<22:56, 11.88it/s]

{'loss': 0.0004, 'grad_norm': 0.016576338559389114, 'learning_rate': 7.128540305010893e-05, 'epoch': 14.36}


 29%|‚ñà‚ñà‚ñâ       | 6601/22950 [10:22<22:53, 11.90it/s]

{'loss': 0.0589, 'grad_norm': 0.01626538299024105, 'learning_rate': 7.124183006535948e-05, 'epoch': 14.38}


 29%|‚ñà‚ñà‚ñâ       | 6611/22950 [10:23<23:07, 11.78it/s]

{'loss': 0.0005, 'grad_norm': 0.015558321960270405, 'learning_rate': 7.119825708061002e-05, 'epoch': 14.4}


 29%|‚ñà‚ñà‚ñâ       | 6621/22950 [10:24<22:57, 11.85it/s]

{'loss': 0.0062, 'grad_norm': 0.0072729079984128475, 'learning_rate': 7.115468409586057e-05, 'epoch': 14.42}


 29%|‚ñà‚ñà‚ñâ       | 6631/22950 [10:25<22:58, 11.84it/s]

{'loss': 0.0147, 'grad_norm': 0.0069349524565041065, 'learning_rate': 7.111111111111112e-05, 'epoch': 14.44}


 29%|‚ñà‚ñà‚ñâ       | 6641/22950 [10:26<22:40, 11.98it/s]

{'loss': 0.0681, 'grad_norm': 0.010453615337610245, 'learning_rate': 7.106753812636166e-05, 'epoch': 14.47}


 29%|‚ñà‚ñà‚ñâ       | 6651/22950 [10:27<23:24, 11.61it/s]

{'loss': 0.09, 'grad_norm': 0.008571074344217777, 'learning_rate': 7.10239651416122e-05, 'epoch': 14.49}


 29%|‚ñà‚ñà‚ñâ       | 6661/22950 [10:27<22:51, 11.88it/s]

{'loss': 0.0013, 'grad_norm': 0.039471764117479324, 'learning_rate': 7.098039215686276e-05, 'epoch': 14.51}


 29%|‚ñà‚ñà‚ñâ       | 6671/22950 [10:28<22:47, 11.90it/s]

{'loss': 0.0002, 'grad_norm': 0.004348713904619217, 'learning_rate': 7.09368191721133e-05, 'epoch': 14.53}


 29%|‚ñà‚ñà‚ñâ       | 6681/22950 [10:29<22:55, 11.83it/s]

{'loss': 0.1726, 'grad_norm': 0.007475684862583876, 'learning_rate': 7.089324618736384e-05, 'epoch': 14.55}


 29%|‚ñà‚ñà‚ñâ       | 6691/22950 [10:30<22:45, 11.91it/s]

{'loss': 0.0005, 'grad_norm': 0.008413439616560936, 'learning_rate': 7.084967320261438e-05, 'epoch': 14.58}


 29%|‚ñà‚ñà‚ñâ       | 6701/22950 [10:31<22:53, 11.83it/s]

{'loss': 0.0539, 'grad_norm': 0.012197108007967472, 'learning_rate': 7.080610021786492e-05, 'epoch': 14.6}


 29%|‚ñà‚ñà‚ñâ       | 6711/22950 [10:32<22:41, 11.92it/s]

{'loss': 0.0003, 'grad_norm': 0.015928680077195168, 'learning_rate': 7.076252723311547e-05, 'epoch': 14.62}


 29%|‚ñà‚ñà‚ñâ       | 6721/22950 [10:33<23:15, 11.63it/s]

{'loss': 0.0697, 'grad_norm': 0.006934128701686859, 'learning_rate': 7.071895424836602e-05, 'epoch': 14.64}


 29%|‚ñà‚ñà‚ñâ       | 6731/22950 [10:33<22:41, 11.92it/s]

{'loss': 0.0013, 'grad_norm': 0.021247737109661102, 'learning_rate': 7.067538126361655e-05, 'epoch': 14.66}


 29%|‚ñà‚ñà‚ñâ       | 6741/22950 [10:34<22:48, 11.84it/s]

{'loss': 0.0003, 'grad_norm': 0.007589902728796005, 'learning_rate': 7.06318082788671e-05, 'epoch': 14.68}


 29%|‚ñà‚ñà‚ñâ       | 6751/22950 [10:35<22:57, 11.76it/s]

{'loss': 0.2813, 'grad_norm': 0.013835923746228218, 'learning_rate': 7.058823529411765e-05, 'epoch': 14.71}


 29%|‚ñà‚ñà‚ñâ       | 6761/22950 [10:36<22:45, 11.85it/s]

{'loss': 0.0011, 'grad_norm': 0.033009178936481476, 'learning_rate': 7.054466230936819e-05, 'epoch': 14.73}


 30%|‚ñà‚ñà‚ñâ       | 6771/22950 [10:37<22:53, 11.78it/s]

{'loss': 0.0008, 'grad_norm': 0.018075309693813324, 'learning_rate': 7.050108932461874e-05, 'epoch': 14.75}


 30%|‚ñà‚ñà‚ñâ       | 6781/22950 [10:38<22:52, 11.78it/s]

{'loss': 0.1825, 'grad_norm': 0.036763474345207214, 'learning_rate': 7.045751633986929e-05, 'epoch': 14.77}


 30%|‚ñà‚ñà‚ñâ       | 6791/22950 [10:38<23:06, 11.65it/s]

{'loss': 0.0653, 'grad_norm': 0.04973914101719856, 'learning_rate': 7.041394335511983e-05, 'epoch': 14.79}


 30%|‚ñà‚ñà‚ñâ       | 6801/22950 [10:39<22:38, 11.89it/s]

{'loss': 0.0023, 'grad_norm': 3.425081491470337, 'learning_rate': 7.037037037037038e-05, 'epoch': 14.81}


 30%|‚ñà‚ñà‚ñâ       | 6811/22950 [10:40<22:44, 11.82it/s]

{'loss': 0.001, 'grad_norm': 0.03708836808800697, 'learning_rate': 7.032679738562093e-05, 'epoch': 14.84}


 30%|‚ñà‚ñà‚ñâ       | 6821/22950 [10:41<22:39, 11.86it/s]

{'loss': 0.0021, 'grad_norm': 0.017917398363351822, 'learning_rate': 7.028322440087147e-05, 'epoch': 14.86}


 30%|‚ñà‚ñà‚ñâ       | 6831/22950 [10:42<22:33, 11.91it/s]

{'loss': 0.1023, 'grad_norm': 0.013267389498651028, 'learning_rate': 7.023965141612202e-05, 'epoch': 14.88}


 30%|‚ñà‚ñà‚ñâ       | 6841/22950 [10:43<22:33, 11.90it/s]

{'loss': 0.0006, 'grad_norm': 0.024167021736502647, 'learning_rate': 7.019607843137255e-05, 'epoch': 14.9}


 30%|‚ñà‚ñà‚ñâ       | 6851/22950 [10:44<23:12, 11.56it/s]

{'loss': 0.0721, 'grad_norm': 0.28499338030815125, 'learning_rate': 7.01525054466231e-05, 'epoch': 14.92}


 30%|‚ñà‚ñà‚ñâ       | 6861/22950 [10:44<22:42, 11.81it/s]

{'loss': 0.0655, 'grad_norm': 0.03836255893111229, 'learning_rate': 7.010893246187364e-05, 'epoch': 14.95}


 30%|‚ñà‚ñà‚ñâ       | 6871/22950 [10:45<22:36, 11.85it/s]

{'loss': 0.0028, 'grad_norm': 0.016038015484809875, 'learning_rate': 7.006535947712418e-05, 'epoch': 14.97}


 30%|‚ñà‚ñà‚ñâ       | 6881/22950 [10:46<22:33, 11.88it/s]

{'loss': 0.0454, 'grad_norm': 9.216401100158691, 'learning_rate': 7.002178649237473e-05, 'epoch': 14.99}


                                                    
 30%|‚ñà‚ñà‚ñà       | 6885/22950 [10:49<21:18, 12.57it/s]

{'eval_loss': 0.9174879193305969, 'eval_accuracy': 0.8529411554336548, 'eval_runtime': 2.5683, 'eval_samples_per_second': 158.858, 'eval_steps_per_second': 19.857, 'epoch': 15.0}


 30%|‚ñà‚ñà‚ñà       | 6891/22950 [10:50<1:20:23,  3.33it/s]

{'loss': 0.0007, 'grad_norm': 0.11240365356206894, 'learning_rate': 6.997821350762528e-05, 'epoch': 15.01}


 30%|‚ñà‚ñà‚ñà       | 6901/22950 [10:51<30:40,  8.72it/s]  

{'loss': 0.0813, 'grad_norm': 0.010116295889019966, 'learning_rate': 6.993464052287581e-05, 'epoch': 15.03}


 30%|‚ñà‚ñà‚ñà       | 6911/22950 [10:52<24:57, 10.71it/s]

{'loss': 0.0007, 'grad_norm': 0.01907469518482685, 'learning_rate': 6.989106753812636e-05, 'epoch': 15.05}


 30%|‚ñà‚ñà‚ñà       | 6921/22950 [10:52<23:07, 11.55it/s]

{'loss': 0.0003, 'grad_norm': 0.00473304046317935, 'learning_rate': 6.984749455337691e-05, 'epoch': 15.08}


 30%|‚ñà‚ñà‚ñà       | 6931/22950 [10:53<22:51, 11.68it/s]

{'loss': 0.1573, 'grad_norm': 8.639824867248535, 'learning_rate': 6.980392156862745e-05, 'epoch': 15.1}


 30%|‚ñà‚ñà‚ñà       | 6941/22950 [10:54<22:49, 11.69it/s]

{'loss': 0.0003, 'grad_norm': 0.015343815088272095, 'learning_rate': 6.9760348583878e-05, 'epoch': 15.12}


 30%|‚ñà‚ñà‚ñà       | 6951/22950 [10:55<22:50, 11.67it/s]

{'loss': 0.0016, 'grad_norm': 0.06625496596097946, 'learning_rate': 6.971677559912855e-05, 'epoch': 15.14}


 30%|‚ñà‚ñà‚ñà       | 6961/22950 [10:56<22:42, 11.73it/s]

{'loss': 0.1572, 'grad_norm': 0.028862273320555687, 'learning_rate': 6.967320261437909e-05, 'epoch': 15.16}


 30%|‚ñà‚ñà‚ñà       | 6971/22950 [10:57<23:00, 11.57it/s]

{'loss': 0.0592, 'grad_norm': 2.2771542072296143, 'learning_rate': 6.962962962962964e-05, 'epoch': 15.19}


 30%|‚ñà‚ñà‚ñà       | 6981/22950 [10:58<22:44, 11.70it/s]

{'loss': 0.0025, 'grad_norm': 0.16088370978832245, 'learning_rate': 6.958605664488019e-05, 'epoch': 15.21}


 30%|‚ñà‚ñà‚ñà       | 6991/22950 [10:58<22:44, 11.70it/s]

{'loss': 0.0585, 'grad_norm': 31.078386306762695, 'learning_rate': 6.954248366013072e-05, 'epoch': 15.23}


 31%|‚ñà‚ñà‚ñà       | 7001/22950 [10:59<22:58, 11.57it/s]

{'loss': 0.0832, 'grad_norm': 0.04411113262176514, 'learning_rate': 6.949891067538127e-05, 'epoch': 15.25}


 31%|‚ñà‚ñà‚ñà       | 7011/22950 [11:00<22:47, 11.66it/s]

{'loss': 0.0012, 'grad_norm': 0.010701834224164486, 'learning_rate': 6.945533769063181e-05, 'epoch': 15.27}


 31%|‚ñà‚ñà‚ñà       | 7021/22950 [11:01<22:42, 11.69it/s]

{'loss': 0.0826, 'grad_norm': 0.1123380959033966, 'learning_rate': 6.941176470588236e-05, 'epoch': 15.29}


 31%|‚ñà‚ñà‚ñà       | 7031/22950 [11:02<22:50, 11.62it/s]

{'loss': 0.1719, 'grad_norm': 6.318185806274414, 'learning_rate': 6.93681917211329e-05, 'epoch': 15.32}


 31%|‚ñà‚ñà‚ñà       | 7041/22950 [11:03<23:11, 11.43it/s]

{'loss': 0.0009, 'grad_norm': 0.13163048028945923, 'learning_rate': 6.932461873638345e-05, 'epoch': 15.34}


 31%|‚ñà‚ñà‚ñà       | 7051/22950 [11:04<22:39, 11.69it/s]

{'loss': 0.0006, 'grad_norm': 0.020208200439810753, 'learning_rate': 6.928104575163398e-05, 'epoch': 15.36}


 31%|‚ñà‚ñà‚ñà       | 7061/22950 [11:04<22:50, 11.59it/s]

{'loss': 0.1388, 'grad_norm': 0.05197887122631073, 'learning_rate': 6.923747276688453e-05, 'epoch': 15.38}


 31%|‚ñà‚ñà‚ñà       | 7071/22950 [11:05<22:44, 11.64it/s]

{'loss': 0.0009, 'grad_norm': 0.043890342116355896, 'learning_rate': 6.919389978213507e-05, 'epoch': 15.4}


 31%|‚ñà‚ñà‚ñà       | 7081/22950 [11:06<22:37, 11.69it/s]

{'loss': 0.0489, 'grad_norm': 0.013410395942628384, 'learning_rate': 6.915032679738562e-05, 'epoch': 15.42}


 31%|‚ñà‚ñà‚ñà       | 7091/22950 [11:07<22:45, 11.62it/s]

{'loss': 0.0782, 'grad_norm': 0.017384828999638557, 'learning_rate': 6.910675381263617e-05, 'epoch': 15.45}


 31%|‚ñà‚ñà‚ñà       | 7101/22950 [11:08<22:40, 11.65it/s]

{'loss': 0.1358, 'grad_norm': 0.01717626489698887, 'learning_rate': 6.906318082788671e-05, 'epoch': 15.47}


 31%|‚ñà‚ñà‚ñà       | 7111/22950 [11:09<23:15, 11.35it/s]

{'loss': 0.0908, 'grad_norm': 12.75301742553711, 'learning_rate': 6.901960784313726e-05, 'epoch': 15.49}


 31%|‚ñà‚ñà‚ñà       | 7121/22950 [11:10<22:29, 11.73it/s]

{'loss': 0.0312, 'grad_norm': 104.02235412597656, 'learning_rate': 6.897603485838781e-05, 'epoch': 15.51}


 31%|‚ñà‚ñà‚ñà       | 7131/22950 [11:10<22:37, 11.65it/s]

{'loss': 0.0005, 'grad_norm': 0.03223780542612076, 'learning_rate': 6.893246187363834e-05, 'epoch': 15.53}


 31%|‚ñà‚ñà‚ñà       | 7141/22950 [11:11<22:47, 11.56it/s]

{'loss': 0.0389, 'grad_norm': 0.035121459513902664, 'learning_rate': 6.88888888888889e-05, 'epoch': 15.56}


 31%|‚ñà‚ñà‚ñà       | 7151/22950 [11:12<22:47, 11.55it/s]

{'loss': 0.0009, 'grad_norm': 0.02035921812057495, 'learning_rate': 6.884531590413945e-05, 'epoch': 15.58}


 31%|‚ñà‚ñà‚ñà       | 7161/22950 [11:13<22:45, 11.56it/s]

{'loss': 0.1422, 'grad_norm': 14.161042213439941, 'learning_rate': 6.880174291938998e-05, 'epoch': 15.6}


 31%|‚ñà‚ñà‚ñà       | 7171/22950 [11:14<23:06, 11.38it/s]

{'loss': 0.0007, 'grad_norm': 0.02306520566344261, 'learning_rate': 6.875816993464053e-05, 'epoch': 15.62}


 31%|‚ñà‚ñà‚ñà‚ñè      | 7181/22950 [11:15<22:52, 11.49it/s]

{'loss': 0.0728, 'grad_norm': 0.014392036013305187, 'learning_rate': 6.871459694989107e-05, 'epoch': 15.64}


 31%|‚ñà‚ñà‚ñà‚ñè      | 7191/22950 [11:16<22:37, 11.61it/s]

{'loss': 0.0007, 'grad_norm': 0.017315233126282692, 'learning_rate': 6.867102396514162e-05, 'epoch': 15.66}


 31%|‚ñà‚ñà‚ñà‚ñè      | 7201/22950 [11:16<22:52, 11.47it/s]

{'loss': 0.0005, 'grad_norm': 0.03965885937213898, 'learning_rate': 6.862745098039216e-05, 'epoch': 15.69}


 31%|‚ñà‚ñà‚ñà‚ñè      | 7211/22950 [11:17<22:40, 11.57it/s]

{'loss': 0.0143, 'grad_norm': 0.09850967675447464, 'learning_rate': 6.85838779956427e-05, 'epoch': 15.71}


 31%|‚ñà‚ñà‚ñà‚ñè      | 7221/22950 [11:18<22:41, 11.55it/s]

{'loss': 0.0003, 'grad_norm': 0.11206625401973724, 'learning_rate': 6.854030501089324e-05, 'epoch': 15.73}


 32%|‚ñà‚ñà‚ñà‚ñè      | 7231/22950 [11:19<24:12, 10.82it/s]

{'loss': 0.1021, 'grad_norm': 0.035716332495212555, 'learning_rate': 6.849673202614379e-05, 'epoch': 15.75}


 32%|‚ñà‚ñà‚ñà‚ñè      | 7241/22950 [11:20<22:59, 11.38it/s]

{'loss': 0.0024, 'grad_norm': 0.17526116967201233, 'learning_rate': 6.845315904139434e-05, 'epoch': 15.77}


 32%|‚ñà‚ñà‚ñà‚ñè      | 7251/22950 [11:21<23:01, 11.36it/s]

{'loss': 0.0632, 'grad_norm': 0.014888443052768707, 'learning_rate': 6.840958605664488e-05, 'epoch': 15.8}


 32%|‚ñà‚ñà‚ñà‚ñè      | 7261/22950 [11:22<22:51, 11.44it/s]

{'loss': 0.157, 'grad_norm': 0.007160520181059837, 'learning_rate': 6.836601307189543e-05, 'epoch': 15.82}


 32%|‚ñà‚ñà‚ñà‚ñè      | 7271/22950 [11:23<22:50, 11.44it/s]

{'loss': 0.006, 'grad_norm': 0.8764719367027283, 'learning_rate': 6.832244008714598e-05, 'epoch': 15.84}


 32%|‚ñà‚ñà‚ñà‚ñè      | 7281/22950 [11:24<23:15, 11.23it/s]

{'loss': 0.113, 'grad_norm': 21.111814498901367, 'learning_rate': 6.827886710239652e-05, 'epoch': 15.86}


 32%|‚ñà‚ñà‚ñà‚ñè      | 7291/22950 [11:25<25:21, 10.29it/s]

{'loss': 0.0089, 'grad_norm': 4.022588729858398, 'learning_rate': 6.823529411764707e-05, 'epoch': 15.88}


 32%|‚ñà‚ñà‚ñà‚ñè      | 7301/22950 [11:25<24:11, 10.78it/s]

{'loss': 0.1869, 'grad_norm': 0.2788960039615631, 'learning_rate': 6.81917211328976e-05, 'epoch': 15.9}


 32%|‚ñà‚ñà‚ñà‚ñè      | 7311/22950 [11:26<22:55, 11.37it/s]

{'loss': 0.1527, 'grad_norm': 0.01568066142499447, 'learning_rate': 6.814814814814815e-05, 'epoch': 15.93}


 32%|‚ñà‚ñà‚ñà‚ñè      | 7321/22950 [11:27<22:55, 11.36it/s]

{'loss': 0.0767, 'grad_norm': 0.044847168028354645, 'learning_rate': 6.81045751633987e-05, 'epoch': 15.95}


 32%|‚ñà‚ñà‚ñà‚ñè      | 7331/22950 [11:28<22:42, 11.46it/s]

{'loss': 0.1431, 'grad_norm': 0.016438497230410576, 'learning_rate': 6.806100217864924e-05, 'epoch': 15.97}


 32%|‚ñà‚ñà‚ñà‚ñè      | 7341/22950 [11:29<22:40, 11.47it/s]

{'loss': 0.0008, 'grad_norm': 0.011625170707702637, 'learning_rate': 6.801742919389979e-05, 'epoch': 15.99}


                                                    
 32%|‚ñà‚ñà‚ñà‚ñè      | 7344/22950 [11:32<22:36, 11.51it/s]

{'eval_loss': 0.8158693313598633, 'eval_accuracy': 0.8529411554336548, 'eval_runtime': 2.6493, 'eval_samples_per_second': 154.002, 'eval_steps_per_second': 19.25, 'epoch': 16.0}


 32%|‚ñà‚ñà‚ñà‚ñè      | 7351/22950 [11:33<1:01:06,  4.25it/s]

{'loss': 0.0767, 'grad_norm': 0.017146475613117218, 'learning_rate': 6.797385620915033e-05, 'epoch': 16.01}


 32%|‚ñà‚ñà‚ñà‚ñè      | 7361/22950 [11:34<30:31,  8.51it/s]  

{'loss': 0.071, 'grad_norm': 0.02359859086573124, 'learning_rate': 6.793028322440086e-05, 'epoch': 16.03}


 32%|‚ñà‚ñà‚ñà‚ñè      | 7371/22950 [11:35<24:13, 10.72it/s]

{'loss': 0.0914, 'grad_norm': 12.326824188232422, 'learning_rate': 6.788671023965141e-05, 'epoch': 16.06}


 32%|‚ñà‚ñà‚ñà‚ñè      | 7381/22950 [11:35<23:12, 11.18it/s]

{'loss': 0.055, 'grad_norm': 0.007545982021838427, 'learning_rate': 6.784313725490196e-05, 'epoch': 16.08}


 32%|‚ñà‚ñà‚ñà‚ñè      | 7391/22950 [11:36<22:59, 11.28it/s]

{'loss': 0.0013, 'grad_norm': 0.021045230329036713, 'learning_rate': 6.77995642701525e-05, 'epoch': 16.1}


 32%|‚ñà‚ñà‚ñà‚ñè      | 7401/22950 [11:37<23:00, 11.26it/s]

{'loss': 0.1516, 'grad_norm': 0.02389468438923359, 'learning_rate': 6.775599128540305e-05, 'epoch': 16.12}


 32%|‚ñà‚ñà‚ñà‚ñè      | 7411/22950 [11:38<23:20, 11.09it/s]

{'loss': 0.0008, 'grad_norm': 0.06677892059087753, 'learning_rate': 6.77124183006536e-05, 'epoch': 16.14}


 32%|‚ñà‚ñà‚ñà‚ñè      | 7421/22950 [11:39<22:53, 11.30it/s]

{'loss': 0.0006, 'grad_norm': 0.020861707627773285, 'learning_rate': 6.766884531590414e-05, 'epoch': 16.17}


 32%|‚ñà‚ñà‚ñà‚ñè      | 7431/22950 [11:40<23:04, 11.21it/s]

{'loss': 0.0053, 'grad_norm': 0.012959087267518044, 'learning_rate': 6.762527233115469e-05, 'epoch': 16.19}


 32%|‚ñà‚ñà‚ñà‚ñè      | 7441/22950 [11:41<23:12, 11.14it/s]

{'loss': 0.0251, 'grad_norm': 0.00412251939997077, 'learning_rate': 6.758169934640524e-05, 'epoch': 16.21}


 32%|‚ñà‚ñà‚ñà‚ñè      | 7451/22950 [11:42<22:48, 11.32it/s]

{'loss': 0.0353, 'grad_norm': 0.00284956069663167, 'learning_rate': 6.753812636165577e-05, 'epoch': 16.23}


 33%|‚ñà‚ñà‚ñà‚ñé      | 7461/22950 [11:43<22:56, 11.25it/s]

{'loss': 0.0003, 'grad_norm': 0.01015681866556406, 'learning_rate': 6.749455337690632e-05, 'epoch': 16.25}


 33%|‚ñà‚ñà‚ñà‚ñé      | 7471/22950 [11:43<22:59, 11.22it/s]

{'loss': 0.0004, 'grad_norm': 0.005490193609148264, 'learning_rate': 6.745098039215687e-05, 'epoch': 16.27}


 33%|‚ñà‚ñà‚ñà‚ñé      | 7481/22950 [11:44<23:26, 11.00it/s]

{'loss': 0.0304, 'grad_norm': 56.32664108276367, 'learning_rate': 6.740740740740741e-05, 'epoch': 16.3}


 33%|‚ñà‚ñà‚ñà‚ñé      | 7491/22950 [11:45<23:11, 11.11it/s]

{'loss': 0.0426, 'grad_norm': 0.0052179209887981415, 'learning_rate': 6.736383442265796e-05, 'epoch': 16.32}


 33%|‚ñà‚ñà‚ñà‚ñé      | 7501/22950 [11:46<23:01, 11.19it/s]

{'loss': 0.0002, 'grad_norm': 0.0038933241739869118, 'learning_rate': 6.73202614379085e-05, 'epoch': 16.34}


 33%|‚ñà‚ñà‚ñà‚ñé      | 7511/22950 [11:47<22:57, 11.21it/s]

{'loss': 0.1619, 'grad_norm': 0.00910342950373888, 'learning_rate': 6.727668845315905e-05, 'epoch': 16.36}


 33%|‚ñà‚ñà‚ñà‚ñé      | 7521/22950 [11:48<22:57, 11.20it/s]

{'loss': 0.0489, 'grad_norm': 0.005681220907717943, 'learning_rate': 6.723311546840959e-05, 'epoch': 16.38}


 33%|‚ñà‚ñà‚ñà‚ñé      | 7531/22950 [11:49<22:57, 11.19it/s]

{'loss': 0.0326, 'grad_norm': 0.5737423300743103, 'learning_rate': 6.718954248366014e-05, 'epoch': 16.41}


 33%|‚ñà‚ñà‚ñà‚ñé      | 7541/22950 [11:50<23:03, 11.14it/s]

{'loss': 0.0011, 'grad_norm': 0.1662278026342392, 'learning_rate': 6.714596949891067e-05, 'epoch': 16.43}


 33%|‚ñà‚ñà‚ñà‚ñé      | 7551/22950 [11:51<23:33, 10.89it/s]

{'loss': 0.0021, 'grad_norm': 0.0041300528682768345, 'learning_rate': 6.710239651416122e-05, 'epoch': 16.45}


 33%|‚ñà‚ñà‚ñà‚ñé      | 7561/22950 [11:52<22:58, 11.16it/s]

{'loss': 0.0002, 'grad_norm': 0.003548016771674156, 'learning_rate': 6.705882352941176e-05, 'epoch': 16.47}


 33%|‚ñà‚ñà‚ñà‚ñé      | 7571/22950 [11:52<22:46, 11.25it/s]

{'loss': 0.0005, 'grad_norm': 0.011002041399478912, 'learning_rate': 6.701525054466231e-05, 'epoch': 16.49}


 33%|‚ñà‚ñà‚ñà‚ñé      | 7581/22950 [11:53<22:50, 11.21it/s]

{'loss': 0.0001, 'grad_norm': 0.0022743777371942997, 'learning_rate': 6.697167755991286e-05, 'epoch': 16.51}


 33%|‚ñà‚ñà‚ñà‚ñé      | 7591/22950 [11:54<22:52, 11.19it/s]

{'loss': 0.0001, 'grad_norm': 0.004747726954519749, 'learning_rate': 6.69281045751634e-05, 'epoch': 16.54}


 33%|‚ñà‚ñà‚ñà‚ñé      | 7601/22950 [11:55<23:00, 11.12it/s]

{'loss': 0.0043, 'grad_norm': 0.11980297416448593, 'learning_rate': 6.688453159041395e-05, 'epoch': 16.56}


 33%|‚ñà‚ñà‚ñà‚ñé      | 7611/22950 [11:56<23:19, 10.96it/s]

{'loss': 0.001, 'grad_norm': 0.009109505452215672, 'learning_rate': 6.68409586056645e-05, 'epoch': 16.58}


 33%|‚ñà‚ñà‚ñà‚ñé      | 7621/22950 [11:57<23:09, 11.03it/s]

{'loss': 0.0002, 'grad_norm': 0.005246657412499189, 'learning_rate': 6.679738562091503e-05, 'epoch': 16.6}


 33%|‚ñà‚ñà‚ñà‚ñé      | 7631/22950 [11:58<23:02, 11.08it/s]

{'loss': 0.1048, 'grad_norm': 17.8727970123291, 'learning_rate': 6.675381263616558e-05, 'epoch': 16.62}


 33%|‚ñà‚ñà‚ñà‚ñé      | 7641/22950 [11:59<23:09, 11.02it/s]

{'loss': 0.104, 'grad_norm': 0.0034692969638854265, 'learning_rate': 6.671023965141613e-05, 'epoch': 16.64}


 33%|‚ñà‚ñà‚ñà‚ñé      | 7651/22950 [12:00<22:57, 11.10it/s]

{'loss': 0.1009, 'grad_norm': 0.010821756906807423, 'learning_rate': 6.666666666666667e-05, 'epoch': 16.67}


 33%|‚ñà‚ñà‚ñà‚ñé      | 7661/22950 [12:01<22:48, 11.17it/s]

{'loss': 0.0236, 'grad_norm': 0.009544349275529385, 'learning_rate': 6.662309368191722e-05, 'epoch': 16.69}


 33%|‚ñà‚ñà‚ñà‚ñé      | 7671/22950 [12:01<23:03, 11.04it/s]

{'loss': 0.1023, 'grad_norm': 0.012765783816576004, 'learning_rate': 6.657952069716776e-05, 'epoch': 16.71}


 33%|‚ñà‚ñà‚ñà‚ñé      | 7681/22950 [12:02<22:57, 11.09it/s]

{'loss': 0.0251, 'grad_norm': 0.0060823881067335606, 'learning_rate': 6.653594771241831e-05, 'epoch': 16.73}


 34%|‚ñà‚ñà‚ñà‚ñé      | 7691/22950 [12:03<22:57, 11.08it/s]

{'loss': 0.0604, 'grad_norm': 0.02256038971245289, 'learning_rate': 6.649237472766884e-05, 'epoch': 16.75}


 34%|‚ñà‚ñà‚ñà‚ñé      | 7701/22950 [12:04<22:57, 11.07it/s]

{'loss': 0.0038, 'grad_norm': 0.010223770514130592, 'learning_rate': 6.64488017429194e-05, 'epoch': 16.78}


 34%|‚ñà‚ñà‚ñà‚ñé      | 7711/22950 [12:05<23:12, 10.94it/s]

{'loss': 0.0047, 'grad_norm': 0.01707705296576023, 'learning_rate': 6.640522875816993e-05, 'epoch': 16.8}


 34%|‚ñà‚ñà‚ñà‚ñé      | 7721/22950 [12:06<23:00, 11.03it/s]

{'loss': 0.0015, 'grad_norm': 0.010129840113222599, 'learning_rate': 6.636165577342048e-05, 'epoch': 16.82}


 34%|‚ñà‚ñà‚ñà‚ñé      | 7731/22950 [12:07<23:26, 10.82it/s]

{'loss': 0.0003, 'grad_norm': 0.0033165859058499336, 'learning_rate': 6.631808278867103e-05, 'epoch': 16.84}


 34%|‚ñà‚ñà‚ñà‚ñé      | 7741/22950 [12:08<22:57, 11.04it/s]

{'loss': 0.0003, 'grad_norm': 0.019581304863095284, 'learning_rate': 6.627450980392157e-05, 'epoch': 16.86}


 34%|‚ñà‚ñà‚ñà‚ñç      | 7751/22950 [12:09<23:02, 10.99it/s]

{'loss': 0.2782, 'grad_norm': 0.034678466618061066, 'learning_rate': 6.623093681917212e-05, 'epoch': 16.88}


 34%|‚ñà‚ñà‚ñà‚ñç      | 7761/22950 [12:10<22:57, 11.03it/s]

{'loss': 0.0004, 'grad_norm': 0.010970653966069221, 'learning_rate': 6.618736383442267e-05, 'epoch': 16.91}


 34%|‚ñà‚ñà‚ñà‚ñç      | 7771/22950 [12:11<22:56, 11.03it/s]

{'loss': 0.0541, 'grad_norm': 0.03547792509198189, 'learning_rate': 6.61437908496732e-05, 'epoch': 16.93}


 34%|‚ñà‚ñà‚ñà‚ñç      | 7781/22950 [12:11<22:50, 11.07it/s]

{'loss': 0.0007, 'grad_norm': 0.010423858650028706, 'learning_rate': 6.610021786492375e-05, 'epoch': 16.95}


 34%|‚ñà‚ñà‚ñà‚ñç      | 7791/22950 [12:12<23:24, 10.80it/s]

{'loss': 0.0006, 'grad_norm': 0.012792062945663929, 'learning_rate': 6.605664488017429e-05, 'epoch': 16.97}


 34%|‚ñà‚ñà‚ñà‚ñç      | 7801/22950 [12:13<22:55, 11.02it/s]

{'loss': 0.0534, 'grad_norm': 0.00775509187951684, 'learning_rate': 6.601307189542484e-05, 'epoch': 16.99}


                                                    
 34%|‚ñà‚ñà‚ñà‚ñç      | 7803/22950 [12:16<21:28, 11.76it/s]

{'eval_loss': 0.9661809206008911, 'eval_accuracy': 0.8504902124404907, 'eval_runtime': 2.7539, 'eval_samples_per_second': 148.154, 'eval_steps_per_second': 18.519, 'epoch': 17.0}


 34%|‚ñà‚ñà‚ñà‚ñç      | 7811/22950 [12:17<1:00:57,  4.14it/s]

{'loss': 0.0002, 'grad_norm': 0.004674654453992844, 'learning_rate': 6.596949891067539e-05, 'epoch': 17.02}


 34%|‚ñà‚ñà‚ñà‚ñç      | 7821/22950 [12:18<30:39,  8.22it/s]  

{'loss': 0.0002, 'grad_norm': 0.0196674894541502, 'learning_rate': 6.592592592592593e-05, 'epoch': 17.04}


 34%|‚ñà‚ñà‚ñà‚ñç      | 7831/22950 [12:19<24:31, 10.27it/s]

{'loss': 0.0153, 'grad_norm': 0.00881696306169033, 'learning_rate': 6.588235294117648e-05, 'epoch': 17.06}


 34%|‚ñà‚ñà‚ñà‚ñç      | 7841/22950 [12:20<23:50, 10.56it/s]

{'loss': 0.0004, 'grad_norm': 0.007795060984790325, 'learning_rate': 6.583877995642702e-05, 'epoch': 17.08}


 34%|‚ñà‚ñà‚ñà‚ñç      | 7851/22950 [12:21<23:14, 10.83it/s]

{'loss': 0.0003, 'grad_norm': 0.012989156879484653, 'learning_rate': 6.579520697167757e-05, 'epoch': 17.1}


 34%|‚ñà‚ñà‚ñà‚ñç      | 7861/22950 [12:22<23:06, 10.89it/s]

{'loss': 0.0002, 'grad_norm': 0.006988695822656155, 'learning_rate': 6.57516339869281e-05, 'epoch': 17.12}


 34%|‚ñà‚ñà‚ñà‚ñç      | 7871/22950 [12:23<23:26, 10.72it/s]

{'loss': 0.1077, 'grad_norm': 0.003833521157503128, 'learning_rate': 6.570806100217865e-05, 'epoch': 17.15}


 34%|‚ñà‚ñà‚ñà‚ñç      | 7881/22950 [12:24<22:58, 10.93it/s]

{'loss': 0.1143, 'grad_norm': 82.77031707763672, 'learning_rate': 6.566448801742919e-05, 'epoch': 17.17}


 34%|‚ñà‚ñà‚ñà‚ñç      | 7891/22950 [12:25<22:59, 10.92it/s]

{'loss': 0.0002, 'grad_norm': 0.008358084596693516, 'learning_rate': 6.562091503267974e-05, 'epoch': 17.19}


 34%|‚ñà‚ñà‚ñà‚ñç      | 7901/22950 [12:26<22:57, 10.93it/s]

{'loss': 0.0004, 'grad_norm': 1.325372576713562, 'learning_rate': 6.557734204793029e-05, 'epoch': 17.21}


 34%|‚ñà‚ñà‚ñà‚ñç      | 7911/22950 [12:27<23:27, 10.69it/s]

{'loss': 0.0079, 'grad_norm': 0.009090704843401909, 'learning_rate': 6.553376906318083e-05, 'epoch': 17.23}


 35%|‚ñà‚ñà‚ñà‚ñç      | 7921/22950 [12:27<23:07, 10.83it/s]

{'loss': 0.0017, 'grad_norm': 0.004629083443433046, 'learning_rate': 6.549019607843138e-05, 'epoch': 17.25}


 35%|‚ñà‚ñà‚ñà‚ñç      | 7931/22950 [12:28<22:54, 10.93it/s]

{'loss': 0.0001, 'grad_norm': 0.004014967475086451, 'learning_rate': 6.544662309368193e-05, 'epoch': 17.28}


 35%|‚ñà‚ñà‚ñà‚ñç      | 7941/22950 [12:29<23:03, 10.85it/s]

{'loss': 0.0101, 'grad_norm': 0.0034341278951615095, 'learning_rate': 6.540305010893246e-05, 'epoch': 17.3}


 35%|‚ñà‚ñà‚ñà‚ñç      | 7951/22950 [12:30<23:05, 10.82it/s]

{'loss': 0.1105, 'grad_norm': 0.003686422249302268, 'learning_rate': 6.535947712418301e-05, 'epoch': 17.32}


 35%|‚ñà‚ñà‚ñà‚ñç      | 7961/22950 [12:31<22:57, 10.89it/s]

{'loss': 0.0003, 'grad_norm': 0.01976584643125534, 'learning_rate': 6.531590413943356e-05, 'epoch': 17.34}


 35%|‚ñà‚ñà‚ñà‚ñç      | 7971/22950 [12:32<23:25, 10.66it/s]

{'loss': 0.0745, 'grad_norm': 0.00861126184463501, 'learning_rate': 6.52723311546841e-05, 'epoch': 17.36}


 35%|‚ñà‚ñà‚ñà‚ñç      | 7981/22950 [12:33<22:55, 10.88it/s]

{'loss': 0.0034, 'grad_norm': 27.52979850769043, 'learning_rate': 6.522875816993465e-05, 'epoch': 17.39}


 35%|‚ñà‚ñà‚ñà‚ñç      | 7991/22950 [12:34<22:53, 10.89it/s]

{'loss': 0.0002, 'grad_norm': 0.009479072876274586, 'learning_rate': 6.51851851851852e-05, 'epoch': 17.41}


 35%|‚ñà‚ñà‚ñà‚ñç      | 8001/22950 [12:35<22:48, 10.92it/s]

{'loss': 0.0003, 'grad_norm': 0.0024314047768712044, 'learning_rate': 6.514161220043574e-05, 'epoch': 17.43}


 35%|‚ñà‚ñà‚ñà‚ñç      | 8011/22950 [12:36<22:52, 10.88it/s]

{'loss': 0.0079, 'grad_norm': 0.0033493018709123135, 'learning_rate': 6.509803921568627e-05, 'epoch': 17.45}


 35%|‚ñà‚ñà‚ñà‚ñç      | 8021/22950 [12:37<22:57, 10.84it/s]

{'loss': 0.0931, 'grad_norm': 0.8312516212463379, 'learning_rate': 6.505446623093682e-05, 'epoch': 17.47}


 35%|‚ñà‚ñà‚ñà‚ñç      | 8031/22950 [12:38<23:12, 10.71it/s]

{'loss': 0.0105, 'grad_norm': 0.019458526745438576, 'learning_rate': 6.501089324618736e-05, 'epoch': 17.49}


 35%|‚ñà‚ñà‚ñà‚ñå      | 8041/22950 [12:39<23:12, 10.71it/s]

{'loss': 0.0001, 'grad_norm': 0.0032714635599404573, 'learning_rate': 6.496732026143791e-05, 'epoch': 17.52}


 35%|‚ñà‚ñà‚ñà‚ñå      | 8051/22950 [12:40<23:07, 10.74it/s]

{'loss': 0.0004, 'grad_norm': 0.003548514097929001, 'learning_rate': 6.492374727668845e-05, 'epoch': 17.54}


 35%|‚ñà‚ñà‚ñà‚ñå      | 8061/22950 [12:40<22:51, 10.86it/s]

{'loss': 0.0001, 'grad_norm': 0.0341549851000309, 'learning_rate': 6.4880174291939e-05, 'epoch': 17.56}


 35%|‚ñà‚ñà‚ñà‚ñå      | 8071/22950 [12:41<22:54, 10.83it/s]

{'loss': 0.0001, 'grad_norm': 0.005655795335769653, 'learning_rate': 6.483660130718955e-05, 'epoch': 17.58}


 35%|‚ñà‚ñà‚ñà‚ñå      | 8081/22950 [12:42<22:53, 10.82it/s]

{'loss': 0.0001, 'grad_norm': 0.005299913231283426, 'learning_rate': 6.479302832244008e-05, 'epoch': 17.6}


 35%|‚ñà‚ñà‚ñà‚ñå      | 8091/22950 [12:43<23:15, 10.64it/s]

{'loss': 0.0733, 'grad_norm': 0.0038251003716140985, 'learning_rate': 6.474945533769063e-05, 'epoch': 17.63}


 35%|‚ñà‚ñà‚ñà‚ñå      | 8101/22950 [12:44<24:09, 10.24it/s]

{'loss': 0.0003, 'grad_norm': 0.004121340345591307, 'learning_rate': 6.470588235294118e-05, 'epoch': 17.65}


 35%|‚ñà‚ñà‚ñà‚ñå      | 8111/22950 [12:45<23:02, 10.74it/s]

{'loss': 0.0868, 'grad_norm': 0.003541837213560939, 'learning_rate': 6.466230936819172e-05, 'epoch': 17.67}


 35%|‚ñà‚ñà‚ñà‚ñå      | 8121/22950 [12:46<22:44, 10.87it/s]

{'loss': 0.0005, 'grad_norm': 0.0037165225949138403, 'learning_rate': 6.461873638344227e-05, 'epoch': 17.69}


 35%|‚ñà‚ñà‚ñà‚ñå      | 8131/22950 [12:47<22:48, 10.83it/s]

{'loss': 0.0009, 'grad_norm': 0.006512267515063286, 'learning_rate': 6.457516339869282e-05, 'epoch': 17.71}


 35%|‚ñà‚ñà‚ñà‚ñå      | 8141/22950 [12:48<22:42, 10.87it/s]

{'loss': 0.0001, 'grad_norm': 0.005164520815014839, 'learning_rate': 6.453159041394336e-05, 'epoch': 17.73}


 36%|‚ñà‚ñà‚ñà‚ñå      | 8151/22950 [12:49<22:41, 10.87it/s]

{'loss': 0.0032, 'grad_norm': 0.3033272922039032, 'learning_rate': 6.448801742919391e-05, 'epoch': 17.76}


 36%|‚ñà‚ñà‚ñà‚ñå      | 8161/22950 [12:50<23:03, 10.69it/s]

{'loss': 0.2098, 'grad_norm': 0.004603009670972824, 'learning_rate': 6.444444444444446e-05, 'epoch': 17.78}


 36%|‚ñà‚ñà‚ñà‚ñå      | 8171/22950 [12:51<22:52, 10.77it/s]

{'loss': 0.0638, 'grad_norm': 0.0116347037255764, 'learning_rate': 6.4400871459695e-05, 'epoch': 17.8}


 36%|‚ñà‚ñà‚ñà‚ñå      | 8181/22950 [12:52<22:43, 10.83it/s]

{'loss': 0.0539, 'grad_norm': 0.09997627139091492, 'learning_rate': 6.435729847494553e-05, 'epoch': 17.82}


 36%|‚ñà‚ñà‚ñà‚ñå      | 8191/22950 [12:53<22:40, 10.84it/s]

{'loss': 0.0003, 'grad_norm': 0.005299780517816544, 'learning_rate': 6.431372549019608e-05, 'epoch': 17.84}


 36%|‚ñà‚ñà‚ñà‚ñå      | 8201/22950 [12:54<22:36, 10.88it/s]

{'loss': 0.0828, 'grad_norm': 0.007544658146798611, 'learning_rate': 6.427015250544662e-05, 'epoch': 17.86}


 36%|‚ñà‚ñà‚ñà‚ñå      | 8211/22950 [12:54<22:44, 10.80it/s]

{'loss': 0.0002, 'grad_norm': 0.013500414788722992, 'learning_rate': 6.422657952069717e-05, 'epoch': 17.89}


 36%|‚ñà‚ñà‚ñà‚ñå      | 8221/22950 [12:55<23:05, 10.63it/s]

{'loss': 0.1134, 'grad_norm': 0.00650946656242013, 'learning_rate': 6.418300653594772e-05, 'epoch': 17.91}


 36%|‚ñà‚ñà‚ñà‚ñå      | 8231/22950 [12:56<22:48, 10.76it/s]

{'loss': 0.0116, 'grad_norm': 0.24888142943382263, 'learning_rate': 6.413943355119826e-05, 'epoch': 17.93}


 36%|‚ñà‚ñà‚ñà‚ñå      | 8241/22950 [12:57<22:37, 10.84it/s]

{'loss': 0.1528, 'grad_norm': 0.3250167965888977, 'learning_rate': 6.40958605664488e-05, 'epoch': 17.95}


 36%|‚ñà‚ñà‚ñà‚ñå      | 8251/22950 [12:58<22:41, 10.80it/s]

{'loss': 0.0007, 'grad_norm': 0.007105708122253418, 'learning_rate': 6.405228758169934e-05, 'epoch': 17.97}


 36%|‚ñà‚ñà‚ñà‚ñå      | 8261/22950 [12:59<22:46, 10.75it/s]

{'loss': 0.0004, 'grad_norm': 0.007410126738250256, 'learning_rate': 6.400871459694989e-05, 'epoch': 18.0}


                                                    
 36%|‚ñà‚ñà‚ñà‚ñå      | 8262/22950 [13:02<22:45, 10.75it/s]

{'eval_loss': 1.0618555545806885, 'eval_accuracy': 0.8382353186607361, 'eval_runtime': 2.7959, 'eval_samples_per_second': 145.927, 'eval_steps_per_second': 18.241, 'epoch': 18.0}


 36%|‚ñà‚ñà‚ñà‚ñå      | 8271/22950 [13:03<48:31,  5.04it/s]  

{'loss': 0.1079, 'grad_norm': 0.004660238046199083, 'learning_rate': 6.396514161220044e-05, 'epoch': 18.02}


 36%|‚ñà‚ñà‚ñà‚ñå      | 8281/22950 [13:04<27:22,  8.93it/s]

{'loss': 0.0002, 'grad_norm': 0.018589016050100327, 'learning_rate': 6.392156862745098e-05, 'epoch': 18.04}


 36%|‚ñà‚ñà‚ñà‚ñå      | 8291/22950 [13:05<23:54, 10.22it/s]

{'loss': 0.0011, 'grad_norm': 0.015332275070250034, 'learning_rate': 6.387799564270153e-05, 'epoch': 18.06}


 36%|‚ñà‚ñà‚ñà‚ñå      | 8301/22950 [13:06<23:07, 10.56it/s]

{'loss': 0.0004, 'grad_norm': 0.003287432249635458, 'learning_rate': 6.383442265795208e-05, 'epoch': 18.08}


 36%|‚ñà‚ñà‚ñà‚ñå      | 8311/22950 [13:07<22:45, 10.72it/s]

{'loss': 0.0003, 'grad_norm': 0.004033638630062342, 'learning_rate': 6.379084967320262e-05, 'epoch': 18.1}


 36%|‚ñà‚ñà‚ñà‚ñã      | 8321/22950 [13:08<23:35, 10.34it/s]

{'loss': 0.0002, 'grad_norm': 0.004180096555501223, 'learning_rate': 6.374727668845317e-05, 'epoch': 18.13}


 36%|‚ñà‚ñà‚ñà‚ñã      | 8331/22950 [13:09<22:50, 10.67it/s]

{'loss': 0.0002, 'grad_norm': 0.0068566324189305305, 'learning_rate': 6.37037037037037e-05, 'epoch': 18.15}


 36%|‚ñà‚ñà‚ñà‚ñã      | 8341/22950 [13:10<22:43, 10.71it/s]

{'loss': 0.0002, 'grad_norm': 0.0031701764091849327, 'learning_rate': 6.366013071895425e-05, 'epoch': 18.17}


 36%|‚ñà‚ñà‚ñà‚ñã      | 8351/22950 [13:11<22:33, 10.79it/s]

{'loss': 0.0688, 'grad_norm': 0.005134982988238335, 'learning_rate': 6.361655773420479e-05, 'epoch': 18.19}


 36%|‚ñà‚ñà‚ñà‚ñã      | 8361/22950 [13:12<22:28, 10.82it/s]

{'loss': 0.0006, 'grad_norm': 0.01123055350035429, 'learning_rate': 6.357298474945534e-05, 'epoch': 18.21}


 36%|‚ñà‚ñà‚ñà‚ñã      | 8371/22950 [13:13<22:49, 10.65it/s]

{'loss': 0.0646, 'grad_norm': 0.004129723645746708, 'learning_rate': 6.352941176470588e-05, 'epoch': 18.24}


 37%|‚ñà‚ñà‚ñà‚ñã      | 8381/22950 [13:13<22:26, 10.82it/s]

{'loss': 0.0001, 'grad_norm': 0.004256679676473141, 'learning_rate': 6.348583877995643e-05, 'epoch': 18.26}


 37%|‚ñà‚ñà‚ñà‚ñã      | 8391/22950 [13:14<22:58, 10.56it/s]

{'loss': 0.0008, 'grad_norm': 0.012804746627807617, 'learning_rate': 6.344226579520698e-05, 'epoch': 18.28}


 37%|‚ñà‚ñà‚ñà‚ñã      | 8401/22950 [13:15<22:36, 10.73it/s]

{'loss': 0.0002, 'grad_norm': 0.004309582989662886, 'learning_rate': 6.339869281045751e-05, 'epoch': 18.3}


 37%|‚ñà‚ñà‚ñà‚ñã      | 8411/22950 [13:16<22:21, 10.84it/s]

{'loss': 0.0002, 'grad_norm': 0.0027326017152518034, 'learning_rate': 6.335511982570806e-05, 'epoch': 18.32}


 37%|‚ñà‚ñà‚ñà‚ñã      | 8421/22950 [13:17<22:24, 10.81it/s]

{'loss': 0.0001, 'grad_norm': 0.0027200214099138975, 'learning_rate': 6.331154684095861e-05, 'epoch': 18.34}


 37%|‚ñà‚ñà‚ñà‚ñã      | 8431/22950 [13:18<22:26, 10.79it/s]

{'loss': 0.0025, 'grad_norm': 0.002737834118306637, 'learning_rate': 6.326797385620915e-05, 'epoch': 18.37}


 37%|‚ñà‚ñà‚ñà‚ñã      | 8441/22950 [13:19<22:33, 10.72it/s]

{'loss': 0.0001, 'grad_norm': 0.002564532682299614, 'learning_rate': 6.32244008714597e-05, 'epoch': 18.39}


 37%|‚ñà‚ñà‚ñà‚ñã      | 8451/22950 [13:20<23:08, 10.44it/s]

{'loss': 0.0001, 'grad_norm': 0.00883710477501154, 'learning_rate': 6.318082788671025e-05, 'epoch': 18.41}


 37%|‚ñà‚ñà‚ñà‚ñã      | 8461/22950 [13:21<22:26, 10.76it/s]

{'loss': 0.0004, 'grad_norm': 2.4765217304229736, 'learning_rate': 6.313725490196079e-05, 'epoch': 18.43}


 37%|‚ñà‚ñà‚ñà‚ñã      | 8471/22950 [13:22<22:20, 10.80it/s]

{'loss': 0.0551, 'grad_norm': 0.001978905638679862, 'learning_rate': 6.309368191721134e-05, 'epoch': 18.45}


 37%|‚ñà‚ñà‚ñà‚ñã      | 8481/22950 [13:23<22:37, 10.66it/s]

{'loss': 0.0369, 'grad_norm': 0.0022292411886155605, 'learning_rate': 6.305010893246189e-05, 'epoch': 18.47}


 37%|‚ñà‚ñà‚ñà‚ñã      | 8491/22950 [13:24<22:27, 10.73it/s]

{'loss': 0.0001, 'grad_norm': 0.003880094736814499, 'learning_rate': 6.300653594771242e-05, 'epoch': 18.5}


 37%|‚ñà‚ñà‚ñà‚ñã      | 8501/22950 [13:25<22:30, 10.70it/s]

{'loss': 0.0001, 'grad_norm': 0.0030494392849504948, 'learning_rate': 6.296296296296296e-05, 'epoch': 18.52}


 37%|‚ñà‚ñà‚ñà‚ñã      | 8511/22950 [13:26<22:48, 10.55it/s]

{'loss': 0.1272, 'grad_norm': 0.0020441384986042976, 'learning_rate': 6.291938997821351e-05, 'epoch': 18.54}


 37%|‚ñà‚ñà‚ñà‚ñã      | 8521/22950 [13:27<22:34, 10.65it/s]

{'loss': 0.0003, 'grad_norm': 0.004044049885123968, 'learning_rate': 6.287581699346405e-05, 'epoch': 18.56}


 37%|‚ñà‚ñà‚ñà‚ñã      | 8531/22950 [13:28<22:37, 10.62it/s]

{'loss': 0.0004, 'grad_norm': 0.0036105553153902292, 'learning_rate': 6.28322440087146e-05, 'epoch': 18.58}


 37%|‚ñà‚ñà‚ñà‚ñã      | 8541/22950 [13:28<22:21, 10.74it/s]

{'loss': 0.0031, 'grad_norm': 0.004227979574352503, 'learning_rate': 6.278867102396514e-05, 'epoch': 18.61}


 37%|‚ñà‚ñà‚ñà‚ñã      | 8551/22950 [13:29<22:19, 10.75it/s]

{'loss': 0.0613, 'grad_norm': 0.0043893177062273026, 'learning_rate': 6.274509803921569e-05, 'epoch': 18.63}


 37%|‚ñà‚ñà‚ñà‚ñã      | 8561/22950 [13:30<22:16, 10.77it/s]

{'loss': 0.0003, 'grad_norm': 0.010500382632017136, 'learning_rate': 6.270152505446624e-05, 'epoch': 18.65}


 37%|‚ñà‚ñà‚ñà‚ñã      | 8571/22950 [13:31<22:37, 10.60it/s]

{'loss': 0.0003, 'grad_norm': 0.0037743952125310898, 'learning_rate': 6.265795206971677e-05, 'epoch': 18.67}


 37%|‚ñà‚ñà‚ñà‚ñã      | 8581/22950 [13:32<22:27, 10.67it/s]

{'loss': 0.0004, 'grad_norm': 0.003470638068392873, 'learning_rate': 6.261437908496732e-05, 'epoch': 18.69}


 37%|‚ñà‚ñà‚ñà‚ñã      | 8591/22950 [13:33<22:15, 10.75it/s]

{'loss': 0.0003, 'grad_norm': 0.006729566492140293, 'learning_rate': 6.257080610021787e-05, 'epoch': 18.71}


 37%|‚ñà‚ñà‚ñà‚ñã      | 8601/22950 [13:34<22:14, 10.75it/s]

{'loss': 0.018, 'grad_norm': 0.004314160440117121, 'learning_rate': 6.252723311546841e-05, 'epoch': 18.74}


 38%|‚ñà‚ñà‚ñà‚ñä      | 8611/22950 [13:35<22:35, 10.58it/s]

{'loss': 0.0001, 'grad_norm': 0.003885940881446004, 'learning_rate': 6.248366013071896e-05, 'epoch': 18.76}


 38%|‚ñà‚ñà‚ñà‚ñä      | 8621/22950 [13:36<22:21, 10.68it/s]

{'loss': 0.1124, 'grad_norm': 0.0023460863158106804, 'learning_rate': 6.244008714596951e-05, 'epoch': 18.78}


 38%|‚ñà‚ñà‚ñà‚ñä      | 8631/22950 [13:37<23:00, 10.37it/s]

{'loss': 0.0001, 'grad_norm': 0.004814604762941599, 'learning_rate': 6.239651416122005e-05, 'epoch': 18.8}


 38%|‚ñà‚ñà‚ñà‚ñä      | 8641/22950 [13:38<22:31, 10.59it/s]

{'loss': 0.1078, 'grad_norm': 0.005900516174733639, 'learning_rate': 6.23529411764706e-05, 'epoch': 18.82}


 38%|‚ñà‚ñà‚ñà‚ñä      | 8651/22950 [13:39<22:02, 10.81it/s]

{'loss': 0.0063, 'grad_norm': 0.01620158925652504, 'learning_rate': 6.230936819172115e-05, 'epoch': 18.85}


 38%|‚ñà‚ñà‚ñà‚ñä      | 8661/22950 [13:40<22:15, 10.70it/s]

{'loss': 0.0005, 'grad_norm': 0.014241328462958336, 'learning_rate': 6.226579520697168e-05, 'epoch': 18.87}


 38%|‚ñà‚ñà‚ñà‚ñä      | 8671/22950 [13:41<22:17, 10.68it/s]

{'loss': 0.0004, 'grad_norm': 0.009435668587684631, 'learning_rate': 6.222222222222222e-05, 'epoch': 18.89}


 38%|‚ñà‚ñà‚ñà‚ñä      | 8681/22950 [13:42<22:13, 10.70it/s]

{'loss': 0.0676, 'grad_norm': 0.013556894846260548, 'learning_rate': 6.217864923747277e-05, 'epoch': 18.91}


 38%|‚ñà‚ñà‚ñà‚ñä      | 8691/22950 [13:43<22:40, 10.48it/s]

{'loss': 0.0345, 'grad_norm': 0.016577359288930893, 'learning_rate': 6.21350762527233e-05, 'epoch': 18.93}


 38%|‚ñà‚ñà‚ñà‚ñä      | 8701/22950 [13:43<22:25, 10.59it/s]

{'loss': 0.0006, 'grad_norm': 0.011687714606523514, 'learning_rate': 6.209150326797386e-05, 'epoch': 18.95}


 38%|‚ñà‚ñà‚ñà‚ñä      | 8711/22950 [13:44<22:20, 10.62it/s]

{'loss': 0.0984, 'grad_norm': 0.00977430772036314, 'learning_rate': 6.204793028322441e-05, 'epoch': 18.98}


 38%|‚ñà‚ñà‚ñà‚ñä      | 8721/22950 [13:45<21:04, 11.25it/s]

{'loss': 0.0752, 'grad_norm': 0.01893610507249832, 'learning_rate': 6.200435729847494e-05, 'epoch': 19.0}


                                                    
 38%|‚ñà‚ñà‚ñà‚ñä      | 8721/22950 [13:48<21:04, 11.25it/s]

{'eval_loss': 1.11159348487854, 'eval_accuracy': 0.8382353186607361, 'eval_runtime': 2.8552, 'eval_samples_per_second': 142.899, 'eval_steps_per_second': 17.862, 'epoch': 19.0}


 38%|‚ñà‚ñà‚ñà‚ñä      | 8731/22950 [13:49<47:34,  4.98it/s]  

{'loss': 0.0003, 'grad_norm': 0.19111768901348114, 'learning_rate': 6.19607843137255e-05, 'epoch': 19.02}


 38%|‚ñà‚ñà‚ñà‚ñä      | 8741/22950 [13:51<27:55,  8.48it/s]

{'loss': 0.0003, 'grad_norm': 0.015792183578014374, 'learning_rate': 6.191721132897603e-05, 'epoch': 19.04}


 38%|‚ñà‚ñà‚ñà‚ñä      | 8751/22950 [13:51<23:26, 10.09it/s]

{'loss': 0.0002, 'grad_norm': 0.009497749619185925, 'learning_rate': 6.187363834422658e-05, 'epoch': 19.06}


 38%|‚ñà‚ñà‚ñà‚ñä      | 8761/22950 [13:52<22:34, 10.48it/s]

{'loss': 0.0002, 'grad_norm': 0.007116356398910284, 'learning_rate': 6.183006535947713e-05, 'epoch': 19.08}


 38%|‚ñà‚ñà‚ñà‚ñä      | 8771/22950 [13:53<22:30, 10.50it/s]

{'loss': 0.0822, 'grad_norm': 0.007872221060097218, 'learning_rate': 6.178649237472767e-05, 'epoch': 19.11}


 38%|‚ñà‚ñà‚ñà‚ñä      | 8781/22950 [13:54<22:31, 10.49it/s]

{'loss': 0.0002, 'grad_norm': 0.018077395856380463, 'learning_rate': 6.174291938997822e-05, 'epoch': 19.13}


 38%|‚ñà‚ñà‚ñà‚ñä      | 8791/22950 [13:55<22:37, 10.43it/s]

{'loss': 0.0058, 'grad_norm': 0.0114892004057765, 'learning_rate': 6.169934640522877e-05, 'epoch': 19.15}


 38%|‚ñà‚ñà‚ñà‚ñä      | 8801/22950 [13:56<23:40,  9.96it/s]

{'loss': 0.0002, 'grad_norm': 0.01574171893298626, 'learning_rate': 6.16557734204793e-05, 'epoch': 19.17}


 38%|‚ñà‚ñà‚ñà‚ñä      | 8811/22950 [13:57<22:38, 10.40it/s]

{'loss': 0.0002, 'grad_norm': 0.009906814433634281, 'learning_rate': 6.161220043572985e-05, 'epoch': 19.19}


 38%|‚ñà‚ñà‚ñà‚ñä      | 8821/22950 [13:58<22:40, 10.38it/s]

{'loss': 0.0002, 'grad_norm': 0.05284888669848442, 'learning_rate': 6.15686274509804e-05, 'epoch': 19.22}


 38%|‚ñà‚ñà‚ñà‚ñä      | 8831/22950 [13:59<22:46, 10.33it/s]

{'loss': 0.077, 'grad_norm': 0.005401888862252235, 'learning_rate': 6.152505446623094e-05, 'epoch': 19.24}


 39%|‚ñà‚ñà‚ñà‚ñä      | 8841/22950 [14:00<23:10, 10.15it/s]

{'loss': 0.0094, 'grad_norm': 0.006046016700565815, 'learning_rate': 6.148148148148148e-05, 'epoch': 19.26}


 39%|‚ñà‚ñà‚ñà‚ñä      | 8851/22950 [14:01<22:45, 10.33it/s]

{'loss': 0.0002, 'grad_norm': 0.007022893987596035, 'learning_rate': 6.143790849673203e-05, 'epoch': 19.28}


 39%|‚ñà‚ñà‚ñà‚ñä      | 8861/22950 [14:02<23:11, 10.13it/s]

{'loss': 0.142, 'grad_norm': 0.004672334063798189, 'learning_rate': 6.139433551198256e-05, 'epoch': 19.3}


 39%|‚ñà‚ñà‚ñà‚ñä      | 8871/22950 [14:03<22:44, 10.31it/s]

{'loss': 0.0914, 'grad_norm': 0.00787421502172947, 'learning_rate': 6.135076252723312e-05, 'epoch': 19.32}


 39%|‚ñà‚ñà‚ñà‚ñä      | 8881/22950 [14:04<22:46, 10.30it/s]

{'loss': 0.0007, 'grad_norm': 0.005781983491033316, 'learning_rate': 6.130718954248367e-05, 'epoch': 19.35}


 39%|‚ñà‚ñà‚ñà‚ñä      | 8891/22950 [14:05<22:50, 10.26it/s]

{'loss': 0.0722, 'grad_norm': 0.004445790313184261, 'learning_rate': 6.12636165577342e-05, 'epoch': 19.37}


 39%|‚ñà‚ñà‚ñà‚ñâ      | 8901/22950 [14:06<22:46, 10.28it/s]

{'loss': 0.0005, 'grad_norm': 0.09974721074104309, 'learning_rate': 6.122004357298475e-05, 'epoch': 19.39}


 39%|‚ñà‚ñà‚ñà‚ñâ      | 8911/22950 [14:07<22:36, 10.35it/s]

{'loss': 0.0002, 'grad_norm': 0.004084032494574785, 'learning_rate': 6.11764705882353e-05, 'epoch': 19.41}


 39%|‚ñà‚ñà‚ñà‚ñâ      | 8921/22950 [14:08<23:18, 10.03it/s]

{'loss': 0.0704, 'grad_norm': 0.0046754819341003895, 'learning_rate': 6.113289760348584e-05, 'epoch': 19.43}


 39%|‚ñà‚ñà‚ñà‚ñâ      | 8931/22950 [14:09<22:46, 10.26it/s]

{'loss': 0.0032, 'grad_norm': 0.7885992527008057, 'learning_rate': 6.108932461873639e-05, 'epoch': 19.46}


 39%|‚ñà‚ñà‚ñà‚ñâ      | 8941/22950 [14:10<22:38, 10.31it/s]

{'loss': 0.0005, 'grad_norm': 0.008450021967291832, 'learning_rate': 6.104575163398694e-05, 'epoch': 19.48}


 39%|‚ñà‚ñà‚ñà‚ñâ      | 8951/22950 [14:11<22:57, 10.16it/s]

{'loss': 0.1395, 'grad_norm': 0.01795121841132641, 'learning_rate': 6.1002178649237476e-05, 'epoch': 19.5}


 39%|‚ñà‚ñà‚ñà‚ñâ      | 8961/22950 [14:12<23:03, 10.11it/s]

{'loss': 0.0002, 'grad_norm': 0.004917520098388195, 'learning_rate': 6.095860566448802e-05, 'epoch': 19.52}


 39%|‚ñà‚ñà‚ñà‚ñâ      | 8971/22950 [14:13<22:45, 10.24it/s]

{'loss': 0.0005, 'grad_norm': 0.01460337731987238, 'learning_rate': 6.091503267973856e-05, 'epoch': 19.54}


 39%|‚ñà‚ñà‚ñà‚ñâ      | 8982/22950 [14:14<23:05, 10.08it/s]

{'loss': 0.0738, 'grad_norm': 0.003994628321379423, 'learning_rate': 6.0871459694989106e-05, 'epoch': 19.56}


 39%|‚ñà‚ñà‚ñà‚ñâ      | 8992/22950 [14:15<22:59, 10.12it/s]

{'loss': 0.1389, 'grad_norm': 0.004925599787384272, 'learning_rate': 6.0827886710239656e-05, 'epoch': 19.59}


 39%|‚ñà‚ñà‚ñà‚ñâ      | 9002/22950 [14:16<23:04, 10.07it/s]

{'loss': 0.0049, 'grad_norm': 0.027880223467946053, 'learning_rate': 6.078431372549019e-05, 'epoch': 19.61}


 39%|‚ñà‚ñà‚ñà‚ñâ      | 9012/22950 [14:17<22:43, 10.22it/s]

{'loss': 0.0003, 'grad_norm': 0.0033120063599199057, 'learning_rate': 6.074074074074074e-05, 'epoch': 19.63}


 39%|‚ñà‚ñà‚ñà‚ñâ      | 9022/22950 [14:18<23:00, 10.09it/s]

{'loss': 0.0002, 'grad_norm': 0.0035371901467442513, 'learning_rate': 6.069716775599129e-05, 'epoch': 19.65}


 39%|‚ñà‚ñà‚ñà‚ñâ      | 9031/22950 [14:19<23:27,  9.89it/s]

{'loss': 0.0002, 'grad_norm': 0.002601861720904708, 'learning_rate': 6.065359477124183e-05, 'epoch': 19.67}


 39%|‚ñà‚ñà‚ñà‚ñâ      | 9041/22950 [14:20<23:00, 10.07it/s]

{'loss': 0.0001, 'grad_norm': 0.0037211754824966192, 'learning_rate': 6.061002178649238e-05, 'epoch': 19.69}


 39%|‚ñà‚ñà‚ñà‚ñâ      | 9051/22950 [14:21<22:54, 10.11it/s]

{'loss': 0.0001, 'grad_norm': 0.016156071797013283, 'learning_rate': 6.0566448801742924e-05, 'epoch': 19.72}


 39%|‚ñà‚ñà‚ñà‚ñâ      | 9061/22950 [14:22<23:01, 10.06it/s]

{'loss': 0.0001, 'grad_norm': 0.009526458568871021, 'learning_rate': 6.052287581699346e-05, 'epoch': 19.74}


 40%|‚ñà‚ñà‚ñà‚ñâ      | 9071/22950 [14:23<24:46,  9.34it/s]

{'loss': 0.0712, 'grad_norm': 0.005900505464524031, 'learning_rate': 6.047930283224401e-05, 'epoch': 19.76}


 40%|‚ñà‚ñà‚ñà‚ñâ      | 9082/22950 [14:24<22:50, 10.12it/s]

{'loss': 0.0422, 'grad_norm': 0.0037993760779500008, 'learning_rate': 6.043572984749456e-05, 'epoch': 19.78}


 40%|‚ñà‚ñà‚ñà‚ñâ      | 9091/22950 [14:25<23:19,  9.90it/s]

{'loss': 0.0609, 'grad_norm': 0.3458747863769531, 'learning_rate': 6.03921568627451e-05, 'epoch': 19.8}


 40%|‚ñà‚ñà‚ñà‚ñâ      | 9101/22950 [14:26<22:53, 10.09it/s]

{'loss': 0.0001, 'grad_norm': 0.002470574341714382, 'learning_rate': 6.034858387799565e-05, 'epoch': 19.83}


 40%|‚ñà‚ñà‚ñà‚ñâ      | 9111/22950 [14:27<23:11,  9.95it/s]

{'loss': 0.0007, 'grad_norm': 0.0026390631683170795, 'learning_rate': 6.03050108932462e-05, 'epoch': 19.85}


 40%|‚ñà‚ñà‚ñà‚ñâ      | 9121/22950 [14:28<22:57, 10.04it/s]

{'loss': 0.0375, 'grad_norm': 57.883323669433594, 'learning_rate': 6.0261437908496734e-05, 'epoch': 19.87}


 40%|‚ñà‚ñà‚ñà‚ñâ      | 9132/22950 [14:29<22:56, 10.04it/s]

{'loss': 0.0888, 'grad_norm': 0.04930958151817322, 'learning_rate': 6.021786492374728e-05, 'epoch': 19.89}


 40%|‚ñà‚ñà‚ñà‚ñâ      | 9141/22950 [14:30<23:04,  9.98it/s]

{'loss': 0.0267, 'grad_norm': 0.0025272832717746496, 'learning_rate': 6.017429193899783e-05, 'epoch': 19.91}


 40%|‚ñà‚ñà‚ñà‚ñâ      | 9151/22950 [14:31<23:19,  9.86it/s]

{'loss': 0.0052, 'grad_norm': 0.0018353721825405955, 'learning_rate': 6.0130718954248365e-05, 'epoch': 19.93}


 40%|‚ñà‚ñà‚ñà‚ñâ      | 9161/22950 [14:32<23:04,  9.96it/s]

{'loss': 0.0978, 'grad_norm': 0.45904335379600525, 'learning_rate': 6.0087145969498915e-05, 'epoch': 19.96}


 40%|‚ñà‚ñà‚ñà‚ñâ      | 9171/22950 [14:33<23:09,  9.92it/s]

{'loss': 0.0647, 'grad_norm': 4.1391377449035645, 'learning_rate': 6.0043572984749465e-05, 'epoch': 19.98}


 40%|‚ñà‚ñà‚ñà‚ñà      | 9180/22950 [14:34<23:16,  9.86it/s]

{'loss': 0.1681, 'grad_norm': 0.004273616708815098, 'learning_rate': 6e-05, 'epoch': 20.0}


                                                    
 40%|‚ñà‚ñà‚ñà‚ñà      | 9180/22950 [14:37<23:16,  9.86it/s]

{'eval_loss': 1.224566102027893, 'eval_accuracy': 0.8333333134651184, 'eval_runtime': 3.0121, 'eval_samples_per_second': 135.455, 'eval_steps_per_second': 16.932, 'epoch': 20.0}


 40%|‚ñà‚ñà‚ñà‚ñà      | 9191/22950 [14:38<43:49,  5.23it/s]  

{'loss': 0.0964, 'grad_norm': 0.010889357887208462, 'learning_rate': 5.995642701525055e-05, 'epoch': 20.02}


 40%|‚ñà‚ñà‚ñà‚ñà      | 9201/22950 [14:39<24:26,  9.37it/s]

{'loss': 0.0009, 'grad_norm': 0.007590922061353922, 'learning_rate': 5.9912854030501095e-05, 'epoch': 20.04}


 40%|‚ñà‚ñà‚ñà‚ñà      | 9211/22950 [14:40<23:34,  9.71it/s]

{'loss': 0.0789, 'grad_norm': 0.004924070090055466, 'learning_rate': 5.986928104575163e-05, 'epoch': 20.07}


 40%|‚ñà‚ñà‚ñà‚ñà      | 9221/22950 [14:41<23:21,  9.80it/s]

{'loss': 0.0008, 'grad_norm': 0.023975539952516556, 'learning_rate': 5.982570806100218e-05, 'epoch': 20.09}


 40%|‚ñà‚ñà‚ñà‚ñà      | 9231/22950 [14:42<23:22,  9.78it/s]

{'loss': 0.0003, 'grad_norm': 0.01611938327550888, 'learning_rate': 5.978213507625272e-05, 'epoch': 20.11}


 40%|‚ñà‚ñà‚ñà‚ñà      | 9241/22950 [14:43<23:26,  9.75it/s]

{'loss': 0.0452, 'grad_norm': 0.025352109223604202, 'learning_rate': 5.973856209150327e-05, 'epoch': 20.13}


 40%|‚ñà‚ñà‚ñà‚ñà      | 9251/22950 [14:44<23:16,  9.81it/s]

{'loss': 0.1273, 'grad_norm': 0.010316476225852966, 'learning_rate': 5.969498910675382e-05, 'epoch': 20.15}


 40%|‚ñà‚ñà‚ñà‚ñà      | 9262/22950 [14:46<23:00,  9.92it/s]

{'loss': 0.1013, 'grad_norm': 0.005293172784149647, 'learning_rate': 5.9651416122004356e-05, 'epoch': 20.17}


 40%|‚ñà‚ñà‚ñà‚ñà      | 9271/22950 [14:47<23:38,  9.64it/s]

{'loss': 0.0019, 'grad_norm': 6.026409149169922, 'learning_rate': 5.9607843137254906e-05, 'epoch': 20.2}


 40%|‚ñà‚ñà‚ñà‚ñà      | 9281/22950 [14:48<23:23,  9.74it/s]

{'loss': 0.0476, 'grad_norm': 0.010182131081819534, 'learning_rate': 5.956427015250545e-05, 'epoch': 20.22}


 40%|‚ñà‚ñà‚ñà‚ñà      | 9291/22950 [14:49<23:32,  9.67it/s]

{'loss': 0.0514, 'grad_norm': 65.0201416015625, 'learning_rate': 5.952069716775599e-05, 'epoch': 20.24}


 41%|‚ñà‚ñà‚ñà‚ñà      | 9301/22950 [14:50<23:27,  9.70it/s]

{'loss': 0.122, 'grad_norm': 4.7841877937316895, 'learning_rate': 5.9477124183006536e-05, 'epoch': 20.26}


 41%|‚ñà‚ñà‚ñà‚ñà      | 9311/22950 [14:51<23:27,  9.69it/s]

{'loss': 0.0003, 'grad_norm': 0.011858094483613968, 'learning_rate': 5.9433551198257086e-05, 'epoch': 20.28}


 41%|‚ñà‚ñà‚ñà‚ñà      | 9321/22950 [14:52<23:17,  9.75it/s]

{'loss': 0.0004, 'grad_norm': 0.015229985117912292, 'learning_rate': 5.938997821350762e-05, 'epoch': 20.31}


 41%|‚ñà‚ñà‚ñà‚ñà      | 9331/22950 [14:53<23:18,  9.74it/s]

{'loss': 0.0012, 'grad_norm': 0.0045668757520616055, 'learning_rate': 5.934640522875817e-05, 'epoch': 20.33}


 41%|‚ñà‚ñà‚ñà‚ñà      | 9341/22950 [14:54<23:27,  9.67it/s]

{'loss': 0.0758, 'grad_norm': 0.00406264653429389, 'learning_rate': 5.930283224400872e-05, 'epoch': 20.35}


 41%|‚ñà‚ñà‚ñà‚ñà      | 9351/22950 [14:55<23:27,  9.66it/s]

{'loss': 0.0272, 'grad_norm': 0.009613944217562675, 'learning_rate': 5.925925925925926e-05, 'epoch': 20.37}


 41%|‚ñà‚ñà‚ñà‚ñà      | 9361/22950 [14:56<23:30,  9.63it/s]

{'loss': 0.0027, 'grad_norm': 12.480066299438477, 'learning_rate': 5.921568627450981e-05, 'epoch': 20.39}


 41%|‚ñà‚ñà‚ñà‚ñà      | 9371/22950 [14:57<23:25,  9.66it/s]

{'loss': 0.0127, 'grad_norm': 2.3781683444976807, 'learning_rate': 5.9172113289760353e-05, 'epoch': 20.41}


 41%|‚ñà‚ñà‚ñà‚ñà      | 9381/22950 [14:58<23:39,  9.56it/s]

{'loss': 0.0004, 'grad_norm': 0.005352831911295652, 'learning_rate': 5.912854030501089e-05, 'epoch': 20.44}


 41%|‚ñà‚ñà‚ñà‚ñà      | 9391/22950 [14:59<23:30,  9.61it/s]

{'loss': 0.0002, 'grad_norm': 0.006094289477914572, 'learning_rate': 5.908496732026144e-05, 'epoch': 20.46}


 41%|‚ñà‚ñà‚ñà‚ñà      | 9401/22950 [15:00<23:30,  9.60it/s]

{'loss': 0.0001, 'grad_norm': 0.0042172763496637344, 'learning_rate': 5.904139433551199e-05, 'epoch': 20.48}


 41%|‚ñà‚ñà‚ñà‚ñà      | 9411/22950 [15:01<23:32,  9.59it/s]

{'loss': 0.0002, 'grad_norm': 0.011282953433692455, 'learning_rate': 5.899782135076253e-05, 'epoch': 20.5}


 41%|‚ñà‚ñà‚ñà‚ñà      | 9421/22950 [15:02<23:29,  9.60it/s]

{'loss': 0.0001, 'grad_norm': 0.02182181365787983, 'learning_rate': 5.895424836601308e-05, 'epoch': 20.52}


 41%|‚ñà‚ñà‚ñà‚ñà      | 9431/22950 [15:03<23:31,  9.58it/s]

{'loss': 0.0001, 'grad_norm': 0.004977106116712093, 'learning_rate': 5.891067538126363e-05, 'epoch': 20.54}


 41%|‚ñà‚ñà‚ñà‚ñà      | 9441/22950 [15:04<24:28,  9.20it/s]

{'loss': 0.0004, 'grad_norm': 0.003760680090636015, 'learning_rate': 5.8867102396514164e-05, 'epoch': 20.57}


 41%|‚ñà‚ñà‚ñà‚ñà      | 9451/22950 [15:05<23:41,  9.49it/s]

{'loss': 0.0686, 'grad_norm': 0.0021313100587576628, 'learning_rate': 5.882352941176471e-05, 'epoch': 20.59}


 41%|‚ñà‚ñà‚ñà‚ñà      | 9461/22950 [15:06<23:31,  9.56it/s]

{'loss': 0.0407, 'grad_norm': 0.005435196217149496, 'learning_rate': 5.877995642701525e-05, 'epoch': 20.61}


 41%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9471/22950 [15:07<23:35,  9.52it/s]

{'loss': 0.0003, 'grad_norm': 0.05010393261909485, 'learning_rate': 5.8736383442265794e-05, 'epoch': 20.63}


 41%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9481/22950 [15:08<23:38,  9.49it/s]

{'loss': 0.1063, 'grad_norm': 0.007105628959834576, 'learning_rate': 5.8692810457516345e-05, 'epoch': 20.65}


 41%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9491/22950 [15:09<23:31,  9.53it/s]

{'loss': 0.0002, 'grad_norm': 0.0469268262386322, 'learning_rate': 5.864923747276688e-05, 'epoch': 20.68}


 41%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9501/22950 [15:11<23:37,  9.49it/s]

{'loss': 0.0004, 'grad_norm': 0.0037356873508542776, 'learning_rate': 5.860566448801743e-05, 'epoch': 20.7}


 41%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9511/22950 [15:12<23:27,  9.55it/s]

{'loss': 0.0353, 'grad_norm': 0.108320452272892, 'learning_rate': 5.856209150326798e-05, 'epoch': 20.72}


 41%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9521/22950 [15:13<23:29,  9.53it/s]

{'loss': 0.0006, 'grad_norm': 0.7164349555969238, 'learning_rate': 5.851851851851852e-05, 'epoch': 20.74}


 42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9531/22950 [15:14<23:31,  9.51it/s]

{'loss': 0.0001, 'grad_norm': 0.0033878867980092764, 'learning_rate': 5.847494553376907e-05, 'epoch': 20.76}


 42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9541/22950 [15:15<23:39,  9.45it/s]

{'loss': 0.0139, 'grad_norm': 0.002559817861765623, 'learning_rate': 5.843137254901961e-05, 'epoch': 20.78}


 42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9551/22950 [15:16<23:36,  9.46it/s]

{'loss': 0.0004, 'grad_norm': 0.06399893760681152, 'learning_rate': 5.838779956427015e-05, 'epoch': 20.81}


 42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9561/22950 [15:17<23:41,  9.42it/s]

{'loss': 0.0034, 'grad_norm': 0.0019191348692402244, 'learning_rate': 5.83442265795207e-05, 'epoch': 20.83}


 42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9571/22950 [15:18<23:18,  9.56it/s]

{'loss': 0.0002, 'grad_norm': 0.0023993703071027994, 'learning_rate': 5.830065359477125e-05, 'epoch': 20.85}


 42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9581/22950 [15:19<23:33,  9.46it/s]

{'loss': 0.0233, 'grad_norm': 0.034953925758600235, 'learning_rate': 5.8257080610021785e-05, 'epoch': 20.87}


 42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9591/22950 [15:20<23:33,  9.45it/s]

{'loss': 0.0001, 'grad_norm': 0.00684942165389657, 'learning_rate': 5.8213507625272336e-05, 'epoch': 20.89}


 42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9601/22950 [15:21<23:26,  9.49it/s]

{'loss': 0.0001, 'grad_norm': 0.0014599525602534413, 'learning_rate': 5.8169934640522886e-05, 'epoch': 20.92}


 42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9611/22950 [15:22<23:27,  9.48it/s]

{'loss': 0.0054, 'grad_norm': 0.12577180564403534, 'learning_rate': 5.812636165577342e-05, 'epoch': 20.94}


 42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9621/22950 [15:23<23:56,  9.28it/s]

{'loss': 0.0003, 'grad_norm': 0.00514229154214263, 'learning_rate': 5.8082788671023966e-05, 'epoch': 20.96}


 42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9631/22950 [15:24<23:29,  9.45it/s]

{'loss': 0.0001, 'grad_norm': 0.001751369796693325, 'learning_rate': 5.8039215686274516e-05, 'epoch': 20.98}


                                                    
 42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9639/22950 [15:28<23:42,  9.36it/s]

{'eval_loss': 1.3467401266098022, 'eval_accuracy': 0.8308823704719543, 'eval_runtime': 3.1992, 'eval_samples_per_second': 127.532, 'eval_steps_per_second': 15.942, 'epoch': 21.0}


 42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9642/22950 [15:29<2:11:32,  1.69it/s]

{'loss': 0.0814, 'grad_norm': 0.004646753426641226, 'learning_rate': 5.799564270152505e-05, 'epoch': 21.0}


 42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9651/22950 [15:30<37:16,  5.95it/s]  

{'loss': 0.0029, 'grad_norm': 0.0020072967745363712, 'learning_rate': 5.79520697167756e-05, 'epoch': 21.02}


 42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9661/22950 [15:31<24:21,  9.09it/s]

{'loss': 0.0001, 'grad_norm': 0.001832220354117453, 'learning_rate': 5.790849673202615e-05, 'epoch': 21.05}


 42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9671/22950 [15:32<23:29,  9.42it/s]

{'loss': 0.0001, 'grad_norm': 0.002968444721773267, 'learning_rate': 5.786492374727669e-05, 'epoch': 21.07}


 42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9681/22950 [15:33<23:38,  9.36it/s]

{'loss': 0.0001, 'grad_norm': 0.0044708251953125, 'learning_rate': 5.782135076252724e-05, 'epoch': 21.09}


 42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9691/22950 [15:34<23:36,  9.36it/s]

{'loss': 0.099, 'grad_norm': 28.133737564086914, 'learning_rate': 5.7777777777777776e-05, 'epoch': 21.11}


 42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9701/22950 [15:35<23:36,  9.36it/s]

{'loss': 0.0001, 'grad_norm': 0.001986401854082942, 'learning_rate': 5.773420479302833e-05, 'epoch': 21.13}


 42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9711/22950 [15:36<23:40,  9.32it/s]

{'loss': 0.0001, 'grad_norm': 0.0019394883420318365, 'learning_rate': 5.769063180827887e-05, 'epoch': 21.15}


 42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9721/22950 [15:37<23:24,  9.42it/s]

{'loss': 0.0001, 'grad_norm': 0.0015706757549196482, 'learning_rate': 5.764705882352941e-05, 'epoch': 21.18}


 42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9731/22950 [15:39<23:35,  9.34it/s]

{'loss': 0.0653, 'grad_norm': 0.0021139297168701887, 'learning_rate': 5.760348583877996e-05, 'epoch': 21.2}


 42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9741/22950 [15:40<23:26,  9.39it/s]

{'loss': 0.0001, 'grad_norm': 0.003180921543389559, 'learning_rate': 5.755991285403051e-05, 'epoch': 21.22}


 42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 9751/22950 [15:41<23:18,  9.44it/s]

{'loss': 0.0001, 'grad_norm': 0.005662338808178902, 'learning_rate': 5.7516339869281044e-05, 'epoch': 21.24}


 43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9761/22950 [15:42<23:27,  9.37it/s]

{'loss': 0.0001, 'grad_norm': 0.0017596042016521096, 'learning_rate': 5.7472766884531594e-05, 'epoch': 21.26}


 43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9771/22950 [15:43<23:26,  9.37it/s]

{'loss': 0.0001, 'grad_norm': 0.0016404975904151797, 'learning_rate': 5.7429193899782144e-05, 'epoch': 21.29}


 43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9781/22950 [15:44<23:33,  9.32it/s]

{'loss': 0.0001, 'grad_norm': 0.0013185548596084118, 'learning_rate': 5.738562091503268e-05, 'epoch': 21.31}


 43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9791/22950 [15:45<23:16,  9.42it/s]

{'loss': 0.0001, 'grad_norm': 0.010554683394730091, 'learning_rate': 5.7342047930283224e-05, 'epoch': 21.33}


 43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9801/22950 [15:46<23:27,  9.34it/s]

{'loss': 0.0, 'grad_norm': 0.001384226605296135, 'learning_rate': 5.7298474945533774e-05, 'epoch': 21.35}


 43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9811/22950 [15:47<23:22,  9.37it/s]

{'loss': 0.0001, 'grad_norm': 0.0016341505106538534, 'learning_rate': 5.725490196078431e-05, 'epoch': 21.37}


 43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9821/22950 [15:48<23:25,  9.34it/s]

{'loss': 0.0001, 'grad_norm': 0.07588736712932587, 'learning_rate': 5.721132897603486e-05, 'epoch': 21.39}


 43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9831/22950 [15:49<23:32,  9.29it/s]

{'loss': 0.0797, 'grad_norm': 0.038431186228990555, 'learning_rate': 5.716775599128541e-05, 'epoch': 21.42}


 43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9841/22950 [15:50<23:24,  9.33it/s]

{'loss': 0.0001, 'grad_norm': 0.0011491456534713507, 'learning_rate': 5.712418300653595e-05, 'epoch': 21.44}


 43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9851/22950 [15:52<22:55,  9.52it/s]

{'loss': 0.0001, 'grad_norm': 0.005314202047884464, 'learning_rate': 5.70806100217865e-05, 'epoch': 21.46}


 43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9861/22950 [15:53<23:07,  9.43it/s]

{'loss': 0.0011, 'grad_norm': 0.02743334323167801, 'learning_rate': 5.703703703703704e-05, 'epoch': 21.48}


 43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9871/22950 [15:54<23:01,  9.46it/s]

{'loss': 0.0001, 'grad_norm': 0.0017846886767074466, 'learning_rate': 5.6993464052287585e-05, 'epoch': 21.5}


 43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9881/22950 [15:55<23:16,  9.36it/s]

{'loss': 0.0, 'grad_norm': 0.001351610873825848, 'learning_rate': 5.694989106753813e-05, 'epoch': 21.53}


 43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9891/22950 [15:56<23:06,  9.42it/s]

{'loss': 0.0001, 'grad_norm': 0.0014525754377245903, 'learning_rate': 5.690631808278868e-05, 'epoch': 21.55}


 43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9901/22950 [15:57<23:01,  9.45it/s]

{'loss': 0.0006, 'grad_norm': 0.0017751532141119242, 'learning_rate': 5.6862745098039215e-05, 'epoch': 21.57}


 43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9911/22950 [15:58<23:18,  9.32it/s]

{'loss': 0.0133, 'grad_norm': 0.00418079225346446, 'learning_rate': 5.6819172113289765e-05, 'epoch': 21.59}


 43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9921/22950 [15:59<22:58,  9.45it/s]

{'loss': 0.0893, 'grad_norm': 0.0010761175071820617, 'learning_rate': 5.6775599128540316e-05, 'epoch': 21.61}


 43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9931/22950 [16:00<22:54,  9.47it/s]

{'loss': 0.0002, 'grad_norm': 0.0013831222895532846, 'learning_rate': 5.673202614379085e-05, 'epoch': 21.63}


 43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9941/22950 [16:01<22:55,  9.46it/s]

{'loss': 0.1191, 'grad_norm': 0.0051185935735702515, 'learning_rate': 5.66884531590414e-05, 'epoch': 21.66}


 43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9951/22950 [16:02<23:13,  9.33it/s]

{'loss': 0.0428, 'grad_norm': 0.017048204317688942, 'learning_rate': 5.664488017429194e-05, 'epoch': 21.68}


 43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9961/22950 [16:03<22:58,  9.42it/s]

{'loss': 0.0003, 'grad_norm': 0.009901979006826878, 'learning_rate': 5.660130718954248e-05, 'epoch': 21.7}


 43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9971/22950 [16:04<23:00,  9.40it/s]

{'loss': 0.0001, 'grad_norm': 0.0037139442283660173, 'learning_rate': 5.655773420479303e-05, 'epoch': 21.72}


 43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9981/22950 [16:05<22:50,  9.46it/s]

{'loss': 0.0474, 'grad_norm': 0.003050154075026512, 'learning_rate': 5.651416122004357e-05, 'epoch': 21.74}


 44%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 9991/22950 [16:07<23:01,  9.38it/s]

{'loss': 0.0001, 'grad_norm': 0.00683382386341691, 'learning_rate': 5.647058823529412e-05, 'epoch': 21.76}


 44%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 10001/22950 [16:08<22:50,  9.45it/s]

{'loss': 0.0119, 'grad_norm': 0.0026441917289048433, 'learning_rate': 5.642701525054467e-05, 'epoch': 21.79}


 44%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 10011/22950 [16:09<23:21,  9.23it/s]

{'loss': 0.0141, 'grad_norm': 0.002067428082227707, 'learning_rate': 5.6383442265795206e-05, 'epoch': 21.81}


 44%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 10021/22950 [16:10<22:46,  9.46it/s]

{'loss': 0.0002, 'grad_norm': 0.0019152256427332759, 'learning_rate': 5.6339869281045756e-05, 'epoch': 21.83}


 44%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 10031/22950 [16:11<22:43,  9.47it/s]

{'loss': 0.0003, 'grad_norm': 0.12908081710338593, 'learning_rate': 5.62962962962963e-05, 'epoch': 21.85}


 44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10041/22950 [16:12<22:46,  9.45it/s]

{'loss': 0.0001, 'grad_norm': 0.0035718423314392567, 'learning_rate': 5.625272331154684e-05, 'epoch': 21.87}


 44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10051/22950 [16:13<22:41,  9.47it/s]

{'loss': 0.0643, 'grad_norm': 0.28693854808807373, 'learning_rate': 5.620915032679739e-05, 'epoch': 21.9}


 44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10061/22950 [16:14<22:40,  9.47it/s]

{'loss': 0.1279, 'grad_norm': 0.002747936639934778, 'learning_rate': 5.616557734204794e-05, 'epoch': 21.92}


 44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10071/22950 [16:15<22:40,  9.47it/s]

{'loss': 0.0001, 'grad_norm': 0.003403402864933014, 'learning_rate': 5.6122004357298474e-05, 'epoch': 21.94}


 44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10081/22950 [16:16<23:04,  9.29it/s]

{'loss': 0.001, 'grad_norm': 0.001960991881787777, 'learning_rate': 5.6078431372549024e-05, 'epoch': 21.96}


 44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10091/22950 [16:17<22:27,  9.54it/s]

{'loss': 0.0447, 'grad_norm': 0.0016055835876613855, 'learning_rate': 5.6034858387799574e-05, 'epoch': 21.98}


                                                     
 44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10098/22950 [16:21<22:33,  9.49it/s]

{'eval_loss': 1.2167500257492065, 'eval_accuracy': 0.843137264251709, 'eval_runtime': 3.1636, 'eval_samples_per_second': 128.969, 'eval_steps_per_second': 16.121, 'epoch': 22.0}


 44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10101/22950 [16:22<2:05:17,  1.71it/s]

{'loss': 0.0102, 'grad_norm': 0.002358243567869067, 'learning_rate': 5.599128540305011e-05, 'epoch': 22.0}


 44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10111/22950 [16:23<32:01,  6.68it/s]  

{'loss': 0.0264, 'grad_norm': 0.07306400686502457, 'learning_rate': 5.5947712418300654e-05, 'epoch': 22.03}


 44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10121/22950 [16:24<23:26,  9.12it/s]

{'loss': 0.0716, 'grad_norm': 0.002883579581975937, 'learning_rate': 5.5904139433551204e-05, 'epoch': 22.05}


 44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10131/22950 [16:25<22:18,  9.58it/s]

{'loss': 0.0001, 'grad_norm': 0.002863601315766573, 'learning_rate': 5.586056644880174e-05, 'epoch': 22.07}


 44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10141/22950 [16:26<22:26,  9.51it/s]

{'loss': 0.0444, 'grad_norm': 0.002161423908546567, 'learning_rate': 5.581699346405229e-05, 'epoch': 22.09}


 44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10151/22950 [16:27<22:23,  9.53it/s]

{'loss': 0.0001, 'grad_norm': 0.001467266702093184, 'learning_rate': 5.577342047930284e-05, 'epoch': 22.11}


 44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10161/22950 [16:28<22:20,  9.54it/s]

{'loss': 0.0881, 'grad_norm': 0.0041576256044209, 'learning_rate': 5.572984749455338e-05, 'epoch': 22.14}


 44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10171/22950 [16:29<22:16,  9.57it/s]

{'loss': 0.0971, 'grad_norm': 0.0019360067090019584, 'learning_rate': 5.568627450980393e-05, 'epoch': 22.16}


 44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10181/22950 [16:30<22:11,  9.59it/s]

{'loss': 0.0001, 'grad_norm': 0.002512657083570957, 'learning_rate': 5.5642701525054465e-05, 'epoch': 22.18}


 44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10191/22950 [16:31<22:17,  9.54it/s]

{'loss': 0.0755, 'grad_norm': 40.0472297668457, 'learning_rate': 5.5599128540305015e-05, 'epoch': 22.2}


 44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10201/22950 [16:32<22:21,  9.51it/s]

{'loss': 0.0002, 'grad_norm': 0.010167000815272331, 'learning_rate': 5.555555555555556e-05, 'epoch': 22.22}


 44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10211/22950 [16:33<22:07,  9.60it/s]

{'loss': 0.0569, 'grad_norm': 0.009173113852739334, 'learning_rate': 5.5511982570806095e-05, 'epoch': 22.24}


 45%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10221/22950 [16:34<22:24,  9.47it/s]

{'loss': 0.0001, 'grad_norm': 0.011552764102816582, 'learning_rate': 5.5468409586056645e-05, 'epoch': 22.27}


 45%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10231/22950 [16:35<22:19,  9.50it/s]

{'loss': 0.0001, 'grad_norm': 0.015512747690081596, 'learning_rate': 5.5424836601307195e-05, 'epoch': 22.29}


 45%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10241/22950 [16:36<22:03,  9.60it/s]

{'loss': 0.0684, 'grad_norm': 0.01328781247138977, 'learning_rate': 5.538126361655773e-05, 'epoch': 22.31}


 45%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10251/22950 [16:38<22:04,  9.59it/s]

{'loss': 0.0002, 'grad_norm': 0.00450112484395504, 'learning_rate': 5.533769063180828e-05, 'epoch': 22.33}


 45%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10262/22950 [16:39<21:34,  9.80it/s]

{'loss': 0.0816, 'grad_norm': 0.0036707816179841757, 'learning_rate': 5.529411764705883e-05, 'epoch': 22.35}


 45%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10271/22950 [16:40<22:02,  9.59it/s]

{'loss': 0.0002, 'grad_norm': 0.14896978437900543, 'learning_rate': 5.525054466230937e-05, 'epoch': 22.37}


 45%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10281/22950 [16:41<22:20,  9.45it/s]

{'loss': 0.0005, 'grad_norm': 0.004597906954586506, 'learning_rate': 5.520697167755991e-05, 'epoch': 22.4}


 45%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10291/22950 [16:42<22:00,  9.59it/s]

{'loss': 0.0593, 'grad_norm': 0.003451079595834017, 'learning_rate': 5.516339869281046e-05, 'epoch': 22.42}


 45%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10301/22950 [16:43<21:50,  9.65it/s]

{'loss': 0.0008, 'grad_norm': 0.003282258054241538, 'learning_rate': 5.5119825708061e-05, 'epoch': 22.44}


 45%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10311/22950 [16:44<21:57,  9.59it/s]

{'loss': 0.0373, 'grad_norm': 76.74528503417969, 'learning_rate': 5.507625272331155e-05, 'epoch': 22.46}


 45%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 10321/22950 [16:45<21:46,  9.67it/s]

{'loss': 0.0001, 'grad_norm': 0.003883835393935442, 'learning_rate': 5.50326797385621e-05, 'epoch': 22.48}


 45%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10331/22950 [16:46<22:08,  9.50it/s]

{'loss': 0.0001, 'grad_norm': 0.007062824908643961, 'learning_rate': 5.4989106753812636e-05, 'epoch': 22.51}


 45%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10341/22950 [16:47<21:47,  9.65it/s]

{'loss': 0.0001, 'grad_norm': 0.003082133363932371, 'learning_rate': 5.4945533769063186e-05, 'epoch': 22.53}


 45%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10351/22950 [16:48<21:36,  9.72it/s]

{'loss': 0.0034, 'grad_norm': 0.04892515018582344, 'learning_rate': 5.490196078431373e-05, 'epoch': 22.55}


 45%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10361/22950 [16:49<21:36,  9.71it/s]

{'loss': 0.0177, 'grad_norm': 0.06830502301454544, 'learning_rate': 5.485838779956427e-05, 'epoch': 22.57}


 45%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10371/22950 [16:50<21:25,  9.79it/s]

{'loss': 0.1089, 'grad_norm': 0.002564335474744439, 'learning_rate': 5.4814814814814817e-05, 'epoch': 22.59}


 45%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10381/22950 [16:51<21:30,  9.74it/s]

{'loss': 0.0558, 'grad_norm': 0.04627685248851776, 'learning_rate': 5.477124183006537e-05, 'epoch': 22.61}


 45%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10391/22950 [16:52<21:29,  9.74it/s]

{'loss': 0.0008, 'grad_norm': 0.014222972095012665, 'learning_rate': 5.47276688453159e-05, 'epoch': 22.64}


 45%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10401/22950 [16:53<21:25,  9.76it/s]

{'loss': 0.179, 'grad_norm': 0.00484757125377655, 'learning_rate': 5.4684095860566454e-05, 'epoch': 22.66}


 45%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10412/22950 [16:54<21:09,  9.88it/s]

{'loss': 0.0003, 'grad_norm': 0.008332360535860062, 'learning_rate': 5.464052287581699e-05, 'epoch': 22.68}


 45%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10421/22950 [16:55<21:16,  9.82it/s]

{'loss': 0.0108, 'grad_norm': 6.063446998596191, 'learning_rate': 5.459694989106754e-05, 'epoch': 22.7}


 45%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10431/22950 [16:56<21:09,  9.86it/s]

{'loss': 0.0003, 'grad_norm': 0.008630558848381042, 'learning_rate': 5.455337690631809e-05, 'epoch': 22.72}


 45%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10441/22950 [16:57<21:06,  9.87it/s]

{'loss': 0.0003, 'grad_norm': 0.006058272439986467, 'learning_rate': 5.450980392156863e-05, 'epoch': 22.75}


 46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10451/22950 [16:58<21:16,  9.79it/s]

{'loss': 0.0286, 'grad_norm': 0.003896029433235526, 'learning_rate': 5.446623093681917e-05, 'epoch': 22.77}


 46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10461/22950 [16:59<21:12,  9.82it/s]

{'loss': 0.0652, 'grad_norm': 0.005474335979670286, 'learning_rate': 5.442265795206972e-05, 'epoch': 22.79}


 46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10471/22950 [17:00<21:15,  9.78it/s]

{'loss': 0.0002, 'grad_norm': 0.006799706257879734, 'learning_rate': 5.437908496732026e-05, 'epoch': 22.81}


 46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10481/22950 [17:01<21:07,  9.83it/s]

{'loss': 0.0687, 'grad_norm': 0.0025871614925563335, 'learning_rate': 5.433551198257081e-05, 'epoch': 22.83}


 46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10491/22950 [17:02<21:08,  9.82it/s]

{'loss': 0.0002, 'grad_norm': 0.004197646398097277, 'learning_rate': 5.429193899782136e-05, 'epoch': 22.85}


 46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10501/22950 [17:03<21:13,  9.77it/s]

{'loss': 0.06, 'grad_norm': 0.003278382821008563, 'learning_rate': 5.4248366013071894e-05, 'epoch': 22.88}


 46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10511/22950 [17:04<21:01,  9.86it/s]

{'loss': 0.0002, 'grad_norm': 0.0038645234890282154, 'learning_rate': 5.4204793028322445e-05, 'epoch': 22.9}


 46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10521/22950 [17:05<21:38,  9.57it/s]

{'loss': 0.0002, 'grad_norm': 0.004982986021786928, 'learning_rate': 5.416122004357299e-05, 'epoch': 22.92}


 46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10531/22950 [17:06<21:17,  9.72it/s]

{'loss': 0.0003, 'grad_norm': 0.00600387854501605, 'learning_rate': 5.411764705882353e-05, 'epoch': 22.94}


 46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10542/22950 [17:08<20:50,  9.93it/s]

{'loss': 0.0512, 'grad_norm': 22.068296432495117, 'learning_rate': 5.4074074074074075e-05, 'epoch': 22.96}


 46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10552/22950 [17:09<20:36, 10.03it/s]

{'loss': 0.0009, 'grad_norm': 0.004542283248156309, 'learning_rate': 5.4030501089324625e-05, 'epoch': 22.98}


                                                     
 46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10557/22950 [17:12<20:48,  9.92it/s]

{'eval_loss': 1.127914547920227, 'eval_accuracy': 0.8406862616539001, 'eval_runtime': 3.0304, 'eval_samples_per_second': 134.635, 'eval_steps_per_second': 16.829, 'epoch': 23.0}


 46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10562/22950 [17:13<1:16:32,  2.70it/s]

{'loss': 0.08, 'grad_norm': 0.012942791916429996, 'learning_rate': 5.398692810457516e-05, 'epoch': 23.01}


 46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10571/22950 [17:14<29:13,  7.06it/s]  

{'loss': 0.0004, 'grad_norm': 0.07034959644079208, 'learning_rate': 5.394335511982571e-05, 'epoch': 23.03}


 46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10581/22950 [17:15<21:25,  9.62it/s]

{'loss': 0.0179, 'grad_norm': 0.005042955745011568, 'learning_rate': 5.389978213507626e-05, 'epoch': 23.05}


 46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10591/22950 [17:16<20:43,  9.94it/s]

{'loss': 0.1685, 'grad_norm': 0.009057650342583656, 'learning_rate': 5.38562091503268e-05, 'epoch': 23.07}


 46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10602/22950 [17:17<20:25, 10.07it/s]

{'loss': 0.0002, 'grad_norm': 0.014683475717902184, 'learning_rate': 5.381263616557735e-05, 'epoch': 23.09}


 46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 10611/22950 [17:18<20:48,  9.88it/s]

{'loss': 0.0038, 'grad_norm': 0.008851110003888607, 'learning_rate': 5.376906318082789e-05, 'epoch': 23.12}


 46%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10621/22950 [17:19<20:57,  9.81it/s]

{'loss': 0.0591, 'grad_norm': 0.08062153309583664, 'learning_rate': 5.372549019607843e-05, 'epoch': 23.14}


 46%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10631/22950 [17:20<20:46,  9.88it/s]

{'loss': 0.0131, 'grad_norm': 0.03692696616053581, 'learning_rate': 5.368191721132898e-05, 'epoch': 23.16}


 46%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10641/22950 [17:21<20:42,  9.91it/s]

{'loss': 0.0006, 'grad_norm': 0.022385042160749435, 'learning_rate': 5.3638344226579516e-05, 'epoch': 23.18}


 46%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10651/22950 [17:22<20:32,  9.98it/s]

{'loss': 0.0001, 'grad_norm': 0.003098100423812866, 'learning_rate': 5.3594771241830066e-05, 'epoch': 23.2}


 46%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10661/22950 [17:23<21:08,  9.69it/s]

{'loss': 0.2414, 'grad_norm': 0.001855459762737155, 'learning_rate': 5.3551198257080616e-05, 'epoch': 23.22}


 47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10672/22950 [17:24<20:21, 10.05it/s]

{'loss': 0.0522, 'grad_norm': 0.09161487966775894, 'learning_rate': 5.350762527233115e-05, 'epoch': 23.25}


 47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10682/22950 [17:25<20:13, 10.11it/s]

{'loss': 0.0004, 'grad_norm': 0.015380412340164185, 'learning_rate': 5.34640522875817e-05, 'epoch': 23.27}


 47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10691/22950 [17:26<20:28,  9.98it/s]

{'loss': 0.0726, 'grad_norm': 0.003505864180624485, 'learning_rate': 5.3420479302832246e-05, 'epoch': 23.29}


 47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10701/22950 [17:27<20:26,  9.99it/s]

{'loss': 0.0002, 'grad_norm': 0.006165300961583853, 'learning_rate': 5.337690631808279e-05, 'epoch': 23.31}


 47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10711/22950 [17:28<20:18, 10.04it/s]

{'loss': 0.0002, 'grad_norm': 0.018633732572197914, 'learning_rate': 5.333333333333333e-05, 'epoch': 23.33}


 47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10722/22950 [17:29<20:14, 10.07it/s]

{'loss': 0.0003, 'grad_norm': 0.0032591824419796467, 'learning_rate': 5.328976034858388e-05, 'epoch': 23.36}


 47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10731/22950 [17:30<20:48,  9.79it/s]

{'loss': 0.0003, 'grad_norm': 0.008056138642132282, 'learning_rate': 5.324618736383442e-05, 'epoch': 23.38}


 47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10742/22950 [17:31<20:18, 10.02it/s]

{'loss': 0.0001, 'grad_norm': 0.0026054847985506058, 'learning_rate': 5.320261437908497e-05, 'epoch': 23.4}


 47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10751/22950 [17:32<20:24,  9.96it/s]

{'loss': 0.0001, 'grad_norm': 0.0017267960356548429, 'learning_rate': 5.315904139433552e-05, 'epoch': 23.42}


 47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10761/22950 [17:33<20:19,  9.99it/s]

{'loss': 0.0002, 'grad_norm': 0.002104993211105466, 'learning_rate': 5.311546840958606e-05, 'epoch': 23.44}


 47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10771/22950 [17:34<20:15, 10.02it/s]

{'loss': 0.0236, 'grad_norm': 0.0023128658067435026, 'learning_rate': 5.307189542483661e-05, 'epoch': 23.46}


 47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10781/22950 [17:35<20:08, 10.07it/s]

{'loss': 0.0001, 'grad_norm': 0.0028082518838346004, 'learning_rate': 5.302832244008715e-05, 'epoch': 23.49}


 47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10791/22950 [17:36<20:31,  9.88it/s]

{'loss': 0.0001, 'grad_norm': 0.0018987973453477025, 'learning_rate': 5.298474945533769e-05, 'epoch': 23.51}


 47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10801/22950 [17:37<20:18,  9.97it/s]

{'loss': 0.018, 'grad_norm': 0.001149844960309565, 'learning_rate': 5.294117647058824e-05, 'epoch': 23.53}


 47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10812/22950 [17:38<20:06, 10.06it/s]

{'loss': 0.0002, 'grad_norm': 0.0021842580754309893, 'learning_rate': 5.289760348583879e-05, 'epoch': 23.55}


 47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10822/22950 [17:39<20:04, 10.07it/s]

{'loss': 0.0001, 'grad_norm': 0.0015734959160909057, 'learning_rate': 5.2854030501089324e-05, 'epoch': 23.57}


 47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10832/22950 [17:40<20:05, 10.05it/s]

{'loss': 0.029, 'grad_norm': 0.008451012894511223, 'learning_rate': 5.2810457516339874e-05, 'epoch': 23.59}


 47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10842/22950 [17:41<20:03, 10.06it/s]

{'loss': 0.1488, 'grad_norm': 17.023540496826172, 'learning_rate': 5.2766884531590425e-05, 'epoch': 23.62}


 47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10852/22950 [17:42<19:54, 10.13it/s]

{'loss': 0.0002, 'grad_norm': 0.0017752082785591483, 'learning_rate': 5.272331154684096e-05, 'epoch': 23.64}


 47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10862/22950 [17:43<19:57, 10.09it/s]

{'loss': 0.0567, 'grad_norm': 0.004298650659620762, 'learning_rate': 5.2679738562091505e-05, 'epoch': 23.66}


 47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10872/22950 [17:44<19:50, 10.14it/s]

{'loss': 0.0441, 'grad_norm': 0.007768854033201933, 'learning_rate': 5.2636165577342055e-05, 'epoch': 23.68}


 47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10882/22950 [17:45<19:55, 10.09it/s]

{'loss': 0.0002, 'grad_norm': 0.014185531996190548, 'learning_rate': 5.259259259259259e-05, 'epoch': 23.7}


 47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10892/22950 [17:46<19:53, 10.10it/s]

{'loss': 0.0574, 'grad_norm': 0.003094041720032692, 'learning_rate': 5.254901960784314e-05, 'epoch': 23.73}


 47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 10900/22950 [17:47<19:59, 10.05it/s]

{'loss': 0.1559, 'grad_norm': 0.004062307067215443, 'learning_rate': 5.250544662309368e-05, 'epoch': 23.75}


 48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 10912/22950 [17:48<19:42, 10.18it/s]

{'loss': 0.0127, 'grad_norm': 0.010752025991678238, 'learning_rate': 5.246187363834423e-05, 'epoch': 23.77}


 48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 10922/22950 [17:49<19:44, 10.16it/s]

{'loss': 0.0043, 'grad_norm': 0.009226968511939049, 'learning_rate': 5.241830065359478e-05, 'epoch': 23.79}


 48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 10932/22950 [17:50<19:42, 10.17it/s]

{'loss': 0.0643, 'grad_norm': 18.097715377807617, 'learning_rate': 5.2374727668845315e-05, 'epoch': 23.81}


 48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 10942/22950 [17:51<19:46, 10.12it/s]

{'loss': 0.0009, 'grad_norm': 0.2242068499326706, 'learning_rate': 5.2331154684095866e-05, 'epoch': 23.83}


 48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 10952/22950 [17:52<19:43, 10.14it/s]

{'loss': 0.0003, 'grad_norm': 0.002827203134074807, 'learning_rate': 5.228758169934641e-05, 'epoch': 23.86}


 48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 10961/22950 [17:53<20:14,  9.87it/s]

{'loss': 0.0128, 'grad_norm': 0.038698434829711914, 'learning_rate': 5.2244008714596946e-05, 'epoch': 23.88}


 48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 10971/22950 [17:54<19:40, 10.15it/s]

{'loss': 0.0384, 'grad_norm': 0.006369854789227247, 'learning_rate': 5.2200435729847496e-05, 'epoch': 23.9}


 48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 10981/22950 [17:55<19:32, 10.21it/s]

{'loss': 0.1285, 'grad_norm': 0.020018283277750015, 'learning_rate': 5.2156862745098046e-05, 'epoch': 23.92}


 48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 10992/22950 [17:56<19:35, 10.17it/s]

{'loss': 0.0003, 'grad_norm': 0.03955947607755661, 'learning_rate': 5.211328976034858e-05, 'epoch': 23.94}


 48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 11002/22950 [17:57<19:37, 10.15it/s]

{'loss': 0.1252, 'grad_norm': 1.7688007354736328, 'learning_rate': 5.206971677559913e-05, 'epoch': 23.97}


 48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 11012/22950 [17:58<19:52, 10.01it/s]

{'loss': 0.044, 'grad_norm': 0.011671322397887707, 'learning_rate': 5.202614379084968e-05, 'epoch': 23.99}


                                                     
 48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 11016/22950 [18:01<18:16, 10.88it/s]

{'eval_loss': 1.0001695156097412, 'eval_accuracy': 0.8357843160629272, 'eval_runtime': 2.9381, 'eval_samples_per_second': 138.865, 'eval_steps_per_second': 17.358, 'epoch': 24.0}


 48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 11022/22950 [18:02<1:07:14,  2.96it/s]

{'loss': 0.0014, 'grad_norm': 0.007452528923749924, 'learning_rate': 5.198257080610022e-05, 'epoch': 24.01}


 48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 11031/22950 [18:03<29:18,  6.78it/s]  

{'loss': 0.0003, 'grad_norm': 0.040466632694005966, 'learning_rate': 5.193899782135076e-05, 'epoch': 24.03}


 48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 11042/22950 [18:04<20:22,  9.74it/s]

{'loss': 0.0002, 'grad_norm': 0.005983269773423672, 'learning_rate': 5.189542483660131e-05, 'epoch': 24.05}


 48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 11051/22950 [18:05<19:42, 10.07it/s]

{'loss': 0.0004, 'grad_norm': 0.013946196995675564, 'learning_rate': 5.185185185185185e-05, 'epoch': 24.07}


 48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 11061/22950 [18:06<19:50,  9.99it/s]

{'loss': 0.1296, 'grad_norm': 11.982810020446777, 'learning_rate': 5.18082788671024e-05, 'epoch': 24.1}


 48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 11071/22950 [18:07<19:21, 10.22it/s]

{'loss': 0.0007, 'grad_norm': 0.01534358412027359, 'learning_rate': 5.176470588235295e-05, 'epoch': 24.12}


 48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 11081/22950 [18:08<19:15, 10.27it/s]

{'loss': 0.0003, 'grad_norm': 0.037059932947158813, 'learning_rate': 5.172113289760349e-05, 'epoch': 24.14}


 48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 11091/22950 [18:09<19:16, 10.25it/s]

{'loss': 0.0567, 'grad_norm': 0.004717658273875713, 'learning_rate': 5.167755991285404e-05, 'epoch': 24.16}


 48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 11102/22950 [18:10<19:23, 10.18it/s]

{'loss': 0.0002, 'grad_norm': 0.002591463038697839, 'learning_rate': 5.163398692810458e-05, 'epoch': 24.18}


 48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 11112/22950 [18:11<19:16, 10.24it/s]

{'loss': 0.0003, 'grad_norm': 0.1583903431892395, 'learning_rate': 5.159041394335512e-05, 'epoch': 24.2}


 48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 11122/22950 [18:12<19:18, 10.21it/s]

{'loss': 0.0826, 'grad_norm': 0.003787299385294318, 'learning_rate': 5.154684095860567e-05, 'epoch': 24.23}


 49%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 11132/22950 [18:13<19:15, 10.22it/s]

{'loss': 0.0002, 'grad_norm': 0.003797742072492838, 'learning_rate': 5.1503267973856204e-05, 'epoch': 24.25}


 49%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 11142/22950 [18:14<19:07, 10.29it/s]

{'loss': 0.0001, 'grad_norm': 0.007266282103955746, 'learning_rate': 5.1459694989106754e-05, 'epoch': 24.27}


 49%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 11152/22950 [18:15<19:12, 10.23it/s]

{'loss': 0.0002, 'grad_norm': 0.003853215603157878, 'learning_rate': 5.1416122004357304e-05, 'epoch': 24.29}


 49%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 11161/22950 [18:16<19:26, 10.11it/s]

{'loss': 0.0083, 'grad_norm': 0.0037540404591709375, 'learning_rate': 5.137254901960784e-05, 'epoch': 24.31}


 49%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 11171/22950 [18:17<19:19, 10.16it/s]

{'loss': 0.0001, 'grad_norm': 0.0022555654868483543, 'learning_rate': 5.132897603485839e-05, 'epoch': 24.34}


 49%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 11181/22950 [18:18<19:19, 10.15it/s]

{'loss': 0.0546, 'grad_norm': 67.66633605957031, 'learning_rate': 5.1285403050108934e-05, 'epoch': 24.36}


 49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11191/22950 [18:19<19:10, 10.22it/s]

{'loss': 0.0001, 'grad_norm': 0.004072581883519888, 'learning_rate': 5.124183006535948e-05, 'epoch': 24.38}


 49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11201/22950 [18:20<19:10, 10.21it/s]

{'loss': 0.0002, 'grad_norm': 0.02930609881877899, 'learning_rate': 5.119825708061002e-05, 'epoch': 24.4}


 49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11212/22950 [18:21<19:11, 10.19it/s]

{'loss': 0.0001, 'grad_norm': 0.0027974029071629047, 'learning_rate': 5.115468409586057e-05, 'epoch': 24.42}


 49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11222/22950 [18:22<19:08, 10.21it/s]

{'loss': 0.0834, 'grad_norm': 23.162572860717773, 'learning_rate': 5.111111111111111e-05, 'epoch': 24.44}


 49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11230/22950 [18:23<19:11, 10.18it/s]

{'loss': 0.1055, 'grad_norm': 0.27514806389808655, 'learning_rate': 5.106753812636166e-05, 'epoch': 24.47}


 49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11241/22950 [18:24<18:47, 10.39it/s]

{'loss': 0.0447, 'grad_norm': 0.004257154185324907, 'learning_rate': 5.102396514161221e-05, 'epoch': 24.49}


 49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11251/22950 [18:25<19:02, 10.24it/s]

{'loss': 0.0054, 'grad_norm': 0.005398723762482405, 'learning_rate': 5.0980392156862745e-05, 'epoch': 24.51}


 49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11261/22950 [18:26<18:55, 10.30it/s]

{'loss': 0.0001, 'grad_norm': 0.0026655709370970726, 'learning_rate': 5.0936819172113295e-05, 'epoch': 24.53}


 49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11272/22950 [18:27<18:53, 10.30it/s]

{'loss': 0.0253, 'grad_norm': 0.023352447897195816, 'learning_rate': 5.089324618736384e-05, 'epoch': 24.55}


 49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11282/22950 [18:28<18:59, 10.24it/s]

{'loss': 0.0002, 'grad_norm': 0.0030096033588051796, 'learning_rate': 5.0849673202614375e-05, 'epoch': 24.58}


 49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11292/22950 [18:29<18:52, 10.29it/s]

{'loss': 0.0002, 'grad_norm': 0.0032660739962011576, 'learning_rate': 5.0806100217864926e-05, 'epoch': 24.6}


 49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11302/22950 [18:30<18:48, 10.32it/s]

{'loss': 0.0001, 'grad_norm': 0.0034452895633876324, 'learning_rate': 5.0762527233115476e-05, 'epoch': 24.62}


 49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11312/22950 [18:31<18:48, 10.32it/s]

{'loss': 0.0277, 'grad_norm': 0.004141549114137888, 'learning_rate': 5.071895424836601e-05, 'epoch': 24.64}


 49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11322/22950 [18:32<19:14, 10.07it/s]

{'loss': 0.0001, 'grad_norm': 0.002303541637957096, 'learning_rate': 5.067538126361656e-05, 'epoch': 24.66}


 49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11332/22950 [18:33<18:50, 10.28it/s]

{'loss': 0.0003, 'grad_norm': 0.004769328981637955, 'learning_rate': 5.063180827886711e-05, 'epoch': 24.68}


 49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11342/22950 [18:34<18:33, 10.42it/s]

{'loss': 0.0008, 'grad_norm': 0.1643984615802765, 'learning_rate': 5.058823529411765e-05, 'epoch': 24.71}


 49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11352/22950 [18:35<18:38, 10.37it/s]

{'loss': 0.0002, 'grad_norm': 0.1141168624162674, 'learning_rate': 5.054466230936819e-05, 'epoch': 24.73}


 50%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11362/22950 [18:36<18:43, 10.31it/s]

{'loss': 0.0002, 'grad_norm': 0.0016520542558282614, 'learning_rate': 5.0501089324618736e-05, 'epoch': 24.75}


 50%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11372/22950 [18:37<18:52, 10.23it/s]

{'loss': 0.0445, 'grad_norm': 0.023273007944226265, 'learning_rate': 5.045751633986928e-05, 'epoch': 24.77}


 50%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11381/22950 [18:38<18:41, 10.32it/s]

{'loss': 0.0588, 'grad_norm': 0.002466929145157337, 'learning_rate': 5.041394335511983e-05, 'epoch': 24.79}


 50%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11391/22950 [18:38<18:36, 10.35it/s]

{'loss': 0.0022, 'grad_norm': 7.817131996154785, 'learning_rate': 5.0370370370370366e-05, 'epoch': 24.81}


 50%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11401/22950 [18:39<18:30, 10.40it/s]

{'loss': 0.0001, 'grad_norm': 0.00222898181527853, 'learning_rate': 5.032679738562092e-05, 'epoch': 24.84}


 50%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11411/22950 [18:40<18:39, 10.31it/s]

{'loss': 0.0699, 'grad_norm': 0.019715027883648872, 'learning_rate': 5.028322440087147e-05, 'epoch': 24.86}


 50%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11421/22950 [18:41<18:35, 10.34it/s]

{'loss': 0.0001, 'grad_norm': 0.00736536318436265, 'learning_rate': 5.0239651416122003e-05, 'epoch': 24.88}


 50%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11431/22950 [18:42<18:52, 10.17it/s]

{'loss': 0.0965, 'grad_norm': 0.05769248679280281, 'learning_rate': 5.0196078431372554e-05, 'epoch': 24.9}


 50%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11441/22950 [18:43<18:28, 10.38it/s]

{'loss': 0.0083, 'grad_norm': 0.0028764717280864716, 'learning_rate': 5.01525054466231e-05, 'epoch': 24.92}


 50%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11451/22950 [18:44<18:33, 10.33it/s]

{'loss': 0.0003, 'grad_norm': 0.009759489446878433, 'learning_rate': 5.0108932461873634e-05, 'epoch': 24.95}


 50%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11461/22950 [18:45<18:28, 10.36it/s]

{'loss': 0.0008, 'grad_norm': 0.00841898936778307, 'learning_rate': 5.0065359477124184e-05, 'epoch': 24.97}


 50%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 11471/22950 [18:46<18:46, 10.19it/s]

{'loss': 0.0002, 'grad_norm': 0.005452144891023636, 'learning_rate': 5.0021786492374734e-05, 'epoch': 24.99}


                                                     
 50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11475/22950 [18:50<17:19, 11.04it/s]

{'eval_loss': 1.090532660484314, 'eval_accuracy': 0.8529411554336548, 'eval_runtime': 2.8761, 'eval_samples_per_second': 141.861, 'eval_steps_per_second': 17.733, 'epoch': 25.0}


 50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11481/22950 [18:50<1:02:05,  3.08it/s]

{'loss': 0.0002, 'grad_norm': 0.07842149585485458, 'learning_rate': 4.997821350762528e-05, 'epoch': 25.01}


 50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11492/22950 [18:52<24:32,  7.78it/s]  

{'loss': 0.0003, 'grad_norm': 0.0037162851076573133, 'learning_rate': 4.993464052287582e-05, 'epoch': 25.03}


 50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11502/22950 [18:52<19:13,  9.93it/s]

{'loss': 0.0001, 'grad_norm': 0.07916175574064255, 'learning_rate': 4.9891067538126364e-05, 'epoch': 25.05}


 50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11512/22950 [18:53<18:39, 10.22it/s]

{'loss': 0.028, 'grad_norm': 0.0017690344247967005, 'learning_rate': 4.984749455337691e-05, 'epoch': 25.08}


 50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11522/22950 [18:54<18:14, 10.44it/s]

{'loss': 0.0633, 'grad_norm': 0.002524863462895155, 'learning_rate': 4.980392156862745e-05, 'epoch': 25.1}


 50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11532/22950 [18:55<18:12, 10.45it/s]

{'loss': 0.0003, 'grad_norm': 0.0024175511207431555, 'learning_rate': 4.9760348583877995e-05, 'epoch': 25.12}


 50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11542/22950 [18:56<18:06, 10.50it/s]

{'loss': 0.1347, 'grad_norm': 23.59355926513672, 'learning_rate': 4.971677559912854e-05, 'epoch': 25.14}


 50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11552/22950 [18:57<18:13, 10.42it/s]

{'loss': 0.0001, 'grad_norm': 0.003942158073186874, 'learning_rate': 4.967320261437909e-05, 'epoch': 25.16}


 50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11562/22950 [18:58<18:14, 10.40it/s]

{'loss': 0.0093, 'grad_norm': 0.002181149320676923, 'learning_rate': 4.962962962962963e-05, 'epoch': 25.19}


 50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11572/22950 [18:59<18:04, 10.49it/s]

{'loss': 0.0013, 'grad_norm': 0.003927124664187431, 'learning_rate': 4.9586056644880175e-05, 'epoch': 25.21}


 50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11582/22950 [19:00<18:01, 10.51it/s]

{'loss': 0.0001, 'grad_norm': 0.0033655075822025537, 'learning_rate': 4.9542483660130725e-05, 'epoch': 25.23}


 51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11592/22950 [19:01<18:00, 10.51it/s]

{'loss': 0.0002, 'grad_norm': 0.0028291402850300074, 'learning_rate': 4.949891067538127e-05, 'epoch': 25.25}


 51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11602/22950 [19:02<18:03, 10.48it/s]

{'loss': 0.0377, 'grad_norm': 122.89311981201172, 'learning_rate': 4.945533769063181e-05, 'epoch': 25.27}


 51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11612/22950 [19:03<18:16, 10.34it/s]

{'loss': 0.0001, 'grad_norm': 0.00622877711430192, 'learning_rate': 4.9411764705882355e-05, 'epoch': 25.29}


 51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11622/22950 [19:04<17:52, 10.56it/s]

{'loss': 0.094, 'grad_norm': 0.0022189212031662464, 'learning_rate': 4.93681917211329e-05, 'epoch': 25.32}


 51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11632/22950 [19:05<18:01, 10.46it/s]

{'loss': 0.001, 'grad_norm': 0.040255919098854065, 'learning_rate': 4.932461873638344e-05, 'epoch': 25.34}


 51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11642/22950 [19:06<18:24, 10.24it/s]

{'loss': 0.0001, 'grad_norm': 0.001306433230638504, 'learning_rate': 4.928104575163399e-05, 'epoch': 25.36}


 51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11652/22950 [19:07<18:03, 10.42it/s]

{'loss': 0.0472, 'grad_norm': 0.002870869589969516, 'learning_rate': 4.9237472766884536e-05, 'epoch': 25.38}


 51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11662/22950 [19:08<18:15, 10.30it/s]

{'loss': 0.0888, 'grad_norm': 0.0027108071371912956, 'learning_rate': 4.919389978213508e-05, 'epoch': 25.4}


 51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11672/22950 [19:09<17:56, 10.47it/s]

{'loss': 0.0003, 'grad_norm': 0.002782002091407776, 'learning_rate': 4.915032679738562e-05, 'epoch': 25.42}


 51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11682/22950 [19:10<17:51, 10.51it/s]

{'loss': 0.0299, 'grad_norm': 0.003617033362388611, 'learning_rate': 4.9106753812636166e-05, 'epoch': 25.45}


 51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11692/22950 [19:11<17:58, 10.44it/s]

{'loss': 0.0004, 'grad_norm': 0.002327169757336378, 'learning_rate': 4.906318082788671e-05, 'epoch': 25.47}


 51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11702/22950 [19:12<17:48, 10.52it/s]

{'loss': 0.0002, 'grad_norm': 0.0020167615730315447, 'learning_rate': 4.901960784313725e-05, 'epoch': 25.49}


 51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11712/22950 [19:13<17:46, 10.54it/s]

{'loss': 0.0003, 'grad_norm': 0.0031695424113422632, 'learning_rate': 4.89760348583878e-05, 'epoch': 25.51}


 51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11722/22950 [19:14<17:43, 10.56it/s]

{'loss': 0.0233, 'grad_norm': 0.0038529906887561083, 'learning_rate': 4.8932461873638346e-05, 'epoch': 25.53}


 51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11732/22950 [19:15<17:41, 10.57it/s]

{'loss': 0.0707, 'grad_norm': 0.5296946167945862, 'learning_rate': 4.888888888888889e-05, 'epoch': 25.56}


 51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11742/22950 [19:16<17:46, 10.51it/s]

{'loss': 0.0276, 'grad_norm': 0.0018507404020056129, 'learning_rate': 4.884531590413944e-05, 'epoch': 25.58}


 51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 11752/22950 [19:16<17:49, 10.47it/s]

{'loss': 0.0002, 'grad_norm': 0.0019264094298705459, 'learning_rate': 4.8801742919389983e-05, 'epoch': 25.6}


 51%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11762/22950 [19:17<17:43, 10.52it/s]

{'loss': 0.0001, 'grad_norm': 0.0020515157375484705, 'learning_rate': 4.875816993464053e-05, 'epoch': 25.62}


 51%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11772/22950 [19:18<17:36, 10.58it/s]

{'loss': 0.0003, 'grad_norm': 0.013344546779990196, 'learning_rate': 4.871459694989107e-05, 'epoch': 25.64}


 51%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11782/22950 [19:19<17:44, 10.49it/s]

{'loss': 0.0001, 'grad_norm': 0.0015066355699673295, 'learning_rate': 4.8671023965141614e-05, 'epoch': 25.66}


 51%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11792/22950 [19:20<17:37, 10.55it/s]

{'loss': 0.0447, 'grad_norm': 90.13072204589844, 'learning_rate': 4.862745098039216e-05, 'epoch': 25.69}


 51%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11802/22950 [19:21<17:31, 10.60it/s]

{'loss': 0.0001, 'grad_norm': 0.0015994466375559568, 'learning_rate': 4.85838779956427e-05, 'epoch': 25.71}


 51%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11812/22950 [19:22<17:29, 10.62it/s]

{'loss': 0.0001, 'grad_norm': 0.013163520023226738, 'learning_rate': 4.854030501089325e-05, 'epoch': 25.73}


 52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11822/22950 [19:23<18:19, 10.12it/s]

{'loss': 0.1133, 'grad_norm': 0.0015792699996381998, 'learning_rate': 4.8496732026143794e-05, 'epoch': 25.75}


 52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11832/22950 [19:24<17:31, 10.57it/s]

{'loss': 0.0694, 'grad_norm': 0.004694065544754267, 'learning_rate': 4.845315904139434e-05, 'epoch': 25.77}


 52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11842/22950 [19:25<17:22, 10.65it/s]

{'loss': 0.0147, 'grad_norm': 0.00171568151563406, 'learning_rate': 4.840958605664489e-05, 'epoch': 25.8}


 52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11852/22950 [19:26<17:29, 10.57it/s]

{'loss': 0.0685, 'grad_norm': 0.5651242733001709, 'learning_rate': 4.8366013071895424e-05, 'epoch': 25.82}


 52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11862/22950 [19:27<17:36, 10.49it/s]

{'loss': 0.0003, 'grad_norm': 0.0035163273569196463, 'learning_rate': 4.832244008714597e-05, 'epoch': 25.84}


 52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11872/22950 [19:28<17:22, 10.63it/s]

{'loss': 0.0004, 'grad_norm': 0.12637659907341003, 'learning_rate': 4.827886710239652e-05, 'epoch': 25.86}


 52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11882/22950 [19:29<17:22, 10.62it/s]

{'loss': 0.0002, 'grad_norm': 0.002425288548693061, 'learning_rate': 4.823529411764706e-05, 'epoch': 25.88}


 52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11892/22950 [19:30<17:24, 10.59it/s]

{'loss': 0.0002, 'grad_norm': 0.002391709014773369, 'learning_rate': 4.8191721132897605e-05, 'epoch': 25.9}


 52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11902/22950 [19:31<17:29, 10.53it/s]

{'loss': 0.0133, 'grad_norm': 0.002104258630424738, 'learning_rate': 4.814814814814815e-05, 'epoch': 25.93}


 52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11912/22950 [19:32<17:15, 10.66it/s]

{'loss': 0.0115, 'grad_norm': 0.002087573753669858, 'learning_rate': 4.81045751633987e-05, 'epoch': 25.95}


 52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11922/22950 [19:33<17:20, 10.60it/s]

{'loss': 0.0001, 'grad_norm': 0.004377382807433605, 'learning_rate': 4.806100217864924e-05, 'epoch': 25.97}


 52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11932/22950 [19:34<17:11, 10.68it/s]

{'loss': 0.0005, 'grad_norm': 0.0032309689559042454, 'learning_rate': 4.8017429193899785e-05, 'epoch': 25.99}


                                                     
 52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11934/22950 [19:37<16:12, 11.33it/s]

{'eval_loss': 1.3207662105560303, 'eval_accuracy': 0.8161764740943909, 'eval_runtime': 2.8145, 'eval_samples_per_second': 144.965, 'eval_steps_per_second': 18.121, 'epoch': 26.0}


 52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11942/22950 [19:38<45:17,  4.05it/s]  

{'loss': 0.1034, 'grad_norm': 0.014009013772010803, 'learning_rate': 4.797385620915033e-05, 'epoch': 26.01}


 52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11952/22950 [19:39<22:24,  8.18it/s]

{'loss': 0.0459, 'grad_norm': 0.007016138173639774, 'learning_rate': 4.793028322440087e-05, 'epoch': 26.03}


 52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11962/22950 [19:40<17:59, 10.18it/s]

{'loss': 0.0501, 'grad_norm': 0.0020054797641932964, 'learning_rate': 4.7886710239651415e-05, 'epoch': 26.06}


 52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11972/22950 [19:41<17:15, 10.60it/s]

{'loss': 0.0106, 'grad_norm': 0.0013408849481493235, 'learning_rate': 4.7843137254901966e-05, 'epoch': 26.08}


 52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11982/22950 [19:41<17:09, 10.65it/s]

{'loss': 0.0026, 'grad_norm': 0.0019449839601293206, 'learning_rate': 4.779956427015251e-05, 'epoch': 26.1}


 52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 11992/22950 [19:42<17:01, 10.73it/s]

{'loss': 0.0005, 'grad_norm': 0.0039300452917814255, 'learning_rate': 4.775599128540305e-05, 'epoch': 26.12}


 52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 12002/22950 [19:43<16:58, 10.75it/s]

{'loss': 0.0001, 'grad_norm': 0.0014149990165606141, 'learning_rate': 4.77124183006536e-05, 'epoch': 26.14}


 52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 12012/22950 [19:44<16:50, 10.82it/s]

{'loss': 0.0001, 'grad_norm': 0.005744571331888437, 'learning_rate': 4.766884531590414e-05, 'epoch': 26.17}


 52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 12022/22950 [19:45<16:56, 10.76it/s]

{'loss': 0.0307, 'grad_norm': 0.0019996066112071276, 'learning_rate': 4.762527233115468e-05, 'epoch': 26.19}


 52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 12032/22950 [19:46<17:23, 10.46it/s]

{'loss': 0.0001, 'grad_norm': 0.003818060737103224, 'learning_rate': 4.7581699346405226e-05, 'epoch': 26.21}


 52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 12042/22950 [19:47<16:50, 10.79it/s]

{'loss': 0.0002, 'grad_norm': 0.001521506579592824, 'learning_rate': 4.7538126361655776e-05, 'epoch': 26.23}


 53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 12052/22950 [19:48<16:49, 10.79it/s]

{'loss': 0.0732, 'grad_norm': 0.0017212631646543741, 'learning_rate': 4.749455337690632e-05, 'epoch': 26.25}


 53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 12062/22950 [19:49<16:50, 10.77it/s]

{'loss': 0.0001, 'grad_norm': 0.0028094316367059946, 'learning_rate': 4.745098039215686e-05, 'epoch': 26.27}


 53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 12072/22950 [19:50<16:49, 10.78it/s]

{'loss': 0.0015, 'grad_norm': 0.0016369695076718926, 'learning_rate': 4.740740740740741e-05, 'epoch': 26.3}


 53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 12082/22950 [19:51<16:56, 10.69it/s]

{'loss': 0.0002, 'grad_norm': 0.002386099426075816, 'learning_rate': 4.736383442265796e-05, 'epoch': 26.32}


 53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 12092/22950 [19:52<17:02, 10.62it/s]

{'loss': 0.0003, 'grad_norm': 0.0018895192770287395, 'learning_rate': 4.73202614379085e-05, 'epoch': 26.34}


 53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 12102/22950 [19:53<16:48, 10.75it/s]

{'loss': 0.0023, 'grad_norm': 20.14342498779297, 'learning_rate': 4.7276688453159044e-05, 'epoch': 26.36}


 53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 12112/22950 [19:54<16:46, 10.77it/s]

{'loss': 0.0001, 'grad_norm': 0.0021942604798823595, 'learning_rate': 4.723311546840959e-05, 'epoch': 26.38}


 53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 12122/22950 [19:55<16:52, 10.69it/s]

{'loss': 0.0002, 'grad_norm': 0.0011459642555564642, 'learning_rate': 4.718954248366013e-05, 'epoch': 26.41}


 53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 12132/22950 [19:56<16:49, 10.72it/s]

{'loss': 0.0196, 'grad_norm': 0.0017699733143672347, 'learning_rate': 4.714596949891068e-05, 'epoch': 26.43}


 53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 12142/22950 [19:56<16:50, 10.69it/s]

{'loss': 0.0076, 'grad_norm': 0.0021254108287394047, 'learning_rate': 4.7102396514161224e-05, 'epoch': 26.45}


 53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 12152/22950 [19:57<16:42, 10.77it/s]

{'loss': 0.1027, 'grad_norm': 23.565969467163086, 'learning_rate': 4.705882352941177e-05, 'epoch': 26.47}


 53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 12162/22950 [19:58<16:38, 10.81it/s]

{'loss': 0.0238, 'grad_norm': 0.22674492001533508, 'learning_rate': 4.701525054466231e-05, 'epoch': 26.49}


 53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 12172/22950 [19:59<16:40, 10.77it/s]

{'loss': 0.0004, 'grad_norm': 0.002270056866109371, 'learning_rate': 4.697167755991286e-05, 'epoch': 26.51}


 53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 12182/22950 [20:00<17:16, 10.39it/s]

{'loss': 0.0001, 'grad_norm': 0.008717812597751617, 'learning_rate': 4.69281045751634e-05, 'epoch': 26.54}


 53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 12192/22950 [20:01<16:40, 10.75it/s]

{'loss': 0.0001, 'grad_norm': 0.001345728407613933, 'learning_rate': 4.688453159041394e-05, 'epoch': 26.56}


 53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 12202/22950 [20:02<16:38, 10.77it/s]

{'loss': 0.0008, 'grad_norm': 0.001508642453700304, 'learning_rate': 4.684095860566449e-05, 'epoch': 26.58}


 53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 12212/22950 [20:03<16:35, 10.78it/s]

{'loss': 0.0002, 'grad_norm': 0.01182522065937519, 'learning_rate': 4.6797385620915035e-05, 'epoch': 26.6}


 53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 12222/22950 [20:04<16:34, 10.78it/s]

{'loss': 0.0016, 'grad_norm': 0.0017695577116683125, 'learning_rate': 4.675381263616558e-05, 'epoch': 26.62}


 53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 12232/22950 [20:05<17:07, 10.43it/s]

{'loss': 0.0001, 'grad_norm': 0.002501227194443345, 'learning_rate': 4.671023965141613e-05, 'epoch': 26.64}


 53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 12242/22950 [20:06<16:21, 10.91it/s]

{'loss': 0.0001, 'grad_norm': 0.03379087522625923, 'learning_rate': 4.666666666666667e-05, 'epoch': 26.67}


 53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 12252/22950 [20:07<16:25, 10.86it/s]

{'loss': 0.0997, 'grad_norm': 0.0018186646047979593, 'learning_rate': 4.6623093681917215e-05, 'epoch': 26.69}


 53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 12262/22950 [20:08<16:19, 10.91it/s]

{'loss': 0.0778, 'grad_norm': 0.0022214981727302074, 'learning_rate': 4.657952069716776e-05, 'epoch': 26.71}


 53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 12272/22950 [20:09<16:15, 10.95it/s]

{'loss': 0.0001, 'grad_norm': 0.003742816625162959, 'learning_rate': 4.65359477124183e-05, 'epoch': 26.73}


 54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 12282/22950 [20:10<16:38, 10.68it/s]

{'loss': 0.0001, 'grad_norm': 0.004589047282934189, 'learning_rate': 4.6492374727668845e-05, 'epoch': 26.75}


 54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 12292/22950 [20:10<16:17, 10.91it/s]

{'loss': 0.0002, 'grad_norm': 0.002664149273186922, 'learning_rate': 4.644880174291939e-05, 'epoch': 26.78}


 54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 12302/22950 [20:11<16:13, 10.93it/s]

{'loss': 0.0146, 'grad_norm': 0.0025053706485778093, 'learning_rate': 4.640522875816994e-05, 'epoch': 26.8}


 54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 12312/22950 [20:12<16:19, 10.86it/s]

{'loss': 0.0002, 'grad_norm': 0.0020243562757968903, 'learning_rate': 4.636165577342048e-05, 'epoch': 26.82}


 54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 12322/22950 [20:13<16:15, 10.90it/s]

{'loss': 0.0038, 'grad_norm': 0.004545052535831928, 'learning_rate': 4.6318082788671026e-05, 'epoch': 26.84}


 54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 12332/22950 [20:14<16:44, 10.57it/s]

{'loss': 0.0821, 'grad_norm': 0.0014026651624590158, 'learning_rate': 4.6274509803921576e-05, 'epoch': 26.86}


 54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12342/22950 [20:15<16:19, 10.83it/s]

{'loss': 0.0001, 'grad_norm': 0.18443426489830017, 'learning_rate': 4.623093681917212e-05, 'epoch': 26.88}


 54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12352/22950 [20:16<16:11, 10.91it/s]

{'loss': 0.0006, 'grad_norm': 0.00131220743060112, 'learning_rate': 4.6187363834422656e-05, 'epoch': 26.91}


 54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12362/22950 [20:17<16:10, 10.91it/s]

{'loss': 0.0001, 'grad_norm': 0.0013644990976899862, 'learning_rate': 4.6143790849673206e-05, 'epoch': 26.93}


 54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12372/22950 [20:18<16:09, 10.91it/s]

{'loss': 0.0001, 'grad_norm': 0.023775020614266396, 'learning_rate': 4.610021786492375e-05, 'epoch': 26.95}


 54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12382/22950 [20:19<16:26, 10.71it/s]

{'loss': 0.0001, 'grad_norm': 0.001871586195193231, 'learning_rate': 4.605664488017429e-05, 'epoch': 26.97}


 54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12392/22950 [20:20<16:07, 10.91it/s]

{'loss': 0.0, 'grad_norm': 0.0017773329745978117, 'learning_rate': 4.6013071895424836e-05, 'epoch': 26.99}


                                                     
 54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12393/22950 [20:22<16:07, 10.91it/s]

{'eval_loss': 1.2536544799804688, 'eval_accuracy': 0.843137264251709, 'eval_runtime': 2.7446, 'eval_samples_per_second': 148.654, 'eval_steps_per_second': 18.582, 'epoch': 27.0}


 54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12402/22950 [20:24<34:01,  5.17it/s]  

{'loss': 0.0001, 'grad_norm': 0.002430505584925413, 'learning_rate': 4.5969498910675387e-05, 'epoch': 27.02}


 54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12412/22950 [20:25<19:48,  8.87it/s]

{'loss': 0.0001, 'grad_norm': 0.0016617277869954705, 'learning_rate': 4.592592592592593e-05, 'epoch': 27.04}


 54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12422/22950 [20:26<16:56, 10.36it/s]

{'loss': 0.0, 'grad_norm': 0.001646960387006402, 'learning_rate': 4.588235294117647e-05, 'epoch': 27.06}


 54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12432/22950 [20:26<16:05, 10.89it/s]

{'loss': 0.0011, 'grad_norm': 0.7847554683685303, 'learning_rate': 4.583877995642702e-05, 'epoch': 27.08}


 54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12442/22950 [20:27<15:54, 11.00it/s]

{'loss': 0.0852, 'grad_norm': 0.036401763558387756, 'learning_rate': 4.579520697167756e-05, 'epoch': 27.1}


 54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12452/22950 [20:28<15:52, 11.02it/s]

{'loss': 0.1195, 'grad_norm': 0.0038257273845374584, 'learning_rate': 4.5751633986928104e-05, 'epoch': 27.12}


 54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12462/22950 [20:29<15:54, 10.99it/s]

{'loss': 0.0001, 'grad_norm': 0.0032822785433381796, 'learning_rate': 4.5708061002178654e-05, 'epoch': 27.15}


 54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12472/22950 [20:30<16:14, 10.75it/s]

{'loss': 0.0503, 'grad_norm': 0.0029178119730204344, 'learning_rate': 4.56644880174292e-05, 'epoch': 27.17}


 54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12482/22950 [20:31<15:55, 10.95it/s]

{'loss': 0.0002, 'grad_norm': 0.0038736730348318815, 'learning_rate': 4.562091503267974e-05, 'epoch': 27.19}


 54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12492/22950 [20:32<15:46, 11.04it/s]

{'loss': 0.0001, 'grad_norm': 0.0035106961149722338, 'learning_rate': 4.557734204793029e-05, 'epoch': 27.21}


 54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12502/22950 [20:33<15:52, 10.97it/s]

{'loss': 0.0001, 'grad_norm': 0.007072497624903917, 'learning_rate': 4.5533769063180834e-05, 'epoch': 27.23}


 55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12512/22950 [20:34<15:48, 11.01it/s]

{'loss': 0.0001, 'grad_norm': 0.0018645052332431078, 'learning_rate': 4.549019607843137e-05, 'epoch': 27.25}


 55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12522/22950 [20:35<16:09, 10.76it/s]

{'loss': 0.0073, 'grad_norm': 0.0024323854595422745, 'learning_rate': 4.5446623093681914e-05, 'epoch': 27.28}


 55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12532/22950 [20:36<15:50, 10.96it/s]

{'loss': 0.0412, 'grad_norm': 0.022099563851952553, 'learning_rate': 4.5403050108932464e-05, 'epoch': 27.3}


 55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12542/22950 [20:37<15:46, 10.99it/s]

{'loss': 0.0001, 'grad_norm': 0.010845988988876343, 'learning_rate': 4.535947712418301e-05, 'epoch': 27.32}


 55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12552/22950 [20:37<16:08, 10.73it/s]

{'loss': 0.0001, 'grad_norm': 0.0019894742872565985, 'learning_rate': 4.531590413943355e-05, 'epoch': 27.34}


 55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12562/22950 [20:38<16:00, 10.82it/s]

{'loss': 0.0001, 'grad_norm': 0.0026373316068202257, 'learning_rate': 4.52723311546841e-05, 'epoch': 27.36}


 55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12572/22950 [20:39<16:31, 10.47it/s]

{'loss': 0.0001, 'grad_norm': 0.002985617145895958, 'learning_rate': 4.5228758169934645e-05, 'epoch': 27.39}


 55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12582/22950 [20:40<16:02, 10.77it/s]

{'loss': 0.0001, 'grad_norm': 0.009709702804684639, 'learning_rate': 4.518518518518519e-05, 'epoch': 27.41}


 55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12592/22950 [20:41<15:49, 10.91it/s]

{'loss': 0.0296, 'grad_norm': 0.0017359571065753698, 'learning_rate': 4.514161220043573e-05, 'epoch': 27.43}


 55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12602/22950 [20:42<15:46, 10.93it/s]

{'loss': 0.0061, 'grad_norm': 1.2690446376800537, 'learning_rate': 4.5098039215686275e-05, 'epoch': 27.45}


 55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12612/22950 [20:43<15:41, 10.98it/s]

{'loss': 0.0002, 'grad_norm': 0.0012789631728082895, 'learning_rate': 4.505446623093682e-05, 'epoch': 27.47}


 55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 12622/22950 [20:44<15:56, 10.79it/s]

{'loss': 0.0003, 'grad_norm': 0.7207223773002625, 'learning_rate': 4.501089324618736e-05, 'epoch': 27.49}


 55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12632/22950 [20:45<16:01, 10.73it/s]

{'loss': 0.0013, 'grad_norm': 5.533519744873047, 'learning_rate': 4.496732026143791e-05, 'epoch': 27.52}


 55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12642/22950 [20:46<15:38, 10.98it/s]

{'loss': 0.0002, 'grad_norm': 0.0015388285974040627, 'learning_rate': 4.4923747276688455e-05, 'epoch': 27.54}


 55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12652/22950 [20:47<15:47, 10.87it/s]

{'loss': 0.0001, 'grad_norm': 0.0015082451282069087, 'learning_rate': 4.4880174291939e-05, 'epoch': 27.56}


 55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12662/22950 [20:48<15:38, 10.96it/s]

{'loss': 0.0001, 'grad_norm': 0.001391092548146844, 'learning_rate': 4.483660130718955e-05, 'epoch': 27.58}


 55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12672/22950 [20:49<15:57, 10.73it/s]

{'loss': 0.0002, 'grad_norm': 0.0017812768928706646, 'learning_rate': 4.479302832244009e-05, 'epoch': 27.6}


 55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12682/22950 [20:49<15:36, 10.97it/s]

{'loss': 0.0, 'grad_norm': 0.0014833693858236074, 'learning_rate': 4.474945533769063e-05, 'epoch': 27.63}


 55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12692/22950 [20:50<15:35, 10.97it/s]

{'loss': 0.081, 'grad_norm': 0.0020659780129790306, 'learning_rate': 4.470588235294118e-05, 'epoch': 27.65}


 55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12702/22950 [20:51<15:26, 11.06it/s]

{'loss': 0.0145, 'grad_norm': 0.001499159843660891, 'learning_rate': 4.466230936819172e-05, 'epoch': 27.67}


 55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12712/22950 [20:52<15:51, 10.76it/s]

{'loss': 0.0001, 'grad_norm': 0.0019850456155836582, 'learning_rate': 4.4618736383442266e-05, 'epoch': 27.69}


 55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12722/22950 [20:53<15:31, 10.98it/s]

{'loss': 0.0003, 'grad_norm': 0.0021664691157639027, 'learning_rate': 4.4575163398692816e-05, 'epoch': 27.71}


 55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12732/22950 [20:54<15:27, 11.02it/s]

{'loss': 0.0817, 'grad_norm': 0.0025154254399240017, 'learning_rate': 4.453159041394336e-05, 'epoch': 27.73}


 56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12742/22950 [20:55<15:24, 11.04it/s]

{'loss': 0.0642, 'grad_norm': 0.0022218876983970404, 'learning_rate': 4.44880174291939e-05, 'epoch': 27.76}


 56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12752/22950 [20:56<15:27, 10.99it/s]

{'loss': 0.0158, 'grad_norm': 0.0022077877074480057, 'learning_rate': 4.4444444444444447e-05, 'epoch': 27.78}


 56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12762/22950 [20:57<15:59, 10.62it/s]

{'loss': 0.0002, 'grad_norm': 0.002105979947373271, 'learning_rate': 4.440087145969499e-05, 'epoch': 27.8}


 56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12772/22950 [20:58<16:04, 10.55it/s]

{'loss': 0.0681, 'grad_norm': 0.00275757466442883, 'learning_rate': 4.4357298474945533e-05, 'epoch': 27.82}


 56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12782/22950 [20:59<15:45, 10.76it/s]

{'loss': 0.0005, 'grad_norm': 0.08032558858394623, 'learning_rate': 4.431372549019608e-05, 'epoch': 27.84}


 56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12792/22950 [21:00<15:19, 11.04it/s]

{'loss': 0.0122, 'grad_norm': 0.001963082468137145, 'learning_rate': 4.427015250544663e-05, 'epoch': 27.86}


 56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12802/22950 [21:01<15:29, 10.92it/s]

{'loss': 0.0001, 'grad_norm': 0.0021376002114266157, 'learning_rate': 4.422657952069717e-05, 'epoch': 27.89}


 56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12812/22950 [21:01<15:23, 10.98it/s]

{'loss': 0.0001, 'grad_norm': 0.001837786054238677, 'learning_rate': 4.4183006535947714e-05, 'epoch': 27.91}


 56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12822/22950 [21:02<15:16, 11.05it/s]

{'loss': 0.0013, 'grad_norm': 0.0026648035272955894, 'learning_rate': 4.4139433551198264e-05, 'epoch': 27.93}


 56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12832/22950 [21:03<15:32, 10.85it/s]

{'loss': 0.0001, 'grad_norm': 0.0033339178189635277, 'learning_rate': 4.409586056644881e-05, 'epoch': 27.95}


 56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12842/22950 [21:04<15:20, 10.98it/s]

{'loss': 0.0014, 'grad_norm': 0.004605880472809076, 'learning_rate': 4.405228758169935e-05, 'epoch': 27.97}


 56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12852/22950 [21:05<14:26, 11.66it/s]

{'loss': 0.0685, 'grad_norm': 0.002388670574873686, 'learning_rate': 4.400871459694989e-05, 'epoch': 28.0}


                                                     
 56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12852/22950 [21:08<14:26, 11.66it/s]

{'eval_loss': 1.3244091272354126, 'eval_accuracy': 0.843137264251709, 'eval_runtime': 2.7446, 'eval_samples_per_second': 148.654, 'eval_steps_per_second': 18.582, 'epoch': 28.0}


 56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12862/22950 [21:09<32:33,  5.16it/s]  

{'loss': 0.0747, 'grad_norm': 0.005745036527514458, 'learning_rate': 4.396514161220044e-05, 'epoch': 28.02}


 56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12872/22950 [21:10<18:39,  9.00it/s]

{'loss': 0.0095, 'grad_norm': 0.00158458121586591, 'learning_rate': 4.392156862745098e-05, 'epoch': 28.04}


 56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12882/22950 [21:11<15:54, 10.55it/s]

{'loss': 0.0001, 'grad_norm': 0.0014534693909808993, 'learning_rate': 4.3877995642701524e-05, 'epoch': 28.06}


 56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12892/22950 [21:12<15:20, 10.92it/s]

{'loss': 0.0, 'grad_norm': 0.0015425090678036213, 'learning_rate': 4.3834422657952075e-05, 'epoch': 28.08}


 56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 12902/22950 [21:13<15:26, 10.85it/s]

{'loss': 0.0001, 'grad_norm': 0.0021324683912098408, 'learning_rate': 4.379084967320262e-05, 'epoch': 28.1}


 56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 12912/22950 [21:14<15:16, 10.96it/s]

{'loss': 0.0001, 'grad_norm': 0.005532430950552225, 'learning_rate': 4.374727668845316e-05, 'epoch': 28.13}


 56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 12922/22950 [21:15<15:05, 11.08it/s]

{'loss': 0.0, 'grad_norm': 0.001542725134640932, 'learning_rate': 4.3703703703703705e-05, 'epoch': 28.15}


 56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 12932/22950 [21:16<15:27, 10.80it/s]

{'loss': 0.0, 'grad_norm': 0.0015665477840229869, 'learning_rate': 4.366013071895425e-05, 'epoch': 28.17}


 56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 12942/22950 [21:16<15:29, 10.77it/s]

{'loss': 0.0001, 'grad_norm': 0.0018092530081048608, 'learning_rate': 4.361655773420479e-05, 'epoch': 28.19}


 56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 12952/22950 [21:17<14:59, 11.11it/s]

{'loss': 0.1249, 'grad_norm': 0.0016286548925563693, 'learning_rate': 4.357298474945534e-05, 'epoch': 28.21}


 56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 12962/22950 [21:18<15:03, 11.06it/s]

{'loss': 0.1088, 'grad_norm': 0.01606598310172558, 'learning_rate': 4.3529411764705885e-05, 'epoch': 28.24}


 57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 12970/22950 [21:19<15:59, 10.40it/s]

{'loss': 0.0072, 'grad_norm': 0.021076476201415062, 'learning_rate': 4.348583877995643e-05, 'epoch': 28.26}


 57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 12982/22950 [21:20<14:51, 11.19it/s]

{'loss': 0.0692, 'grad_norm': 23.35841941833496, 'learning_rate': 4.344226579520697e-05, 'epoch': 28.28}


 57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 12992/22950 [21:21<15:15, 10.88it/s]

{'loss': 0.0007, 'grad_norm': 2.220912218093872, 'learning_rate': 4.339869281045752e-05, 'epoch': 28.3}


 57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 13002/22950 [21:22<15:07, 10.96it/s]

{'loss': 0.0009, 'grad_norm': 0.007149074226617813, 'learning_rate': 4.3355119825708066e-05, 'epoch': 28.32}


 57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 13012/22950 [21:23<15:16, 10.85it/s]

{'loss': 0.0001, 'grad_norm': 0.003805340500548482, 'learning_rate': 4.33115468409586e-05, 'epoch': 28.34}


 57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 13022/22950 [21:24<15:10, 10.90it/s]

{'loss': 0.0006, 'grad_norm': 0.0014310465194284916, 'learning_rate': 4.326797385620915e-05, 'epoch': 28.37}


 57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 13032/22950 [21:25<15:20, 10.78it/s]

{'loss': 0.0059, 'grad_norm': 0.05244935303926468, 'learning_rate': 4.3224400871459696e-05, 'epoch': 28.39}


 57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 13042/22950 [21:26<14:56, 11.05it/s]

{'loss': 0.0001, 'grad_norm': 0.001869887812063098, 'learning_rate': 4.318082788671024e-05, 'epoch': 28.41}


 57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 13052/22950 [21:27<14:56, 11.05it/s]

{'loss': 0.0001, 'grad_norm': 0.0016141324304044247, 'learning_rate': 4.313725490196079e-05, 'epoch': 28.43}


 57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 13062/22950 [21:27<15:02, 10.96it/s]

{'loss': 0.0001, 'grad_norm': 0.0013096077600494027, 'learning_rate': 4.309368191721133e-05, 'epoch': 28.45}


 57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 13072/22950 [21:28<14:55, 11.03it/s]

{'loss': 0.0002, 'grad_norm': 0.0013807243667542934, 'learning_rate': 4.3050108932461876e-05, 'epoch': 28.47}


 57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 13082/22950 [21:29<15:05, 10.90it/s]

{'loss': 0.0001, 'grad_norm': 0.0017824271926656365, 'learning_rate': 4.300653594771242e-05, 'epoch': 28.5}


 57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 13092/22950 [21:30<14:53, 11.03it/s]

{'loss': 0.0001, 'grad_norm': 0.0015610263217240572, 'learning_rate': 4.296296296296296e-05, 'epoch': 28.52}


 57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 13102/22950 [21:31<14:51, 11.05it/s]

{'loss': 0.0001, 'grad_norm': 0.0021663266234099865, 'learning_rate': 4.291938997821351e-05, 'epoch': 28.54}


 57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 13112/22950 [21:32<14:56, 10.97it/s]

{'loss': 0.0463, 'grad_norm': 0.5256273150444031, 'learning_rate': 4.287581699346405e-05, 'epoch': 28.56}


 57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 13122/22950 [21:33<15:16, 10.72it/s]

{'loss': 0.1971, 'grad_norm': 0.001164403511211276, 'learning_rate': 4.28322440087146e-05, 'epoch': 28.58}


 57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 13132/22950 [21:34<14:51, 11.01it/s]

{'loss': 0.0001, 'grad_norm': 0.001422042609192431, 'learning_rate': 4.2788671023965144e-05, 'epoch': 28.61}


 57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 13142/22950 [21:35<14:46, 11.07it/s]

{'loss': 0.0001, 'grad_norm': 0.0019449616083875299, 'learning_rate': 4.274509803921569e-05, 'epoch': 28.63}


 57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 13152/22950 [21:36<14:45, 11.07it/s]

{'loss': 0.0001, 'grad_norm': 0.0017896501813083887, 'learning_rate': 4.270152505446624e-05, 'epoch': 28.65}


 57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 13162/22950 [21:37<14:43, 11.08it/s]

{'loss': 0.0001, 'grad_norm': 0.0029076493810862303, 'learning_rate': 4.265795206971678e-05, 'epoch': 28.67}


 57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 13172/22950 [21:38<14:55, 10.92it/s]

{'loss': 0.0002, 'grad_norm': 0.0021984011400491, 'learning_rate': 4.2614379084967324e-05, 'epoch': 28.69}


 57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 13182/22950 [21:38<14:46, 11.02it/s]

{'loss': 0.024, 'grad_norm': 0.0016242071287706494, 'learning_rate': 4.257080610021787e-05, 'epoch': 28.71}


 57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 13192/22950 [21:39<14:39, 11.09it/s]

{'loss': 0.0402, 'grad_norm': 0.0017259255982935429, 'learning_rate': 4.252723311546841e-05, 'epoch': 28.74}


 58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13202/22950 [21:40<14:43, 11.03it/s]

{'loss': 0.0005, 'grad_norm': 0.0031897465232759714, 'learning_rate': 4.2483660130718954e-05, 'epoch': 28.76}


 58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13212/22950 [21:41<15:07, 10.73it/s]

{'loss': 0.1054, 'grad_norm': 0.003011297667399049, 'learning_rate': 4.24400871459695e-05, 'epoch': 28.78}


 58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13222/22950 [21:42<14:47, 10.97it/s]

{'loss': 0.0208, 'grad_norm': 1.7619519233703613, 'learning_rate': 4.239651416122005e-05, 'epoch': 28.8}


 58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13232/22950 [21:43<14:41, 11.02it/s]

{'loss': 0.0003, 'grad_norm': 0.009137487970292568, 'learning_rate': 4.235294117647059e-05, 'epoch': 28.82}


 58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13242/22950 [21:44<14:36, 11.08it/s]

{'loss': 0.005, 'grad_norm': 0.005276297219097614, 'learning_rate': 4.2309368191721135e-05, 'epoch': 28.85}


 58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13252/22950 [21:45<14:45, 10.95it/s]

{'loss': 0.0081, 'grad_norm': 1.0646439790725708, 'learning_rate': 4.226579520697168e-05, 'epoch': 28.87}


 58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13262/22950 [21:46<15:06, 10.69it/s]

{'loss': 0.0002, 'grad_norm': 0.0037928111851215363, 'learning_rate': 4.222222222222222e-05, 'epoch': 28.89}


 58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13272/22950 [21:47<14:44, 10.94it/s]

{'loss': 0.0001, 'grad_norm': 0.003958418034017086, 'learning_rate': 4.2178649237472765e-05, 'epoch': 28.91}


 58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13282/22950 [21:48<14:35, 11.05it/s]

{'loss': 0.0396, 'grad_norm': 0.0028053920250386, 'learning_rate': 4.2135076252723315e-05, 'epoch': 28.93}


 58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13292/22950 [21:49<14:53, 10.81it/s]

{'loss': 0.0002, 'grad_norm': 0.019607897847890854, 'learning_rate': 4.209150326797386e-05, 'epoch': 28.95}


 58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13302/22950 [21:49<15:03, 10.68it/s]

{'loss': 0.0001, 'grad_norm': 0.0197701845318079, 'learning_rate': 4.20479302832244e-05, 'epoch': 28.98}


 58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13310/22950 [21:50<14:34, 11.02it/s]

{'loss': 0.0003, 'grad_norm': 0.0029863910749554634, 'learning_rate': 4.200435729847495e-05, 'epoch': 29.0}


                                                     
 58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13311/22950 [21:53<14:34, 11.02it/s]

{'eval_loss': 1.1376022100448608, 'eval_accuracy': 0.8504902124404907, 'eval_runtime': 2.7153, 'eval_samples_per_second': 150.261, 'eval_steps_per_second': 18.783, 'epoch': 29.0}


 58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13322/22950 [21:54<26:03,  6.16it/s]  

{'loss': 0.0002, 'grad_norm': 0.21269886195659637, 'learning_rate': 4.1960784313725496e-05, 'epoch': 29.02}


 58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13332/22950 [21:55<16:48,  9.54it/s]

{'loss': 0.0001, 'grad_norm': 0.0034272363409399986, 'learning_rate': 4.191721132897604e-05, 'epoch': 29.04}


 58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13342/22950 [21:56<15:00, 10.67it/s]

{'loss': 0.0001, 'grad_norm': 0.00323216593824327, 'learning_rate': 4.1873638344226576e-05, 'epoch': 29.06}


 58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13352/22950 [21:57<14:29, 11.04it/s]

{'loss': 0.0001, 'grad_norm': 0.007192724384367466, 'learning_rate': 4.1830065359477126e-05, 'epoch': 29.08}


 58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13362/22950 [21:58<14:32, 10.99it/s]

{'loss': 0.0083, 'grad_norm': 1.3166528940200806, 'learning_rate': 4.178649237472767e-05, 'epoch': 29.11}


 58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13372/22950 [21:59<14:42, 10.85it/s]

{'loss': 0.0001, 'grad_norm': 0.012472530826926231, 'learning_rate': 4.174291938997821e-05, 'epoch': 29.13}


 58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13382/22950 [22:00<14:30, 11.00it/s]

{'loss': 0.0002, 'grad_norm': 0.3312753438949585, 'learning_rate': 4.169934640522876e-05, 'epoch': 29.15}


 58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13392/22950 [22:01<14:25, 11.04it/s]

{'loss': 0.0019, 'grad_norm': 0.0025615559425204992, 'learning_rate': 4.1655773420479306e-05, 'epoch': 29.17}


 58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13402/22950 [22:02<14:24, 11.04it/s]

{'loss': 0.0505, 'grad_norm': 0.0023158774711191654, 'learning_rate': 4.161220043572985e-05, 'epoch': 29.19}


 58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13412/22950 [22:03<14:38, 10.86it/s]

{'loss': 0.0025, 'grad_norm': 0.0027065544854849577, 'learning_rate': 4.156862745098039e-05, 'epoch': 29.22}


 58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13422/22950 [22:03<14:33, 10.91it/s]

{'loss': 0.0001, 'grad_norm': 0.0018575668800622225, 'learning_rate': 4.1525054466230936e-05, 'epoch': 29.24}


 59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13432/22950 [22:04<14:25, 10.99it/s]

{'loss': 0.0001, 'grad_norm': 0.001237585674971342, 'learning_rate': 4.148148148148148e-05, 'epoch': 29.26}


 59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13442/22950 [22:05<14:20, 11.05it/s]

{'loss': 0.0001, 'grad_norm': 0.0023789445403963327, 'learning_rate': 4.143790849673203e-05, 'epoch': 29.28}


 59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13452/22950 [22:06<14:20, 11.04it/s]

{'loss': 0.0001, 'grad_norm': 0.0030631988774985075, 'learning_rate': 4.1394335511982573e-05, 'epoch': 29.3}


 59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13462/22950 [22:07<14:42, 10.75it/s]

{'loss': 0.0001, 'grad_norm': 0.0013817844446748495, 'learning_rate': 4.135076252723312e-05, 'epoch': 29.32}


 59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13472/22950 [22:08<14:23, 10.97it/s]

{'loss': 0.0, 'grad_norm': 0.0026191927026957273, 'learning_rate': 4.130718954248366e-05, 'epoch': 29.35}


 59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 13482/22950 [22:09<14:20, 11.01it/s]

{'loss': 0.0001, 'grad_norm': 0.001059861620888114, 'learning_rate': 4.126361655773421e-05, 'epoch': 29.37}


 59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13492/22950 [22:10<14:17, 11.02it/s]

{'loss': 0.0012, 'grad_norm': 0.0021548487711697817, 'learning_rate': 4.1220043572984754e-05, 'epoch': 29.39}


 59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13502/22950 [22:11<14:18, 11.00it/s]

{'loss': 0.0062, 'grad_norm': 1.044989824295044, 'learning_rate': 4.11764705882353e-05, 'epoch': 29.41}


 59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13512/22950 [22:12<14:28, 10.86it/s]

{'loss': 0.0031, 'grad_norm': 0.0014553364599123597, 'learning_rate': 4.113289760348584e-05, 'epoch': 29.43}


 59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13522/22950 [22:13<14:22, 10.93it/s]

{'loss': 0.0001, 'grad_norm': 0.0044533866457641125, 'learning_rate': 4.1089324618736384e-05, 'epoch': 29.46}


 59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13532/22950 [22:14<14:13, 11.03it/s]

{'loss': 0.0, 'grad_norm': 0.0014106400776654482, 'learning_rate': 4.104575163398693e-05, 'epoch': 29.48}


 59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13542/22950 [22:14<14:22, 10.90it/s]

{'loss': 0.0, 'grad_norm': 0.0013344967737793922, 'learning_rate': 4.100217864923748e-05, 'epoch': 29.5}


 59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13552/22950 [22:15<14:35, 10.74it/s]

{'loss': 0.0001, 'grad_norm': 0.0022913338616490364, 'learning_rate': 4.095860566448802e-05, 'epoch': 29.52}


 59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13562/22950 [22:16<14:14, 10.99it/s]

{'loss': 0.0621, 'grad_norm': 0.0020531117916107178, 'learning_rate': 4.0915032679738565e-05, 'epoch': 29.54}


 59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13572/22950 [22:17<14:09, 11.04it/s]

{'loss': 0.0001, 'grad_norm': 0.004242563620209694, 'learning_rate': 4.087145969498911e-05, 'epoch': 29.56}


 59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13582/22950 [22:18<14:11, 11.00it/s]

{'loss': 0.012, 'grad_norm': 0.005500677041709423, 'learning_rate': 4.082788671023965e-05, 'epoch': 29.59}


 59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13592/22950 [22:19<14:08, 11.03it/s]

{'loss': 0.0133, 'grad_norm': 1.7411015033721924, 'learning_rate': 4.0784313725490195e-05, 'epoch': 29.61}


 59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13602/22950 [22:20<14:11, 10.97it/s]

{'loss': 0.0006, 'grad_norm': 0.0015567168593406677, 'learning_rate': 4.074074074074074e-05, 'epoch': 29.63}


 59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13612/22950 [22:21<14:08, 11.01it/s]

{'loss': 0.0295, 'grad_norm': 84.95782470703125, 'learning_rate': 4.069716775599129e-05, 'epoch': 29.65}


 59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13622/22950 [22:22<14:02, 11.07it/s]

{'loss': 0.0, 'grad_norm': 0.00140200718306005, 'learning_rate': 4.065359477124183e-05, 'epoch': 29.67}


 59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13632/22950 [22:23<14:28, 10.73it/s]

{'loss': 0.1131, 'grad_norm': 0.00307984440587461, 'learning_rate': 4.0610021786492375e-05, 'epoch': 29.69}


 59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13642/22950 [22:24<14:20, 10.82it/s]

{'loss': 0.0003, 'grad_norm': 0.008900891058146954, 'learning_rate': 4.0566448801742925e-05, 'epoch': 29.72}


 59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13652/22950 [22:25<14:21, 10.79it/s]

{'loss': 0.0001, 'grad_norm': 0.004262959118932486, 'learning_rate': 4.052287581699347e-05, 'epoch': 29.74}


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13662/22950 [22:25<14:01, 11.04it/s]

{'loss': 0.0001, 'grad_norm': 0.0023505277931690216, 'learning_rate': 4.047930283224401e-05, 'epoch': 29.76}


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13672/22950 [22:26<14:18, 10.80it/s]

{'loss': 0.0002, 'grad_norm': 0.005081477575004101, 'learning_rate': 4.0435729847494556e-05, 'epoch': 29.78}


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13682/22950 [22:27<14:06, 10.94it/s]

{'loss': 0.0499, 'grad_norm': 0.0032479504588991404, 'learning_rate': 4.03921568627451e-05, 'epoch': 29.8}


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13692/22950 [22:28<14:00, 11.01it/s]

{'loss': 0.0001, 'grad_norm': 0.001546174637041986, 'learning_rate': 4.034858387799564e-05, 'epoch': 29.83}


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13702/22950 [22:29<14:18, 10.77it/s]

{'loss': 0.0001, 'grad_norm': 0.0021107145585119724, 'learning_rate': 4.0305010893246186e-05, 'epoch': 29.85}


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13712/22950 [22:30<14:03, 10.95it/s]

{'loss': 0.0525, 'grad_norm': 0.004064921289682388, 'learning_rate': 4.0261437908496736e-05, 'epoch': 29.87}


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13722/22950 [22:31<14:03, 10.95it/s]

{'loss': 0.0001, 'grad_norm': 0.0030820798128843307, 'learning_rate': 4.021786492374728e-05, 'epoch': 29.89}


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13732/22950 [22:32<14:26, 10.63it/s]

{'loss': 0.0212, 'grad_norm': 0.004742641933262348, 'learning_rate': 4.017429193899782e-05, 'epoch': 29.91}


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13742/22950 [22:33<14:05, 10.90it/s]

{'loss': 0.0001, 'grad_norm': 0.0014599337009713054, 'learning_rate': 4.013071895424837e-05, 'epoch': 29.93}


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13752/22950 [22:34<13:59, 10.96it/s]

{'loss': 0.0001, 'grad_norm': 0.015681402757763863, 'learning_rate': 4.008714596949891e-05, 'epoch': 29.96}


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 13762/22950 [22:35<14:26, 10.61it/s]

{'loss': 0.0001, 'grad_norm': 0.004207403399050236, 'learning_rate': 4.004357298474945e-05, 'epoch': 29.98}


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13770/22950 [22:35<13:14, 11.56it/s]

{'loss': 0.1753, 'grad_norm': 0.0026701344177126884, 'learning_rate': 4e-05, 'epoch': 30.0}


                                                     
 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13770/22950 [22:38<13:14, 11.56it/s]

{'eval_loss': 1.1819967031478882, 'eval_accuracy': 0.8406862616539001, 'eval_runtime': 2.7188, 'eval_samples_per_second': 150.065, 'eval_steps_per_second': 18.758, 'epoch': 30.0}


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13782/22950 [22:40<25:14,  6.05it/s]  

{'loss': 0.0001, 'grad_norm': 0.003985797055065632, 'learning_rate': 3.995642701525055e-05, 'epoch': 30.02}


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13792/22950 [22:41<16:02,  9.51it/s]

{'loss': 0.0001, 'grad_norm': 0.00395814748480916, 'learning_rate': 3.991285403050109e-05, 'epoch': 30.04}


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13802/22950 [22:41<14:23, 10.60it/s]

{'loss': 0.0215, 'grad_norm': 0.004346856847405434, 'learning_rate': 3.986928104575164e-05, 'epoch': 30.07}


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13812/22950 [22:42<13:53, 10.96it/s]

{'loss': 0.0001, 'grad_norm': 0.0037411495577543974, 'learning_rate': 3.9825708061002184e-05, 'epoch': 30.09}


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13822/22950 [22:43<13:52, 10.96it/s]

{'loss': 0.0006, 'grad_norm': 0.022509222850203514, 'learning_rate': 3.978213507625273e-05, 'epoch': 30.11}


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13832/22950 [22:44<14:24, 10.55it/s]

{'loss': 0.0001, 'grad_norm': 0.0033613350242376328, 'learning_rate': 3.973856209150327e-05, 'epoch': 30.13}


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13842/22950 [22:45<13:44, 11.04it/s]

{'loss': 0.0093, 'grad_norm': 0.001748813083395362, 'learning_rate': 3.9694989106753814e-05, 'epoch': 30.15}


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13852/22950 [22:46<14:13, 10.65it/s]

{'loss': 0.0875, 'grad_norm': 0.002370412927120924, 'learning_rate': 3.965141612200436e-05, 'epoch': 30.17}


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13862/22950 [22:47<13:52, 10.91it/s]

{'loss': 0.0001, 'grad_norm': 0.0032401036005467176, 'learning_rate': 3.96078431372549e-05, 'epoch': 30.2}


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13872/22950 [22:48<13:44, 11.01it/s]

{'loss': 0.0477, 'grad_norm': 0.006766197737306356, 'learning_rate': 3.956427015250545e-05, 'epoch': 30.22}


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13882/22950 [22:49<13:46, 10.97it/s]

{'loss': 0.0001, 'grad_norm': 0.0025989855639636517, 'learning_rate': 3.9520697167755994e-05, 'epoch': 30.24}


 61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13892/22950 [22:50<13:54, 10.86it/s]

{'loss': 0.0001, 'grad_norm': 0.003581992583349347, 'learning_rate': 3.947712418300654e-05, 'epoch': 30.26}


 61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13902/22950 [22:51<13:49, 10.91it/s]

{'loss': 0.0001, 'grad_norm': 0.0037030347157269716, 'learning_rate': 3.943355119825709e-05, 'epoch': 30.28}


 61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13912/22950 [22:52<13:50, 10.88it/s]

{'loss': 0.0001, 'grad_norm': 0.003243091981858015, 'learning_rate': 3.9389978213507625e-05, 'epoch': 30.31}


 61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13922/22950 [22:52<13:57, 10.78it/s]

{'loss': 0.0001, 'grad_norm': 0.02174614556133747, 'learning_rate': 3.934640522875817e-05, 'epoch': 30.33}


 61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13932/22950 [22:53<14:03, 10.69it/s]

{'loss': 0.0002, 'grad_norm': 0.003410845063626766, 'learning_rate': 3.930283224400871e-05, 'epoch': 30.35}


 61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13942/22950 [22:54<13:45, 10.92it/s]

{'loss': 0.0243, 'grad_norm': 0.0036934579256922007, 'learning_rate': 3.925925925925926e-05, 'epoch': 30.37}


 61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13952/22950 [22:55<13:53, 10.80it/s]

{'loss': 0.0075, 'grad_norm': 0.0017827918054535985, 'learning_rate': 3.9215686274509805e-05, 'epoch': 30.39}


 61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13962/22950 [22:56<13:38, 10.98it/s]

{'loss': 0.0502, 'grad_norm': 0.006852141115814447, 'learning_rate': 3.917211328976035e-05, 'epoch': 30.41}


 61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13972/22950 [22:57<13:40, 10.94it/s]

{'loss': 0.0001, 'grad_norm': 0.021236242726445198, 'learning_rate': 3.91285403050109e-05, 'epoch': 30.44}


 61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13982/22950 [22:58<13:54, 10.75it/s]

{'loss': 0.0001, 'grad_norm': 0.0022057299502193928, 'learning_rate': 3.908496732026144e-05, 'epoch': 30.46}


 61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 13992/22950 [22:59<13:42, 10.89it/s]

{'loss': 0.0001, 'grad_norm': 0.0022779046557843685, 'learning_rate': 3.9041394335511985e-05, 'epoch': 30.48}


 61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 14002/22950 [23:00<13:40, 10.91it/s]

{'loss': 0.0001, 'grad_norm': 0.003654835047200322, 'learning_rate': 3.899782135076253e-05, 'epoch': 30.5}


 61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 14012/22950 [23:01<13:37, 10.93it/s]

{'loss': 0.0001, 'grad_norm': 0.15941530466079712, 'learning_rate': 3.895424836601307e-05, 'epoch': 30.52}


 61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 14022/22950 [23:02<14:01, 10.61it/s]

{'loss': 0.0001, 'grad_norm': 0.0021527870558202267, 'learning_rate': 3.8910675381263616e-05, 'epoch': 30.54}


 61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 14032/22950 [23:03<13:39, 10.89it/s]

{'loss': 0.0001, 'grad_norm': 0.0022737637627869844, 'learning_rate': 3.8867102396514166e-05, 'epoch': 30.57}


 61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 14042/22950 [23:04<13:47, 10.76it/s]

{'loss': 0.0102, 'grad_norm': 0.0019498931942507625, 'learning_rate': 3.882352941176471e-05, 'epoch': 30.59}


 61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 14052/22950 [23:05<13:37, 10.88it/s]

{'loss': 0.0126, 'grad_norm': 0.0013158932561054826, 'learning_rate': 3.877995642701525e-05, 'epoch': 30.61}


 61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 14062/22950 [23:05<13:37, 10.88it/s]

{'loss': 0.0074, 'grad_norm': 0.007192736025899649, 'learning_rate': 3.8736383442265796e-05, 'epoch': 30.63}


 61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 14072/22950 [23:06<13:32, 10.92it/s]

{'loss': 0.0002, 'grad_norm': 0.003022346179932356, 'learning_rate': 3.8692810457516346e-05, 'epoch': 30.65}


 61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 14082/22950 [23:07<13:31, 10.93it/s]

{'loss': 0.0001, 'grad_norm': 0.0017940590623766184, 'learning_rate': 3.864923747276688e-05, 'epoch': 30.68}


 61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 14092/22950 [23:08<13:37, 10.83it/s]

{'loss': 0.0001, 'grad_norm': 0.0020169245544821024, 'learning_rate': 3.8605664488017426e-05, 'epoch': 30.7}


 61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 14102/22950 [23:09<13:30, 10.92it/s]

{'loss': 0.0003, 'grad_norm': 1.3947970867156982, 'learning_rate': 3.8562091503267977e-05, 'epoch': 30.72}


 61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 14112/22950 [23:10<13:39, 10.79it/s]

{'loss': 0.0001, 'grad_norm': 0.017797963693737984, 'learning_rate': 3.851851851851852e-05, 'epoch': 30.74}


 62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 14122/22950 [23:11<13:22, 11.00it/s]

{'loss': 0.1009, 'grad_norm': 0.002371755661442876, 'learning_rate': 3.847494553376906e-05, 'epoch': 30.76}


 62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 14132/22950 [23:12<13:31, 10.87it/s]

{'loss': 0.0001, 'grad_norm': 0.0012487669009715319, 'learning_rate': 3.8431372549019614e-05, 'epoch': 30.78}


 62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 14142/22950 [23:13<13:45, 10.67it/s]

{'loss': 0.0955, 'grad_norm': 0.0014586230972781777, 'learning_rate': 3.838779956427016e-05, 'epoch': 30.81}


 62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 14152/22950 [23:14<13:32, 10.83it/s]

{'loss': 0.0001, 'grad_norm': 0.001673263730481267, 'learning_rate': 3.83442265795207e-05, 'epoch': 30.83}


 62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 14162/22950 [23:15<13:30, 10.85it/s]

{'loss': 0.0001, 'grad_norm': 0.004814510699361563, 'learning_rate': 3.8300653594771244e-05, 'epoch': 30.85}


 62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 14172/22950 [23:16<13:30, 10.84it/s]

{'loss': 0.0003, 'grad_norm': 0.001724681002087891, 'learning_rate': 3.825708061002179e-05, 'epoch': 30.87}


 62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 14182/22950 [23:17<13:39, 10.70it/s]

{'loss': 0.0153, 'grad_norm': 0.0028563477098941803, 'learning_rate': 3.821350762527233e-05, 'epoch': 30.89}


 62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 14192/22950 [23:17<13:22, 10.92it/s]

{'loss': 0.0457, 'grad_norm': 0.0058586508966982365, 'learning_rate': 3.8169934640522874e-05, 'epoch': 30.92}


 62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 14202/22950 [23:18<13:17, 10.98it/s]

{'loss': 0.0034, 'grad_norm': 36.09544372558594, 'learning_rate': 3.8126361655773424e-05, 'epoch': 30.94}


 62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 14212/22950 [23:19<13:46, 10.57it/s]

{'loss': 0.0155, 'grad_norm': 0.0021579531021416187, 'learning_rate': 3.808278867102397e-05, 'epoch': 30.96}


 62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 14222/22950 [23:20<13:25, 10.84it/s]

{'loss': 0.0839, 'grad_norm': 0.003445529146119952, 'learning_rate': 3.803921568627451e-05, 'epoch': 30.98}


                                                     
 62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 14229/22950 [23:24<13:32, 10.74it/s]

{'eval_loss': 1.2430531978607178, 'eval_accuracy': 0.8333333134651184, 'eval_runtime': 2.7402, 'eval_samples_per_second': 148.893, 'eval_steps_per_second': 18.612, 'epoch': 31.0}


 62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 14232/22950 [23:24<1:00:50,  2.39it/s]

{'loss': 0.0001, 'grad_norm': 0.040732014924287796, 'learning_rate': 3.799564270152506e-05, 'epoch': 31.0}


 62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 14240/22950 [23:25<23:58,  6.05it/s]  

{'loss': 0.0001, 'grad_norm': 0.0019186793360859156, 'learning_rate': 3.79520697167756e-05, 'epoch': 31.02}


 62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 14252/22950 [23:26<14:31,  9.98it/s]

{'loss': 0.0, 'grad_norm': 0.0014746870147064328, 'learning_rate': 3.790849673202614e-05, 'epoch': 31.05}


 62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 14262/22950 [23:27<13:27, 10.76it/s]

{'loss': 0.0001, 'grad_norm': 0.0015440700808539987, 'learning_rate': 3.786492374727669e-05, 'epoch': 31.07}


 62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 14272/22950 [23:28<13:18, 10.86it/s]

{'loss': 0.0, 'grad_norm': 0.001922210562042892, 'learning_rate': 3.7821350762527235e-05, 'epoch': 31.09}


 62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 14282/22950 [23:29<13:32, 10.67it/s]

{'loss': 0.0001, 'grad_norm': 0.0017567750765010715, 'learning_rate': 3.777777777777778e-05, 'epoch': 31.11}


 62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 14292/22950 [23:30<13:20, 10.82it/s]

{'loss': 0.0002, 'grad_norm': 0.0022449938114732504, 'learning_rate': 3.773420479302832e-05, 'epoch': 31.13}


 62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 14302/22950 [23:31<13:17, 10.84it/s]

{'loss': 0.0131, 'grad_norm': 0.0025359105784446, 'learning_rate': 3.769063180827887e-05, 'epoch': 31.15}


 62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 14312/22950 [23:32<13:41, 10.52it/s]

{'loss': 0.0, 'grad_norm': 0.0013328338973224163, 'learning_rate': 3.7647058823529415e-05, 'epoch': 31.18}


 62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 14322/22950 [23:33<13:13, 10.87it/s]

{'loss': 0.0001, 'grad_norm': 0.001796419033780694, 'learning_rate': 3.760348583877996e-05, 'epoch': 31.2}


 62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 14332/22950 [23:34<13:11, 10.89it/s]

{'loss': 0.0001, 'grad_norm': 0.001265843864530325, 'learning_rate': 3.75599128540305e-05, 'epoch': 31.22}


 62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 14342/22950 [23:34<13:13, 10.85it/s]

{'loss': 0.0737, 'grad_norm': 0.0012051723897457123, 'learning_rate': 3.7516339869281045e-05, 'epoch': 31.24}


 63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14352/22950 [23:35<13:32, 10.58it/s]

{'loss': 0.0003, 'grad_norm': 0.21186122298240662, 'learning_rate': 3.747276688453159e-05, 'epoch': 31.26}


 63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14362/22950 [23:36<13:24, 10.68it/s]

{'loss': 0.0492, 'grad_norm': 0.0010432753479108214, 'learning_rate': 3.742919389978214e-05, 'epoch': 31.29}


 63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14372/22950 [23:37<13:24, 10.67it/s]

{'loss': 0.0001, 'grad_norm': 0.0014598470879718661, 'learning_rate': 3.738562091503268e-05, 'epoch': 31.31}


 63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14382/22950 [23:38<13:09, 10.85it/s]

{'loss': 0.0, 'grad_norm': 0.0013001703191548586, 'learning_rate': 3.7342047930283226e-05, 'epoch': 31.33}


 63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14392/22950 [23:39<13:05, 10.90it/s]

{'loss': 0.0001, 'grad_norm': 0.0018797345692291856, 'learning_rate': 3.7298474945533776e-05, 'epoch': 31.35}


 63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14402/22950 [23:40<13:15, 10.75it/s]

{'loss': 0.0008, 'grad_norm': 0.0015524276532232761, 'learning_rate': 3.725490196078432e-05, 'epoch': 31.37}


 63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14412/22950 [23:41<13:08, 10.83it/s]

{'loss': 0.0001, 'grad_norm': 0.0716559961438179, 'learning_rate': 3.7211328976034856e-05, 'epoch': 31.39}


 63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14422/22950 [23:42<13:03, 10.89it/s]

{'loss': 0.0836, 'grad_norm': 0.0007258288678713143, 'learning_rate': 3.71677559912854e-05, 'epoch': 31.42}


 63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14432/22950 [23:43<13:21, 10.63it/s]

{'loss': 0.0736, 'grad_norm': 0.0010997591307386756, 'learning_rate': 3.712418300653595e-05, 'epoch': 31.44}


 63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14442/22950 [23:44<13:17, 10.66it/s]

{'loss': 0.0001, 'grad_norm': 0.0011058627860620618, 'learning_rate': 3.708061002178649e-05, 'epoch': 31.46}


 63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14452/22950 [23:45<13:03, 10.84it/s]

{'loss': 0.0, 'grad_norm': 0.0014888428850099444, 'learning_rate': 3.7037037037037037e-05, 'epoch': 31.48}


 63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14462/22950 [23:46<13:01, 10.86it/s]

{'loss': 0.0, 'grad_norm': 0.002259609056636691, 'learning_rate': 3.699346405228759e-05, 'epoch': 31.5}


 63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14472/22950 [23:47<13:03, 10.82it/s]

{'loss': 0.1148, 'grad_norm': 0.026915976777672768, 'learning_rate': 3.694989106753813e-05, 'epoch': 31.53}


 63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14482/22950 [23:48<13:05, 10.77it/s]

{'loss': 0.0001, 'grad_norm': 0.02261749468743801, 'learning_rate': 3.6906318082788674e-05, 'epoch': 31.55}


 63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14492/22950 [23:48<13:11, 10.69it/s]

{'loss': 0.0096, 'grad_norm': 49.90470886230469, 'learning_rate': 3.686274509803922e-05, 'epoch': 31.57}


 63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14502/22950 [23:49<13:11, 10.67it/s]

{'loss': 0.0019, 'grad_norm': 0.22221817076206207, 'learning_rate': 3.681917211328976e-05, 'epoch': 31.59}


 63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14512/22950 [23:50<12:59, 10.83it/s]

{'loss': 0.0021, 'grad_norm': 0.0015668824780732393, 'learning_rate': 3.6775599128540304e-05, 'epoch': 31.61}


 63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14522/22950 [23:51<12:59, 10.81it/s]

{'loss': 0.0001, 'grad_norm': 0.0017115699592977762, 'learning_rate': 3.6732026143790854e-05, 'epoch': 31.63}


 63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14532/22950 [23:52<13:01, 10.78it/s]

{'loss': 0.0, 'grad_norm': 0.0011271099792793393, 'learning_rate': 3.66884531590414e-05, 'epoch': 31.66}


 63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14542/22950 [23:53<13:07, 10.67it/s]

{'loss': 0.0001, 'grad_norm': 0.020938994362950325, 'learning_rate': 3.664488017429194e-05, 'epoch': 31.68}


 63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14552/22950 [23:54<12:59, 10.77it/s]

{'loss': 0.0202, 'grad_norm': 0.02300046943128109, 'learning_rate': 3.6601307189542484e-05, 'epoch': 31.7}


 63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14562/22950 [23:55<13:19, 10.49it/s]

{'loss': 0.0119, 'grad_norm': 0.0060930876061320305, 'learning_rate': 3.6557734204793034e-05, 'epoch': 31.72}


 63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14572/22950 [23:56<12:57, 10.78it/s]

{'loss': 0.0001, 'grad_norm': 0.002100482117384672, 'learning_rate': 3.651416122004358e-05, 'epoch': 31.74}


 64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14582/22950 [23:57<12:57, 10.76it/s]

{'loss': 0.0154, 'grad_norm': 0.0016592669999226928, 'learning_rate': 3.6470588235294114e-05, 'epoch': 31.76}


 64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14592/22950 [23:58<13:20, 10.44it/s]

{'loss': 0.0312, 'grad_norm': 0.007447661366313696, 'learning_rate': 3.6427015250544665e-05, 'epoch': 31.79}


 64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14602/22950 [23:59<13:15, 10.50it/s]

{'loss': 0.0001, 'grad_norm': 0.002450797939673066, 'learning_rate': 3.638344226579521e-05, 'epoch': 31.81}


 64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14612/22950 [24:00<12:55, 10.75it/s]

{'loss': 0.0004, 'grad_norm': 0.001380407833494246, 'learning_rate': 3.633986928104575e-05, 'epoch': 31.83}


 64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 14622/22950 [24:01<12:59, 10.68it/s]

{'loss': 0.007, 'grad_norm': 0.002026657108217478, 'learning_rate': 3.62962962962963e-05, 'epoch': 31.85}


 64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14632/22950 [24:02<13:24, 10.34it/s]

{'loss': 0.0619, 'grad_norm': 0.001540306257084012, 'learning_rate': 3.6252723311546845e-05, 'epoch': 31.87}


 64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14642/22950 [24:03<12:58, 10.67it/s]

{'loss': 0.0, 'grad_norm': 0.001362787326797843, 'learning_rate': 3.620915032679739e-05, 'epoch': 31.9}


 64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14652/22950 [24:04<12:56, 10.69it/s]

{'loss': 0.0001, 'grad_norm': 0.001326798228546977, 'learning_rate': 3.616557734204793e-05, 'epoch': 31.92}


 64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14662/22950 [24:04<12:54, 10.70it/s]

{'loss': 0.0, 'grad_norm': 0.0013441079063341022, 'learning_rate': 3.6122004357298475e-05, 'epoch': 31.94}


 64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14672/22950 [24:05<13:12, 10.45it/s]

{'loss': 0.0001, 'grad_norm': 0.019080813974142075, 'learning_rate': 3.607843137254902e-05, 'epoch': 31.96}


 64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14682/22950 [24:06<12:55, 10.67it/s]

{'loss': 0.0, 'grad_norm': 0.0012443807208910584, 'learning_rate': 3.603485838779956e-05, 'epoch': 31.98}


                                                     
 64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14688/22950 [24:10<12:18, 11.18it/s]

{'eval_loss': 1.1400479078292847, 'eval_accuracy': 0.845588207244873, 'eval_runtime': 2.7788, 'eval_samples_per_second': 146.824, 'eval_steps_per_second': 18.353, 'epoch': 32.0}


 64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14692/22950 [24:10<57:58,  2.37it/s]  

{'loss': 0.0, 'grad_norm': 0.006196668837219477, 'learning_rate': 3.599128540305011e-05, 'epoch': 32.0}


 64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14702/22950 [24:11<21:12,  6.48it/s]

{'loss': 0.0001, 'grad_norm': 0.0016590883024036884, 'learning_rate': 3.5947712418300656e-05, 'epoch': 32.03}


 64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14712/22950 [24:12<14:23,  9.54it/s]

{'loss': 0.0, 'grad_norm': 0.001435476471669972, 'learning_rate': 3.59041394335512e-05, 'epoch': 32.05}


 64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14722/22950 [24:13<13:17, 10.32it/s]

{'loss': 0.0, 'grad_norm': 0.0010840781033039093, 'learning_rate': 3.586056644880175e-05, 'epoch': 32.07}


 64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14732/22950 [24:14<12:52, 10.64it/s]

{'loss': 0.0, 'grad_norm': 0.0020602610893547535, 'learning_rate': 3.581699346405229e-05, 'epoch': 32.09}


 64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14742/22950 [24:15<12:51, 10.65it/s]

{'loss': 0.0001, 'grad_norm': 0.0018039698479697108, 'learning_rate': 3.577342047930283e-05, 'epoch': 32.11}


 64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14752/22950 [24:16<13:02, 10.47it/s]

{'loss': 0.0001, 'grad_norm': 0.0015113626141101122, 'learning_rate': 3.572984749455338e-05, 'epoch': 32.14}


 64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14762/22950 [24:17<13:01, 10.48it/s]

{'loss': 0.0, 'grad_norm': 0.0013055563904345036, 'learning_rate': 3.568627450980392e-05, 'epoch': 32.16}


 64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14772/22950 [24:18<12:53, 10.57it/s]

{'loss': 0.0, 'grad_norm': 0.0012357401428744197, 'learning_rate': 3.5642701525054466e-05, 'epoch': 32.18}


 64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14782/22950 [24:19<12:53, 10.56it/s]

{'loss': 0.0, 'grad_norm': 0.0012380254920572042, 'learning_rate': 3.559912854030501e-05, 'epoch': 32.2}


 64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14792/22950 [24:20<12:49, 10.60it/s]

{'loss': 0.0001, 'grad_norm': 0.0023729237727820873, 'learning_rate': 3.555555555555556e-05, 'epoch': 32.22}


 64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14802/22950 [24:21<12:58, 10.46it/s]

{'loss': 0.0001, 'grad_norm': 0.0013909338740631938, 'learning_rate': 3.55119825708061e-05, 'epoch': 32.24}


 65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14812/22950 [24:22<13:00, 10.42it/s]

{'loss': 0.0, 'grad_norm': 0.0009526087087579072, 'learning_rate': 3.546840958605665e-05, 'epoch': 32.27}


 65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14822/22950 [24:23<13:17, 10.20it/s]

{'loss': 0.0515, 'grad_norm': 0.0009887408232316375, 'learning_rate': 3.542483660130719e-05, 'epoch': 32.29}


 65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14832/22950 [24:24<12:49, 10.55it/s]

{'loss': 0.0, 'grad_norm': 0.0016930088168010116, 'learning_rate': 3.5381263616557734e-05, 'epoch': 32.31}


 65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14842/22950 [24:25<12:54, 10.46it/s]

{'loss': 0.0, 'grad_norm': 0.002420583972707391, 'learning_rate': 3.533769063180828e-05, 'epoch': 32.33}


 65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14852/22950 [24:26<13:00, 10.37it/s]

{'loss': 0.0001, 'grad_norm': 0.0011302527273073792, 'learning_rate': 3.529411764705883e-05, 'epoch': 32.35}


 65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14862/22950 [24:27<12:52, 10.47it/s]

{'loss': 0.092, 'grad_norm': 0.0012307026190683246, 'learning_rate': 3.525054466230937e-05, 'epoch': 32.37}


 65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14872/22950 [24:28<12:51, 10.47it/s]

{'loss': 0.0001, 'grad_norm': 0.002338270889595151, 'learning_rate': 3.5206971677559914e-05, 'epoch': 32.4}


 65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14882/22950 [24:29<13:07, 10.25it/s]

{'loss': 0.0001, 'grad_norm': 0.004768090322613716, 'learning_rate': 3.5163398692810464e-05, 'epoch': 32.42}


 65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14892/22950 [24:30<13:03, 10.28it/s]

{'loss': 0.0002, 'grad_norm': 0.002464640885591507, 'learning_rate': 3.511982570806101e-05, 'epoch': 32.44}


 65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14902/22950 [24:31<12:52, 10.41it/s]

{'loss': 0.0311, 'grad_norm': 0.002811519196256995, 'learning_rate': 3.507625272331155e-05, 'epoch': 32.46}


 65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 14912/22950 [24:31<12:44, 10.51it/s]

{'loss': 0.0001, 'grad_norm': 0.0016647850861772895, 'learning_rate': 3.503267973856209e-05, 'epoch': 32.48}


 65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 14922/22950 [24:32<12:43, 10.51it/s]

{'loss': 0.011, 'grad_norm': 1.632748007774353, 'learning_rate': 3.498910675381264e-05, 'epoch': 32.51}


 65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 14932/22950 [24:33<12:51, 10.39it/s]

{'loss': 0.0002, 'grad_norm': 0.0023877108469605446, 'learning_rate': 3.494553376906318e-05, 'epoch': 32.53}


 65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 14942/22950 [24:34<12:57, 10.30it/s]

{'loss': 0.0097, 'grad_norm': 0.0016357420245185494, 'learning_rate': 3.4901960784313725e-05, 'epoch': 32.55}


 65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 14952/22950 [24:35<12:38, 10.55it/s]

{'loss': 0.0001, 'grad_norm': 0.006200006697326899, 'learning_rate': 3.4858387799564275e-05, 'epoch': 32.57}


 65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 14962/22950 [24:36<12:40, 10.51it/s]

{'loss': 0.0002, 'grad_norm': 0.2531207501888275, 'learning_rate': 3.481481481481482e-05, 'epoch': 32.59}


 65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 14972/22950 [24:37<12:47, 10.39it/s]

{'loss': 0.0, 'grad_norm': 0.0017710435204207897, 'learning_rate': 3.477124183006536e-05, 'epoch': 32.61}


 65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 14982/22950 [24:38<12:42, 10.45it/s]

{'loss': 0.0001, 'grad_norm': 0.003388682845979929, 'learning_rate': 3.4727668845315905e-05, 'epoch': 32.64}


 65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 14992/22950 [24:39<12:50, 10.33it/s]

{'loss': 0.0003, 'grad_norm': 0.0015820799162611365, 'learning_rate': 3.468409586056645e-05, 'epoch': 32.66}


 65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 15002/22950 [24:40<12:37, 10.50it/s]

{'loss': 0.087, 'grad_norm': 0.0009367825114168227, 'learning_rate': 3.464052287581699e-05, 'epoch': 32.68}


 65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 15012/22950 [24:41<12:37, 10.48it/s]

{'loss': 0.0001, 'grad_norm': 0.001276807626709342, 'learning_rate': 3.4596949891067535e-05, 'epoch': 32.7}


 65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 15022/22950 [24:42<12:36, 10.49it/s]

{'loss': 0.0007, 'grad_norm': 0.010312345810234547, 'learning_rate': 3.4553376906318086e-05, 'epoch': 32.72}


 65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 15032/22950 [24:43<12:43, 10.37it/s]

{'loss': 0.0001, 'grad_norm': 0.0017976900562644005, 'learning_rate': 3.450980392156863e-05, 'epoch': 32.75}


 66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 15042/22950 [24:44<12:51, 10.25it/s]

{'loss': 0.0, 'grad_norm': 0.0011600485304370522, 'learning_rate': 3.446623093681917e-05, 'epoch': 32.77}


 66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 15052/22950 [24:45<12:47, 10.29it/s]

{'loss': 0.0, 'grad_norm': 0.0010249641491100192, 'learning_rate': 3.442265795206972e-05, 'epoch': 32.79}


 66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 15062/22950 [24:46<12:40, 10.37it/s]

{'loss': 0.0068, 'grad_norm': 0.0008724711951799691, 'learning_rate': 3.4379084967320266e-05, 'epoch': 32.81}


 66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 15072/22950 [24:47<12:33, 10.45it/s]

{'loss': 0.0, 'grad_norm': 0.0023289688397198915, 'learning_rate': 3.433551198257081e-05, 'epoch': 32.83}


 66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 15082/22950 [24:48<12:35, 10.41it/s]

{'loss': 0.0, 'grad_norm': 0.00087062205420807, 'learning_rate': 3.429193899782135e-05, 'epoch': 32.85}


 66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 15092/22950 [24:49<12:47, 10.24it/s]

{'loss': 0.0, 'grad_norm': 0.001159333041869104, 'learning_rate': 3.4248366013071896e-05, 'epoch': 32.88}


 66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 15102/22950 [24:50<12:36, 10.38it/s]

{'loss': 0.0003, 'grad_norm': 0.0011732265120372176, 'learning_rate': 3.420479302832244e-05, 'epoch': 32.9}


 66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 15112/22950 [24:51<12:38, 10.33it/s]

{'loss': 0.0, 'grad_norm': 0.0012371228076517582, 'learning_rate': 3.416122004357299e-05, 'epoch': 32.92}


 66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 15122/22950 [24:52<12:40, 10.30it/s]

{'loss': 0.0, 'grad_norm': 0.001057216664776206, 'learning_rate': 3.411764705882353e-05, 'epoch': 32.94}


 66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 15132/22950 [24:53<12:47, 10.18it/s]

{'loss': 0.0001, 'grad_norm': 0.0012994694989174604, 'learning_rate': 3.4074074074074077e-05, 'epoch': 32.96}


 66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 15142/22950 [24:54<12:46, 10.19it/s]

{'loss': 0.0, 'grad_norm': 0.0007894299342297018, 'learning_rate': 3.403050108932462e-05, 'epoch': 32.98}


                                                     
 66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 15147/22950 [24:57<12:38, 10.29it/s]

{'eval_loss': 1.3404340744018555, 'eval_accuracy': 0.8382353186607361, 'eval_runtime': 2.8969, 'eval_samples_per_second': 140.842, 'eval_steps_per_second': 17.605, 'epoch': 33.0}


 66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 15152/22950 [24:58<42:26,  3.06it/s]  

{'loss': 0.0, 'grad_norm': 0.0007860246696509421, 'learning_rate': 3.3986928104575163e-05, 'epoch': 33.01}


 66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 15161/22950 [24:59<19:45,  6.57it/s]

{'loss': 0.0, 'grad_norm': 0.0009776368970051408, 'learning_rate': 3.394335511982571e-05, 'epoch': 33.03}


 66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 15171/22950 [25:00<14:07,  9.18it/s]

{'loss': 0.0, 'grad_norm': 0.0008672205731272697, 'learning_rate': 3.389978213507625e-05, 'epoch': 33.05}


 66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 15181/22950 [25:01<13:28,  9.61it/s]

{'loss': 0.0, 'grad_norm': 0.0008798661292530596, 'learning_rate': 3.38562091503268e-05, 'epoch': 33.07}


 66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 15191/22950 [25:02<13:48,  9.36it/s]

{'loss': 0.0045, 'grad_norm': 0.0010578975779935718, 'learning_rate': 3.3812636165577344e-05, 'epoch': 33.09}


 66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 15201/22950 [25:03<12:53, 10.01it/s]

{'loss': 0.0028, 'grad_norm': 0.6452157497406006, 'learning_rate': 3.376906318082789e-05, 'epoch': 33.12}


 66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15211/22950 [25:04<13:00,  9.92it/s]

{'loss': 0.0014, 'grad_norm': 0.0006393976509571075, 'learning_rate': 3.372549019607844e-05, 'epoch': 33.14}


 66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15222/22950 [25:05<12:44, 10.11it/s]

{'loss': 0.0, 'grad_norm': 0.002171238185837865, 'learning_rate': 3.368191721132898e-05, 'epoch': 33.16}


 66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15232/22950 [25:06<12:51, 10.01it/s]

{'loss': 0.0006, 'grad_norm': 0.0008920335094444454, 'learning_rate': 3.3638344226579524e-05, 'epoch': 33.18}


 66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15242/22950 [25:07<12:29, 10.28it/s]

{'loss': 0.0, 'grad_norm': 0.0010389178059995174, 'learning_rate': 3.359477124183007e-05, 'epoch': 33.2}


 66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15252/22950 [25:08<12:37, 10.17it/s]

{'loss': 0.0216, 'grad_norm': 0.0007369681261479855, 'learning_rate': 3.355119825708061e-05, 'epoch': 33.22}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15262/22950 [25:09<12:37, 10.15it/s]

{'loss': 0.1158, 'grad_norm': 0.0011408546706661582, 'learning_rate': 3.3507625272331155e-05, 'epoch': 33.25}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15272/22950 [25:10<12:34, 10.18it/s]

{'loss': 0.0, 'grad_norm': 0.001047054072842002, 'learning_rate': 3.34640522875817e-05, 'epoch': 33.27}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15282/22950 [25:11<12:36, 10.14it/s]

{'loss': 0.05, 'grad_norm': 0.0016382054891437292, 'learning_rate': 3.342047930283225e-05, 'epoch': 33.29}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15291/22950 [25:12<12:38, 10.09it/s]

{'loss': 0.0001, 'grad_norm': 0.0015944474143907428, 'learning_rate': 3.337690631808279e-05, 'epoch': 33.31}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15301/22950 [25:13<12:31, 10.18it/s]

{'loss': 0.0023, 'grad_norm': 0.0015797670930624008, 'learning_rate': 3.3333333333333335e-05, 'epoch': 33.33}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15311/22950 [25:14<12:34, 10.13it/s]

{'loss': 0.0112, 'grad_norm': 0.006327881012111902, 'learning_rate': 3.328976034858388e-05, 'epoch': 33.36}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15321/22950 [25:15<12:30, 10.17it/s]

{'loss': 0.0718, 'grad_norm': 0.0017230029916390777, 'learning_rate': 3.324618736383442e-05, 'epoch': 33.38}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15332/22950 [25:16<12:29, 10.16it/s]

{'loss': 0.0358, 'grad_norm': 0.0016250306507572532, 'learning_rate': 3.3202614379084965e-05, 'epoch': 33.4}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15342/22950 [25:17<12:33, 10.10it/s]

{'loss': 0.0, 'grad_norm': 0.0011795031605288386, 'learning_rate': 3.3159041394335515e-05, 'epoch': 33.42}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15352/22950 [25:18<12:21, 10.24it/s]

{'loss': 0.0001, 'grad_norm': 0.0015472185332328081, 'learning_rate': 3.311546840958606e-05, 'epoch': 33.44}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15361/22950 [25:19<12:36, 10.03it/s]

{'loss': 0.0096, 'grad_norm': 0.003553966525942087, 'learning_rate': 3.30718954248366e-05, 'epoch': 33.46}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15371/22950 [25:20<12:34, 10.05it/s]

{'loss': 0.0676, 'grad_norm': 0.001109403558075428, 'learning_rate': 3.3028322440087146e-05, 'epoch': 33.49}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15381/22950 [25:21<12:24, 10.17it/s]

{'loss': 0.0001, 'grad_norm': 0.0015987071674317122, 'learning_rate': 3.2984749455337696e-05, 'epoch': 33.51}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15391/22950 [25:22<12:21, 10.19it/s]

{'loss': 0.0124, 'grad_norm': 0.0013473456492647529, 'learning_rate': 3.294117647058824e-05, 'epoch': 33.53}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15401/22950 [25:23<12:54,  9.75it/s]

{'loss': 0.0072, 'grad_norm': 0.0009231261792592704, 'learning_rate': 3.289760348583878e-05, 'epoch': 33.55}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15411/22950 [25:24<12:28, 10.08it/s]

{'loss': 0.0, 'grad_norm': 0.0011692183325067163, 'learning_rate': 3.2854030501089326e-05, 'epoch': 33.57}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15421/22950 [25:25<12:21, 10.15it/s]

{'loss': 0.0001, 'grad_norm': 0.006479375530034304, 'learning_rate': 3.281045751633987e-05, 'epoch': 33.59}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15431/22950 [25:26<12:18, 10.18it/s]

{'loss': 0.0071, 'grad_norm': 0.0010618449887260795, 'learning_rate': 3.276688453159041e-05, 'epoch': 33.62}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15441/22950 [25:27<12:25, 10.08it/s]

{'loss': 0.0001, 'grad_norm': 0.0016963299131020904, 'learning_rate': 3.272331154684096e-05, 'epoch': 33.64}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15452/22950 [25:28<12:18, 10.16it/s]

{'loss': 0.0, 'grad_norm': 0.0012154709547758102, 'learning_rate': 3.2679738562091506e-05, 'epoch': 33.66}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15462/22950 [25:29<12:19, 10.13it/s]

{'loss': 0.0, 'grad_norm': 0.0018581346375867724, 'learning_rate': 3.263616557734205e-05, 'epoch': 33.68}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15472/22950 [25:30<12:21, 10.09it/s]

{'loss': 0.0002, 'grad_norm': 0.0018096083076670766, 'learning_rate': 3.25925925925926e-05, 'epoch': 33.7}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15482/22950 [25:31<12:11, 10.21it/s]

{'loss': 0.0, 'grad_norm': 0.0016158577054738998, 'learning_rate': 3.254901960784314e-05, 'epoch': 33.73}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 15491/22950 [25:32<12:20, 10.07it/s]

{'loss': 0.0, 'grad_norm': 0.0011560001876205206, 'learning_rate': 3.250544662309368e-05, 'epoch': 33.75}


 68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15501/22950 [25:33<12:12, 10.16it/s]

{'loss': 0.0, 'grad_norm': 0.001193704316392541, 'learning_rate': 3.2461873638344223e-05, 'epoch': 33.77}


 68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15512/22950 [25:34<12:13, 10.14it/s]

{'loss': 0.1218, 'grad_norm': 0.005176089238375425, 'learning_rate': 3.2418300653594774e-05, 'epoch': 33.79}


 68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15522/22950 [25:35<12:15, 10.10it/s]

{'loss': 0.0, 'grad_norm': 0.001476973295211792, 'learning_rate': 3.237472766884532e-05, 'epoch': 33.81}


 68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15532/22950 [25:36<12:13, 10.11it/s]

{'loss': 0.0422, 'grad_norm': 0.0011891955509781837, 'learning_rate': 3.233115468409586e-05, 'epoch': 33.83}


 68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15542/22950 [25:37<12:15, 10.07it/s]

{'loss': 0.0, 'grad_norm': 0.0013874928699806333, 'learning_rate': 3.228758169934641e-05, 'epoch': 33.86}


 68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15551/22950 [25:38<12:14, 10.08it/s]

{'loss': 0.0001, 'grad_norm': 0.0009311408502981067, 'learning_rate': 3.2244008714596954e-05, 'epoch': 33.88}


 68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15562/22950 [25:39<12:14, 10.06it/s]

{'loss': 0.0001, 'grad_norm': 0.0011208959622308612, 'learning_rate': 3.22004357298475e-05, 'epoch': 33.9}


 68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15572/22950 [25:40<12:11, 10.09it/s]

{'loss': 0.0039, 'grad_norm': 0.0017147030448541045, 'learning_rate': 3.215686274509804e-05, 'epoch': 33.92}


 68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15581/22950 [25:41<12:09, 10.09it/s]

{'loss': 0.0, 'grad_norm': 0.0012080727610737085, 'learning_rate': 3.2113289760348584e-05, 'epoch': 33.94}


 68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15591/22950 [25:42<12:09, 10.09it/s]

{'loss': 0.0, 'grad_norm': 0.001322261057794094, 'learning_rate': 3.206971677559913e-05, 'epoch': 33.97}


 68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15601/22950 [25:43<12:07, 10.11it/s]

{'loss': 0.0003, 'grad_norm': 0.0025218739174306393, 'learning_rate': 3.202614379084967e-05, 'epoch': 33.99}


                                                     
 68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15606/22950 [25:46<12:32,  9.76it/s]

{'eval_loss': 1.2303425073623657, 'eval_accuracy': 0.8504902124404907, 'eval_runtime': 2.9508, 'eval_samples_per_second': 138.268, 'eval_steps_per_second': 17.284, 'epoch': 34.0}


 68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15611/22950 [25:47<40:52,  2.99it/s]  

{'loss': 0.0001, 'grad_norm': 0.0013484848896041512, 'learning_rate': 3.198257080610022e-05, 'epoch': 34.01}


 68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15621/22950 [25:48<16:45,  7.29it/s]

{'loss': 0.0, 'grad_norm': 0.00123589055147022, 'learning_rate': 3.1938997821350765e-05, 'epoch': 34.03}


 68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15631/22950 [25:49<12:45,  9.56it/s]

{'loss': 0.0179, 'grad_norm': 0.0012596326414495707, 'learning_rate': 3.189542483660131e-05, 'epoch': 34.05}


 68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15642/22950 [25:50<13:40,  8.91it/s]

{'loss': 0.0, 'grad_norm': 0.0013430872932076454, 'learning_rate': 3.185185185185185e-05, 'epoch': 34.07}


 68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15652/22950 [25:52<12:20,  9.86it/s]

{'loss': 0.0, 'grad_norm': 0.0010468590771779418, 'learning_rate': 3.1808278867102395e-05, 'epoch': 34.1}


 68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15661/22950 [25:52<12:08, 10.00it/s]

{'loss': 0.0053, 'grad_norm': 0.0014669167576357722, 'learning_rate': 3.176470588235294e-05, 'epoch': 34.12}


 68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15672/22950 [25:54<11:56, 10.15it/s]

{'loss': 0.0, 'grad_norm': 0.0010249369079247117, 'learning_rate': 3.172113289760349e-05, 'epoch': 34.14}


 68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15682/22950 [25:55<12:03, 10.04it/s]

{'loss': 0.0, 'grad_norm': 0.0009152984712272882, 'learning_rate': 3.167755991285403e-05, 'epoch': 34.16}


 68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15691/22950 [25:55<12:15,  9.87it/s]

{'loss': 0.0031, 'grad_norm': 0.0020195054821670055, 'learning_rate': 3.1633986928104575e-05, 'epoch': 34.18}


 68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15701/22950 [25:56<11:59, 10.07it/s]

{'loss': 0.0002, 'grad_norm': 0.0007766704657115042, 'learning_rate': 3.1590413943355126e-05, 'epoch': 34.2}


 68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15712/22950 [25:58<11:58, 10.08it/s]

{'loss': 0.0001, 'grad_norm': 0.0008901635883376002, 'learning_rate': 3.154684095860567e-05, 'epoch': 34.23}


 69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15722/22950 [25:59<11:59, 10.04it/s]

{'loss': 0.0, 'grad_norm': 0.00125365040730685, 'learning_rate': 3.150326797385621e-05, 'epoch': 34.25}


 69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15731/22950 [25:59<12:02,  9.99it/s]

{'loss': 0.0, 'grad_norm': 0.0007555786287412047, 'learning_rate': 3.1459694989106756e-05, 'epoch': 34.27}


 69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15742/22950 [26:01<11:57, 10.04it/s]

{'loss': 0.0, 'grad_norm': 0.000991997541859746, 'learning_rate': 3.14161220043573e-05, 'epoch': 34.29}


 69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15751/22950 [26:02<12:10,  9.85it/s]

{'loss': 0.102, 'grad_norm': 29.44516372680664, 'learning_rate': 3.137254901960784e-05, 'epoch': 34.31}


 69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15761/22950 [26:03<12:00,  9.98it/s]

{'loss': 0.0, 'grad_norm': 0.0010966688860207796, 'learning_rate': 3.1328976034858386e-05, 'epoch': 34.34}


 69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 15772/22950 [26:04<11:59,  9.98it/s]

{'loss': 0.007, 'grad_norm': 0.0009972522966563702, 'learning_rate': 3.1285403050108936e-05, 'epoch': 34.36}


 69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 15781/22950 [26:05<11:55, 10.01it/s]

{'loss': 0.0194, 'grad_norm': 0.0010688361944630742, 'learning_rate': 3.124183006535948e-05, 'epoch': 34.38}


 69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 15792/22950 [26:06<11:53, 10.04it/s]

{'loss': 0.0, 'grad_norm': 0.004738796502351761, 'learning_rate': 3.119825708061002e-05, 'epoch': 34.4}


 69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 15801/22950 [26:07<11:55,  9.99it/s]

{'loss': 0.0, 'grad_norm': 0.001100013148970902, 'learning_rate': 3.115468409586057e-05, 'epoch': 34.42}


 69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 15811/22950 [26:08<11:58,  9.93it/s]

{'loss': 0.0394, 'grad_norm': 0.0013129573781043291, 'learning_rate': 3.111111111111111e-05, 'epoch': 34.44}


 69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 15821/22950 [26:09<11:51, 10.02it/s]

{'loss': 0.0001, 'grad_norm': 0.08161450922489166, 'learning_rate': 3.106753812636165e-05, 'epoch': 34.47}


 69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 15832/22950 [26:10<11:47, 10.06it/s]

{'loss': 0.0, 'grad_norm': 0.0009240314830094576, 'learning_rate': 3.1023965141612203e-05, 'epoch': 34.49}


 69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 15841/22950 [26:11<11:51,  9.99it/s]

{'loss': 0.0588, 'grad_norm': 0.0013544987887144089, 'learning_rate': 3.098039215686275e-05, 'epoch': 34.51}


 69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 15851/22950 [26:12<11:51,  9.97it/s]

{'loss': 0.0001, 'grad_norm': 0.003206786001101136, 'learning_rate': 3.093681917211329e-05, 'epoch': 34.53}


 69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 15861/22950 [26:13<11:46, 10.04it/s]

{'loss': 0.0001, 'grad_norm': 0.04262150451540947, 'learning_rate': 3.0893246187363834e-05, 'epoch': 34.55}


 69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 15872/22950 [26:14<11:38, 10.14it/s]

{'loss': 0.0, 'grad_norm': 0.0009312051115557551, 'learning_rate': 3.0849673202614384e-05, 'epoch': 34.58}


 69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 15881/22950 [26:15<11:53,  9.90it/s]

{'loss': 0.0, 'grad_norm': 0.0013088369742035866, 'learning_rate': 3.080610021786493e-05, 'epoch': 34.6}


 69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 15891/22950 [26:16<11:48,  9.97it/s]

{'loss': 0.0, 'grad_norm': 0.001125673996284604, 'learning_rate': 3.076252723311547e-05, 'epoch': 34.62}


 69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 15902/22950 [26:17<11:36, 10.11it/s]

{'loss': 0.0723, 'grad_norm': 0.0021164186764508486, 'learning_rate': 3.0718954248366014e-05, 'epoch': 34.64}


 69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 15911/22950 [26:18<11:50,  9.90it/s]

{'loss': 0.0, 'grad_norm': 0.0007562328246422112, 'learning_rate': 3.067538126361656e-05, 'epoch': 34.66}


 69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 15921/22950 [26:19<11:40, 10.04it/s]

{'loss': 0.0, 'grad_norm': 0.0011558674741536379, 'learning_rate': 3.06318082788671e-05, 'epoch': 34.68}


 69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 15931/22950 [26:20<11:43,  9.98it/s]

{'loss': 0.0036, 'grad_norm': 0.001600500545464456, 'learning_rate': 3.058823529411765e-05, 'epoch': 34.71}


 69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 15941/22950 [26:21<11:43,  9.97it/s]

{'loss': 0.0, 'grad_norm': 0.0007204401772469282, 'learning_rate': 3.0544662309368195e-05, 'epoch': 34.73}


 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 15952/22950 [26:22<11:35, 10.06it/s]

{'loss': 0.002, 'grad_norm': 0.03461873531341553, 'learning_rate': 3.0501089324618738e-05, 'epoch': 34.75}


 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 15960/22950 [26:23<11:32, 10.10it/s]

{'loss': 0.0, 'grad_norm': 0.001455070567317307, 'learning_rate': 3.045751633986928e-05, 'epoch': 34.77}


 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 15971/22950 [26:24<11:29, 10.12it/s]

{'loss': 0.0947, 'grad_norm': 0.0009719390072859824, 'learning_rate': 3.0413943355119828e-05, 'epoch': 34.79}


 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 15981/22950 [26:25<11:30, 10.09it/s]

{'loss': 0.0027, 'grad_norm': 0.001342371921055019, 'learning_rate': 3.037037037037037e-05, 'epoch': 34.81}


 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 15991/22950 [26:26<11:38,  9.96it/s]

{'loss': 0.0001, 'grad_norm': 0.054312583059072495, 'learning_rate': 3.0326797385620915e-05, 'epoch': 34.84}


 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 16001/22950 [26:27<11:31, 10.04it/s]

{'loss': 0.0, 'grad_norm': 0.0011085454607382417, 'learning_rate': 3.0283224400871462e-05, 'epoch': 34.86}


 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 16011/22950 [26:28<11:30, 10.05it/s]

{'loss': 0.0, 'grad_norm': 0.0036828499287366867, 'learning_rate': 3.0239651416122005e-05, 'epoch': 34.88}


 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 16021/22950 [26:29<11:28, 10.07it/s]

{'loss': 0.0004, 'grad_norm': 0.0011727048549801111, 'learning_rate': 3.019607843137255e-05, 'epoch': 34.9}


 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 16031/22950 [26:30<11:42,  9.85it/s]

{'loss': 0.0, 'grad_norm': 0.0011757115134969354, 'learning_rate': 3.01525054466231e-05, 'epoch': 34.92}


 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 16042/22950 [26:31<11:22, 10.13it/s]

{'loss': 0.0, 'grad_norm': 0.0011121947318315506, 'learning_rate': 3.010893246187364e-05, 'epoch': 34.95}


 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 16052/22950 [26:32<11:15, 10.21it/s]

{'loss': 0.0, 'grad_norm': 0.0018275243928655982, 'learning_rate': 3.0065359477124182e-05, 'epoch': 34.97}


 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 16062/22950 [26:33<11:26, 10.03it/s]

{'loss': 0.0, 'grad_norm': 0.0013699752744287252, 'learning_rate': 3.0021786492374732e-05, 'epoch': 34.99}


                                                     
 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 16065/22950 [26:36<11:18, 10.14it/s]

{'eval_loss': 1.2703418731689453, 'eval_accuracy': 0.843137264251709, 'eval_runtime': 2.983, 'eval_samples_per_second': 136.776, 'eval_steps_per_second': 17.097, 'epoch': 35.0}


 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 16072/22950 [26:37<29:43,  3.86it/s]  

{'loss': 0.0001, 'grad_norm': 0.001998098101466894, 'learning_rate': 2.9978213507625276e-05, 'epoch': 35.01}


 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 16082/22950 [26:38<13:56,  8.21it/s]

{'loss': 0.0, 'grad_norm': 0.0008314964943565428, 'learning_rate': 2.9934640522875816e-05, 'epoch': 35.03}


 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 16092/22950 [26:39<11:43,  9.75it/s]

{'loss': 0.0, 'grad_norm': 0.0008352905861102045, 'learning_rate': 2.989106753812636e-05, 'epoch': 35.05}


 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 16102/22950 [26:40<11:22, 10.04it/s]

{'loss': 0.0045, 'grad_norm': 0.001087064272724092, 'learning_rate': 2.984749455337691e-05, 'epoch': 35.08}


 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 16112/22950 [26:41<11:14, 10.14it/s]

{'loss': 0.0, 'grad_norm': 0.0010012194979935884, 'learning_rate': 2.9803921568627453e-05, 'epoch': 35.1}


 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 16122/22950 [26:42<11:10, 10.18it/s]

{'loss': 0.0036, 'grad_norm': 0.0008982495055533946, 'learning_rate': 2.9760348583877996e-05, 'epoch': 35.12}


 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 16131/22950 [26:43<11:25,  9.94it/s]

{'loss': 0.0001, 'grad_norm': 0.0007831642869859934, 'learning_rate': 2.9716775599128543e-05, 'epoch': 35.14}


 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 16141/22950 [26:44<11:06, 10.22it/s]

{'loss': 0.0007, 'grad_norm': 0.000678648182656616, 'learning_rate': 2.9673202614379087e-05, 'epoch': 35.16}


 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 16151/22950 [26:45<11:07, 10.18it/s]

{'loss': 0.0, 'grad_norm': 0.0008972947834990919, 'learning_rate': 2.962962962962963e-05, 'epoch': 35.19}


 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 16161/22950 [26:46<11:06, 10.19it/s]

{'loss': 0.0, 'grad_norm': 0.04280335456132889, 'learning_rate': 2.9586056644880177e-05, 'epoch': 35.21}


 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 16172/22950 [26:47<11:05, 10.18it/s]

{'loss': 0.0, 'grad_norm': 0.000788760487921536, 'learning_rate': 2.954248366013072e-05, 'epoch': 35.23}


 71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 16182/22950 [26:48<10:59, 10.26it/s]

{'loss': 0.0, 'grad_norm': 0.007393593899905682, 'learning_rate': 2.9498910675381264e-05, 'epoch': 35.25}


 71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 16192/22950 [26:49<10:59, 10.25it/s]

{'loss': 0.0, 'grad_norm': 0.0005204543122090399, 'learning_rate': 2.9455337690631814e-05, 'epoch': 35.27}


 71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 16202/22950 [26:50<10:59, 10.23it/s]

{'loss': 0.0281, 'grad_norm': 0.000831850222311914, 'learning_rate': 2.9411764705882354e-05, 'epoch': 35.29}


 71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 16211/22950 [26:51<11:25,  9.83it/s]

{'loss': 0.0, 'grad_norm': 0.011786800809204578, 'learning_rate': 2.9368191721132897e-05, 'epoch': 35.32}


 71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 16221/22950 [26:52<10:58, 10.22it/s]

{'loss': 0.0, 'grad_norm': 0.0008873065235093236, 'learning_rate': 2.932461873638344e-05, 'epoch': 35.34}


 71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 16231/22950 [26:53<11:00, 10.17it/s]

{'loss': 0.0, 'grad_norm': 0.0010491119464859366, 'learning_rate': 2.928104575163399e-05, 'epoch': 35.36}


 71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 16241/22950 [26:54<11:01, 10.14it/s]

{'loss': 0.0053, 'grad_norm': 0.0007218846585601568, 'learning_rate': 2.9237472766884534e-05, 'epoch': 35.38}


 71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 16252/22950 [26:55<11:04, 10.08it/s]

{'loss': 0.0, 'grad_norm': 0.015114719048142433, 'learning_rate': 2.9193899782135074e-05, 'epoch': 35.4}


 71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 16262/22950 [26:56<10:54, 10.22it/s]

{'loss': 0.0001, 'grad_norm': 0.0007146435673348606, 'learning_rate': 2.9150326797385624e-05, 'epoch': 35.42}


 71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 16272/22950 [26:57<10:47, 10.32it/s]

{'loss': 0.0001, 'grad_norm': 0.013940664939582348, 'learning_rate': 2.9106753812636168e-05, 'epoch': 35.45}


 71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 16281/22950 [26:58<11:51,  9.37it/s]

{'loss': 0.0, 'grad_norm': 0.0007152727921493351, 'learning_rate': 2.906318082788671e-05, 'epoch': 35.47}


 71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 16292/22950 [26:59<10:38, 10.43it/s]

{'loss': 0.0639, 'grad_norm': 0.0009163339855149388, 'learning_rate': 2.9019607843137258e-05, 'epoch': 35.49}


 71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 16302/22950 [27:00<10:50, 10.22it/s]

{'loss': 0.0001, 'grad_norm': 0.0017289776587858796, 'learning_rate': 2.89760348583878e-05, 'epoch': 35.51}


 71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 16312/22950 [27:01<10:45, 10.29it/s]

{'loss': 0.0002, 'grad_norm': 0.0009872830705717206, 'learning_rate': 2.8932461873638345e-05, 'epoch': 35.53}


 71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 16322/22950 [27:02<10:40, 10.35it/s]

{'loss': 0.0001, 'grad_norm': 0.004403221886605024, 'learning_rate': 2.8888888888888888e-05, 'epoch': 35.56}


 71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 16332/22950 [27:03<10:53, 10.13it/s]

{'loss': 0.0, 'grad_norm': 0.001995930913835764, 'learning_rate': 2.8845315904139435e-05, 'epoch': 35.58}


 71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 16342/22950 [27:04<10:39, 10.33it/s]

{'loss': 0.0002, 'grad_norm': 0.0015641542850062251, 'learning_rate': 2.880174291938998e-05, 'epoch': 35.6}


 71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16352/22950 [27:05<10:40, 10.31it/s]

{'loss': 0.0003, 'grad_norm': 0.0012232206063345075, 'learning_rate': 2.8758169934640522e-05, 'epoch': 35.62}


 71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16362/22950 [27:06<10:40, 10.28it/s]

{'loss': 0.0, 'grad_norm': 0.0007855088333599269, 'learning_rate': 2.8714596949891072e-05, 'epoch': 35.64}


 71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16371/22950 [27:07<10:59,  9.98it/s]

{'loss': 0.0091, 'grad_norm': 0.0008973405929282308, 'learning_rate': 2.8671023965141612e-05, 'epoch': 35.66}


 71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16381/22950 [27:08<10:39, 10.27it/s]

{'loss': 0.0025, 'grad_norm': 0.0007723529124632478, 'learning_rate': 2.8627450980392155e-05, 'epoch': 35.69}


 71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16391/22950 [27:09<10:36, 10.31it/s]

{'loss': 0.0001, 'grad_norm': 0.00864800252020359, 'learning_rate': 2.8583877995642706e-05, 'epoch': 35.71}


 71%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16401/22950 [27:10<10:38, 10.25it/s]

{'loss': 0.0, 'grad_norm': 0.0006023492896929383, 'learning_rate': 2.854030501089325e-05, 'epoch': 35.73}


 72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16411/22950 [27:11<10:51, 10.03it/s]

{'loss': 0.0292, 'grad_norm': 0.013778859749436378, 'learning_rate': 2.8496732026143792e-05, 'epoch': 35.75}


 72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16421/22950 [27:11<10:34, 10.28it/s]

{'loss': 0.0249, 'grad_norm': 0.0009342912817373872, 'learning_rate': 2.845315904139434e-05, 'epoch': 35.77}


 72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16431/22950 [27:12<10:39, 10.19it/s]

{'loss': 0.0, 'grad_norm': 0.0016604569973424077, 'learning_rate': 2.8409586056644883e-05, 'epoch': 35.8}


 72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16441/22950 [27:13<10:33, 10.28it/s]

{'loss': 0.0, 'grad_norm': 0.0011010334128513932, 'learning_rate': 2.8366013071895426e-05, 'epoch': 35.82}


 72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16451/22950 [27:14<10:42, 10.11it/s]

{'loss': 0.0001, 'grad_norm': 0.0024828738532960415, 'learning_rate': 2.832244008714597e-05, 'epoch': 35.84}


 72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16461/22950 [27:15<10:33, 10.24it/s]

{'loss': 0.0, 'grad_norm': 0.0014059700770303607, 'learning_rate': 2.8278867102396516e-05, 'epoch': 35.86}


 72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16471/22950 [27:16<10:24, 10.37it/s]

{'loss': 0.0, 'grad_norm': 0.005368727259337902, 'learning_rate': 2.823529411764706e-05, 'epoch': 35.88}


 72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16481/22950 [27:17<10:27, 10.30it/s]

{'loss': 0.0, 'grad_norm': 0.0023825850803405046, 'learning_rate': 2.8191721132897603e-05, 'epoch': 35.9}


 72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16492/22950 [27:18<10:24, 10.34it/s]

{'loss': 0.0001, 'grad_norm': 0.001440701773390174, 'learning_rate': 2.814814814814815e-05, 'epoch': 35.93}


 72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16502/22950 [27:19<10:25, 10.31it/s]

{'loss': 0.0, 'grad_norm': 0.0021736156195402145, 'learning_rate': 2.8104575163398693e-05, 'epoch': 35.95}


 72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16512/22950 [27:20<10:23, 10.32it/s]

{'loss': 0.0, 'grad_norm': 0.010600840672850609, 'learning_rate': 2.8061002178649237e-05, 'epoch': 35.97}


 72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16522/22950 [27:21<10:21, 10.34it/s]

{'loss': 0.0001, 'grad_norm': 0.0009350426262244582, 'learning_rate': 2.8017429193899787e-05, 'epoch': 35.99}


                                                     
 72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16524/22950 [27:24<09:43, 11.01it/s]

{'eval_loss': 1.2315094470977783, 'eval_accuracy': 0.8529411554336548, 'eval_runtime': 2.9252, 'eval_samples_per_second': 139.48, 'eval_steps_per_second': 17.435, 'epoch': 36.0}


 72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16532/22950 [27:25<27:17,  3.92it/s]  

{'loss': 0.0154, 'grad_norm': 0.0009056641138158739, 'learning_rate': 2.7973856209150327e-05, 'epoch': 36.01}


 72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16541/22950 [27:26<13:42,  7.80it/s]

{'loss': 0.0141, 'grad_norm': 0.0007063700468279421, 'learning_rate': 2.793028322440087e-05, 'epoch': 36.03}


 72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16551/22950 [27:27<10:50,  9.84it/s]

{'loss': 0.0, 'grad_norm': 0.0007154056220315397, 'learning_rate': 2.788671023965142e-05, 'epoch': 36.06}


 72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16561/22950 [27:28<10:30, 10.13it/s]

{'loss': 0.0, 'grad_norm': 0.0008721210178919137, 'learning_rate': 2.7843137254901964e-05, 'epoch': 36.08}


 72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16571/22950 [27:29<10:19, 10.30it/s]

{'loss': 0.0001, 'grad_norm': 0.00580122135579586, 'learning_rate': 2.7799564270152507e-05, 'epoch': 36.1}


 72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16581/22950 [27:30<10:16, 10.34it/s]

{'loss': 0.0061, 'grad_norm': 0.0007562689715996385, 'learning_rate': 2.7755991285403047e-05, 'epoch': 36.12}


 72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16591/22950 [27:31<10:15, 10.33it/s]

{'loss': 0.0, 'grad_norm': 0.00048125136527232826, 'learning_rate': 2.7712418300653598e-05, 'epoch': 36.14}


 72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16601/22950 [27:32<10:18, 10.26it/s]

{'loss': 0.0, 'grad_norm': 0.0008642650791443884, 'learning_rate': 2.766884531590414e-05, 'epoch': 36.17}


 72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16611/22950 [27:33<10:17, 10.27it/s]

{'loss': 0.0002, 'grad_norm': 0.0006803845171816647, 'learning_rate': 2.7625272331154684e-05, 'epoch': 36.19}


 72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16621/22950 [27:34<10:11, 10.35it/s]

{'loss': 0.0, 'grad_norm': 0.0010140491649508476, 'learning_rate': 2.758169934640523e-05, 'epoch': 36.21}


 72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 16631/22950 [27:35<10:15, 10.27it/s]

{'loss': 0.0, 'grad_norm': 0.0007580986130051315, 'learning_rate': 2.7538126361655775e-05, 'epoch': 36.23}


 73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16641/22950 [27:36<10:20, 10.16it/s]

{'loss': 0.0, 'grad_norm': 0.0006775221554562449, 'learning_rate': 2.7494553376906318e-05, 'epoch': 36.25}


 73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16651/22950 [27:37<10:18, 10.19it/s]

{'loss': 0.0, 'grad_norm': 0.0006807182217016816, 'learning_rate': 2.7450980392156865e-05, 'epoch': 36.27}


 73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16661/22950 [27:38<10:08, 10.34it/s]

{'loss': 0.0, 'grad_norm': 0.0008827062556520104, 'learning_rate': 2.7407407407407408e-05, 'epoch': 36.3}


 73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16671/22950 [27:39<10:11, 10.27it/s]

{'loss': 0.0001, 'grad_norm': 0.04296274483203888, 'learning_rate': 2.736383442265795e-05, 'epoch': 36.32}


 73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16682/22950 [27:40<10:18, 10.14it/s]

{'loss': 0.0, 'grad_norm': 0.09302645176649094, 'learning_rate': 2.7320261437908495e-05, 'epoch': 36.34}


 73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16692/22950 [27:41<10:08, 10.29it/s]

{'loss': 0.0131, 'grad_norm': 0.00067087885690853, 'learning_rate': 2.7276688453159045e-05, 'epoch': 36.36}


 73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16702/22950 [27:42<10:05, 10.31it/s]

{'loss': 0.1196, 'grad_norm': 0.0008354955934919417, 'learning_rate': 2.7233115468409585e-05, 'epoch': 36.38}


 73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16712/22950 [27:43<10:01, 10.38it/s]

{'loss': 0.0, 'grad_norm': 0.0014350379351526499, 'learning_rate': 2.718954248366013e-05, 'epoch': 36.41}


 73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16721/22950 [27:44<10:24,  9.97it/s]

{'loss': 0.0, 'grad_norm': 0.002285620430484414, 'learning_rate': 2.714596949891068e-05, 'epoch': 36.43}


 73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16731/22950 [27:45<10:00, 10.36it/s]

{'loss': 0.0001, 'grad_norm': 0.000676272960845381, 'learning_rate': 2.7102396514161222e-05, 'epoch': 36.45}


 73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16741/22950 [27:46<10:04, 10.27it/s]

{'loss': 0.0007, 'grad_norm': 0.0020174344535917044, 'learning_rate': 2.7058823529411766e-05, 'epoch': 36.47}


 73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16751/22950 [27:47<09:59, 10.33it/s]

{'loss': 0.0001, 'grad_norm': 0.00090406509116292, 'learning_rate': 2.7015250544662313e-05, 'epoch': 36.49}


 73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16761/22950 [27:48<10:04, 10.23it/s]

{'loss': 0.0, 'grad_norm': 0.0017988577019423246, 'learning_rate': 2.6971677559912856e-05, 'epoch': 36.51}


 73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16771/22950 [27:49<09:56, 10.37it/s]

{'loss': 0.0071, 'grad_norm': 0.0027434274088591337, 'learning_rate': 2.69281045751634e-05, 'epoch': 36.54}


 73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16781/22950 [27:50<09:51, 10.43it/s]

{'loss': 0.0, 'grad_norm': 0.000936806492973119, 'learning_rate': 2.6884531590413946e-05, 'epoch': 36.56}


 73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16791/22950 [27:51<09:52, 10.39it/s]

{'loss': 0.0, 'grad_norm': 0.0006146972882561386, 'learning_rate': 2.684095860566449e-05, 'epoch': 36.58}


 73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16801/22950 [27:52<10:01, 10.22it/s]

{'loss': 0.0, 'grad_norm': 0.0008931109332479537, 'learning_rate': 2.6797385620915033e-05, 'epoch': 36.6}


 73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16811/22950 [27:53<09:51, 10.37it/s]

{'loss': 0.0, 'grad_norm': 0.0008311474230140448, 'learning_rate': 2.6753812636165576e-05, 'epoch': 36.62}


 73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16821/22950 [27:54<09:45, 10.46it/s]

{'loss': 0.0, 'grad_norm': 0.0012511699460446835, 'learning_rate': 2.6710239651416123e-05, 'epoch': 36.64}


 73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16831/22950 [27:55<09:46, 10.43it/s]

{'loss': 0.0, 'grad_norm': 0.0008904084097594023, 'learning_rate': 2.6666666666666667e-05, 'epoch': 36.67}


 73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16841/22950 [27:56<09:56, 10.24it/s]

{'loss': 0.0, 'grad_norm': 0.001024940051138401, 'learning_rate': 2.662309368191721e-05, 'epoch': 36.69}


 73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16851/22950 [27:57<09:44, 10.43it/s]

{'loss': 0.0104, 'grad_norm': 0.0011631036177277565, 'learning_rate': 2.657952069716776e-05, 'epoch': 36.71}


 73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16861/22950 [27:57<09:42, 10.46it/s]

{'loss': 0.0, 'grad_norm': 0.0009171654819510877, 'learning_rate': 2.6535947712418304e-05, 'epoch': 36.73}


 74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16871/22950 [27:58<09:44, 10.40it/s]

{'loss': 0.0, 'grad_norm': 0.0006556957960128784, 'learning_rate': 2.6492374727668844e-05, 'epoch': 36.75}


 74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16881/22950 [27:59<09:53, 10.22it/s]

{'loss': 0.0001, 'grad_norm': 0.0007828088128007948, 'learning_rate': 2.6448801742919394e-05, 'epoch': 36.78}


 74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16891/22950 [28:00<09:41, 10.43it/s]

{'loss': 0.0001, 'grad_norm': 0.0021179714240133762, 'learning_rate': 2.6405228758169937e-05, 'epoch': 36.8}


 74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16901/22950 [28:01<09:44, 10.35it/s]

{'loss': 0.0, 'grad_norm': 0.0010856076842173934, 'learning_rate': 2.636165577342048e-05, 'epoch': 36.82}


 74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16911/22950 [28:02<09:40, 10.40it/s]

{'loss': 0.0, 'grad_norm': 0.0008552517392672598, 'learning_rate': 2.6318082788671027e-05, 'epoch': 36.84}


 74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 16921/22950 [28:03<09:54, 10.15it/s]

{'loss': 0.0, 'grad_norm': 0.0008275478612631559, 'learning_rate': 2.627450980392157e-05, 'epoch': 36.86}


 74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 16931/22950 [28:04<09:38, 10.41it/s]

{'loss': 0.0132, 'grad_norm': 0.0007691122009418905, 'learning_rate': 2.6230936819172114e-05, 'epoch': 36.88}


 74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 16941/22950 [28:05<09:36, 10.43it/s]

{'loss': 0.0, 'grad_norm': 0.0009249175782315433, 'learning_rate': 2.6187363834422658e-05, 'epoch': 36.91}


 74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 16951/22950 [28:06<09:33, 10.46it/s]

{'loss': 0.0068, 'grad_norm': 1.057516098022461, 'learning_rate': 2.6143790849673204e-05, 'epoch': 36.93}


 74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 16961/22950 [28:07<09:45, 10.22it/s]

{'loss': 0.0, 'grad_norm': 0.0008565128082409501, 'learning_rate': 2.6100217864923748e-05, 'epoch': 36.95}


 74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 16971/22950 [28:08<09:36, 10.37it/s]

{'loss': 0.0, 'grad_norm': 0.0012738305376842618, 'learning_rate': 2.605664488017429e-05, 'epoch': 36.97}


 74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 16981/22950 [28:09<09:33, 10.40it/s]

{'loss': 0.0, 'grad_norm': 0.0009586975211277604, 'learning_rate': 2.601307189542484e-05, 'epoch': 36.99}


                                                     
 74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 16983/22950 [28:12<08:53, 11.18it/s]

{'eval_loss': 1.235145926475525, 'eval_accuracy': 0.8480392098426819, 'eval_runtime': 2.8577, 'eval_samples_per_second': 142.77, 'eval_steps_per_second': 17.846, 'epoch': 37.0}


 74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 16991/22950 [28:13<24:49,  4.00it/s]

{'loss': 0.3378, 'grad_norm': 0.0008487902814522386, 'learning_rate': 2.596949891067538e-05, 'epoch': 37.02}


 74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 17001/22950 [28:14<12:29,  7.93it/s]

{'loss': 0.0001, 'grad_norm': 0.046567127108573914, 'learning_rate': 2.5925925925925925e-05, 'epoch': 37.04}


 74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 17011/22950 [28:15<09:58,  9.92it/s]

{'loss': 0.0001, 'grad_norm': 0.0012532961554825306, 'learning_rate': 2.5882352941176475e-05, 'epoch': 37.06}


 74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 17021/22950 [28:16<09:40, 10.22it/s]

{'loss': 0.0001, 'grad_norm': 0.01159965991973877, 'learning_rate': 2.583877995642702e-05, 'epoch': 37.08}


 74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 17031/22950 [28:17<09:27, 10.44it/s]

{'loss': 0.0001, 'grad_norm': 0.004137720447033644, 'learning_rate': 2.579520697167756e-05, 'epoch': 37.1}


 74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 17041/22950 [28:18<09:24, 10.47it/s]

{'loss': 0.0001, 'grad_norm': 0.001625654986128211, 'learning_rate': 2.5751633986928102e-05, 'epoch': 37.12}


 74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 17051/22950 [28:19<09:24, 10.45it/s]

{'loss': 0.0002, 'grad_norm': 0.000528485223185271, 'learning_rate': 2.5708061002178652e-05, 'epoch': 37.15}


 74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 17061/22950 [28:20<09:26, 10.40it/s]

{'loss': 0.0001, 'grad_norm': 0.0014529863838106394, 'learning_rate': 2.5664488017429196e-05, 'epoch': 37.17}


 74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 17071/22950 [28:21<09:19, 10.50it/s]

{'loss': 0.0, 'grad_norm': 0.0019709994085133076, 'learning_rate': 2.562091503267974e-05, 'epoch': 37.19}


 74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 17081/22950 [28:22<09:21, 10.45it/s]

{'loss': 0.0002, 'grad_norm': 0.0010327133350074291, 'learning_rate': 2.5577342047930286e-05, 'epoch': 37.21}


 74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 17091/22950 [28:23<09:58,  9.79it/s]

{'loss': 0.0, 'grad_norm': 0.0008558318368159235, 'learning_rate': 2.553376906318083e-05, 'epoch': 37.23}


 75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 17102/22950 [28:24<09:13, 10.56it/s]

{'loss': 0.0, 'grad_norm': 0.0008003121474757791, 'learning_rate': 2.5490196078431373e-05, 'epoch': 37.25}


 75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 17112/22950 [28:25<09:14, 10.53it/s]

{'loss': 0.0, 'grad_norm': 0.0015807523159310222, 'learning_rate': 2.544662309368192e-05, 'epoch': 37.28}


 75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 17122/22950 [28:26<09:17, 10.46it/s]

{'loss': 0.0, 'grad_norm': 0.0013124716933816671, 'learning_rate': 2.5403050108932463e-05, 'epoch': 37.3}


 75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 17132/22950 [28:27<09:24, 10.30it/s]

{'loss': 0.0001, 'grad_norm': 0.037131279706954956, 'learning_rate': 2.5359477124183006e-05, 'epoch': 37.32}


 75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 17142/22950 [28:28<09:13, 10.49it/s]

{'loss': 0.0, 'grad_norm': 0.0009382787975482643, 'learning_rate': 2.5315904139433556e-05, 'epoch': 37.34}


 75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 17152/22950 [28:29<09:09, 10.54it/s]

{'loss': 0.0126, 'grad_norm': 0.0010665450245141983, 'learning_rate': 2.5272331154684096e-05, 'epoch': 37.36}


 75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 17162/22950 [28:30<09:09, 10.54it/s]

{'loss': 0.011, 'grad_norm': 0.0006549008539877832, 'learning_rate': 2.522875816993464e-05, 'epoch': 37.39}


 75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 17172/22950 [28:31<09:22, 10.28it/s]

{'loss': 0.0001, 'grad_norm': 0.01021867897361517, 'learning_rate': 2.5185185185185183e-05, 'epoch': 37.41}


 75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 17182/22950 [28:32<09:09, 10.50it/s]

{'loss': 0.0, 'grad_norm': 0.0009086812497116625, 'learning_rate': 2.5141612200435733e-05, 'epoch': 37.43}


 75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 17192/22950 [28:33<09:08, 10.50it/s]

{'loss': 0.0, 'grad_norm': 0.0013396124122664332, 'learning_rate': 2.5098039215686277e-05, 'epoch': 37.45}


 75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 17202/22950 [28:34<09:04, 10.55it/s]

{'loss': 0.0, 'grad_norm': 0.0007026420207694173, 'learning_rate': 2.5054466230936817e-05, 'epoch': 37.47}


 75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 17212/22950 [28:34<09:14, 10.35it/s]

{'loss': 0.0, 'grad_norm': 0.001213432988151908, 'learning_rate': 2.5010893246187367e-05, 'epoch': 37.49}


 75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 17222/22950 [28:35<09:02, 10.57it/s]

{'loss': 0.0, 'grad_norm': 0.0008506343001499772, 'learning_rate': 2.496732026143791e-05, 'epoch': 37.52}


 75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 17232/22950 [28:36<08:58, 10.62it/s]

{'loss': 0.0001, 'grad_norm': 0.0018410038901492953, 'learning_rate': 2.4923747276688454e-05, 'epoch': 37.54}


 75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 17242/22950 [28:37<08:58, 10.60it/s]

{'loss': 0.0, 'grad_norm': 0.0008968309848569334, 'learning_rate': 2.4880174291938997e-05, 'epoch': 37.56}


 75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 17252/22950 [28:38<09:09, 10.38it/s]

{'loss': 0.0, 'grad_norm': 0.0008656787103973329, 'learning_rate': 2.4836601307189544e-05, 'epoch': 37.58}


 75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 17262/22950 [28:39<09:00, 10.52it/s]

{'loss': 0.0, 'grad_norm': 0.0008791440050117671, 'learning_rate': 2.4793028322440087e-05, 'epoch': 37.6}


 75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 17272/22950 [28:40<08:54, 10.62it/s]

{'loss': 0.0, 'grad_norm': 0.0007761546294204891, 'learning_rate': 2.4749455337690634e-05, 'epoch': 37.63}


 75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 17282/22950 [28:41<08:56, 10.57it/s]

{'loss': 0.0, 'grad_norm': 0.0010572075843811035, 'learning_rate': 2.4705882352941178e-05, 'epoch': 37.65}


 75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 17292/22950 [28:42<09:04, 10.40it/s]

{'loss': 0.0648, 'grad_norm': 0.2511689066886902, 'learning_rate': 2.466230936819172e-05, 'epoch': 37.67}


 75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 17302/22950 [28:43<08:54, 10.56it/s]

{'loss': 0.0892, 'grad_norm': 0.0007854088908061385, 'learning_rate': 2.4618736383442268e-05, 'epoch': 37.69}


 75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 17312/22950 [28:44<08:53, 10.56it/s]

{'loss': 0.0001, 'grad_norm': 0.0008901196415536106, 'learning_rate': 2.457516339869281e-05, 'epoch': 37.71}


 75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 17322/22950 [28:45<09:03, 10.36it/s]

{'loss': 0.0, 'grad_norm': 0.0005947684403508902, 'learning_rate': 2.4531590413943355e-05, 'epoch': 37.73}


 76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 17332/22950 [28:46<08:53, 10.53it/s]

{'loss': 0.0001, 'grad_norm': 0.0032608937472105026, 'learning_rate': 2.44880174291939e-05, 'epoch': 37.76}


 76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 17342/22950 [28:47<08:47, 10.62it/s]

{'loss': 0.0001, 'grad_norm': 0.0010080145439133048, 'learning_rate': 2.4444444444444445e-05, 'epoch': 37.78}


 76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 17352/22950 [28:48<08:43, 10.68it/s]

{'loss': 0.0, 'grad_norm': 0.0017817916814237833, 'learning_rate': 2.4400871459694992e-05, 'epoch': 37.8}


 76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 17362/22950 [28:49<08:54, 10.45it/s]

{'loss': 0.0, 'grad_norm': 0.0007874344591982663, 'learning_rate': 2.4357298474945535e-05, 'epoch': 37.82}


 76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 17372/22950 [28:50<08:46, 10.59it/s]

{'loss': 0.0, 'grad_norm': 0.025166437029838562, 'learning_rate': 2.431372549019608e-05, 'epoch': 37.84}


 76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 17382/22950 [28:51<08:43, 10.63it/s]

{'loss': 0.0246, 'grad_norm': 0.002589210867881775, 'learning_rate': 2.4270152505446625e-05, 'epoch': 37.86}


 76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 17392/22950 [28:52<08:41, 10.65it/s]

{'loss': 0.0128, 'grad_norm': 0.001120622968301177, 'learning_rate': 2.422657952069717e-05, 'epoch': 37.89}


 76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 17402/22950 [28:53<08:50, 10.47it/s]

{'loss': 0.0, 'grad_norm': 0.0011920332908630371, 'learning_rate': 2.4183006535947712e-05, 'epoch': 37.91}


 76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 17412/22950 [28:54<08:40, 10.63it/s]

{'loss': 0.0, 'grad_norm': 0.0012882011942565441, 'learning_rate': 2.413943355119826e-05, 'epoch': 37.93}


 76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 17422/22950 [28:54<08:40, 10.62it/s]

{'loss': 0.0, 'grad_norm': 0.0007718755514360964, 'learning_rate': 2.4095860566448802e-05, 'epoch': 37.95}


 76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 17432/22950 [28:55<08:53, 10.34it/s]

{'loss': 0.0001, 'grad_norm': 0.001420731539838016, 'learning_rate': 2.405228758169935e-05, 'epoch': 37.97}


 76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 17442/22950 [28:56<08:10, 11.24it/s]

{'loss': 0.0073, 'grad_norm': 1.1482138633728027, 'learning_rate': 2.4008714596949893e-05, 'epoch': 38.0}


                                                     
 76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 17442/22950 [28:59<08:10, 11.24it/s]

{'eval_loss': 1.2555452585220337, 'eval_accuracy': 0.845588207244873, 'eval_runtime': 2.792, 'eval_samples_per_second': 146.13, 'eval_steps_per_second': 18.266, 'epoch': 38.0}


 76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 17452/22950 [29:00<18:18,  5.00it/s]

{'loss': 0.0, 'grad_norm': 0.000946840038523078, 'learning_rate': 2.3965141612200436e-05, 'epoch': 38.02}


 76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 17462/22950 [29:01<10:37,  8.61it/s]

{'loss': 0.0, 'grad_norm': 0.0009344374411739409, 'learning_rate': 2.3921568627450983e-05, 'epoch': 38.04}


 76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 17472/22950 [29:02<08:58, 10.17it/s]

{'loss': 0.0029, 'grad_norm': 0.0010032126447185874, 'learning_rate': 2.3877995642701526e-05, 'epoch': 38.06}


 76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 17482/22950 [29:03<08:36, 10.59it/s]

{'loss': 0.0006, 'grad_norm': 0.0008666164940223098, 'learning_rate': 2.383442265795207e-05, 'epoch': 38.08}


 76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 17492/22950 [29:04<08:42, 10.45it/s]

{'loss': 0.0, 'grad_norm': 0.0006471136002801359, 'learning_rate': 2.3790849673202613e-05, 'epoch': 38.1}


 76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17502/22950 [29:05<08:38, 10.51it/s]

{'loss': 0.0, 'grad_norm': 0.0009477256680838764, 'learning_rate': 2.374727668845316e-05, 'epoch': 38.13}


 76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17512/22950 [29:06<08:34, 10.57it/s]

{'loss': 0.0108, 'grad_norm': 0.0011004036059603095, 'learning_rate': 2.3703703703703707e-05, 'epoch': 38.15}


 76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17522/22950 [29:07<08:40, 10.43it/s]

{'loss': 0.0002, 'grad_norm': 0.000781893846578896, 'learning_rate': 2.366013071895425e-05, 'epoch': 38.17}


 76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17532/22950 [29:08<08:39, 10.42it/s]

{'loss': 0.0173, 'grad_norm': 0.0012562725460156798, 'learning_rate': 2.3616557734204793e-05, 'epoch': 38.19}


 76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17542/22950 [29:09<08:28, 10.64it/s]

{'loss': 0.0, 'grad_norm': 0.023093771189451218, 'learning_rate': 2.357298474945534e-05, 'epoch': 38.21}


 76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17552/22950 [29:10<08:28, 10.62it/s]

{'loss': 0.0, 'grad_norm': 0.0008729741675779223, 'learning_rate': 2.3529411764705884e-05, 'epoch': 38.24}


 77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17562/22950 [29:11<08:36, 10.44it/s]

{'loss': 0.0106, 'grad_norm': 0.0006951030809432268, 'learning_rate': 2.348583877995643e-05, 'epoch': 38.26}


 77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17572/22950 [29:12<08:34, 10.45it/s]

{'loss': 0.1234, 'grad_norm': 0.0014959978871047497, 'learning_rate': 2.344226579520697e-05, 'epoch': 38.28}


 77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17582/22950 [29:13<08:19, 10.74it/s]

{'loss': 0.0, 'grad_norm': 0.0006632184376940131, 'learning_rate': 2.3398692810457517e-05, 'epoch': 38.3}


 77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17592/22950 [29:14<08:23, 10.64it/s]

{'loss': 0.0, 'grad_norm': 0.0006272802129387856, 'learning_rate': 2.3355119825708064e-05, 'epoch': 38.32}


 77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17602/22950 [29:15<08:28, 10.52it/s]

{'loss': 0.0001, 'grad_norm': 0.0005536641692742705, 'learning_rate': 2.3311546840958608e-05, 'epoch': 38.34}


 77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17612/22950 [29:16<08:18, 10.71it/s]

{'loss': 0.0, 'grad_norm': 0.000977757852524519, 'learning_rate': 2.326797385620915e-05, 'epoch': 38.37}


 77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17622/22950 [29:17<08:23, 10.58it/s]

{'loss': 0.0, 'grad_norm': 0.0007492397562600672, 'learning_rate': 2.3224400871459694e-05, 'epoch': 38.39}


 77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17632/22950 [29:18<08:23, 10.57it/s]

{'loss': 0.0027, 'grad_norm': 0.0006759217358194292, 'learning_rate': 2.318082788671024e-05, 'epoch': 38.41}


 77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17642/22950 [29:18<08:16, 10.70it/s]

{'loss': 0.0, 'grad_norm': 0.0007438255124725401, 'learning_rate': 2.3137254901960788e-05, 'epoch': 38.43}


 77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17652/22950 [29:19<08:18, 10.63it/s]

{'loss': 0.0122, 'grad_norm': 0.0006143174832686782, 'learning_rate': 2.3093681917211328e-05, 'epoch': 38.45}


 77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17662/22950 [29:20<08:17, 10.63it/s]

{'loss': 0.0113, 'grad_norm': 0.0009399725822731853, 'learning_rate': 2.3050108932461875e-05, 'epoch': 38.47}


 77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17672/22950 [29:21<08:25, 10.45it/s]

{'loss': 0.0, 'grad_norm': 0.001101172063499689, 'learning_rate': 2.3006535947712418e-05, 'epoch': 38.5}


 77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17682/22950 [29:22<08:13, 10.67it/s]

{'loss': 0.0, 'grad_norm': 0.0017746958183124661, 'learning_rate': 2.2962962962962965e-05, 'epoch': 38.52}


 77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17692/22950 [29:23<08:25, 10.41it/s]

{'loss': 0.058, 'grad_norm': 0.0007679177797399461, 'learning_rate': 2.291938997821351e-05, 'epoch': 38.54}


 77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17702/22950 [29:24<08:11, 10.68it/s]

{'loss': 0.0, 'grad_norm': 0.0023639483842998743, 'learning_rate': 2.2875816993464052e-05, 'epoch': 38.56}


 77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17712/22950 [29:25<08:20, 10.46it/s]

{'loss': 0.0, 'grad_norm': 0.0006166939274407923, 'learning_rate': 2.28322440087146e-05, 'epoch': 38.58}


 77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17722/22950 [29:26<08:09, 10.67it/s]

{'loss': 0.0001, 'grad_norm': 0.0019806723576039076, 'learning_rate': 2.2788671023965145e-05, 'epoch': 38.61}


 77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17732/22950 [29:27<08:11, 10.62it/s]

{'loss': 0.0, 'grad_norm': 0.0009395808447152376, 'learning_rate': 2.2745098039215685e-05, 'epoch': 38.63}


 77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17742/22950 [29:28<08:06, 10.71it/s]

{'loss': 0.0, 'grad_norm': 0.005841171834617853, 'learning_rate': 2.2701525054466232e-05, 'epoch': 38.65}


 77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17752/22950 [29:29<08:11, 10.57it/s]

{'loss': 0.0, 'grad_norm': 0.0005705986404791474, 'learning_rate': 2.2657952069716776e-05, 'epoch': 38.67}


 77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17760/22950 [29:30<08:15, 10.47it/s]

{'loss': 0.083, 'grad_norm': 0.0004901356878690422, 'learning_rate': 2.2614379084967322e-05, 'epoch': 38.69}


 77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17772/22950 [29:31<07:54, 10.92it/s]

{'loss': 0.0, 'grad_norm': 0.006352188531309366, 'learning_rate': 2.2570806100217866e-05, 'epoch': 38.71}


 77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 17782/22950 [29:32<08:12, 10.50it/s]

{'loss': 0.0, 'grad_norm': 0.0016169494483619928, 'learning_rate': 2.252723311546841e-05, 'epoch': 38.74}


 78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 17792/22950 [29:33<08:08, 10.57it/s]

{'loss': 0.1705, 'grad_norm': 42.840232849121094, 'learning_rate': 2.2483660130718956e-05, 'epoch': 38.76}


 78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 17802/22950 [29:34<08:00, 10.70it/s]

{'loss': 0.0381, 'grad_norm': 69.15802001953125, 'learning_rate': 2.24400871459695e-05, 'epoch': 38.78}


 78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 17812/22950 [29:35<08:05, 10.58it/s]

{'loss': 0.0, 'grad_norm': 0.0007161575485952199, 'learning_rate': 2.2396514161220046e-05, 'epoch': 38.8}


 78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 17822/22950 [29:36<07:59, 10.69it/s]

{'loss': 0.0, 'grad_norm': 0.0012174486182630062, 'learning_rate': 2.235294117647059e-05, 'epoch': 38.82}


 78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 17832/22950 [29:36<08:14, 10.35it/s]

{'loss': 0.0161, 'grad_norm': 0.010447345674037933, 'learning_rate': 2.2309368191721133e-05, 'epoch': 38.85}


 78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 17842/22950 [29:37<08:01, 10.60it/s]

{'loss': 0.0, 'grad_norm': 0.0010888940887525678, 'learning_rate': 2.226579520697168e-05, 'epoch': 38.87}


 78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 17852/22950 [29:38<07:58, 10.65it/s]

{'loss': 0.0, 'grad_norm': 0.0010050985729321837, 'learning_rate': 2.2222222222222223e-05, 'epoch': 38.89}


 78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 17862/22950 [29:39<08:00, 10.58it/s]

{'loss': 0.0, 'grad_norm': 0.0008507859893143177, 'learning_rate': 2.2178649237472767e-05, 'epoch': 38.91}


 78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 17872/22950 [29:40<07:58, 10.61it/s]

{'loss': 0.0195, 'grad_norm': 0.0009203541558235884, 'learning_rate': 2.2135076252723313e-05, 'epoch': 38.93}


 78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 17882/22950 [29:41<08:07, 10.40it/s]

{'loss': 0.0, 'grad_norm': 0.003757112892344594, 'learning_rate': 2.2091503267973857e-05, 'epoch': 38.95}


 78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 17892/22950 [29:42<07:52, 10.71it/s]

{'loss': 0.0, 'grad_norm': 0.000825346214696765, 'learning_rate': 2.2047930283224404e-05, 'epoch': 38.98}


 78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 17900/22950 [29:43<07:54, 10.65it/s]

{'loss': 0.0, 'grad_norm': 0.0011765523813664913, 'learning_rate': 2.2004357298474944e-05, 'epoch': 39.0}


                                                     
 78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 17901/22950 [29:46<07:54, 10.65it/s]

{'eval_loss': 1.1177678108215332, 'eval_accuracy': 0.843137264251709, 'eval_runtime': 2.7958, 'eval_samples_per_second': 145.935, 'eval_steps_per_second': 18.242, 'epoch': 39.0}


 78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 17912/22950 [29:47<14:08,  5.94it/s]

{'loss': 0.0, 'grad_norm': 0.000809161807410419, 'learning_rate': 2.196078431372549e-05, 'epoch': 39.02}


 78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 17922/22950 [29:48<08:56,  9.38it/s]

{'loss': 0.0, 'grad_norm': 0.0009866785258054733, 'learning_rate': 2.1917211328976037e-05, 'epoch': 39.04}


 78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 17932/22950 [29:49<08:12, 10.20it/s]

{'loss': 0.0, 'grad_norm': 0.0009874808602035046, 'learning_rate': 2.187363834422658e-05, 'epoch': 39.06}


 78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 17942/22950 [29:50<07:50, 10.64it/s]

{'loss': 0.0393, 'grad_norm': 0.0015320206293836236, 'learning_rate': 2.1830065359477124e-05, 'epoch': 39.08}


 78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 17952/22950 [29:51<07:47, 10.69it/s]

{'loss': 0.0001, 'grad_norm': 0.0011547692120075226, 'learning_rate': 2.178649237472767e-05, 'epoch': 39.11}


 78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 17962/22950 [29:52<07:45, 10.71it/s]

{'loss': 0.0, 'grad_norm': 0.00078269635559991, 'learning_rate': 2.1742919389978214e-05, 'epoch': 39.13}


 78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 17972/22950 [29:53<07:51, 10.56it/s]

{'loss': 0.0059, 'grad_norm': 0.0012041578302159905, 'learning_rate': 2.169934640522876e-05, 'epoch': 39.15}


 78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 17982/22950 [29:54<07:45, 10.66it/s]

{'loss': 0.0, 'grad_norm': 0.0011879614321514964, 'learning_rate': 2.16557734204793e-05, 'epoch': 39.17}


 78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 17992/22950 [29:55<07:39, 10.78it/s]

{'loss': 0.0, 'grad_norm': 0.0010922456858679652, 'learning_rate': 2.1612200435729848e-05, 'epoch': 39.19}


 78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 18002/22950 [29:56<07:52, 10.48it/s]

{'loss': 0.0, 'grad_norm': 0.001014629378914833, 'learning_rate': 2.1568627450980395e-05, 'epoch': 39.22}


 78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 18012/22950 [29:57<07:40, 10.71it/s]

{'loss': 0.0009, 'grad_norm': 0.4032571613788605, 'learning_rate': 2.1525054466230938e-05, 'epoch': 39.24}


 79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 18022/22950 [29:58<07:37, 10.77it/s]

{'loss': 0.0, 'grad_norm': 0.0007097298512235284, 'learning_rate': 2.148148148148148e-05, 'epoch': 39.26}


 79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 18032/22950 [29:58<07:38, 10.74it/s]

{'loss': 0.0212, 'grad_norm': 0.0009467571508139372, 'learning_rate': 2.1437908496732025e-05, 'epoch': 39.28}


 79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 18042/22950 [29:59<07:45, 10.54it/s]

{'loss': 0.0057, 'grad_norm': 0.0020742174237966537, 'learning_rate': 2.1394335511982572e-05, 'epoch': 39.3}


 79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 18052/22950 [30:00<07:38, 10.68it/s]

{'loss': 0.0, 'grad_norm': 0.000914947537239641, 'learning_rate': 2.135076252723312e-05, 'epoch': 39.32}


 79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 18062/22950 [30:01<07:35, 10.74it/s]

{'loss': 0.0917, 'grad_norm': 0.0006388774490915239, 'learning_rate': 2.1307189542483662e-05, 'epoch': 39.35}


 79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 18072/22950 [30:02<07:44, 10.50it/s]

{'loss': 0.0001, 'grad_norm': 0.0009126749355345964, 'learning_rate': 2.1263616557734205e-05, 'epoch': 39.37}


 79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 18082/22950 [30:03<07:35, 10.70it/s]

{'loss': 0.0001, 'grad_norm': 0.0008281097980216146, 'learning_rate': 2.122004357298475e-05, 'epoch': 39.39}


 79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 18092/22950 [30:04<07:38, 10.60it/s]

{'loss': 0.0, 'grad_norm': 0.0022460934706032276, 'learning_rate': 2.1176470588235296e-05, 'epoch': 39.41}


 79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 18102/22950 [30:05<07:37, 10.60it/s]

{'loss': 0.0, 'grad_norm': 0.0015829030890017748, 'learning_rate': 2.113289760348584e-05, 'epoch': 39.43}


 79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 18112/22950 [30:06<07:41, 10.48it/s]

{'loss': 0.0, 'grad_norm': 0.0018537079449743032, 'learning_rate': 2.1089324618736382e-05, 'epoch': 39.46}


 79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 18122/22950 [30:07<07:34, 10.63it/s]

{'loss': 0.0163, 'grad_norm': 0.0015028176130726933, 'learning_rate': 2.104575163398693e-05, 'epoch': 39.48}


 79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 18132/22950 [30:08<07:32, 10.64it/s]

{'loss': 0.0, 'grad_norm': 0.0006883052410557866, 'learning_rate': 2.1002178649237476e-05, 'epoch': 39.5}


 79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 18142/22950 [30:09<07:28, 10.72it/s]

{'loss': 0.0126, 'grad_norm': 0.0009320368408225477, 'learning_rate': 2.095860566448802e-05, 'epoch': 39.52}


 79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 18152/22950 [30:10<07:32, 10.61it/s]

{'loss': 0.0, 'grad_norm': 0.0007004369399510324, 'learning_rate': 2.0915032679738563e-05, 'epoch': 39.54}


 79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 18162/22950 [30:11<07:32, 10.57it/s]

{'loss': 0.0, 'grad_norm': 0.001743942964822054, 'learning_rate': 2.0871459694989106e-05, 'epoch': 39.56}


 79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 18172/22950 [30:12<07:28, 10.65it/s]

{'loss': 0.0, 'grad_norm': 0.0007535192416980863, 'learning_rate': 2.0827886710239653e-05, 'epoch': 39.59}


 79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 18182/22950 [30:13<07:26, 10.68it/s]

{'loss': 0.0, 'grad_norm': 0.0007356573478318751, 'learning_rate': 2.0784313725490197e-05, 'epoch': 39.61}


 79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 18192/22950 [30:14<07:27, 10.64it/s]

{'loss': 0.0, 'grad_norm': 0.0005104937590658665, 'learning_rate': 2.074074074074074e-05, 'epoch': 39.63}


 79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 18202/22950 [30:15<07:26, 10.63it/s]

{'loss': 0.0, 'grad_norm': 0.0012336900690570474, 'learning_rate': 2.0697167755991287e-05, 'epoch': 39.65}


 79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 18212/22950 [30:15<07:26, 10.62it/s]

{'loss': 0.0, 'grad_norm': 0.0006948764203116298, 'learning_rate': 2.065359477124183e-05, 'epoch': 39.67}


 79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 18222/22950 [30:16<07:27, 10.57it/s]

{'loss': 0.0928, 'grad_norm': 0.0012648754054680467, 'learning_rate': 2.0610021786492377e-05, 'epoch': 39.69}


 79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 18232/22950 [30:17<07:20, 10.71it/s]

{'loss': 0.0001, 'grad_norm': 0.0008023262489587069, 'learning_rate': 2.056644880174292e-05, 'epoch': 39.72}


 79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 18242/22950 [30:18<07:15, 10.81it/s]

{'loss': 0.0, 'grad_norm': 0.0015504172770306468, 'learning_rate': 2.0522875816993464e-05, 'epoch': 39.74}


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 18252/22950 [30:19<07:26, 10.53it/s]

{'loss': 0.0, 'grad_norm': 0.0006674960604868829, 'learning_rate': 2.047930283224401e-05, 'epoch': 39.76}


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 18262/22950 [30:20<07:18, 10.68it/s]

{'loss': 0.0, 'grad_norm': 0.0009110288228839636, 'learning_rate': 2.0435729847494554e-05, 'epoch': 39.78}


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 18272/22950 [30:21<07:13, 10.78it/s]

{'loss': 0.0, 'grad_norm': 0.0006259245565161109, 'learning_rate': 2.0392156862745097e-05, 'epoch': 39.8}


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 18282/22950 [30:22<07:16, 10.69it/s]

{'loss': 0.0, 'grad_norm': 0.00092401506844908, 'learning_rate': 2.0348583877995644e-05, 'epoch': 39.83}


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 18292/22950 [30:23<07:25, 10.46it/s]

{'loss': 0.0001, 'grad_norm': 0.0006245457916520536, 'learning_rate': 2.0305010893246188e-05, 'epoch': 39.85}


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 18302/22950 [30:24<07:12, 10.75it/s]

{'loss': 0.0001, 'grad_norm': 0.0009563939529471099, 'learning_rate': 2.0261437908496734e-05, 'epoch': 39.87}


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 18312/22950 [30:25<07:12, 10.72it/s]

{'loss': 0.0001, 'grad_norm': 0.0008749456028454006, 'learning_rate': 2.0217864923747278e-05, 'epoch': 39.89}


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 18322/22950 [30:26<07:21, 10.48it/s]

{'loss': 0.0935, 'grad_norm': 0.0011033185292035341, 'learning_rate': 2.017429193899782e-05, 'epoch': 39.91}


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 18332/22950 [30:27<07:08, 10.77it/s]

{'loss': 0.0, 'grad_norm': 0.0007258036639541388, 'learning_rate': 2.0130718954248368e-05, 'epoch': 39.93}


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 18342/22950 [30:28<07:09, 10.73it/s]

{'loss': 0.0, 'grad_norm': 0.0008566753822378814, 'learning_rate': 2.008714596949891e-05, 'epoch': 39.96}


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 18352/22950 [30:29<07:14, 10.58it/s]

{'loss': 0.0, 'grad_norm': 0.0006201096693985164, 'learning_rate': 2.0043572984749455e-05, 'epoch': 39.98}


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18360/22950 [30:29<06:47, 11.28it/s]

{'loss': 0.0, 'grad_norm': 0.0006887295166961849, 'learning_rate': 2e-05, 'epoch': 40.0}


                                                     
 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18360/22950 [30:32<06:47, 11.28it/s]

{'eval_loss': 1.1539058685302734, 'eval_accuracy': 0.8602941036224365, 'eval_runtime': 2.7658, 'eval_samples_per_second': 147.514, 'eval_steps_per_second': 18.439, 'epoch': 40.0}


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18370/22950 [30:33<15:09,  5.04it/s]

{'loss': 0.0, 'grad_norm': 0.0008343351073563099, 'learning_rate': 1.9956427015250545e-05, 'epoch': 40.02}


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18382/22950 [30:35<08:13,  9.26it/s]

{'loss': 0.0, 'grad_norm': 0.0005583135061897337, 'learning_rate': 1.9912854030501092e-05, 'epoch': 40.04}


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18392/22950 [30:35<07:19, 10.37it/s]

{'loss': 0.007, 'grad_norm': 0.000932331255171448, 'learning_rate': 1.9869281045751635e-05, 'epoch': 40.07}


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18402/22950 [30:36<07:05, 10.69it/s]

{'loss': 0.0, 'grad_norm': 0.0006163085927255452, 'learning_rate': 1.982570806100218e-05, 'epoch': 40.09}


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18412/22950 [30:37<07:03, 10.72it/s]

{'loss': 0.0, 'grad_norm': 0.0007529738359153271, 'learning_rate': 1.9782135076252725e-05, 'epoch': 40.11}


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18422/22950 [30:38<07:10, 10.51it/s]

{'loss': 0.0172, 'grad_norm': 0.0010045067174360156, 'learning_rate': 1.973856209150327e-05, 'epoch': 40.13}


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18432/22950 [30:39<07:00, 10.74it/s]

{'loss': 0.0, 'grad_norm': 0.0009100304450839758, 'learning_rate': 1.9694989106753812e-05, 'epoch': 40.15}


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18442/22950 [30:40<06:58, 10.77it/s]

{'loss': 0.0, 'grad_norm': 0.0009708215948194265, 'learning_rate': 1.9651416122004356e-05, 'epoch': 40.17}


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18452/22950 [30:41<06:58, 10.76it/s]

{'loss': 0.0, 'grad_norm': 0.001998270396143198, 'learning_rate': 1.9607843137254903e-05, 'epoch': 40.2}


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18462/22950 [30:42<07:03, 10.59it/s]

{'loss': 0.0, 'grad_norm': 0.000968448759522289, 'learning_rate': 1.956427015250545e-05, 'epoch': 40.22}


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18472/22950 [30:43<06:59, 10.67it/s]

{'loss': 0.0, 'grad_norm': 0.0005902366829104722, 'learning_rate': 1.9520697167755993e-05, 'epoch': 40.24}


 81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18482/22950 [30:44<06:53, 10.81it/s]

{'loss': 0.0, 'grad_norm': 0.0008055903599597514, 'learning_rate': 1.9477124183006536e-05, 'epoch': 40.26}


 81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18492/22950 [30:45<07:01, 10.57it/s]

{'loss': 0.0, 'grad_norm': 0.0007259799749590456, 'learning_rate': 1.9433551198257083e-05, 'epoch': 40.28}


 81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18502/22950 [30:46<06:58, 10.64it/s]

{'loss': 0.0, 'grad_norm': 0.0008034503553062677, 'learning_rate': 1.9389978213507626e-05, 'epoch': 40.31}


 81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18512/22950 [30:47<06:51, 10.79it/s]

{'loss': 0.0, 'grad_norm': 0.000671804475132376, 'learning_rate': 1.9346405228758173e-05, 'epoch': 40.33}


 81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18522/22950 [30:48<06:51, 10.77it/s]

{'loss': 0.0, 'grad_norm': 0.005983164068311453, 'learning_rate': 1.9302832244008713e-05, 'epoch': 40.35}


 81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18532/22950 [30:49<06:56, 10.61it/s]

{'loss': 0.0, 'grad_norm': 0.0007769145304337144, 'learning_rate': 1.925925925925926e-05, 'epoch': 40.37}


 81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18542/22950 [30:50<06:56, 10.59it/s]

{'loss': 0.0, 'grad_norm': 0.0006356200319714844, 'learning_rate': 1.9215686274509807e-05, 'epoch': 40.39}


 81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18552/22950 [30:51<06:52, 10.67it/s]

{'loss': 0.0, 'grad_norm': 0.0007370587554760277, 'learning_rate': 1.917211328976035e-05, 'epoch': 40.41}


 81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18562/22950 [30:51<06:57, 10.51it/s]

{'loss': 0.0001, 'grad_norm': 0.0007660542032681406, 'learning_rate': 1.9128540305010894e-05, 'epoch': 40.44}


 81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18572/22950 [30:52<06:48, 10.72it/s]

{'loss': 0.0, 'grad_norm': 0.0005098542314954102, 'learning_rate': 1.9084967320261437e-05, 'epoch': 40.46}


 81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18582/22950 [30:53<06:58, 10.45it/s]

{'loss': 0.0, 'grad_norm': 0.0010724489111453295, 'learning_rate': 1.9041394335511984e-05, 'epoch': 40.48}


 81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18592/22950 [30:54<06:47, 10.70it/s]

{'loss': 0.0, 'grad_norm': 0.041410986334085464, 'learning_rate': 1.899782135076253e-05, 'epoch': 40.5}


 81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18602/22950 [30:55<06:45, 10.73it/s]

{'loss': 0.0001, 'grad_norm': 0.0009649033891037107, 'learning_rate': 1.895424836601307e-05, 'epoch': 40.52}


 81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18612/22950 [30:56<06:50, 10.57it/s]

{'loss': 0.0, 'grad_norm': 0.0006970335962250829, 'learning_rate': 1.8910675381263617e-05, 'epoch': 40.54}


 81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18622/22950 [30:57<06:47, 10.62it/s]

{'loss': 0.0, 'grad_norm': 0.0011834269389510155, 'learning_rate': 1.886710239651416e-05, 'epoch': 40.57}


 81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18632/22950 [30:58<06:50, 10.52it/s]

{'loss': 0.0, 'grad_norm': 0.0007123210816644132, 'learning_rate': 1.8823529411764708e-05, 'epoch': 40.59}


 81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 18642/22950 [30:59<06:41, 10.74it/s]

{'loss': 0.0, 'grad_norm': 0.0007306038751266897, 'learning_rate': 1.877995642701525e-05, 'epoch': 40.61}


 81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18652/22950 [31:00<06:46, 10.58it/s]

{'loss': 0.0, 'grad_norm': 0.000717447604984045, 'learning_rate': 1.8736383442265794e-05, 'epoch': 40.63}


 81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18662/22950 [31:01<06:38, 10.75it/s]

{'loss': 0.0787, 'grad_norm': 0.0006679489160887897, 'learning_rate': 1.869281045751634e-05, 'epoch': 40.65}


 81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18672/22950 [31:02<06:38, 10.75it/s]

{'loss': 0.0074, 'grad_norm': 0.0007816923316568136, 'learning_rate': 1.8649237472766888e-05, 'epoch': 40.68}


 81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18682/22950 [31:03<06:41, 10.63it/s]

{'loss': 0.0053, 'grad_norm': 0.0008857396896928549, 'learning_rate': 1.8605664488017428e-05, 'epoch': 40.7}


 81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18692/22950 [31:04<06:37, 10.71it/s]

{'loss': 0.0, 'grad_norm': 0.004860444460064173, 'learning_rate': 1.8562091503267975e-05, 'epoch': 40.72}


 81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18702/22950 [31:05<06:43, 10.54it/s]

{'loss': 0.0162, 'grad_norm': 0.000640595389995724, 'learning_rate': 1.8518518518518518e-05, 'epoch': 40.74}


 82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18712/22950 [31:06<06:36, 10.68it/s]

{'loss': 0.0, 'grad_norm': 0.0005739065818488598, 'learning_rate': 1.8474945533769065e-05, 'epoch': 40.76}


 82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18722/22950 [31:07<06:36, 10.66it/s]

{'loss': 0.0, 'grad_norm': 0.000923382118344307, 'learning_rate': 1.843137254901961e-05, 'epoch': 40.78}


 82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18732/22950 [31:07<06:32, 10.73it/s]

{'loss': 0.0308, 'grad_norm': 0.0007845874642953277, 'learning_rate': 1.8387799564270152e-05, 'epoch': 40.81}


 82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18742/22950 [31:08<06:36, 10.62it/s]

{'loss': 0.0, 'grad_norm': 0.0008203493198379874, 'learning_rate': 1.83442265795207e-05, 'epoch': 40.83}


 82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18752/22950 [31:09<06:36, 10.58it/s]

{'loss': 0.0, 'grad_norm': 0.0010511834407225251, 'learning_rate': 1.8300653594771242e-05, 'epoch': 40.85}


 82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18762/22950 [31:10<06:31, 10.69it/s]

{'loss': 0.0, 'grad_norm': 0.0011560532730072737, 'learning_rate': 1.825708061002179e-05, 'epoch': 40.87}


 82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18772/22950 [31:11<06:36, 10.54it/s]

{'loss': 0.0128, 'grad_norm': 0.0009918217547237873, 'learning_rate': 1.8213507625272332e-05, 'epoch': 40.89}


 82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18782/22950 [31:12<06:43, 10.33it/s]

{'loss': 0.0, 'grad_norm': 0.002399474149569869, 'learning_rate': 1.8169934640522876e-05, 'epoch': 40.92}


 82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18792/22950 [31:13<06:44, 10.29it/s]

{'loss': 0.0062, 'grad_norm': 0.0011219332227483392, 'learning_rate': 1.8126361655773423e-05, 'epoch': 40.94}


 82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18802/22950 [31:14<06:23, 10.82it/s]

{'loss': 0.0, 'grad_norm': 0.01172739639878273, 'learning_rate': 1.8082788671023966e-05, 'epoch': 40.96}


 82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18812/22950 [31:15<06:25, 10.73it/s]

{'loss': 0.0, 'grad_norm': 0.0024487555492669344, 'learning_rate': 1.803921568627451e-05, 'epoch': 40.98}


                                                     
 82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18819/22950 [31:19<06:40, 10.31it/s]

{'eval_loss': 1.0669426918029785, 'eval_accuracy': 0.8676470518112183, 'eval_runtime': 2.7858, 'eval_samples_per_second': 146.46, 'eval_steps_per_second': 18.307, 'epoch': 41.0}


 82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18822/22950 [31:19<29:15,  2.35it/s]

{'loss': 0.0, 'grad_norm': 0.0006146269734017551, 'learning_rate': 1.7995642701525056e-05, 'epoch': 41.0}


 82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18830/22950 [31:20<11:43,  5.85it/s]

{'loss': 0.0, 'grad_norm': 0.0005920406547375023, 'learning_rate': 1.79520697167756e-05, 'epoch': 41.02}


 82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18842/22950 [31:21<06:56,  9.86it/s]

{'loss': 0.0, 'grad_norm': 0.0006881145527586341, 'learning_rate': 1.7908496732026146e-05, 'epoch': 41.05}


 82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18852/22950 [31:22<06:37, 10.32it/s]

{'loss': 0.0, 'grad_norm': 0.0006341055268421769, 'learning_rate': 1.786492374727669e-05, 'epoch': 41.07}


 82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18861/22950 [31:23<07:29,  9.11it/s]

{'loss': 0.0012, 'grad_norm': 0.000677210686262697, 'learning_rate': 1.7821350762527233e-05, 'epoch': 41.09}


 82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18872/22950 [31:24<06:19, 10.73it/s]

{'loss': 0.0, 'grad_norm': 0.06349455565214157, 'learning_rate': 1.777777777777778e-05, 'epoch': 41.11}


 82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18882/22950 [31:25<06:25, 10.56it/s]

{'loss': 0.0, 'grad_norm': 0.001248347689397633, 'learning_rate': 1.7734204793028323e-05, 'epoch': 41.13}


 82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18892/22950 [31:26<06:28, 10.45it/s]

{'loss': 0.0, 'grad_norm': 0.0006747713778167963, 'learning_rate': 1.7690631808278867e-05, 'epoch': 41.15}


 82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18902/22950 [31:27<06:17, 10.71it/s]

{'loss': 0.0001, 'grad_norm': 0.0004810827085748315, 'learning_rate': 1.7647058823529414e-05, 'epoch': 41.18}


 82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18912/22950 [31:28<06:25, 10.47it/s]

{'loss': 0.0, 'grad_norm': 0.008671397343277931, 'learning_rate': 1.7603485838779957e-05, 'epoch': 41.2}


 82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18922/22950 [31:29<06:22, 10.53it/s]

{'loss': 0.0, 'grad_norm': 0.000640951911918819, 'learning_rate': 1.7559912854030504e-05, 'epoch': 41.22}


 82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 18932/22950 [31:30<06:16, 10.67it/s]

{'loss': 0.0, 'grad_norm': 0.0006678978679701686, 'learning_rate': 1.7516339869281044e-05, 'epoch': 41.24}


 83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 18942/22950 [31:31<06:17, 10.61it/s]

{'loss': 0.0, 'grad_norm': 0.0014990305062383413, 'learning_rate': 1.747276688453159e-05, 'epoch': 41.26}


 83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 18952/22950 [31:32<06:19, 10.53it/s]

{'loss': 0.0001, 'grad_norm': 0.0007795593119226396, 'learning_rate': 1.7429193899782137e-05, 'epoch': 41.29}


 83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 18962/22950 [31:33<06:13, 10.67it/s]

{'loss': 0.0, 'grad_norm': 0.000670230423565954, 'learning_rate': 1.738562091503268e-05, 'epoch': 41.31}


 83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 18972/22950 [31:34<06:16, 10.55it/s]

{'loss': 0.0, 'grad_norm': 0.0012908641947433352, 'learning_rate': 1.7342047930283224e-05, 'epoch': 41.33}


 83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 18982/22950 [31:34<06:16, 10.53it/s]

{'loss': 0.0011, 'grad_norm': 0.00042823125841096044, 'learning_rate': 1.7298474945533768e-05, 'epoch': 41.35}


 83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 18992/22950 [31:35<06:17, 10.49it/s]

{'loss': 0.0066, 'grad_norm': 0.0005993585800752044, 'learning_rate': 1.7254901960784314e-05, 'epoch': 41.37}


 83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 19002/22950 [31:36<06:10, 10.66it/s]

{'loss': 0.0, 'grad_norm': 0.0006044609472155571, 'learning_rate': 1.721132897603486e-05, 'epoch': 41.39}


 83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 19012/22950 [31:37<06:17, 10.44it/s]

{'loss': 0.0001, 'grad_norm': 0.24983768165111542, 'learning_rate': 1.7167755991285405e-05, 'epoch': 41.42}


 83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 19022/22950 [31:38<06:14, 10.48it/s]

{'loss': 0.0, 'grad_norm': 0.0005386079428717494, 'learning_rate': 1.7124183006535948e-05, 'epoch': 41.44}


 83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 19032/22950 [31:39<06:08, 10.64it/s]

{'loss': 0.0, 'grad_norm': 0.000576776685193181, 'learning_rate': 1.7080610021786495e-05, 'epoch': 41.46}


 83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 19042/22950 [31:40<06:11, 10.51it/s]

{'loss': 0.0, 'grad_norm': 0.00047640001866966486, 'learning_rate': 1.7037037037037038e-05, 'epoch': 41.48}


 83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 19052/22950 [31:41<06:06, 10.64it/s]

{'loss': 0.0, 'grad_norm': 0.0008734225993975997, 'learning_rate': 1.6993464052287582e-05, 'epoch': 41.5}


 83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 19062/22950 [31:42<06:15, 10.36it/s]

{'loss': 0.0, 'grad_norm': 0.000444696081103757, 'learning_rate': 1.6949891067538125e-05, 'epoch': 41.53}


 83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 19072/22950 [31:43<06:03, 10.68it/s]

{'loss': 0.0, 'grad_norm': 0.0005968961049802601, 'learning_rate': 1.6906318082788672e-05, 'epoch': 41.55}


 83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 19082/22950 [31:44<06:12, 10.39it/s]

{'loss': 0.0094, 'grad_norm': 0.00048743592924438417, 'learning_rate': 1.686274509803922e-05, 'epoch': 41.57}


 83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 19092/22950 [31:45<06:01, 10.67it/s]

{'loss': 0.0, 'grad_norm': 0.0004965668194927275, 'learning_rate': 1.6819172113289762e-05, 'epoch': 41.59}


 83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 19102/22950 [31:46<06:17, 10.20it/s]

{'loss': 0.0082, 'grad_norm': 0.0004750746302306652, 'learning_rate': 1.6775599128540306e-05, 'epoch': 41.61}


 83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 19112/22950 [31:47<06:07, 10.45it/s]

{'loss': 0.0294, 'grad_norm': 0.0006008101627230644, 'learning_rate': 1.673202614379085e-05, 'epoch': 41.63}


 83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 19122/22950 [31:48<06:06, 10.44it/s]

{'loss': 0.0002, 'grad_norm': 0.0004893452860414982, 'learning_rate': 1.6688453159041396e-05, 'epoch': 41.66}


 83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 19132/22950 [31:49<06:00, 10.60it/s]

{'loss': 0.0, 'grad_norm': 0.001892807544209063, 'learning_rate': 1.664488017429194e-05, 'epoch': 41.68}


 83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 19142/22950 [31:50<06:08, 10.34it/s]

{'loss': 0.0, 'grad_norm': 0.0006272119353525341, 'learning_rate': 1.6601307189542483e-05, 'epoch': 41.7}


 83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 19152/22950 [31:51<05:59, 10.56it/s]

{'loss': 0.0002, 'grad_norm': 0.0007733124657534063, 'learning_rate': 1.655773420479303e-05, 'epoch': 41.72}


 83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 19162/22950 [31:52<06:12, 10.16it/s]

{'loss': 0.0, 'grad_norm': 0.0006284397095441818, 'learning_rate': 1.6514161220043573e-05, 'epoch': 41.74}


 84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 19172/22950 [31:53<05:55, 10.61it/s]

{'loss': 0.0, 'grad_norm': 0.0005728111718781292, 'learning_rate': 1.647058823529412e-05, 'epoch': 41.76}


 84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 19182/22950 [31:54<06:03, 10.36it/s]

{'loss': 0.0, 'grad_norm': 0.0004872477729804814, 'learning_rate': 1.6427015250544663e-05, 'epoch': 41.79}


 84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 19192/22950 [31:55<05:55, 10.58it/s]

{'loss': 0.0, 'grad_norm': 0.0006408547051250935, 'learning_rate': 1.6383442265795206e-05, 'epoch': 41.81}


 84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 19202/22950 [31:55<06:05, 10.24it/s]

{'loss': 0.0, 'grad_norm': 0.005300967954099178, 'learning_rate': 1.6339869281045753e-05, 'epoch': 41.83}


 84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 19212/22950 [31:56<05:53, 10.56it/s]

{'loss': 0.0, 'grad_norm': 0.0004031432617921382, 'learning_rate': 1.62962962962963e-05, 'epoch': 41.85}


 84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 19222/22950 [31:57<05:59, 10.37it/s]

{'loss': 0.0, 'grad_norm': 0.000851418124511838, 'learning_rate': 1.625272331154684e-05, 'epoch': 41.87}


 84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 19232/22950 [31:58<05:52, 10.55it/s]

{'loss': 0.0, 'grad_norm': 0.0004190584586467594, 'learning_rate': 1.6209150326797387e-05, 'epoch': 41.9}


 84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 19242/22950 [31:59<06:01, 10.26it/s]

{'loss': 0.0, 'grad_norm': 0.0005246414802968502, 'learning_rate': 1.616557734204793e-05, 'epoch': 41.92}


 84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 19252/22950 [32:00<05:48, 10.60it/s]

{'loss': 0.0, 'grad_norm': 0.00047067218110896647, 'learning_rate': 1.6122004357298477e-05, 'epoch': 41.94}


 84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 19262/22950 [32:01<05:58, 10.28it/s]

{'loss': 0.0, 'grad_norm': 0.0005266941152513027, 'learning_rate': 1.607843137254902e-05, 'epoch': 41.96}


 84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 19272/22950 [32:02<05:48, 10.56it/s]

{'loss': 0.0, 'grad_norm': 0.0009293538751080632, 'learning_rate': 1.6034858387799564e-05, 'epoch': 41.98}


                                                     
 84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 19278/22950 [32:06<05:44, 10.67it/s]

{'eval_loss': 1.223159909248352, 'eval_accuracy': 0.8602941036224365, 'eval_runtime': 2.8177, 'eval_samples_per_second': 144.797, 'eval_steps_per_second': 18.1, 'epoch': 42.0}


 84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 19282/22950 [32:06<26:08,  2.34it/s]

{'loss': 0.0108, 'grad_norm': 24.94646644592285, 'learning_rate': 1.599128540305011e-05, 'epoch': 42.0}


 84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 19290/22950 [32:07<10:47,  5.66it/s]

{'loss': 0.0, 'grad_norm': 0.0006756809307262301, 'learning_rate': 1.5947712418300654e-05, 'epoch': 42.03}


 84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 19302/22950 [32:08<06:22,  9.54it/s]

{'loss': 0.0, 'grad_norm': 0.000573960889596492, 'learning_rate': 1.5904139433551197e-05, 'epoch': 42.05}


 84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 19312/22950 [32:09<05:55, 10.25it/s]

{'loss': 0.0, 'grad_norm': 0.00043737952364608645, 'learning_rate': 1.5860566448801744e-05, 'epoch': 42.07}


 84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 19321/22950 [32:10<06:07,  9.88it/s]

{'loss': 0.0073, 'grad_norm': 0.0005237205768935382, 'learning_rate': 1.5816993464052288e-05, 'epoch': 42.09}


 84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 19331/22950 [32:11<05:46, 10.46it/s]

{'loss': 0.0, 'grad_norm': 0.0007925492827780545, 'learning_rate': 1.5773420479302835e-05, 'epoch': 42.11}


 84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 19341/22950 [32:12<05:50, 10.30it/s]

{'loss': 0.0, 'grad_norm': 0.0006152847781777382, 'learning_rate': 1.5729847494553378e-05, 'epoch': 42.14}


 84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 19351/22950 [32:13<05:47, 10.34it/s]

{'loss': 0.0, 'grad_norm': 0.0005958595429547131, 'learning_rate': 1.568627450980392e-05, 'epoch': 42.16}


 84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 19361/22950 [32:14<05:41, 10.52it/s]

{'loss': 0.0176, 'grad_norm': 0.0004689339839387685, 'learning_rate': 1.5642701525054468e-05, 'epoch': 42.18}


 84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 19371/22950 [32:15<05:56, 10.05it/s]

{'loss': 0.0, 'grad_norm': 0.0004171235777903348, 'learning_rate': 1.559912854030501e-05, 'epoch': 42.2}


 84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 19381/22950 [32:16<05:43, 10.39it/s]

{'loss': 0.0, 'grad_norm': 0.00040116027230396867, 'learning_rate': 1.5555555555555555e-05, 'epoch': 42.22}


 84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 19391/22950 [32:17<05:41, 10.41it/s]

{'loss': 0.0, 'grad_norm': 0.0003553781716618687, 'learning_rate': 1.5511982570806102e-05, 'epoch': 42.24}


 85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 19401/22950 [32:18<05:47, 10.22it/s]

{'loss': 0.0001, 'grad_norm': 0.0006233383319340646, 'learning_rate': 1.5468409586056645e-05, 'epoch': 42.27}


 85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 19411/22950 [32:19<05:45, 10.23it/s]

{'loss': 0.0, 'grad_norm': 0.0004058786144014448, 'learning_rate': 1.5424836601307192e-05, 'epoch': 42.29}


 85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 19421/22950 [32:20<05:47, 10.16it/s]

{'loss': 0.0, 'grad_norm': 0.00045827304711565375, 'learning_rate': 1.5381263616557735e-05, 'epoch': 42.31}


 85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 19431/22950 [32:21<05:37, 10.43it/s]

{'loss': 0.0054, 'grad_norm': 0.0005235071294009686, 'learning_rate': 1.533769063180828e-05, 'epoch': 42.33}


 85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 19441/22950 [32:22<05:43, 10.23it/s]

{'loss': 0.0, 'grad_norm': 0.00029617708059959114, 'learning_rate': 1.5294117647058826e-05, 'epoch': 42.35}


 85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 19451/22950 [32:23<05:43, 10.19it/s]

{'loss': 0.0, 'grad_norm': 0.001492087496444583, 'learning_rate': 1.5250544662309369e-05, 'epoch': 42.37}


 85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 19461/22950 [32:24<05:44, 10.13it/s]

{'loss': 0.0, 'grad_norm': 0.0003614498127717525, 'learning_rate': 1.5206971677559914e-05, 'epoch': 42.4}


 85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 19471/22950 [32:25<05:31, 10.48it/s]

{'loss': 0.0, 'grad_norm': 0.0004263799055479467, 'learning_rate': 1.5163398692810458e-05, 'epoch': 42.42}


 85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 19481/22950 [32:26<05:38, 10.25it/s]

{'loss': 0.0, 'grad_norm': 0.00038369526737369597, 'learning_rate': 1.5119825708061003e-05, 'epoch': 42.44}


 85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 19491/22950 [32:27<05:36, 10.27it/s]

{'loss': 0.0, 'grad_norm': 0.0005724510410800576, 'learning_rate': 1.507625272331155e-05, 'epoch': 42.46}


 85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 19501/22950 [32:28<05:36, 10.25it/s]

{'loss': 0.0, 'grad_norm': 0.00035828055115416646, 'learning_rate': 1.5032679738562091e-05, 'epoch': 42.48}


 85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19511/22950 [32:29<05:30, 10.40it/s]

{'loss': 0.0832, 'grad_norm': 0.0004012180434074253, 'learning_rate': 1.4989106753812638e-05, 'epoch': 42.51}


 85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19521/22950 [32:30<05:35, 10.21it/s]

{'loss': 0.0, 'grad_norm': 0.002443223260343075, 'learning_rate': 1.494553376906318e-05, 'epoch': 42.53}


 85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19531/22950 [32:30<05:34, 10.22it/s]

{'loss': 0.0902, 'grad_norm': 0.0006575188017450273, 'learning_rate': 1.4901960784313726e-05, 'epoch': 42.55}


 85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19541/22950 [32:31<05:34, 10.19it/s]

{'loss': 0.0151, 'grad_norm': 1.9027577638626099, 'learning_rate': 1.4858387799564272e-05, 'epoch': 42.57}


 85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19551/22950 [32:32<05:26, 10.41it/s]

{'loss': 0.0, 'grad_norm': 0.0004081403312738985, 'learning_rate': 1.4814814814814815e-05, 'epoch': 42.59}


 85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19561/22950 [32:33<05:31, 10.23it/s]

{'loss': 0.0114, 'grad_norm': 0.0005319410120137036, 'learning_rate': 1.477124183006536e-05, 'epoch': 42.61}


 85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19571/22950 [32:34<05:31, 10.20it/s]

{'loss': 0.0, 'grad_norm': 0.0005105398595333099, 'learning_rate': 1.4727668845315907e-05, 'epoch': 42.64}


 85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19581/22950 [32:35<05:35, 10.05it/s]

{'loss': 0.0, 'grad_norm': 0.0005179682048037648, 'learning_rate': 1.4684095860566449e-05, 'epoch': 42.66}


 85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19591/22950 [32:36<05:24, 10.36it/s]

{'loss': 0.0, 'grad_norm': 0.00037112602149136364, 'learning_rate': 1.4640522875816995e-05, 'epoch': 42.68}


 85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19601/22950 [32:37<05:29, 10.16it/s]

{'loss': 0.0, 'grad_norm': 0.000494743580929935, 'learning_rate': 1.4596949891067537e-05, 'epoch': 42.7}


 85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19611/22950 [32:38<05:26, 10.21it/s]

{'loss': 0.0001, 'grad_norm': 0.0004480736970435828, 'learning_rate': 1.4553376906318084e-05, 'epoch': 42.72}


 85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19621/22950 [32:39<05:30, 10.07it/s]

{'loss': 0.0, 'grad_norm': 0.00044760716264136136, 'learning_rate': 1.4509803921568629e-05, 'epoch': 42.75}


 86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19631/22950 [32:40<05:20, 10.35it/s]

{'loss': 0.0, 'grad_norm': 0.000578453007619828, 'learning_rate': 1.4466230936819172e-05, 'epoch': 42.77}


 86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19641/22950 [32:41<05:24, 10.19it/s]

{'loss': 0.0, 'grad_norm': 0.00044748600339517, 'learning_rate': 1.4422657952069718e-05, 'epoch': 42.79}


 86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19651/22950 [32:42<05:23, 10.21it/s]

{'loss': 0.0, 'grad_norm': 0.00040030028321780264, 'learning_rate': 1.4379084967320261e-05, 'epoch': 42.81}


 86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19661/22950 [32:43<05:22, 10.20it/s]

{'loss': 0.0, 'grad_norm': 0.00044622900895774364, 'learning_rate': 1.4335511982570806e-05, 'epoch': 42.83}


 86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19671/22950 [32:44<05:16, 10.36it/s]

{'loss': 0.0, 'grad_norm': 0.0006602921057492495, 'learning_rate': 1.4291938997821353e-05, 'epoch': 42.85}


 86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19681/22950 [32:45<05:19, 10.22it/s]

{'loss': 0.0, 'grad_norm': 0.0012417284306138754, 'learning_rate': 1.4248366013071896e-05, 'epoch': 42.88}


 86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19691/22950 [32:46<05:18, 10.24it/s]

{'loss': 0.0, 'grad_norm': 0.00046090877731330693, 'learning_rate': 1.4204793028322441e-05, 'epoch': 42.9}


 86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19701/22950 [32:47<05:17, 10.22it/s]

{'loss': 0.0001, 'grad_norm': 0.0005280681070871651, 'learning_rate': 1.4161220043572985e-05, 'epoch': 42.92}


 86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19711/22950 [32:48<05:11, 10.39it/s]

{'loss': 0.0, 'grad_norm': 0.0005427616997621953, 'learning_rate': 1.411764705882353e-05, 'epoch': 42.94}


 86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19721/22950 [32:49<05:15, 10.25it/s]

{'loss': 0.0, 'grad_norm': 0.0006291762110777199, 'learning_rate': 1.4074074074074075e-05, 'epoch': 42.96}


 86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19731/22950 [32:50<05:13, 10.27it/s]

{'loss': 0.0, 'grad_norm': 0.002884137909859419, 'learning_rate': 1.4030501089324618e-05, 'epoch': 42.98}


                                                     
 86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19737/22950 [32:53<05:01, 10.66it/s]

{'eval_loss': 1.240157961845398, 'eval_accuracy': 0.843137264251709, 'eval_runtime': 2.8482, 'eval_samples_per_second': 143.25, 'eval_steps_per_second': 17.906, 'epoch': 43.0}


 86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19741/22950 [32:54<23:03,  2.32it/s]

{'loss': 0.0, 'grad_norm': 0.0011388759594410658, 'learning_rate': 1.3986928104575163e-05, 'epoch': 43.01}


 86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19751/22950 [32:55<08:15,  6.46it/s]

{'loss': 0.0, 'grad_norm': 0.000981262419372797, 'learning_rate': 1.394335511982571e-05, 'epoch': 43.03}


 86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19762/22950 [32:56<05:29,  9.67it/s]

{'loss': 0.0, 'grad_norm': 0.0004979997756890953, 'learning_rate': 1.3899782135076254e-05, 'epoch': 43.05}


 86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19772/22950 [32:57<05:10, 10.24it/s]

{'loss': 0.0, 'grad_norm': 0.00048699634498916566, 'learning_rate': 1.3856209150326799e-05, 'epoch': 43.07}


 86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19782/22950 [32:58<05:11, 10.17it/s]

{'loss': 0.0004, 'grad_norm': 0.0006461054435931146, 'learning_rate': 1.3812636165577342e-05, 'epoch': 43.09}


 86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 19792/22950 [32:59<05:06, 10.32it/s]

{'loss': 0.0, 'grad_norm': 0.0003792960196733475, 'learning_rate': 1.3769063180827887e-05, 'epoch': 43.12}


 86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 19802/22950 [33:00<05:09, 10.18it/s]

{'loss': 0.0, 'grad_norm': 0.0004507152480073273, 'learning_rate': 1.3725490196078432e-05, 'epoch': 43.14}


 86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 19812/22950 [33:01<05:03, 10.34it/s]

{'loss': 0.0, 'grad_norm': 0.0005122597212903202, 'learning_rate': 1.3681917211328976e-05, 'epoch': 43.16}


 86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 19822/22950 [33:02<05:09, 10.10it/s]

{'loss': 0.0, 'grad_norm': 0.00849138293415308, 'learning_rate': 1.3638344226579523e-05, 'epoch': 43.18}


 86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 19832/22950 [33:03<05:00, 10.39it/s]

{'loss': 0.0004, 'grad_norm': 0.0005395296029746532, 'learning_rate': 1.3594771241830064e-05, 'epoch': 43.2}


 86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 19842/22950 [33:04<05:05, 10.16it/s]

{'loss': 0.0, 'grad_norm': 0.000700565695296973, 'learning_rate': 1.3551198257080611e-05, 'epoch': 43.22}


 87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 19852/22950 [33:05<04:57, 10.40it/s]

{'loss': 0.006, 'grad_norm': 0.0005482739070430398, 'learning_rate': 1.3507625272331156e-05, 'epoch': 43.25}


 87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 19862/22950 [33:06<05:00, 10.28it/s]

{'loss': 0.0, 'grad_norm': 0.0004905435489490628, 'learning_rate': 1.34640522875817e-05, 'epoch': 43.27}


 87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 19872/22950 [33:07<04:58, 10.32it/s]

{'loss': 0.0028, 'grad_norm': 0.0006328352610580623, 'learning_rate': 1.3420479302832245e-05, 'epoch': 43.29}


 87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 19881/22950 [33:08<05:05, 10.04it/s]

{'loss': 0.0, 'grad_norm': 0.00037942617200315, 'learning_rate': 1.3376906318082788e-05, 'epoch': 43.31}


 87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 19891/22950 [33:09<04:53, 10.41it/s]

{'loss': 0.0, 'grad_norm': 0.00040294660720974207, 'learning_rate': 1.3333333333333333e-05, 'epoch': 43.33}


 87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 19901/22950 [33:10<04:57, 10.24it/s]

{'loss': 0.0, 'grad_norm': 0.0005683386698365211, 'learning_rate': 1.328976034858388e-05, 'epoch': 43.36}


 87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 19911/22950 [33:11<04:54, 10.31it/s]

{'loss': 0.0, 'grad_norm': 0.0005070348270237446, 'learning_rate': 1.3246187363834422e-05, 'epoch': 43.38}


 87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 19921/22950 [33:12<05:01, 10.04it/s]

{'loss': 0.0231, 'grad_norm': 0.002996532479301095, 'learning_rate': 1.3202614379084969e-05, 'epoch': 43.4}


 87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 19931/22950 [33:13<04:52, 10.31it/s]

{'loss': 0.0, 'grad_norm': 0.00034591867006383836, 'learning_rate': 1.3159041394335514e-05, 'epoch': 43.42}


 87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 19941/22950 [33:14<04:58, 10.09it/s]

{'loss': 0.0, 'grad_norm': 0.0005243397317826748, 'learning_rate': 1.3115468409586057e-05, 'epoch': 43.44}


 87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 19951/22950 [33:15<04:48, 10.39it/s]

{'loss': 0.0, 'grad_norm': 0.0006448866915889084, 'learning_rate': 1.3071895424836602e-05, 'epoch': 43.46}


 87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 19961/22950 [33:16<04:58, 10.00it/s]

{'loss': 0.0, 'grad_norm': 0.0003328912134747952, 'learning_rate': 1.3028322440087146e-05, 'epoch': 43.49}


 87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 19971/22950 [33:17<04:48, 10.33it/s]

{'loss': 0.0, 'grad_norm': 0.00045735613093711436, 'learning_rate': 1.298474945533769e-05, 'epoch': 43.51}


 87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 19981/22950 [33:18<04:51, 10.18it/s]

{'loss': 0.0, 'grad_norm': 0.0004966868436895311, 'learning_rate': 1.2941176470588238e-05, 'epoch': 43.53}


 87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 19991/22950 [33:19<04:46, 10.31it/s]

{'loss': 0.1334, 'grad_norm': 0.0006739003001712263, 'learning_rate': 1.289760348583878e-05, 'epoch': 43.55}


 87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 20001/22950 [33:20<04:53, 10.03it/s]

{'loss': 0.0, 'grad_norm': 0.00055628054542467, 'learning_rate': 1.2854030501089326e-05, 'epoch': 43.57}


 87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 20011/22950 [33:21<04:46, 10.27it/s]

{'loss': 0.0, 'grad_norm': 0.0005187960923649371, 'learning_rate': 1.281045751633987e-05, 'epoch': 43.59}


 87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 20021/22950 [33:22<04:47, 10.20it/s]

{'loss': 0.0, 'grad_norm': 0.0006546349613927305, 'learning_rate': 1.2766884531590415e-05, 'epoch': 43.62}


 87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 20031/22950 [33:23<04:41, 10.38it/s]

{'loss': 0.0, 'grad_norm': 0.002361573278903961, 'learning_rate': 1.272331154684096e-05, 'epoch': 43.64}


 87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 20041/22950 [33:24<04:50, 10.00it/s]

{'loss': 0.0, 'grad_norm': 0.0004186294972896576, 'learning_rate': 1.2679738562091503e-05, 'epoch': 43.66}


 87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 20051/22950 [33:25<04:44, 10.19it/s]

{'loss': 0.0, 'grad_norm': 0.00041968197911046445, 'learning_rate': 1.2636165577342048e-05, 'epoch': 43.68}


 87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 20062/22950 [33:26<04:46, 10.07it/s]

{'loss': 0.0, 'grad_norm': 0.0006010081269778311, 'learning_rate': 1.2592592592592592e-05, 'epoch': 43.7}


 87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 20072/22950 [33:27<04:40, 10.26it/s]

{'loss': 0.0, 'grad_norm': 0.000489748374093324, 'learning_rate': 1.2549019607843138e-05, 'epoch': 43.73}


 88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 20082/22950 [33:28<04:45, 10.03it/s]

{'loss': 0.0, 'grad_norm': 0.0009709526202641428, 'learning_rate': 1.2505446623093684e-05, 'epoch': 43.75}


 88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 20092/22950 [33:29<04:38, 10.28it/s]

{'loss': 0.0, 'grad_norm': 0.0004708104534074664, 'learning_rate': 1.2461873638344227e-05, 'epoch': 43.77}


 88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 20102/22950 [33:30<04:41, 10.11it/s]

{'loss': 0.0154, 'grad_norm': 0.00038287087227217853, 'learning_rate': 1.2418300653594772e-05, 'epoch': 43.79}


 88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 20112/22950 [33:31<04:35, 10.29it/s]

{'loss': 0.0, 'grad_norm': 0.0005413329927250743, 'learning_rate': 1.2374727668845317e-05, 'epoch': 43.81}


 88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 20121/22950 [33:31<04:43, 10.00it/s]

{'loss': 0.0, 'grad_norm': 0.0005356951151043177, 'learning_rate': 1.233115468409586e-05, 'epoch': 43.83}


 88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 20131/22950 [33:32<04:34, 10.26it/s]

{'loss': 0.0, 'grad_norm': 0.0006330275209620595, 'learning_rate': 1.2287581699346406e-05, 'epoch': 43.86}


 88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 20142/22950 [33:34<04:31, 10.34it/s]

{'loss': 0.0, 'grad_norm': 0.001806745189242065, 'learning_rate': 1.224400871459695e-05, 'epoch': 43.88}


 88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 20152/22950 [33:34<04:33, 10.22it/s]

{'loss': 0.0, 'grad_norm': 0.0006538084708154202, 'learning_rate': 1.2200435729847496e-05, 'epoch': 43.9}


 88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 20161/22950 [33:35<04:40,  9.96it/s]

{'loss': 0.0, 'grad_norm': 0.0007296138210222125, 'learning_rate': 1.215686274509804e-05, 'epoch': 43.92}


 88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 20171/22950 [33:36<04:30, 10.26it/s]

{'loss': 0.014, 'grad_norm': 0.0005887206643819809, 'learning_rate': 1.2113289760348584e-05, 'epoch': 43.94}


 88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 20182/22950 [33:37<04:29, 10.27it/s]

{'loss': 0.0, 'grad_norm': 0.0007365230121649802, 'learning_rate': 1.206971677559913e-05, 'epoch': 43.97}


 88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 20192/22950 [33:38<04:28, 10.26it/s]

{'loss': 0.0, 'grad_norm': 0.000762788811698556, 'learning_rate': 1.2026143790849675e-05, 'epoch': 43.99}


                                                     
 88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 20196/22950 [33:42<04:16, 10.73it/s]

{'eval_loss': 1.310192346572876, 'eval_accuracy': 0.8504902124404907, 'eval_runtime': 2.9093, 'eval_samples_per_second': 140.241, 'eval_steps_per_second': 17.53, 'epoch': 44.0}


 88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 20202/22950 [33:43<15:02,  3.04it/s]

{'loss': 0.1264, 'grad_norm': 0.0004722077283076942, 'learning_rate': 1.1982570806100218e-05, 'epoch': 44.01}


 88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 20211/22950 [33:44<06:40,  6.83it/s]

{'loss': 0.0, 'grad_norm': 0.0008126167813315988, 'learning_rate': 1.1938997821350763e-05, 'epoch': 44.03}


 88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 20221/22950 [33:45<04:45,  9.55it/s]

{'loss': 0.0, 'grad_norm': 0.0009046847699210048, 'learning_rate': 1.1895424836601307e-05, 'epoch': 44.05}


 88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 20232/22950 [33:46<04:25, 10.22it/s]

{'loss': 0.0, 'grad_norm': 0.0009795642690733075, 'learning_rate': 1.1851851851851853e-05, 'epoch': 44.07}


 88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 20241/22950 [33:47<04:31,  9.98it/s]

{'loss': 0.0, 'grad_norm': 0.0005608587525784969, 'learning_rate': 1.1808278867102397e-05, 'epoch': 44.1}


 88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 20251/22950 [33:47<04:24, 10.19it/s]

{'loss': 0.0005, 'grad_norm': 0.0004749283252749592, 'learning_rate': 1.1764705882352942e-05, 'epoch': 44.12}


 88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 20262/22950 [33:49<04:23, 10.21it/s]

{'loss': 0.0764, 'grad_norm': 0.0006729172309860587, 'learning_rate': 1.1721132897603485e-05, 'epoch': 44.14}


 88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 20272/22950 [33:50<04:21, 10.24it/s]

{'loss': 0.0, 'grad_norm': 0.0004948641872033477, 'learning_rate': 1.1677559912854032e-05, 'epoch': 44.16}


 88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 20282/22950 [33:51<04:25, 10.05it/s]

{'loss': 0.0, 'grad_norm': 0.000778555404394865, 'learning_rate': 1.1633986928104575e-05, 'epoch': 44.18}


 88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 20292/22950 [33:52<04:19, 10.24it/s]

{'loss': 0.0, 'grad_norm': 0.0006783615681342781, 'learning_rate': 1.159041394335512e-05, 'epoch': 44.2}


 88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 20302/22950 [33:53<04:24, 10.00it/s]

{'loss': 0.0, 'grad_norm': 0.0006994139985181391, 'learning_rate': 1.1546840958605664e-05, 'epoch': 44.23}


 89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 20311/22950 [33:53<04:20, 10.13it/s]

{'loss': 0.0, 'grad_norm': 0.000550712866242975, 'learning_rate': 1.1503267973856209e-05, 'epoch': 44.25}


 89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 20322/22950 [33:55<04:20, 10.10it/s]

{'loss': 0.0, 'grad_norm': 0.0005710712866857648, 'learning_rate': 1.1459694989106754e-05, 'epoch': 44.27}


 89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 20332/22950 [33:56<04:17, 10.17it/s]

{'loss': 0.0, 'grad_norm': 0.0006383032305166125, 'learning_rate': 1.14161220043573e-05, 'epoch': 44.29}


 89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 20342/22950 [33:57<04:21,  9.99it/s]

{'loss': 0.0119, 'grad_norm': 0.0007815971039235592, 'learning_rate': 1.1372549019607843e-05, 'epoch': 44.31}


 89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 20352/22950 [33:58<04:15, 10.19it/s]

{'loss': 0.0, 'grad_norm': 0.0004714054230134934, 'learning_rate': 1.1328976034858388e-05, 'epoch': 44.34}


 89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 20361/22950 [33:58<04:21,  9.92it/s]

{'loss': 0.0015, 'grad_norm': 7.806797027587891, 'learning_rate': 1.1285403050108933e-05, 'epoch': 44.36}


 89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20371/22950 [33:59<04:13, 10.15it/s]

{'loss': 0.0, 'grad_norm': 0.000752381922211498, 'learning_rate': 1.1241830065359478e-05, 'epoch': 44.38}


 89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20381/22950 [34:00<04:16, 10.00it/s]

{'loss': 0.0, 'grad_norm': 0.0006685303524136543, 'learning_rate': 1.1198257080610023e-05, 'epoch': 44.4}


 89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20392/22950 [34:01<04:11, 10.18it/s]

{'loss': 0.0066, 'grad_norm': 0.000662844511680305, 'learning_rate': 1.1154684095860567e-05, 'epoch': 44.42}


 89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20401/22950 [34:02<04:17,  9.89it/s]

{'loss': 0.0, 'grad_norm': 0.0005344355013221502, 'learning_rate': 1.1111111111111112e-05, 'epoch': 44.44}


 89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20412/22950 [34:03<04:06, 10.31it/s]

{'loss': 0.0, 'grad_norm': 0.0004870756820309907, 'learning_rate': 1.1067538126361657e-05, 'epoch': 44.47}


 89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20422/22950 [34:04<04:06, 10.25it/s]

{'loss': 0.0898, 'grad_norm': 0.0006680224323645234, 'learning_rate': 1.1023965141612202e-05, 'epoch': 44.49}


 89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20432/22950 [34:05<04:09, 10.10it/s]

{'loss': 0.0, 'grad_norm': 0.0010904563823714852, 'learning_rate': 1.0980392156862745e-05, 'epoch': 44.51}


 89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20442/22950 [34:06<04:05, 10.21it/s]

{'loss': 0.0019, 'grad_norm': 0.0004204364959150553, 'learning_rate': 1.093681917211329e-05, 'epoch': 44.53}


 89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20451/22950 [34:07<04:07, 10.12it/s]

{'loss': 0.0114, 'grad_norm': 0.0008737842435948551, 'learning_rate': 1.0893246187363835e-05, 'epoch': 44.55}


 89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20461/22950 [34:08<04:04, 10.16it/s]

{'loss': 0.0, 'grad_norm': 0.0007791395182721317, 'learning_rate': 1.084967320261438e-05, 'epoch': 44.58}


 89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20472/22950 [34:09<04:03, 10.19it/s]

{'loss': 0.0, 'grad_norm': 0.0004830689576920122, 'learning_rate': 1.0806100217864924e-05, 'epoch': 44.6}


 89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20482/22950 [34:10<04:07,  9.97it/s]

{'loss': 0.0, 'grad_norm': 0.0007435747538693249, 'learning_rate': 1.0762527233115469e-05, 'epoch': 44.62}


 89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20492/22950 [34:11<04:01, 10.16it/s]

{'loss': 0.0, 'grad_norm': 0.001729531679302454, 'learning_rate': 1.0718954248366013e-05, 'epoch': 44.64}


 89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20501/22950 [34:12<04:06,  9.92it/s]

{'loss': 0.0, 'grad_norm': 0.0007055269670672715, 'learning_rate': 1.067538126361656e-05, 'epoch': 44.66}


 89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20512/22950 [34:13<04:00, 10.13it/s]

{'loss': 0.0, 'grad_norm': 0.002815200947225094, 'learning_rate': 1.0631808278867103e-05, 'epoch': 44.68}


 89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20522/22950 [34:14<04:03,  9.99it/s]

{'loss': 0.0, 'grad_norm': 0.0004181316471658647, 'learning_rate': 1.0588235294117648e-05, 'epoch': 44.71}


 89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20532/22950 [34:15<03:57, 10.19it/s]

{'loss': 0.0, 'grad_norm': 0.0012700642691925168, 'learning_rate': 1.0544662309368191e-05, 'epoch': 44.73}


 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20541/22950 [34:16<04:09,  9.65it/s]

{'loss': 0.0, 'grad_norm': 0.0004306419286876917, 'learning_rate': 1.0501089324618738e-05, 'epoch': 44.75}


 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20552/22950 [34:17<03:52, 10.32it/s]

{'loss': 0.0, 'grad_norm': 0.00044096659985370934, 'learning_rate': 1.0457516339869281e-05, 'epoch': 44.77}


 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20562/22950 [34:18<03:53, 10.25it/s]

{'loss': 0.0, 'grad_norm': 0.0005019413074478507, 'learning_rate': 1.0413943355119827e-05, 'epoch': 44.79}


 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20571/22950 [34:19<03:57, 10.00it/s]

{'loss': 0.0, 'grad_norm': 0.0005904943100176752, 'learning_rate': 1.037037037037037e-05, 'epoch': 44.81}


 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20582/22950 [34:20<03:53, 10.14it/s]

{'loss': 0.0007, 'grad_norm': 0.0006853517261333764, 'learning_rate': 1.0326797385620915e-05, 'epoch': 44.84}


 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20592/22950 [34:21<03:50, 10.22it/s]

{'loss': 0.0, 'grad_norm': 0.000393258233089, 'learning_rate': 1.028322440087146e-05, 'epoch': 44.86}


 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20602/22950 [34:22<03:49, 10.24it/s]

{'loss': 0.0, 'grad_norm': 0.0004019969201181084, 'learning_rate': 1.0239651416122005e-05, 'epoch': 44.88}


 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20611/22950 [34:23<03:53, 10.03it/s]

{'loss': 0.0, 'grad_norm': 0.0004370403476059437, 'learning_rate': 1.0196078431372549e-05, 'epoch': 44.9}


 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20622/22950 [34:24<03:48, 10.18it/s]

{'loss': 0.0, 'grad_norm': 0.00037677717045880854, 'learning_rate': 1.0152505446623094e-05, 'epoch': 44.92}


 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20632/22950 [34:25<03:46, 10.21it/s]

{'loss': 0.0, 'grad_norm': 0.0006546925869770348, 'learning_rate': 1.0108932461873639e-05, 'epoch': 44.95}


 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20642/22950 [34:26<03:45, 10.22it/s]

{'loss': 0.0052, 'grad_norm': 0.0007156335050240159, 'learning_rate': 1.0065359477124184e-05, 'epoch': 44.97}


 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 20652/22950 [34:27<03:48, 10.07it/s]

{'loss': 0.0, 'grad_norm': 0.07282539457082748, 'learning_rate': 1.0021786492374727e-05, 'epoch': 44.99}


                                                     
 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20655/22950 [34:31<03:44, 10.21it/s]

{'eval_loss': 1.2721956968307495, 'eval_accuracy': 0.8627451062202454, 'eval_runtime': 2.9614, 'eval_samples_per_second': 137.771, 'eval_steps_per_second': 17.221, 'epoch': 45.0}


 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20662/22950 [34:32<09:52,  3.86it/s]

{'loss': 0.0, 'grad_norm': 0.0006361986161209643, 'learning_rate': 9.978213507625273e-06, 'epoch': 45.01}


 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20671/22950 [34:33<04:54,  7.74it/s]

{'loss': 0.0, 'grad_norm': 0.000515204796101898, 'learning_rate': 9.934640522875818e-06, 'epoch': 45.03}


 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20682/22950 [34:34<03:50,  9.82it/s]

{'loss': 0.0, 'grad_norm': 0.0006531227845698595, 'learning_rate': 9.891067538126363e-06, 'epoch': 45.05}


 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20692/22950 [34:35<03:41, 10.21it/s]

{'loss': 0.0, 'grad_norm': 0.00043973748688586056, 'learning_rate': 9.847494553376906e-06, 'epoch': 45.08}


 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20702/22950 [34:36<03:45,  9.97it/s]

{'loss': 0.0, 'grad_norm': 0.0007499518687836826, 'learning_rate': 9.803921568627451e-06, 'epoch': 45.1}


 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20712/22950 [34:37<03:38, 10.24it/s]

{'loss': 0.0, 'grad_norm': 0.00042410020250827074, 'learning_rate': 9.760348583877996e-06, 'epoch': 45.12}


 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20721/22950 [34:38<03:42, 10.01it/s]

{'loss': 0.0, 'grad_norm': 0.0008281816262751818, 'learning_rate': 9.716775599128541e-06, 'epoch': 45.14}


 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20731/22950 [34:38<03:37, 10.20it/s]

{'loss': 0.0, 'grad_norm': 0.0003888377395924181, 'learning_rate': 9.673202614379087e-06, 'epoch': 45.16}


 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20741/22950 [34:39<03:41,  9.96it/s]

{'loss': 0.0, 'grad_norm': 0.0009522454347461462, 'learning_rate': 9.62962962962963e-06, 'epoch': 45.19}


 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20751/22950 [34:40<03:35, 10.23it/s]

{'loss': 0.0, 'grad_norm': 0.0005246622022241354, 'learning_rate': 9.586056644880175e-06, 'epoch': 45.21}


 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20762/22950 [34:42<03:31, 10.32it/s]

{'loss': 0.0, 'grad_norm': 0.0005749556585215032, 'learning_rate': 9.542483660130718e-06, 'epoch': 45.23}


 91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20772/22950 [34:43<03:30, 10.34it/s]

{'loss': 0.0, 'grad_norm': 0.00047070684377104044, 'learning_rate': 9.498910675381265e-06, 'epoch': 45.25}


 91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20781/22950 [34:43<03:39,  9.88it/s]

{'loss': 0.0012, 'grad_norm': 0.0005209469818510115, 'learning_rate': 9.455337690631809e-06, 'epoch': 45.27}


 91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20791/22950 [34:44<03:29, 10.32it/s]

{'loss': 0.0, 'grad_norm': 0.0033709907438606024, 'learning_rate': 9.411764705882354e-06, 'epoch': 45.29}


 91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20802/22950 [34:45<03:29, 10.24it/s]

{'loss': 0.0, 'grad_norm': 0.0006620376952923834, 'learning_rate': 9.368191721132897e-06, 'epoch': 45.32}


 91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20812/22950 [34:46<03:26, 10.33it/s]

{'loss': 0.0004, 'grad_norm': 0.0004616196092683822, 'learning_rate': 9.324618736383444e-06, 'epoch': 45.34}


 91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20821/22950 [34:47<03:32, 10.00it/s]

{'loss': 0.0071, 'grad_norm': 1.090404748916626, 'learning_rate': 9.281045751633987e-06, 'epoch': 45.36}


 91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20831/22950 [34:48<03:25, 10.31it/s]

{'loss': 0.0, 'grad_norm': 0.00044507504208013415, 'learning_rate': 9.237472766884533e-06, 'epoch': 45.38}


 91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20841/22950 [34:49<03:25, 10.28it/s]

{'loss': 0.0, 'grad_norm': 0.000571934855543077, 'learning_rate': 9.193899782135076e-06, 'epoch': 45.4}


 91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20851/22950 [34:50<03:23, 10.30it/s]

{'loss': 0.0, 'grad_norm': 0.0004896351019851863, 'learning_rate': 9.150326797385621e-06, 'epoch': 45.42}


 91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20862/22950 [34:51<03:23, 10.26it/s]

{'loss': 0.0, 'grad_norm': 0.0003977651649620384, 'learning_rate': 9.106753812636166e-06, 'epoch': 45.45}


 91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20872/22950 [34:52<03:20, 10.38it/s]

{'loss': 0.005, 'grad_norm': 0.8397030830383301, 'learning_rate': 9.063180827886711e-06, 'epoch': 45.47}


 91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20882/22950 [34:53<03:22, 10.20it/s]

{'loss': 0.0146, 'grad_norm': 0.00047488807467743754, 'learning_rate': 9.019607843137255e-06, 'epoch': 45.49}


 91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20892/22950 [34:54<03:18, 10.35it/s]

{'loss': 0.0017, 'grad_norm': 0.00040067624649964273, 'learning_rate': 8.9760348583878e-06, 'epoch': 45.51}


 91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20902/22950 [34:55<03:21, 10.15it/s]

{'loss': 0.0, 'grad_norm': 0.0005530568305402994, 'learning_rate': 8.932461873638345e-06, 'epoch': 45.53}


 91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20912/22950 [34:56<03:17, 10.33it/s]

{'loss': 0.0, 'grad_norm': 0.0004780669114552438, 'learning_rate': 8.88888888888889e-06, 'epoch': 45.56}


 91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20922/22950 [34:57<03:19, 10.18it/s]

{'loss': 0.0, 'grad_norm': 0.0005066663725301623, 'learning_rate': 8.845315904139433e-06, 'epoch': 45.58}


 91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20932/22950 [34:58<03:16, 10.28it/s]

{'loss': 0.0, 'grad_norm': 0.011157003231346607, 'learning_rate': 8.801742919389979e-06, 'epoch': 45.6}


 91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 20941/22950 [34:59<03:21,  9.98it/s]

{'loss': 0.0158, 'grad_norm': 0.00038663839222863317, 'learning_rate': 8.758169934640522e-06, 'epoch': 45.62}


 91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 20951/22950 [35:00<03:13, 10.33it/s]

{'loss': 0.0119, 'grad_norm': 0.00030421483097597957, 'learning_rate': 8.714596949891069e-06, 'epoch': 45.64}


 91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 20961/22950 [35:01<03:16, 10.11it/s]

{'loss': 0.0, 'grad_norm': 0.0005906570004299283, 'learning_rate': 8.671023965141612e-06, 'epoch': 45.66}


 91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 20971/22950 [35:02<03:11, 10.32it/s]

{'loss': 0.0, 'grad_norm': 0.0003484254702925682, 'learning_rate': 8.627450980392157e-06, 'epoch': 45.69}


 91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 20981/22950 [35:03<03:17,  9.97it/s]

{'loss': 0.0, 'grad_norm': 0.0004989166627638042, 'learning_rate': 8.583877995642702e-06, 'epoch': 45.71}


 91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 20991/22950 [35:04<03:09, 10.32it/s]

{'loss': 0.0, 'grad_norm': 0.000375063595129177, 'learning_rate': 8.540305010893247e-06, 'epoch': 45.73}


 92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 21001/22950 [35:05<03:14, 10.05it/s]

{'loss': 0.0, 'grad_norm': 0.0007165621500462294, 'learning_rate': 8.496732026143791e-06, 'epoch': 45.75}


 92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 21011/22950 [35:06<03:07, 10.34it/s]

{'loss': 0.0, 'grad_norm': 0.0005448017036542296, 'learning_rate': 8.453159041394336e-06, 'epoch': 45.77}


 92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 21021/22950 [35:07<03:12, 10.04it/s]

{'loss': 0.0, 'grad_norm': 0.0006943544140085578, 'learning_rate': 8.409586056644881e-06, 'epoch': 45.8}


 92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 21031/22950 [35:08<03:05, 10.36it/s]

{'loss': 0.0, 'grad_norm': 0.00039212428964674473, 'learning_rate': 8.366013071895424e-06, 'epoch': 45.82}


 92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 21041/22950 [35:09<03:06, 10.26it/s]

{'loss': 0.0, 'grad_norm': 0.0004860138287767768, 'learning_rate': 8.32244008714597e-06, 'epoch': 45.84}


 92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 21051/22950 [35:10<03:02, 10.43it/s]

{'loss': 0.0, 'grad_norm': 0.002028387738391757, 'learning_rate': 8.278867102396515e-06, 'epoch': 45.86}


 92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 21062/22950 [35:11<03:02, 10.36it/s]

{'loss': 0.0, 'grad_norm': 0.0003950722166337073, 'learning_rate': 8.23529411764706e-06, 'epoch': 45.88}


 92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 21072/22950 [35:12<03:01, 10.35it/s]

{'loss': 0.0, 'grad_norm': 0.0005317333852872252, 'learning_rate': 8.191721132897603e-06, 'epoch': 45.9}


 92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 21082/22950 [35:13<03:01, 10.28it/s]

{'loss': 0.0, 'grad_norm': 0.00045017481897957623, 'learning_rate': 8.14814814814815e-06, 'epoch': 45.93}


 92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 21092/22950 [35:14<02:57, 10.47it/s]

{'loss': 0.0, 'grad_norm': 0.0007757680141367018, 'learning_rate': 8.104575163398693e-06, 'epoch': 45.95}


 92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 21101/22950 [35:15<03:07,  9.86it/s]

{'loss': 0.0, 'grad_norm': 0.0005480714607983828, 'learning_rate': 8.061002178649239e-06, 'epoch': 45.97}


 92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 21111/22950 [35:16<02:57, 10.35it/s]

{'loss': 0.0, 'grad_norm': 0.000774012878537178, 'learning_rate': 8.017429193899782e-06, 'epoch': 45.99}


                                                     
 92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 21114/22950 [35:19<02:54, 10.54it/s]

{'eval_loss': 1.2901488542556763, 'eval_accuracy': 0.8504902124404907, 'eval_runtime': 2.8566, 'eval_samples_per_second': 142.829, 'eval_steps_per_second': 17.854, 'epoch': 46.0}


 92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 21121/22950 [35:20<07:37,  4.00it/s]

{'loss': 0.0, 'grad_norm': 0.004239191301167011, 'learning_rate': 7.973856209150327e-06, 'epoch': 46.01}


 92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 21132/22950 [35:21<03:30,  8.65it/s]

{'loss': 0.0, 'grad_norm': 0.0003875953843817115, 'learning_rate': 7.930283224400872e-06, 'epoch': 46.03}


 92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 21142/22950 [35:22<03:03,  9.86it/s]

{'loss': 0.0, 'grad_norm': 0.0003774539509322494, 'learning_rate': 7.886710239651417e-06, 'epoch': 46.06}


 92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 21152/22950 [35:23<02:54, 10.32it/s]

{'loss': 0.0, 'grad_norm': 0.0003265703853685409, 'learning_rate': 7.84313725490196e-06, 'epoch': 46.08}


 92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 21162/22950 [35:24<02:58, 10.03it/s]

{'loss': 0.0, 'grad_norm': 0.0004860515473410487, 'learning_rate': 7.799564270152506e-06, 'epoch': 46.1}


 92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 21172/22950 [35:25<02:50, 10.44it/s]

{'loss': 0.0096, 'grad_norm': 0.0005356032634153962, 'learning_rate': 7.755991285403051e-06, 'epoch': 46.12}


 92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 21182/22950 [35:26<02:54, 10.10it/s]

{'loss': 0.0, 'grad_norm': 0.0005765004316344857, 'learning_rate': 7.712418300653596e-06, 'epoch': 46.14}


 92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 21192/22950 [35:27<02:48, 10.46it/s]

{'loss': 0.0, 'grad_norm': 0.0004184365679975599, 'learning_rate': 7.66884531590414e-06, 'epoch': 46.17}


 92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 21202/22950 [35:28<02:52, 10.11it/s]

{'loss': 0.0, 'grad_norm': 0.0018361045513302088, 'learning_rate': 7.6252723311546845e-06, 'epoch': 46.19}


 92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 21212/22950 [35:29<02:46, 10.44it/s]

{'loss': 0.0, 'grad_norm': 0.00032818870386108756, 'learning_rate': 7.581699346405229e-06, 'epoch': 46.21}


 92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 21222/22950 [35:30<02:48, 10.26it/s]

{'loss': 0.0, 'grad_norm': 0.00044897981570102274, 'learning_rate': 7.538126361655775e-06, 'epoch': 46.23}


 93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 21232/22950 [35:31<02:45, 10.41it/s]

{'loss': 0.0, 'grad_norm': 0.00040420799632556736, 'learning_rate': 7.494553376906319e-06, 'epoch': 46.25}


 93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 21242/22950 [35:32<02:47, 10.21it/s]

{'loss': 0.0003, 'grad_norm': 0.00040135433664545417, 'learning_rate': 7.450980392156863e-06, 'epoch': 46.27}


 93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 21252/22950 [35:32<02:42, 10.44it/s]

{'loss': 0.0, 'grad_norm': 0.0004976344062015414, 'learning_rate': 7.4074074074074075e-06, 'epoch': 46.3}


 93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 21262/22950 [35:33<02:43, 10.30it/s]

{'loss': 0.0, 'grad_norm': 0.00033968707430176437, 'learning_rate': 7.3638344226579534e-06, 'epoch': 46.32}


 93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 21272/22950 [35:34<02:40, 10.47it/s]

{'loss': 0.0, 'grad_norm': 0.0003905630437657237, 'learning_rate': 7.320261437908498e-06, 'epoch': 46.34}


 93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 21282/22950 [35:35<02:45, 10.05it/s]

{'loss': 0.0, 'grad_norm': 0.00035004023811779916, 'learning_rate': 7.276688453159042e-06, 'epoch': 46.36}


 93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 21292/22950 [35:36<02:38, 10.45it/s]

{'loss': 0.0, 'grad_norm': 0.00045587291242554784, 'learning_rate': 7.233115468409586e-06, 'epoch': 46.38}


 93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 21302/22950 [35:37<02:39, 10.31it/s]

{'loss': 0.0, 'grad_norm': 0.0003700028464663774, 'learning_rate': 7.1895424836601305e-06, 'epoch': 46.41}


 93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 21312/22950 [35:38<02:35, 10.54it/s]

{'loss': 0.0094, 'grad_norm': 0.0004038480401504785, 'learning_rate': 7.145969498910676e-06, 'epoch': 46.43}


 93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 21321/22950 [35:39<02:41, 10.08it/s]

{'loss': 0.0001, 'grad_norm': 0.00046680873492732644, 'learning_rate': 7.102396514161221e-06, 'epoch': 46.45}


 93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 21331/22950 [35:40<02:34, 10.46it/s]

{'loss': 0.0, 'grad_norm': 0.0009256494231522083, 'learning_rate': 7.058823529411765e-06, 'epoch': 46.47}


 93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 21341/22950 [35:41<02:34, 10.40it/s]

{'loss': 0.0, 'grad_norm': 0.0004549597797449678, 'learning_rate': 7.015250544662309e-06, 'epoch': 46.49}


 93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 21351/22950 [35:42<02:32, 10.52it/s]

{'loss': 0.0, 'grad_norm': 0.00043018755968660116, 'learning_rate': 6.971677559912855e-06, 'epoch': 46.51}


 93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 21361/22950 [35:43<02:35, 10.23it/s]

{'loss': 0.0, 'grad_norm': 0.0004060847859364003, 'learning_rate': 6.928104575163399e-06, 'epoch': 46.54}


 93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 21371/22950 [35:44<02:30, 10.49it/s]

{'loss': 0.0, 'grad_norm': 0.0003622522053774446, 'learning_rate': 6.884531590413944e-06, 'epoch': 46.56}


 93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 21381/22950 [35:45<02:32, 10.31it/s]

{'loss': 0.0135, 'grad_norm': 0.00044397960300557315, 'learning_rate': 6.840958605664488e-06, 'epoch': 46.58}


 93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 21391/22950 [35:46<02:28, 10.47it/s]

{'loss': 0.0, 'grad_norm': 0.0006148067186586559, 'learning_rate': 6.797385620915032e-06, 'epoch': 46.6}


 93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 21401/22950 [35:47<02:31, 10.20it/s]

{'loss': 0.0, 'grad_norm': 0.00033943349262699485, 'learning_rate': 6.753812636165578e-06, 'epoch': 46.62}


 93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 21411/22950 [35:48<02:25, 10.60it/s]

{'loss': 0.0, 'grad_norm': 0.00043541897321119905, 'learning_rate': 6.710239651416122e-06, 'epoch': 46.64}


 93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 21421/22950 [35:49<02:28, 10.29it/s]

{'loss': 0.0, 'grad_norm': 0.0003422748704906553, 'learning_rate': 6.666666666666667e-06, 'epoch': 46.67}


 93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 21431/22950 [35:50<02:25, 10.45it/s]

{'loss': 0.0004, 'grad_norm': 0.0007291902438737452, 'learning_rate': 6.623093681917211e-06, 'epoch': 46.69}


 93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 21441/22950 [35:51<02:28, 10.14it/s]

{'loss': 0.0, 'grad_norm': 0.000324607128277421, 'learning_rate': 6.579520697167757e-06, 'epoch': 46.71}


 93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 21451/22950 [35:52<02:21, 10.56it/s]

{'loss': 0.0, 'grad_norm': 0.0005062472191639245, 'learning_rate': 6.535947712418301e-06, 'epoch': 46.73}


 94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 21461/22950 [35:53<02:23, 10.35it/s]

{'loss': 0.0, 'grad_norm': 0.0003312489716336131, 'learning_rate': 6.492374727668845e-06, 'epoch': 46.75}


 94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 21471/22950 [35:54<02:20, 10.55it/s]

{'loss': 0.0, 'grad_norm': 0.000473723717732355, 'learning_rate': 6.44880174291939e-06, 'epoch': 46.78}


 94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 21481/22950 [35:55<02:22, 10.29it/s]

{'loss': 0.0053, 'grad_norm': 0.9544607996940613, 'learning_rate': 6.405228758169935e-06, 'epoch': 46.8}


 94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 21491/22950 [35:55<02:19, 10.46it/s]

{'loss': 0.0076, 'grad_norm': 0.0004306183836888522, 'learning_rate': 6.36165577342048e-06, 'epoch': 46.82}


 94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 21501/22950 [35:56<02:21, 10.23it/s]

{'loss': 0.0, 'grad_norm': 0.00037511123809963465, 'learning_rate': 6.318082788671024e-06, 'epoch': 46.84}


 94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 21511/22950 [35:57<02:16, 10.55it/s]

{'loss': 0.0, 'grad_norm': 0.0004759737057611346, 'learning_rate': 6.274509803921569e-06, 'epoch': 46.86}


 94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21521/22950 [35:58<02:18, 10.30it/s]

{'loss': 0.0, 'grad_norm': 0.0005040353862568736, 'learning_rate': 6.2309368191721135e-06, 'epoch': 46.88}


 94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21531/22950 [35:59<02:17, 10.35it/s]

{'loss': 0.0, 'grad_norm': 0.0005506358575075865, 'learning_rate': 6.1873638344226586e-06, 'epoch': 46.91}


 94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21541/22950 [36:00<02:17, 10.24it/s]

{'loss': 0.0, 'grad_norm': 0.0008155523100867867, 'learning_rate': 6.143790849673203e-06, 'epoch': 46.93}


 94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21551/22950 [36:01<02:13, 10.51it/s]

{'loss': 0.0, 'grad_norm': 0.0002602126041892916, 'learning_rate': 6.100217864923748e-06, 'epoch': 46.95}


 94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21561/22950 [36:02<02:15, 10.28it/s]

{'loss': 0.0, 'grad_norm': 0.0005415822379291058, 'learning_rate': 6.056644880174292e-06, 'epoch': 46.97}


 94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21571/22950 [36:03<02:11, 10.50it/s]

{'loss': 0.0001, 'grad_norm': 0.0003670481382869184, 'learning_rate': 6.013071895424837e-06, 'epoch': 46.99}


                                                     
 94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21573/22950 [36:06<02:02, 11.21it/s]

{'eval_loss': 1.2975338697433472, 'eval_accuracy': 0.8553921580314636, 'eval_runtime': 2.8301, 'eval_samples_per_second': 144.164, 'eval_steps_per_second': 18.021, 'epoch': 47.0}


 94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21581/22950 [36:07<05:41,  4.01it/s]

{'loss': 0.0, 'grad_norm': 0.00048469824832864106, 'learning_rate': 5.9694989106753816e-06, 'epoch': 47.02}


 94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21591/22950 [36:08<02:40,  8.48it/s]

{'loss': 0.0, 'grad_norm': 0.000521537265740335, 'learning_rate': 5.925925925925927e-06, 'epoch': 47.04}


 94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21601/22950 [36:09<02:13, 10.11it/s]

{'loss': 0.0, 'grad_norm': 0.0004458077310118824, 'learning_rate': 5.882352941176471e-06, 'epoch': 47.06}


 94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21611/22950 [36:10<02:09, 10.36it/s]

{'loss': 0.0, 'grad_norm': 0.0004525862750597298, 'learning_rate': 5.838779956427016e-06, 'epoch': 47.08}


 94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21621/22950 [36:11<02:09, 10.24it/s]

{'loss': 0.0056, 'grad_norm': 0.0004872328427154571, 'learning_rate': 5.79520697167756e-06, 'epoch': 47.1}


 94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21631/22950 [36:12<02:05, 10.54it/s]

{'loss': 0.0, 'grad_norm': 0.000462343537947163, 'learning_rate': 5.7516339869281045e-06, 'epoch': 47.12}


 94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21641/22950 [36:13<02:06, 10.39it/s]

{'loss': 0.0, 'grad_norm': 0.00039536633994430304, 'learning_rate': 5.70806100217865e-06, 'epoch': 47.15}


 94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21651/22950 [36:14<02:04, 10.40it/s]

{'loss': 0.0, 'grad_norm': 0.0002921765553764999, 'learning_rate': 5.664488017429194e-06, 'epoch': 47.17}


 94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21661/22950 [36:15<02:05, 10.31it/s]

{'loss': 0.0, 'grad_norm': 0.00029950335738249123, 'learning_rate': 5.620915032679739e-06, 'epoch': 47.19}


 94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21671/22950 [36:16<02:01, 10.53it/s]

{'loss': 0.0, 'grad_norm': 0.0003456211125012487, 'learning_rate': 5.577342047930283e-06, 'epoch': 47.21}


 94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21681/22950 [36:17<02:03, 10.30it/s]

{'loss': 0.0, 'grad_norm': 0.00033592505496926606, 'learning_rate': 5.533769063180828e-06, 'epoch': 47.23}


 95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21691/22950 [36:18<01:59, 10.52it/s]

{'loss': 0.0, 'grad_norm': 0.0003673332103062421, 'learning_rate': 5.490196078431373e-06, 'epoch': 47.25}


 95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21701/22950 [36:19<02:01, 10.26it/s]

{'loss': 0.0, 'grad_norm': 0.0002864862617570907, 'learning_rate': 5.446623093681918e-06, 'epoch': 47.28}


 95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21711/22950 [36:20<01:57, 10.50it/s]

{'loss': 0.0115, 'grad_norm': 0.0003192789445165545, 'learning_rate': 5.403050108932462e-06, 'epoch': 47.3}


 95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21721/22950 [36:21<02:00, 10.21it/s]

{'loss': 0.0, 'grad_norm': 0.0004194334615021944, 'learning_rate': 5.359477124183006e-06, 'epoch': 47.32}


 95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21731/22950 [36:22<01:55, 10.52it/s]

{'loss': 0.0, 'grad_norm': 0.0004559276858344674, 'learning_rate': 5.315904139433551e-06, 'epoch': 47.34}


 95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21741/22950 [36:23<02:02,  9.86it/s]

{'loss': 0.0, 'grad_norm': 0.0003528028610162437, 'learning_rate': 5.272331154684096e-06, 'epoch': 47.36}


 95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21752/22950 [36:24<01:51, 10.74it/s]

{'loss': 0.0, 'grad_norm': 0.0004666815511882305, 'learning_rate': 5.228758169934641e-06, 'epoch': 47.39}


 95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21762/22950 [36:25<01:55, 10.33it/s]

{'loss': 0.0001, 'grad_norm': 0.0157445278018713, 'learning_rate': 5.185185185185185e-06, 'epoch': 47.41}


 95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21772/22950 [36:26<01:51, 10.55it/s]

{'loss': 0.0, 'grad_norm': 0.00035162916174158454, 'learning_rate': 5.14161220043573e-06, 'epoch': 47.43}


 95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21782/22950 [36:27<01:53, 10.31it/s]

{'loss': 0.0, 'grad_norm': 0.0004084893153049052, 'learning_rate': 5.098039215686274e-06, 'epoch': 47.45}


 95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21792/22950 [36:28<01:50, 10.44it/s]

{'loss': 0.0, 'grad_norm': 0.0003207464178558439, 'learning_rate': 5.0544662309368195e-06, 'epoch': 47.47}


 95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 21802/22950 [36:29<01:51, 10.27it/s]

{'loss': 0.0, 'grad_norm': 0.0009555743890814483, 'learning_rate': 5.010893246187364e-06, 'epoch': 47.49}


 95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 21812/22950 [36:30<01:48, 10.52it/s]

{'loss': 0.0001, 'grad_norm': 0.001582111930474639, 'learning_rate': 4.967320261437909e-06, 'epoch': 47.52}


 95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 21822/22950 [36:31<01:49, 10.26it/s]

{'loss': 0.0, 'grad_norm': 0.000323437707265839, 'learning_rate': 4.923747276688453e-06, 'epoch': 47.54}


 95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 21832/22950 [36:32<01:46, 10.54it/s]

{'loss': 0.0001, 'grad_norm': 0.0003715470083989203, 'learning_rate': 4.880174291938998e-06, 'epoch': 47.56}


 95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 21842/22950 [36:33<01:46, 10.36it/s]

{'loss': 0.0, 'grad_norm': 0.00031391752418130636, 'learning_rate': 4.836601307189543e-06, 'epoch': 47.58}


 95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 21852/22950 [36:33<01:43, 10.65it/s]

{'loss': 0.0, 'grad_norm': 0.00039512739749625325, 'learning_rate': 4.7930283224400875e-06, 'epoch': 47.6}


 95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 21862/22950 [36:34<01:45, 10.29it/s]

{'loss': 0.0021, 'grad_norm': 0.0003237849159631878, 'learning_rate': 4.749455337690633e-06, 'epoch': 47.63}


 95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 21872/22950 [36:35<01:42, 10.52it/s]

{'loss': 0.0, 'grad_norm': 0.0005895866779610515, 'learning_rate': 4.705882352941177e-06, 'epoch': 47.65}


 95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 21882/22950 [36:36<01:43, 10.36it/s]

{'loss': 0.0, 'grad_norm': 0.00021372611809056252, 'learning_rate': 4.662309368191722e-06, 'epoch': 47.67}


 95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 21892/22950 [36:37<01:42, 10.32it/s]

{'loss': 0.0, 'grad_norm': 0.00041001805220730603, 'learning_rate': 4.618736383442266e-06, 'epoch': 47.69}


 95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 21902/22950 [36:38<01:41, 10.33it/s]

{'loss': 0.0, 'grad_norm': 0.00039165065390989184, 'learning_rate': 4.5751633986928105e-06, 'epoch': 47.71}


 95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 21912/22950 [36:39<01:39, 10.43it/s]

{'loss': 0.0, 'grad_norm': 0.0005747514078393579, 'learning_rate': 4.531590413943356e-06, 'epoch': 47.73}


 96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 21922/22950 [36:40<01:41, 10.17it/s]

{'loss': 0.0, 'grad_norm': 0.00039467349415645003, 'learning_rate': 4.4880174291939e-06, 'epoch': 47.76}


 96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 21932/22950 [36:41<01:37, 10.42it/s]

{'loss': 0.0, 'grad_norm': 0.0003203933301847428, 'learning_rate': 4.444444444444445e-06, 'epoch': 47.78}


 96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 21942/22950 [36:42<01:38, 10.22it/s]

{'loss': 0.0, 'grad_norm': 0.0002880838292185217, 'learning_rate': 4.400871459694989e-06, 'epoch': 47.8}


 96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 21952/22950 [36:43<01:34, 10.61it/s]

{'loss': 0.0, 'grad_norm': 0.00034388265339657664, 'learning_rate': 4.357298474945534e-06, 'epoch': 47.82}


 96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 21962/22950 [36:44<01:36, 10.25it/s]

{'loss': 0.0, 'grad_norm': 0.0003390924830455333, 'learning_rate': 4.313725490196079e-06, 'epoch': 47.84}


 96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 21972/22950 [36:45<01:32, 10.55it/s]

{'loss': 0.0, 'grad_norm': 0.00046079023741185665, 'learning_rate': 4.270152505446624e-06, 'epoch': 47.86}


 96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 21982/22950 [36:46<01:33, 10.34it/s]

{'loss': 0.0, 'grad_norm': 0.0003888586943503469, 'learning_rate': 4.226579520697168e-06, 'epoch': 47.89}


 96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 21992/22950 [36:47<01:31, 10.53it/s]

{'loss': 0.0, 'grad_norm': 0.0008484281715936959, 'learning_rate': 4.183006535947712e-06, 'epoch': 47.91}


 96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 22002/22950 [36:48<01:31, 10.37it/s]

{'loss': 0.0, 'grad_norm': 0.00039963232120499015, 'learning_rate': 4.139433551198257e-06, 'epoch': 47.93}


 96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 22012/22950 [36:49<01:28, 10.57it/s]

{'loss': 0.0149, 'grad_norm': 0.00037380080902948976, 'learning_rate': 4.095860566448802e-06, 'epoch': 47.95}


 96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 22022/22950 [36:50<01:29, 10.37it/s]

{'loss': 0.0, 'grad_norm': 0.00037917806184850633, 'learning_rate': 4.052287581699347e-06, 'epoch': 47.97}


 96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 22032/22950 [36:51<01:23, 10.95it/s]

{'loss': 0.0, 'grad_norm': 0.0003915127017535269, 'learning_rate': 4.008714596949891e-06, 'epoch': 48.0}


                                                     
 96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 22032/22950 [36:54<01:23, 10.95it/s]

{'eval_loss': 1.3166900873184204, 'eval_accuracy': 0.8529411554336548, 'eval_runtime': 2.7955, 'eval_samples_per_second': 145.95, 'eval_steps_per_second': 18.244, 'epoch': 48.0}


 96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 22042/22950 [36:55<03:02,  4.99it/s]

{'loss': 0.0, 'grad_norm': 0.00047144314157776535, 'learning_rate': 3.965141612200436e-06, 'epoch': 48.02}


 96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 22051/22950 [36:56<01:46,  8.43it/s]

{'loss': 0.0, 'grad_norm': 0.0003218152851331979, 'learning_rate': 3.92156862745098e-06, 'epoch': 48.04}


 96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 22061/22950 [36:57<01:29,  9.94it/s]

{'loss': 0.0041, 'grad_norm': 0.00029920495580881834, 'learning_rate': 3.8779956427015254e-06, 'epoch': 48.06}


 96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 22071/22950 [36:58<01:27, 10.06it/s]

{'loss': 0.0, 'grad_norm': 0.00036061680293641984, 'learning_rate': 3.83442265795207e-06, 'epoch': 48.08}


 96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 22081/22950 [36:59<01:22, 10.53it/s]

{'loss': 0.0, 'grad_norm': 0.00039160059532150626, 'learning_rate': 3.7908496732026144e-06, 'epoch': 48.1}


 96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 22091/22950 [37:00<01:23, 10.31it/s]

{'loss': 0.0, 'grad_norm': 0.0004043875669594854, 'learning_rate': 3.7472766884531595e-06, 'epoch': 48.13}


 96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 22101/22950 [37:01<01:22, 10.35it/s]

{'loss': 0.0, 'grad_norm': 0.0003255071060266346, 'learning_rate': 3.7037037037037037e-06, 'epoch': 48.15}


 96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 22111/22950 [37:02<01:20, 10.42it/s]

{'loss': 0.006, 'grad_norm': 0.0003408065822441131, 'learning_rate': 3.660130718954249e-06, 'epoch': 48.17}


 96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 22121/22950 [37:02<01:18, 10.58it/s]

{'loss': 0.0, 'grad_norm': 0.0005934697110205889, 'learning_rate': 3.616557734204793e-06, 'epoch': 48.19}


 96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 22131/22950 [37:03<01:21, 10.09it/s]

{'loss': 0.0, 'grad_norm': 0.0003787228197325021, 'learning_rate': 3.572984749455338e-06, 'epoch': 48.21}


 96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 22141/22950 [37:04<01:16, 10.51it/s]

{'loss': 0.0, 'grad_norm': 0.00030899958801455796, 'learning_rate': 3.5294117647058825e-06, 'epoch': 48.24}


 97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 22151/22950 [37:05<01:17, 10.37it/s]

{'loss': 0.0, 'grad_norm': 0.00031312316423282027, 'learning_rate': 3.4858387799564276e-06, 'epoch': 48.26}


 97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 22161/22950 [37:06<01:15, 10.44it/s]

{'loss': 0.0, 'grad_norm': 0.0003221811493858695, 'learning_rate': 3.442265795206972e-06, 'epoch': 48.28}


 97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 22171/22950 [37:07<01:15, 10.27it/s]

{'loss': 0.0, 'grad_norm': 0.0003201642830390483, 'learning_rate': 3.398692810457516e-06, 'epoch': 48.3}


 97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 22181/22950 [37:08<01:13, 10.47it/s]

{'loss': 0.0, 'grad_norm': 0.00038652250077575445, 'learning_rate': 3.355119825708061e-06, 'epoch': 48.32}


 97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 22191/22950 [37:09<01:12, 10.41it/s]

{'loss': 0.0, 'grad_norm': 0.0003145512891933322, 'learning_rate': 3.3115468409586055e-06, 'epoch': 48.34}


 97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 22201/22950 [37:10<01:11, 10.43it/s]

{'loss': 0.0, 'grad_norm': 0.0002991177316289395, 'learning_rate': 3.2679738562091506e-06, 'epoch': 48.37}


 97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 22211/22950 [37:11<01:11, 10.34it/s]

{'loss': 0.0, 'grad_norm': 0.00035982992267236114, 'learning_rate': 3.224400871459695e-06, 'epoch': 48.39}


 97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 22221/22950 [37:12<01:09, 10.51it/s]

{'loss': 0.0, 'grad_norm': 0.0004328207578510046, 'learning_rate': 3.18082788671024e-06, 'epoch': 48.41}


 97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 22231/22950 [37:13<01:09, 10.31it/s]

{'loss': 0.0, 'grad_norm': 0.0002984381280839443, 'learning_rate': 3.1372549019607846e-06, 'epoch': 48.43}


 97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 22241/22950 [37:14<01:07, 10.43it/s]

{'loss': 0.0, 'grad_norm': 0.00029505282873287797, 'learning_rate': 3.0936819172113293e-06, 'epoch': 48.45}


 97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 22251/22950 [37:15<01:07, 10.36it/s]

{'loss': 0.0, 'grad_norm': 0.00044257473200559616, 'learning_rate': 3.050108932461874e-06, 'epoch': 48.47}


 97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 22261/22950 [37:16<01:06, 10.34it/s]

{'loss': 0.0, 'grad_norm': 0.0003779582038987428, 'learning_rate': 3.0065359477124186e-06, 'epoch': 48.5}


 97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 22271/22950 [37:17<01:06, 10.15it/s]

{'loss': 0.0, 'grad_norm': 0.0003245534608140588, 'learning_rate': 2.9629629629629633e-06, 'epoch': 48.52}


 97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 22281/22950 [37:18<01:03, 10.55it/s]

{'loss': 0.0, 'grad_norm': 0.0025488294195383787, 'learning_rate': 2.919389978213508e-06, 'epoch': 48.54}


 97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 22291/22950 [37:19<01:04, 10.25it/s]

{'loss': 0.0, 'grad_norm': 0.00029079162050038576, 'learning_rate': 2.8758169934640523e-06, 'epoch': 48.56}


 97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 22301/22950 [37:20<01:02, 10.46it/s]

{'loss': 0.0, 'grad_norm': 0.0003713628393597901, 'learning_rate': 2.832244008714597e-06, 'epoch': 48.58}


 97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 22311/22950 [37:21<01:02, 10.23it/s]

{'loss': 0.0, 'grad_norm': 0.0003604304511100054, 'learning_rate': 2.7886710239651416e-06, 'epoch': 48.61}


 97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 22321/22950 [37:22<00:59, 10.54it/s]

{'loss': 0.0, 'grad_norm': 0.00043920520693063736, 'learning_rate': 2.7450980392156863e-06, 'epoch': 48.63}


 97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 22331/22950 [37:23<00:59, 10.35it/s]

{'loss': 0.0, 'grad_norm': 0.0016216514632105827, 'learning_rate': 2.701525054466231e-06, 'epoch': 48.65}


 97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 22341/22950 [37:24<00:59, 10.28it/s]

{'loss': 0.0005, 'grad_norm': 0.0003445831243880093, 'learning_rate': 2.6579520697167757e-06, 'epoch': 48.67}


 97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 22351/22950 [37:25<00:57, 10.34it/s]

{'loss': 0.0, 'grad_norm': 0.00037011559470556676, 'learning_rate': 2.6143790849673204e-06, 'epoch': 48.69}


 97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 22361/22950 [37:26<00:56, 10.48it/s]

{'loss': 0.0, 'grad_norm': 0.00031777305412106216, 'learning_rate': 2.570806100217865e-06, 'epoch': 48.71}


 97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 22371/22950 [37:26<00:55, 10.35it/s]

{'loss': 0.0, 'grad_norm': 0.00029846938559785485, 'learning_rate': 2.5272331154684097e-06, 'epoch': 48.74}


 98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22381/22950 [37:27<00:54, 10.35it/s]

{'loss': 0.0, 'grad_norm': 0.0004308809875510633, 'learning_rate': 2.4836601307189544e-06, 'epoch': 48.76}


 98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22391/22950 [37:28<00:53, 10.54it/s]

{'loss': 0.0, 'grad_norm': 0.00031280642724595964, 'learning_rate': 2.440087145969499e-06, 'epoch': 48.78}


 98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22401/22950 [37:29<00:53, 10.24it/s]

{'loss': 0.0152, 'grad_norm': 0.0003723752743098885, 'learning_rate': 2.3965141612200438e-06, 'epoch': 48.8}


 98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22411/22950 [37:30<00:51, 10.42it/s]

{'loss': 0.0, 'grad_norm': 0.00039407488657161593, 'learning_rate': 2.3529411764705885e-06, 'epoch': 48.82}


 98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22421/22950 [37:31<00:51, 10.32it/s]

{'loss': 0.0, 'grad_norm': 0.00033161177998408675, 'learning_rate': 2.309368191721133e-06, 'epoch': 48.85}


 98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22431/22950 [37:32<00:49, 10.56it/s]

{'loss': 0.0, 'grad_norm': 0.0004121317761018872, 'learning_rate': 2.265795206971678e-06, 'epoch': 48.87}


 98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22441/22950 [37:33<00:48, 10.45it/s]

{'loss': 0.0, 'grad_norm': 0.0003685773990582675, 'learning_rate': 2.2222222222222225e-06, 'epoch': 48.89}


 98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22451/22950 [37:34<00:47, 10.48it/s]

{'loss': 0.0, 'grad_norm': 0.0003006580227520317, 'learning_rate': 2.178649237472767e-06, 'epoch': 48.91}


 98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22461/22950 [37:35<00:46, 10.51it/s]

{'loss': 0.0, 'grad_norm': 0.00033828147570602596, 'learning_rate': 2.135076252723312e-06, 'epoch': 48.93}


 98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22471/22950 [37:36<00:46, 10.32it/s]

{'loss': 0.0, 'grad_norm': 0.00029300199821591377, 'learning_rate': 2.091503267973856e-06, 'epoch': 48.95}


 98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22481/22950 [37:37<00:45, 10.38it/s]

{'loss': 0.0114, 'grad_norm': 0.00023973430506885052, 'learning_rate': 2.047930283224401e-06, 'epoch': 48.98}


 98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22491/22950 [37:38<00:41, 10.94it/s]

{'loss': 0.0, 'grad_norm': 0.0003294879861641675, 'learning_rate': 2.0043572984749455e-06, 'epoch': 49.0}


                                                     
 98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22491/22950 [37:41<00:41, 10.94it/s]

{'eval_loss': 1.3538857698440552, 'eval_accuracy': 0.8578431606292725, 'eval_runtime': 2.7818, 'eval_samples_per_second': 146.668, 'eval_steps_per_second': 18.334, 'epoch': 49.0}


 98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22501/22950 [37:42<01:31,  4.93it/s]

{'loss': 0.012, 'grad_norm': 0.0003792168281506747, 'learning_rate': 1.96078431372549e-06, 'epoch': 49.02}


 98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22511/22950 [37:43<00:48,  9.00it/s]

{'loss': 0.007, 'grad_norm': 0.00038326779031194746, 'learning_rate': 1.917211328976035e-06, 'epoch': 49.04}


 98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22521/22950 [37:44<00:42, 10.09it/s]

{'loss': 0.0, 'grad_norm': 0.0003731226606760174, 'learning_rate': 1.8736383442265797e-06, 'epoch': 49.06}


 98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22531/22950 [37:45<00:39, 10.51it/s]

{'loss': 0.0, 'grad_norm': 0.0003742629196494818, 'learning_rate': 1.8300653594771244e-06, 'epoch': 49.08}


 98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22541/22950 [37:46<00:39, 10.38it/s]

{'loss': 0.0, 'grad_norm': 0.0003622179210651666, 'learning_rate': 1.786492374727669e-06, 'epoch': 49.11}


 98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22551/22950 [37:47<00:38, 10.39it/s]

{'loss': 0.0, 'grad_norm': 0.000402226549340412, 'learning_rate': 1.7429193899782138e-06, 'epoch': 49.13}


 98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22561/22950 [37:48<00:37, 10.37it/s]

{'loss': 0.0, 'grad_norm': 0.00030786110437475145, 'learning_rate': 1.699346405228758e-06, 'epoch': 49.15}


 98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22571/22950 [37:49<00:35, 10.54it/s]

{'loss': 0.0, 'grad_norm': 0.0003054918779525906, 'learning_rate': 1.6557734204793027e-06, 'epoch': 49.17}


 98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22581/22950 [37:50<00:36, 10.18it/s]

{'loss': 0.0, 'grad_norm': 0.00033016124507412314, 'learning_rate': 1.6122004357298474e-06, 'epoch': 49.19}


 98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22591/22950 [37:51<00:34, 10.45it/s]

{'loss': 0.0, 'grad_norm': 0.00034431664971634746, 'learning_rate': 1.5686274509803923e-06, 'epoch': 49.22}


 98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22601/22950 [37:52<00:33, 10.35it/s]

{'loss': 0.0, 'grad_norm': 0.0016408710507676005, 'learning_rate': 1.525054466230937e-06, 'epoch': 49.24}


 99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22611/22950 [37:53<00:32, 10.43it/s]

{'loss': 0.0151, 'grad_norm': 0.00036620182800106704, 'learning_rate': 1.4814814814814817e-06, 'epoch': 49.26}


 99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22621/22950 [37:54<00:31, 10.31it/s]

{'loss': 0.0, 'grad_norm': 0.00037287521990947425, 'learning_rate': 1.4379084967320261e-06, 'epoch': 49.28}


 99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22631/22950 [37:55<00:30, 10.55it/s]

{'loss': 0.0, 'grad_norm': 0.00029095617355778813, 'learning_rate': 1.3943355119825708e-06, 'epoch': 49.3}


 99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22641/22950 [37:55<00:29, 10.33it/s]

{'loss': 0.0, 'grad_norm': 0.0004499276401475072, 'learning_rate': 1.3507625272331155e-06, 'epoch': 49.32}


 99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22651/22950 [37:56<00:28, 10.36it/s]

{'loss': 0.0, 'grad_norm': 0.0003422719310037792, 'learning_rate': 1.3071895424836602e-06, 'epoch': 49.35}


 99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 22661/22950 [37:57<00:27, 10.36it/s]

{'loss': 0.0, 'grad_norm': 0.00039939736598171294, 'learning_rate': 1.2636165577342049e-06, 'epoch': 49.37}


 99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22671/22950 [37:58<00:26, 10.54it/s]

{'loss': 0.0, 'grad_norm': 0.00033051214995794, 'learning_rate': 1.2200435729847495e-06, 'epoch': 49.39}


 99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22681/22950 [37:59<00:25, 10.38it/s]

{'loss': 0.0065, 'grad_norm': 0.00032471230952069163, 'learning_rate': 1.1764705882352942e-06, 'epoch': 49.41}


 99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22691/22950 [38:00<00:24, 10.48it/s]

{'loss': 0.0, 'grad_norm': 0.00035649488563649356, 'learning_rate': 1.132897603485839e-06, 'epoch': 49.43}


 99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22701/22950 [38:01<00:24, 10.23it/s]

{'loss': 0.0, 'grad_norm': 0.00028060926706530154, 'learning_rate': 1.0893246187363836e-06, 'epoch': 49.46}


 99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22711/22950 [38:02<00:22, 10.59it/s]

{'loss': 0.0, 'grad_norm': 0.00031351656070910394, 'learning_rate': 1.045751633986928e-06, 'epoch': 49.48}


 99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22721/22950 [38:03<00:22, 10.28it/s]

{'loss': 0.0139, 'grad_norm': 1.7340549230575562, 'learning_rate': 1.0021786492374727e-06, 'epoch': 49.5}


 99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22731/22950 [38:04<00:20, 10.54it/s]

{'loss': 0.0, 'grad_norm': 0.001632227562367916, 'learning_rate': 9.586056644880174e-07, 'epoch': 49.52}


 99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22741/22950 [38:05<00:20, 10.31it/s]

{'loss': 0.0, 'grad_norm': 0.000294598430627957, 'learning_rate': 9.150326797385622e-07, 'epoch': 49.54}


 99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22751/22950 [38:06<00:19, 10.45it/s]

{'loss': 0.0001, 'grad_norm': 0.0003652842715382576, 'learning_rate': 8.714596949891069e-07, 'epoch': 49.56}


 99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22761/22950 [38:07<00:18, 10.45it/s]

{'loss': 0.0, 'grad_norm': 0.000530935067217797, 'learning_rate': 8.278867102396514e-07, 'epoch': 49.59}


 99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22771/22950 [38:08<00:17, 10.44it/s]

{'loss': 0.0, 'grad_norm': 0.0003123390779364854, 'learning_rate': 7.843137254901962e-07, 'epoch': 49.61}


 99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22781/22950 [38:09<00:16, 10.18it/s]

{'loss': 0.0, 'grad_norm': 0.00033441203413531184, 'learning_rate': 7.407407407407408e-07, 'epoch': 49.63}


 99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22791/22950 [38:10<00:15, 10.57it/s]

{'loss': 0.0, 'grad_norm': 0.0002729314146563411, 'learning_rate': 6.971677559912854e-07, 'epoch': 49.65}


 99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22801/22950 [38:11<00:14, 10.33it/s]

{'loss': 0.0, 'grad_norm': 0.00030539146973751485, 'learning_rate': 6.535947712418301e-07, 'epoch': 49.67}


 99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22811/22950 [38:12<00:13, 10.40it/s]

{'loss': 0.0, 'grad_norm': 0.00034811589284799993, 'learning_rate': 6.100217864923748e-07, 'epoch': 49.69}


 99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22821/22950 [38:13<00:12, 10.19it/s]

{'loss': 0.0, 'grad_norm': 0.00047099796938709915, 'learning_rate': 5.664488017429195e-07, 'epoch': 49.72}


 99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22831/22950 [38:14<00:11, 10.59it/s]

{'loss': 0.0, 'grad_norm': 0.0004568697477225214, 'learning_rate': 5.22875816993464e-07, 'epoch': 49.74}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22841/22950 [38:15<00:10, 10.17it/s]

{'loss': 0.0, 'grad_norm': 0.0003311081090942025, 'learning_rate': 4.793028322440087e-07, 'epoch': 49.76}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22851/22950 [38:16<00:09, 10.54it/s]

{'loss': 0.0, 'grad_norm': 0.0003933681873604655, 'learning_rate': 4.3572984749455345e-07, 'epoch': 49.78}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22861/22950 [38:17<00:08, 10.36it/s]

{'loss': 0.0, 'grad_norm': 0.0003333955828566104, 'learning_rate': 3.921568627450981e-07, 'epoch': 49.8}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22871/22950 [38:18<00:07, 10.28it/s]

{'loss': 0.0, 'grad_norm': 0.0005715012084692717, 'learning_rate': 3.485838779956427e-07, 'epoch': 49.83}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22881/22950 [38:18<00:06, 10.29it/s]

{'loss': 0.0, 'grad_norm': 0.0004867271345574409, 'learning_rate': 3.050108932461874e-07, 'epoch': 49.85}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22891/22950 [38:19<00:05, 10.53it/s]

{'loss': 0.0, 'grad_norm': 0.0002780702488962561, 'learning_rate': 2.61437908496732e-07, 'epoch': 49.87}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22901/22950 [38:20<00:04, 10.13it/s]

{'loss': 0.0, 'grad_norm': 0.00031006880453787744, 'learning_rate': 2.1786492374727672e-07, 'epoch': 49.89}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22911/22950 [38:21<00:03, 10.59it/s]

{'loss': 0.0, 'grad_norm': 0.0002765099925454706, 'learning_rate': 1.7429193899782135e-07, 'epoch': 49.91}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22921/22950 [38:22<00:02, 10.33it/s]

{'loss': 0.0, 'grad_norm': 0.0006321017863228917, 'learning_rate': 1.30718954248366e-07, 'epoch': 49.93}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22931/22950 [38:23<00:01, 10.29it/s]

{'loss': 0.0, 'grad_norm': 0.0003627659461926669, 'learning_rate': 8.714596949891068e-08, 'epoch': 49.96}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 22941/22950 [38:24<00:00, 10.35it/s]

{'loss': 0.0, 'grad_norm': 0.0003673289611469954, 'learning_rate': 4.357298474945534e-08, 'epoch': 49.98}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 22950/22950 [38:25<00:00, 10.47it/s]

{'loss': 0.0, 'grad_norm': 0.0004455955349840224, 'learning_rate': 0.0, 'epoch': 50.0}


                                                     
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 22950/22950 [38:28<00:00, 10.47it/s]

{'eval_loss': 1.348001480102539, 'eval_accuracy': 0.8529411554336548, 'eval_runtime': 2.7068, 'eval_samples_per_second': 150.73, 'eval_steps_per_second': 18.841, 'epoch': 50.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 22950/22950 [38:30<00:00,  9.93it/s]

{'train_runtime': 2310.0483, 'train_samples_per_second': 79.392, 'train_steps_per_second': 9.935, 'train_loss': 0.05217436972030999, 'epoch': 50.0}





TrainOutput(global_step=22950, training_loss=0.05217436972030999, metrics={'train_runtime': 2310.0483, 'train_samples_per_second': 79.392, 'train_steps_per_second': 9.935, 'total_flos': 1.2063641888256e+16, 'train_loss': 0.05217436972030999, 'epoch': 50.0})

In [127]:
results = trainer.evaluate()
print("Evaluation Results:", results)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 51/51 [00:02<00:00, 24.44it/s]

Evaluation Results: {'eval_loss': 0.3803345859050751, 'eval_accuracy': 0.8602941036224365, 'eval_runtime': 2.2411, 'eval_samples_per_second': 182.057, 'eval_steps_per_second': 22.757, 'epoch': 50.0}





In [128]:
model.save_pretrained("./glue_mrpc_bert_model")
tokenizer.save_pretrained("./glue_mrpc_bert_tokenizer")

('./glue_mrpc_bert_tokenizer/tokenizer_config.json',
 './glue_mrpc_bert_tokenizer/special_tokens_map.json',
 './glue_mrpc_bert_tokenizer/vocab.txt',
 './glue_mrpc_bert_tokenizer/added_tokens.json',
 './glue_mrpc_bert_tokenizer/tokenizer.json')

In [129]:
predictions = trainer.predict(tokenized_datasets["validation"])

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 51/51 [00:02<00:00, 24.41it/s]


In [130]:
predictions = trainer.predict(tokenized_datasets["validation"])

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 51/51 [00:02<00:00, 24.35it/s]


In [131]:
import numpy as np
from sklearn.metrics import classification_report

labels = np.array(tokenized_datasets["validation"]["label"])
logits = predictions.predictions
y_pred = np.argmax(logits, axis=-1)

In [132]:
# Classification Report
class_report = classification_report(labels, y_pred)
print("Classification Report:")
print(class_report)

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.71      0.76       129
           1       0.88      0.93      0.90       279

    accuracy                           0.86       408
   macro avg       0.85      0.82      0.83       408
weighted avg       0.86      0.86      0.86       408



In [133]:
# Load the fine-tuned model and tokenizer
model_name = "./glue_mrpc_bert_model"  # Path to the saved fine-tuned model
tokenizer_name = "./glue_mrpc_bert_tokenizer"  # Path to the saved tokenizer
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

In [134]:
sentence1 = "Ram is very happy"
sentence2 = "He is sad"


In [135]:
inputs = tokenizer(sentence1, sentence2, return_tensors="pt", padding="max_length", max_length=128, truncation=True)


In [136]:
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
model.to(device)
inputs = {key: value.to(device) for key, value in inputs.items()}


In [137]:
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)



In [138]:
# Print the prediction
predicted_class = predictions.item()
class_names = ["Not Equivalent", "Equivalent"]
print(f"Predicted class: {predicted_class} ({class_names[predicted_class]})")

Predicted class: 0 (Not Equivalent)
