In [1]:
import torch
from datasets import load_dataset
from transformers import RobertaTokenizer, RobertaForCausalLM, Trainer, TrainingArguments
torch.cuda.empty_cache()


  from .autonotebook import tqdm as notebook_tqdm





In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
dataset = load_dataset("code_search_net", "python",cache_dir="./Datasets", trust_remote_code=True)

In [4]:
dataset.column_names

{'train': ['repository_name',
  'func_path_in_repository',
  'func_name',
  'whole_func_string',
  'language',
  'func_code_string',
  'func_code_tokens',
  'func_documentation_string',
  'func_documentation_tokens',
  'split_name',
  'func_code_url'],
 'test': ['repository_name',
  'func_path_in_repository',
  'func_name',
  'whole_func_string',
  'language',
  'func_code_string',
  'func_code_tokens',
  'func_documentation_string',
  'func_documentation_tokens',
  'split_name',
  'func_code_url'],
 'validation': ['repository_name',
  'func_path_in_repository',
  'func_name',
  'whole_func_string',
  'language',
  'func_code_string',
  'func_code_tokens',
  'func_documentation_string',
  'func_documentation_tokens',
  'split_name',
  'func_code_url']}

In [5]:
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base", cache_dir="./Models")
model = RobertaForCausalLM.from_pretrained("microsoft/codebert-base", cache_dir="./Models",is_decoder = True).to(device)

Some weights of RobertaForCausalLM were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
def tokenize_code(code_snippet, tokenizer):
    return tokenizer(
        code_snippet, max_length=512, truncation=True, padding='max_length', return_tensors="pt"
    )

In [7]:
def preprocess_data(batch, tokenizer):
    batch["input_ids"] = tokenize_code(batch["func_code_string"], tokenizer)["input_ids"]
    batch["labels"] = tokenize_code(batch["func_documentation_string"], tokenizer)["input_ids"]
    return batch

In [8]:
train_dataset = dataset["train"].shuffle(seed=42).select(range(10000)).map(lambda x: preprocess_data(x, tokenizer), batched=True)
eval_dataset = dataset["validation"].shuffle(seed=42).select(range(3000)).map(lambda x: preprocess_data(x, tokenizer), batched=True)

In [9]:
model

RobertaForCausalLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNor

In [10]:
training_args = TrainingArguments(
    output_dir="./codebert-finetuned-roberta/results",    # output directory
    num_train_epochs=20,                   # number of epochs
    per_device_train_batch_size=4,       # batch size
    per_device_eval_batch_size=4,        # eval batch size
    warmup_steps=1000,                     # warmup steps
    weight_decay=0.01,                    # weight decay
    logging_dir="./logs",                 # logging directory
    logging_steps=1000,
    evaluation_strategy="steps",          # Evaluation after every logging step
    save_total_limit=2,                   # Keep only last two checkpoints
    save_steps=1000,                       # Save model every 500 steps
    report_to="none",
    fp16=True,
    learning_rate=5e-5                 # No reports (e.g., to wandb)
)



In [11]:
# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

In [12]:
trainer.train()

  0%|          | 0/18750 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  1%|          | 100/18750 [00:29<1:29:32,  3.47it/s]

{'loss': 13.6835, 'grad_norm': 41.94532012939453, 'learning_rate': 4.800000000000001e-06, 'epoch': 0.08}


                                                     
  1%|          | 100/18750 [00:47<1:29:32,  3.47it/s]

{'eval_loss': 5.897005558013916, 'eval_runtime': 17.3952, 'eval_samples_per_second': 57.487, 'eval_steps_per_second': 14.372, 'epoch': 0.08}


  1%|          | 200/18750 [01:16<1:28:57,  3.48it/s] 

{'loss': 2.4638, 'grad_norm': 6.432319641113281, 'learning_rate': 9.750000000000002e-06, 'epoch': 0.16}


                                                     
  1%|          | 200/18750 [01:33<1:28:57,  3.48it/s]

{'eval_loss': 1.8412015438079834, 'eval_runtime': 17.0839, 'eval_samples_per_second': 58.535, 'eval_steps_per_second': 14.634, 'epoch': 0.16}


  2%|▏         | 300/18750 [02:02<1:29:21,  3.44it/s] 

{'loss': 1.6872, 'grad_norm': 17.521379470825195, 'learning_rate': 1.475e-05, 'epoch': 0.24}


                                                     
  2%|▏         | 300/18750 [02:19<1:29:21,  3.44it/s]

{'eval_loss': 1.5479098558425903, 'eval_runtime': 17.16, 'eval_samples_per_second': 58.275, 'eval_steps_per_second': 14.569, 'epoch': 0.24}


  2%|▏         | 400/18750 [02:48<1:28:49,  3.44it/s] 

{'loss': 1.2787, 'grad_norm': 15.012622833251953, 'learning_rate': 1.9750000000000002e-05, 'epoch': 0.32}


                                                     
  2%|▏         | 400/18750 [03:05<1:28:49,  3.44it/s]

{'eval_loss': 1.6730849742889404, 'eval_runtime': 17.1638, 'eval_samples_per_second': 58.262, 'eval_steps_per_second': 14.566, 'epoch': 0.32}


  3%|▎         | 500/18750 [03:34<1:27:55,  3.46it/s] 

{'loss': 1.2335, 'grad_norm': 9.981375694274902, 'learning_rate': 2.4750000000000002e-05, 'epoch': 0.4}


                                                     
  3%|▎         | 500/18750 [03:51<1:27:55,  3.46it/s]

{'eval_loss': 1.5102951526641846, 'eval_runtime': 17.1248, 'eval_samples_per_second': 58.395, 'eval_steps_per_second': 14.599, 'epoch': 0.4}


  3%|▎         | 600/18750 [04:20<1:26:48,  3.48it/s] 

{'loss': 1.2031, 'grad_norm': 3.7021894454956055, 'learning_rate': 2.975e-05, 'epoch': 0.48}


                                                     
  3%|▎         | 600/18750 [04:37<1:26:48,  3.48it/s]

{'eval_loss': 1.4089616537094116, 'eval_runtime': 17.1874, 'eval_samples_per_second': 58.182, 'eval_steps_per_second': 14.546, 'epoch': 0.48}


  4%|▎         | 700/18750 [05:06<1:26:58,  3.46it/s] 

{'loss': 1.3196, 'grad_norm': 14.493172645568848, 'learning_rate': 3.475e-05, 'epoch': 0.56}


                                                     
  4%|▎         | 700/18750 [05:23<1:26:58,  3.46it/s]

{'eval_loss': 1.3226734399795532, 'eval_runtime': 17.1257, 'eval_samples_per_second': 58.392, 'eval_steps_per_second': 14.598, 'epoch': 0.56}


  4%|▍         | 800/18750 [05:52<1:27:55,  3.40it/s] 

{'loss': 1.1337, 'grad_norm': 2.0874922275543213, 'learning_rate': 3.9750000000000004e-05, 'epoch': 0.64}


                                                     
  4%|▍         | 800/18750 [06:09<1:27:55,  3.40it/s]

{'eval_loss': 1.3390471935272217, 'eval_runtime': 17.298, 'eval_samples_per_second': 57.81, 'eval_steps_per_second': 14.453, 'epoch': 0.64}


  5%|▍         | 900/18750 [06:38<1:26:41,  3.43it/s] 

{'loss': 1.2636, 'grad_norm': 1.2033361196517944, 'learning_rate': 4.4750000000000004e-05, 'epoch': 0.72}


                                                     
  5%|▍         | 900/18750 [06:55<1:26:41,  3.43it/s]

{'eval_loss': 1.2938686609268188, 'eval_runtime': 17.2042, 'eval_samples_per_second': 58.125, 'eval_steps_per_second': 14.531, 'epoch': 0.72}


  5%|▌         | 1000/18750 [07:24<1:24:54,  3.48it/s]

{'loss': 1.1222, 'grad_norm': 1.536632776260376, 'learning_rate': 4.975e-05, 'epoch': 0.8}


                                                      
  5%|▌         | 1000/18750 [07:41<1:24:54,  3.48it/s]

{'eval_loss': 1.3335614204406738, 'eval_runtime': 17.1098, 'eval_samples_per_second': 58.446, 'eval_steps_per_second': 14.611, 'epoch': 0.8}


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  6%|▌         | 1100/18750 [08:12<1:24:34,  3.48it/s] 

{'loss': 1.0648, 'grad_norm': 1.8426233530044556, 'learning_rate': 4.973239436619719e-05, 'epoch': 0.88}


                                                      
  6%|▌         | 1100/18750 [08:29<1:24:34,  3.48it/s]

{'eval_loss': 1.316601276397705, 'eval_runtime': 17.1182, 'eval_samples_per_second': 58.417, 'eval_steps_per_second': 14.604, 'epoch': 0.88}


  6%|▋         | 1200/18750 [08:58<1:24:37,  3.46it/s] 

{'loss': 1.2048, 'grad_norm': 1.642164707183838, 'learning_rate': 4.9450704225352114e-05, 'epoch': 0.96}


                                                      
  6%|▋         | 1200/18750 [09:15<1:24:37,  3.46it/s]

{'eval_loss': 1.2916616201400757, 'eval_runtime': 17.119, 'eval_samples_per_second': 58.415, 'eval_steps_per_second': 14.604, 'epoch': 0.96}


  7%|▋         | 1300/18750 [09:44<1:23:53,  3.47it/s] 

{'loss': 1.0021, 'grad_norm': 0.9650108814239502, 'learning_rate': 4.916901408450704e-05, 'epoch': 1.04}


                                                      
  7%|▋         | 1300/18750 [10:01<1:23:53,  3.47it/s]

{'eval_loss': 1.2996212244033813, 'eval_runtime': 17.2245, 'eval_samples_per_second': 58.057, 'eval_steps_per_second': 14.514, 'epoch': 1.04}


  7%|▋         | 1400/18750 [10:30<1:23:05,  3.48it/s] 

{'loss': 1.0285, 'grad_norm': 1.1928812265396118, 'learning_rate': 4.888732394366197e-05, 'epoch': 1.12}


                                                      
  7%|▋         | 1400/18750 [10:47<1:23:05,  3.48it/s]

{'eval_loss': 1.2624651193618774, 'eval_runtime': 17.1144, 'eval_samples_per_second': 58.43, 'eval_steps_per_second': 14.608, 'epoch': 1.12}


  8%|▊         | 1500/18750 [11:16<1:23:14,  3.45it/s] 

{'loss': 1.1431, 'grad_norm': 3.030064821243286, 'learning_rate': 4.8605633802816904e-05, 'epoch': 1.2}


                                                      
  8%|▊         | 1500/18750 [11:33<1:23:14,  3.45it/s]

{'eval_loss': 1.2837474346160889, 'eval_runtime': 17.1149, 'eval_samples_per_second': 58.429, 'eval_steps_per_second': 14.607, 'epoch': 1.2}


  9%|▊         | 1600/18750 [12:02<1:22:46,  3.45it/s] 

{'loss': 1.106, 'grad_norm': 10.784835815429688, 'learning_rate': 4.832394366197183e-05, 'epoch': 1.28}


                                                      
  9%|▊         | 1600/18750 [12:19<1:22:46,  3.45it/s]

{'eval_loss': 1.2831774950027466, 'eval_runtime': 17.1037, 'eval_samples_per_second': 58.467, 'eval_steps_per_second': 14.617, 'epoch': 1.28}


  9%|▉         | 1700/18750 [12:48<1:21:22,  3.49it/s] 

{'loss': 1.1115, 'grad_norm': 1.2016226053237915, 'learning_rate': 4.8042253521126765e-05, 'epoch': 1.36}


                                                      
  9%|▉         | 1700/18750 [13:05<1:21:22,  3.49it/s]

{'eval_loss': 1.261431097984314, 'eval_runtime': 17.1466, 'eval_samples_per_second': 58.321, 'eval_steps_per_second': 14.58, 'epoch': 1.36}


 10%|▉         | 1800/18750 [13:34<1:21:14,  3.48it/s] 

{'loss': 1.1168, 'grad_norm': 2.403912305831909, 'learning_rate': 4.776056338028169e-05, 'epoch': 1.44}


                                                      
 10%|▉         | 1800/18750 [13:51<1:21:14,  3.48it/s]

{'eval_loss': 1.2836049795150757, 'eval_runtime': 17.4491, 'eval_samples_per_second': 57.31, 'eval_steps_per_second': 14.327, 'epoch': 1.44}


 10%|█         | 1900/18750 [14:20<1:21:03,  3.46it/s] 

{'loss': 1.152, 'grad_norm': 0.9587670564651489, 'learning_rate': 4.747887323943662e-05, 'epoch': 1.52}


                                                      
 10%|█         | 1900/18750 [14:38<1:21:03,  3.46it/s]

{'eval_loss': 1.2800387144088745, 'eval_runtime': 17.2326, 'eval_samples_per_second': 58.029, 'eval_steps_per_second': 14.507, 'epoch': 1.52}


 11%|█         | 2000/18750 [15:07<1:20:56,  3.45it/s] 

{'loss': 1.1338, 'grad_norm': 0.8283342123031616, 'learning_rate': 4.7197183098591555e-05, 'epoch': 1.6}


                                                      
 11%|█         | 2000/18750 [15:24<1:20:56,  3.45it/s]

{'eval_loss': 1.2604877948760986, 'eval_runtime': 17.2294, 'eval_samples_per_second': 58.04, 'eval_steps_per_second': 14.51, 'epoch': 1.6}


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
 11%|█         | 2100/18750 [15:55<1:20:05,  3.46it/s] 

{'loss': 1.1249, 'grad_norm': 1.366652011871338, 'learning_rate': 4.691549295774648e-05, 'epoch': 1.68}


                                                      
 11%|█         | 2100/18750 [16:13<1:20:05,  3.46it/s]

{'eval_loss': 1.2487019300460815, 'eval_runtime': 17.2182, 'eval_samples_per_second': 58.078, 'eval_steps_per_second': 14.52, 'epoch': 1.68}


 12%|█▏        | 2200/18750 [16:42<1:19:37,  3.46it/s] 

{'loss': 1.107, 'grad_norm': 1.2738003730773926, 'learning_rate': 4.663380281690141e-05, 'epoch': 1.76}


                                                      
 12%|█▏        | 2200/18750 [16:59<1:19:37,  3.46it/s]

{'eval_loss': 1.2498278617858887, 'eval_runtime': 17.2556, 'eval_samples_per_second': 57.952, 'eval_steps_per_second': 14.488, 'epoch': 1.76}


 12%|█▏        | 2300/18750 [17:28<1:19:12,  3.46it/s] 

{'loss': 1.0594, 'grad_norm': 1.8301517963409424, 'learning_rate': 4.6352112676056344e-05, 'epoch': 1.84}


                                                      
 12%|█▏        | 2300/18750 [17:45<1:19:12,  3.46it/s]

{'eval_loss': 1.2433984279632568, 'eval_runtime': 17.2361, 'eval_samples_per_second': 58.018, 'eval_steps_per_second': 14.504, 'epoch': 1.84}


 13%|█▎        | 2400/18750 [18:14<1:18:43,  3.46it/s] 

{'loss': 1.0547, 'grad_norm': 2.0869336128234863, 'learning_rate': 4.607042253521127e-05, 'epoch': 1.92}


                                                      
 13%|█▎        | 2400/18750 [18:31<1:18:43,  3.46it/s]

{'eval_loss': 1.2509866952896118, 'eval_runtime': 17.2526, 'eval_samples_per_second': 57.962, 'eval_steps_per_second': 14.491, 'epoch': 1.92}


 13%|█▎        | 2500/18750 [19:00<1:18:16,  3.46it/s] 

{'loss': 1.2209, 'grad_norm': 2.085958957672119, 'learning_rate': 4.57887323943662e-05, 'epoch': 2.0}


                                                      
 13%|█▎        | 2500/18750 [19:18<1:18:16,  3.46it/s]

{'eval_loss': 1.2473005056381226, 'eval_runtime': 17.2353, 'eval_samples_per_second': 58.02, 'eval_steps_per_second': 14.505, 'epoch': 2.0}


 14%|█▍        | 2600/18750 [19:47<1:17:53,  3.46it/s] 

{'loss': 1.0981, 'grad_norm': 1.2044625282287598, 'learning_rate': 4.550704225352113e-05, 'epoch': 2.08}


                                                      
 14%|█▍        | 2600/18750 [20:04<1:17:53,  3.46it/s]

{'eval_loss': 1.245200276374817, 'eval_runtime': 17.24, 'eval_samples_per_second': 58.005, 'eval_steps_per_second': 14.501, 'epoch': 2.08}


 14%|█▍        | 2700/18750 [20:33<1:17:37,  3.45it/s] 

{'loss': 1.0568, 'grad_norm': 1.0202840566635132, 'learning_rate': 4.5225352112676054e-05, 'epoch': 2.16}


                                                      
 14%|█▍        | 2700/18750 [20:50<1:17:37,  3.45it/s]

{'eval_loss': 1.2342016696929932, 'eval_runtime': 17.2329, 'eval_samples_per_second': 58.028, 'eval_steps_per_second': 14.507, 'epoch': 2.16}


 15%|█▍        | 2800/18750 [21:19<1:17:45,  3.42it/s] 

{'loss': 1.0499, 'grad_norm': 0.6591837406158447, 'learning_rate': 4.494366197183099e-05, 'epoch': 2.24}


                                                      
 15%|█▍        | 2800/18750 [21:37<1:17:45,  3.42it/s]

{'eval_loss': 1.2464150190353394, 'eval_runtime': 17.2504, 'eval_samples_per_second': 57.97, 'eval_steps_per_second': 14.492, 'epoch': 2.24}


 15%|█▌        | 2900/18750 [22:06<1:16:43,  3.44it/s] 

{'loss': 1.0978, 'grad_norm': 2.9149320125579834, 'learning_rate': 4.4661971830985916e-05, 'epoch': 2.32}


                                                      
 15%|█▌        | 2900/18750 [22:23<1:16:43,  3.44it/s]

{'eval_loss': 1.2398537397384644, 'eval_runtime': 17.2465, 'eval_samples_per_second': 57.983, 'eval_steps_per_second': 14.496, 'epoch': 2.32}


 16%|█▌        | 3000/18750 [22:52<1:16:04,  3.45it/s] 

{'loss': 0.9941, 'grad_norm': 1.812187910079956, 'learning_rate': 4.438028169014084e-05, 'epoch': 2.4}


                                                      
 16%|█▌        | 3000/18750 [23:09<1:16:04,  3.45it/s]

{'eval_loss': 1.2548290491104126, 'eval_runtime': 17.2407, 'eval_samples_per_second': 58.002, 'eval_steps_per_second': 14.501, 'epoch': 2.4}


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
 17%|█▋        | 3100/18750 [23:41<1:15:42,  3.45it/s] 

{'loss': 1.0715, 'grad_norm': 1.66780686378479, 'learning_rate': 4.409859154929578e-05, 'epoch': 2.48}


                                                      
 17%|█▋        | 3100/18750 [23:58<1:15:42,  3.45it/s]

{'eval_loss': 1.2404587268829346, 'eval_runtime': 17.252, 'eval_samples_per_second': 57.964, 'eval_steps_per_second': 14.491, 'epoch': 2.48}


 17%|█▋        | 3200/18750 [24:27<1:15:24,  3.44it/s] 

{'loss': 1.113, 'grad_norm': 1.3819949626922607, 'learning_rate': 4.3816901408450705e-05, 'epoch': 2.56}


                                                      
 17%|█▋        | 3200/18750 [24:44<1:15:24,  3.44it/s]

{'eval_loss': 1.2473297119140625, 'eval_runtime': 17.2463, 'eval_samples_per_second': 57.983, 'eval_steps_per_second': 14.496, 'epoch': 2.56}


 18%|█▊        | 3300/18750 [25:13<1:14:54,  3.44it/s] 

{'loss': 0.9877, 'grad_norm': 1.2230474948883057, 'learning_rate': 4.353521126760563e-05, 'epoch': 2.64}


                                                      
 18%|█▊        | 3300/18750 [25:31<1:14:54,  3.44it/s]

{'eval_loss': 1.2491625547409058, 'eval_runtime': 17.2468, 'eval_samples_per_second': 57.982, 'eval_steps_per_second': 14.495, 'epoch': 2.64}


 18%|█▊        | 3400/18750 [26:00<1:14:31,  3.43it/s] 

{'loss': 1.0225, 'grad_norm': 0.7042796015739441, 'learning_rate': 4.325352112676057e-05, 'epoch': 2.72}


                                                      
 18%|█▊        | 3400/18750 [26:17<1:14:31,  3.43it/s]

{'eval_loss': 1.232690453529358, 'eval_runtime': 17.2629, 'eval_samples_per_second': 57.928, 'eval_steps_per_second': 14.482, 'epoch': 2.72}


 19%|█▊        | 3500/18750 [26:46<1:17:20,  3.29it/s] 

{'loss': 1.1029, 'grad_norm': 1.6595189571380615, 'learning_rate': 4.2971830985915494e-05, 'epoch': 2.8}


                                                      
 19%|█▊        | 3500/18750 [27:03<1:17:20,  3.29it/s]

{'eval_loss': 1.2433953285217285, 'eval_runtime': 17.2453, 'eval_samples_per_second': 57.987, 'eval_steps_per_second': 14.497, 'epoch': 2.8}


 19%|█▉        | 3600/18750 [27:33<1:13:48,  3.42it/s] 

{'loss': 1.0845, 'grad_norm': 2.4555885791778564, 'learning_rate': 4.269014084507043e-05, 'epoch': 2.88}


                                                      
 19%|█▉        | 3600/18750 [27:50<1:13:48,  3.42it/s]

{'eval_loss': 1.2606662511825562, 'eval_runtime': 17.4222, 'eval_samples_per_second': 57.398, 'eval_steps_per_second': 14.349, 'epoch': 2.88}


 20%|█▉        | 3700/18750 [28:20<1:13:48,  3.40it/s] 

{'loss': 1.0632, 'grad_norm': 1.911312222480774, 'learning_rate': 4.2408450704225356e-05, 'epoch': 2.96}


                                                      
 20%|█▉        | 3700/18750 [28:37<1:13:48,  3.40it/s]

{'eval_loss': 1.2389583587646484, 'eval_runtime': 17.4827, 'eval_samples_per_second': 57.199, 'eval_steps_per_second': 14.3, 'epoch': 2.96}


 20%|██        | 3800/18750 [29:06<1:12:30,  3.44it/s] 

{'loss': 1.0827, 'grad_norm': 1.4247175455093384, 'learning_rate': 4.2126760563380284e-05, 'epoch': 3.04}


                                                      
 20%|██        | 3800/18750 [29:23<1:12:30,  3.44it/s]

{'eval_loss': 1.240574836730957, 'eval_runtime': 17.1858, 'eval_samples_per_second': 58.188, 'eval_steps_per_second': 14.547, 'epoch': 3.04}


 21%|██        | 3900/18750 [29:53<1:11:57,  3.44it/s] 

{'loss': 1.0237, 'grad_norm': 3.400317907333374, 'learning_rate': 4.184507042253522e-05, 'epoch': 3.12}


                                                      
 21%|██        | 3900/18750 [30:10<1:11:57,  3.44it/s]

{'eval_loss': 1.2431992292404175, 'eval_runtime': 17.1915, 'eval_samples_per_second': 58.168, 'eval_steps_per_second': 14.542, 'epoch': 3.12}


 21%|██▏       | 4000/18750 [30:39<1:11:35,  3.43it/s] 

{'loss': 1.0365, 'grad_norm': 0.8599569201469421, 'learning_rate': 4.1563380281690145e-05, 'epoch': 3.2}


                                                      
 21%|██▏       | 4000/18750 [30:57<1:11:35,  3.43it/s]

{'eval_loss': 1.250285267829895, 'eval_runtime': 17.6569, 'eval_samples_per_second': 56.635, 'eval_steps_per_second': 14.159, 'epoch': 3.2}


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
 22%|██▏       | 4100/18750 [31:28<1:11:25,  3.42it/s] 

{'loss': 0.9881, 'grad_norm': 1.2678591012954712, 'learning_rate': 4.128169014084507e-05, 'epoch': 3.28}


                                                      
 22%|██▏       | 4100/18750 [31:46<1:11:25,  3.42it/s]

{'eval_loss': 1.2500258684158325, 'eval_runtime': 17.4174, 'eval_samples_per_second': 57.414, 'eval_steps_per_second': 14.353, 'epoch': 3.28}


 22%|██▏       | 4200/18750 [32:15<1:10:39,  3.43it/s] 

{'loss': 1.1155, 'grad_norm': 1.0220381021499634, 'learning_rate': 4.1e-05, 'epoch': 3.36}


                                                      
 22%|██▏       | 4200/18750 [32:33<1:10:39,  3.43it/s]

{'eval_loss': 1.2391140460968018, 'eval_runtime': 17.4621, 'eval_samples_per_second': 57.267, 'eval_steps_per_second': 14.317, 'epoch': 3.36}


 23%|██▎       | 4300/18750 [33:02<1:10:33,  3.41it/s] 

{'loss': 1.0521, 'grad_norm': 2.263787269592285, 'learning_rate': 4.071830985915493e-05, 'epoch': 3.44}


                                                      
 23%|██▎       | 4300/18750 [33:19<1:10:33,  3.41it/s]

{'eval_loss': 1.2553377151489258, 'eval_runtime': 17.3745, 'eval_samples_per_second': 57.556, 'eval_steps_per_second': 14.389, 'epoch': 3.44}


 23%|██▎       | 4400/18750 [33:49<1:09:47,  3.43it/s] 

{'loss': 0.9803, 'grad_norm': 7.292447566986084, 'learning_rate': 4.043661971830986e-05, 'epoch': 3.52}


                                                      
 23%|██▎       | 4400/18750 [34:06<1:09:47,  3.43it/s]

{'eval_loss': 1.2898772954940796, 'eval_runtime': 17.305, 'eval_samples_per_second': 57.787, 'eval_steps_per_second': 14.447, 'epoch': 3.52}


 24%|██▍       | 4500/18750 [34:35<1:10:10,  3.38it/s] 

{'loss': 1.0677, 'grad_norm': 3.033140182495117, 'learning_rate': 4.015492957746479e-05, 'epoch': 3.6}


                                                      
 24%|██▍       | 4500/18750 [34:53<1:10:10,  3.38it/s]

{'eval_loss': 1.2386469841003418, 'eval_runtime': 17.623, 'eval_samples_per_second': 56.744, 'eval_steps_per_second': 14.186, 'epoch': 3.6}


 25%|██▍       | 4600/18750 [35:22<1:08:48,  3.43it/s] 

{'loss': 1.0803, 'grad_norm': 1.372910499572754, 'learning_rate': 3.987323943661972e-05, 'epoch': 3.68}


                                                      
 25%|██▍       | 4600/18750 [35:40<1:08:48,  3.43it/s]

{'eval_loss': 1.2453503608703613, 'eval_runtime': 17.2909, 'eval_samples_per_second': 57.834, 'eval_steps_per_second': 14.458, 'epoch': 3.68}


 25%|██▌       | 4700/18750 [36:09<1:08:39,  3.41it/s] 

{'loss': 0.981, 'grad_norm': 0.780325710773468, 'learning_rate': 3.959154929577465e-05, 'epoch': 3.76}


                                                      
 25%|██▌       | 4700/18750 [36:27<1:08:39,  3.41it/s]

{'eval_loss': 1.2404531240463257, 'eval_runtime': 17.6261, 'eval_samples_per_second': 56.734, 'eval_steps_per_second': 14.184, 'epoch': 3.76}


 26%|██▌       | 4800/18750 [36:56<1:08:21,  3.40it/s] 

{'loss': 1.042, 'grad_norm': 2.7298433780670166, 'learning_rate': 3.930985915492958e-05, 'epoch': 3.84}


                                                      
 26%|██▌       | 4800/18750 [37:14<1:08:21,  3.40it/s]

{'eval_loss': 1.252794623374939, 'eval_runtime': 17.5417, 'eval_samples_per_second': 57.007, 'eval_steps_per_second': 14.252, 'epoch': 3.84}


 26%|██▌       | 4900/18750 [37:43<1:07:24,  3.42it/s] 

{'loss': 1.0657, 'grad_norm': 1.3956917524337769, 'learning_rate': 3.9028169014084507e-05, 'epoch': 3.92}


                                                      
 26%|██▌       | 4900/18750 [38:00<1:07:24,  3.42it/s]

{'eval_loss': 1.2291923761367798, 'eval_runtime': 17.3567, 'eval_samples_per_second': 57.615, 'eval_steps_per_second': 14.404, 'epoch': 3.92}


 27%|██▋       | 5000/18750 [38:30<1:06:45,  3.43it/s] 

{'loss': 0.9531, 'grad_norm': 2.2015838623046875, 'learning_rate': 3.874647887323944e-05, 'epoch': 4.0}


                                                      
 27%|██▋       | 5000/18750 [38:47<1:06:45,  3.43it/s]

{'eval_loss': 1.2384235858917236, 'eval_runtime': 17.3549, 'eval_samples_per_second': 57.62, 'eval_steps_per_second': 14.405, 'epoch': 4.0}


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
 27%|██▋       | 5100/18750 [39:19<1:06:35,  3.42it/s] 

{'loss': 1.031, 'grad_norm': 2.565732955932617, 'learning_rate': 3.846478873239437e-05, 'epoch': 4.08}


                                                      
 27%|██▋       | 5100/18750 [39:36<1:06:35,  3.42it/s]

{'eval_loss': 1.254526138305664, 'eval_runtime': 17.517, 'eval_samples_per_second': 57.087, 'eval_steps_per_second': 14.272, 'epoch': 4.08}


 28%|██▊       | 5200/18750 [40:06<1:06:14,  3.41it/s] 

{'loss': 0.9677, 'grad_norm': 1.2191054821014404, 'learning_rate': 3.8183098591549296e-05, 'epoch': 4.16}


                                                      
 28%|██▊       | 5200/18750 [40:23<1:06:14,  3.41it/s]

{'eval_loss': 1.2616047859191895, 'eval_runtime': 17.5494, 'eval_samples_per_second': 56.982, 'eval_steps_per_second': 14.245, 'epoch': 4.16}


 28%|██▊       | 5300/18750 [40:53<1:05:38,  3.41it/s] 

{'loss': 0.9778, 'grad_norm': 1.7475440502166748, 'learning_rate': 3.790140845070423e-05, 'epoch': 4.24}


                                                      
 28%|██▊       | 5300/18750 [41:10<1:05:38,  3.41it/s]

{'eval_loss': 1.2504016160964966, 'eval_runtime': 17.5258, 'eval_samples_per_second': 57.059, 'eval_steps_per_second': 14.265, 'epoch': 4.24}


 29%|██▉       | 5400/18750 [41:40<1:05:27,  3.40it/s] 

{'loss': 0.9701, 'grad_norm': 2.3369152545928955, 'learning_rate': 3.761971830985916e-05, 'epoch': 4.32}


                                                      
 29%|██▉       | 5400/18750 [41:57<1:05:27,  3.40it/s]

{'eval_loss': 1.2451738119125366, 'eval_runtime': 17.5588, 'eval_samples_per_second': 56.952, 'eval_steps_per_second': 14.238, 'epoch': 4.32}


 29%|██▉       | 5500/18750 [42:27<1:05:08,  3.39it/s] 

{'loss': 1.0162, 'grad_norm': 1.9890462160110474, 'learning_rate': 3.733802816901409e-05, 'epoch': 4.4}


                                                      
 29%|██▉       | 5500/18750 [42:44<1:05:08,  3.39it/s]

{'eval_loss': 1.250539779663086, 'eval_runtime': 17.6162, 'eval_samples_per_second': 56.766, 'eval_steps_per_second': 14.191, 'epoch': 4.4}


 30%|██▉       | 5600/18750 [43:14<1:04:26,  3.40it/s] 

{'loss': 1.0718, 'grad_norm': 1.0667668581008911, 'learning_rate': 3.705633802816901e-05, 'epoch': 4.48}


                                                      
 30%|██▉       | 5600/18750 [43:31<1:04:26,  3.40it/s]

{'eval_loss': 1.2367156744003296, 'eval_runtime': 17.6301, 'eval_samples_per_second': 56.721, 'eval_steps_per_second': 14.18, 'epoch': 4.48}


 30%|███       | 5700/18750 [44:01<1:04:30,  3.37it/s] 

{'loss': 1.0062, 'grad_norm': 1.1812914609909058, 'learning_rate': 3.677464788732394e-05, 'epoch': 4.56}


                                                      
 30%|███       | 5700/18750 [44:19<1:04:30,  3.37it/s]

{'eval_loss': 1.2926805019378662, 'eval_runtime': 17.5963, 'eval_samples_per_second': 56.83, 'eval_steps_per_second': 14.208, 'epoch': 4.56}


 31%|███       | 5800/18750 [44:48<1:03:36,  3.39it/s] 

{'loss': 0.8778, 'grad_norm': 2.3920392990112305, 'learning_rate': 3.6492957746478874e-05, 'epoch': 4.64}


                                                      
 31%|███       | 5800/18750 [45:06<1:03:36,  3.39it/s]

{'eval_loss': 1.2511823177337646, 'eval_runtime': 17.6148, 'eval_samples_per_second': 56.77, 'eval_steps_per_second': 14.193, 'epoch': 4.64}


 31%|███▏      | 5900/18750 [45:35<1:02:54,  3.40it/s] 

{'loss': 1.0126, 'grad_norm': 1.1560661792755127, 'learning_rate': 3.62112676056338e-05, 'epoch': 4.72}


                                                      
 31%|███▏      | 5900/18750 [45:53<1:02:54,  3.40it/s]

{'eval_loss': 1.2391754388809204, 'eval_runtime': 17.6804, 'eval_samples_per_second': 56.56, 'eval_steps_per_second': 14.14, 'epoch': 4.72}


 32%|███▏      | 6000/18750 [46:23<1:03:11,  3.36it/s] 

{'loss': 1.0059, 'grad_norm': 1.9963688850402832, 'learning_rate': 3.5929577464788736e-05, 'epoch': 4.8}


                                                      
 32%|███▏      | 6000/18750 [46:40<1:03:11,  3.36it/s]

{'eval_loss': 1.2470320463180542, 'eval_runtime': 17.6181, 'eval_samples_per_second': 56.76, 'eval_steps_per_second': 14.19, 'epoch': 4.8}


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
 33%|███▎      | 6100/18750 [47:12<1:02:16,  3.39it/s] 

{'loss': 1.007, 'grad_norm': 1.954512357711792, 'learning_rate': 3.5647887323943664e-05, 'epoch': 4.88}


                                                      
 33%|███▎      | 6100/18750 [47:30<1:02:16,  3.39it/s]

{'eval_loss': 1.2831982374191284, 'eval_runtime': 17.6545, 'eval_samples_per_second': 56.643, 'eval_steps_per_second': 14.161, 'epoch': 4.88}


 33%|███▎      | 6200/18750 [47:59<1:02:02,  3.37it/s] 

{'loss': 1.0138, 'grad_norm': 1.588827133178711, 'learning_rate': 3.536619718309859e-05, 'epoch': 4.96}


                                                      
 33%|███▎      | 6200/18750 [48:17<1:02:02,  3.37it/s]

{'eval_loss': 1.2565526962280273, 'eval_runtime': 17.6153, 'eval_samples_per_second': 56.769, 'eval_steps_per_second': 14.192, 'epoch': 4.96}


 34%|███▎      | 6300/18750 [48:46<1:01:14,  3.39it/s] 

{'loss': 0.9753, 'grad_norm': 1.3189483880996704, 'learning_rate': 3.5084507042253525e-05, 'epoch': 5.04}


                                                      
 34%|███▎      | 6300/18750 [49:04<1:01:14,  3.39it/s]

{'eval_loss': 1.2543061971664429, 'eval_runtime': 17.596, 'eval_samples_per_second': 56.831, 'eval_steps_per_second': 14.208, 'epoch': 5.04}


 34%|███▍      | 6400/18750 [49:34<1:01:06,  3.37it/s] 

{'loss': 0.9682, 'grad_norm': 1.8966397047042847, 'learning_rate': 3.480281690140845e-05, 'epoch': 5.12}


                                                      
 34%|███▍      | 6400/18750 [49:51<1:01:06,  3.37it/s]

{'eval_loss': 1.3097312450408936, 'eval_runtime': 17.6711, 'eval_samples_per_second': 56.59, 'eval_steps_per_second': 14.147, 'epoch': 5.12}


 35%|███▍      | 6500/18750 [50:21<1:00:31,  3.37it/s] 

{'loss': 0.9012, 'grad_norm': 2.347118377685547, 'learning_rate': 3.452112676056338e-05, 'epoch': 5.2}


                                                      
 35%|███▍      | 6500/18750 [50:38<1:00:31,  3.37it/s]

{'eval_loss': 1.3452227115631104, 'eval_runtime': 17.6756, 'eval_samples_per_second': 56.575, 'eval_steps_per_second': 14.144, 'epoch': 5.2}


 35%|███▌      | 6600/18750 [51:08<59:46,  3.39it/s]   

{'loss': 0.9708, 'grad_norm': 3.924473285675049, 'learning_rate': 3.4239436619718315e-05, 'epoch': 5.28}


                                                    
 35%|███▌      | 6600/18750 [51:26<59:46,  3.39it/s]

{'eval_loss': 1.2870259284973145, 'eval_runtime': 17.647, 'eval_samples_per_second': 56.667, 'eval_steps_per_second': 14.167, 'epoch': 5.28}


 36%|███▌      | 6700/18750 [51:55<59:29,  3.38it/s]   

{'loss': 0.9621, 'grad_norm': 3.886406183242798, 'learning_rate': 3.395774647887324e-05, 'epoch': 5.36}


                                                    
 36%|███▌      | 6700/18750 [52:13<59:29,  3.38it/s]

{'eval_loss': 1.2691913843154907, 'eval_runtime': 17.6376, 'eval_samples_per_second': 56.697, 'eval_steps_per_second': 14.174, 'epoch': 5.36}


 36%|███▋      | 6800/18750 [52:43<58:46,  3.39it/s]   

{'loss': 0.91, 'grad_norm': 1.3994488716125488, 'learning_rate': 3.367605633802817e-05, 'epoch': 5.44}


                                                    
 36%|███▋      | 6800/18750 [53:00<58:46,  3.39it/s]

{'eval_loss': 1.2542957067489624, 'eval_runtime': 17.728, 'eval_samples_per_second': 56.408, 'eval_steps_per_second': 14.102, 'epoch': 5.44}


 37%|███▋      | 6900/18750 [53:30<58:31,  3.37it/s]   

{'loss': 0.9568, 'grad_norm': 1.1449030637741089, 'learning_rate': 3.3394366197183104e-05, 'epoch': 5.52}


                                                    
 37%|███▋      | 6900/18750 [53:48<58:31,  3.37it/s]

{'eval_loss': 1.2607897520065308, 'eval_runtime': 17.6954, 'eval_samples_per_second': 56.512, 'eval_steps_per_second': 14.128, 'epoch': 5.52}


 37%|███▋      | 7000/18750 [54:17<58:00,  3.38it/s]   

{'loss': 1.0429, 'grad_norm': 3.213344097137451, 'learning_rate': 3.311267605633803e-05, 'epoch': 5.6}


                                                    
 37%|███▋      | 7000/18750 [54:35<58:00,  3.38it/s]

{'eval_loss': 1.2655327320098877, 'eval_runtime': 17.7502, 'eval_samples_per_second': 56.337, 'eval_steps_per_second': 14.084, 'epoch': 5.6}


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
 38%|███▊      | 7100/18750 [55:06<57:35,  3.37it/s]   

{'loss': 0.9953, 'grad_norm': 5.336260795593262, 'learning_rate': 3.283098591549296e-05, 'epoch': 5.68}


                                                    
 38%|███▊      | 7100/18750 [55:24<57:35,  3.37it/s]

{'eval_loss': 1.265977382659912, 'eval_runtime': 17.7595, 'eval_samples_per_second': 56.308, 'eval_steps_per_second': 14.077, 'epoch': 5.68}


 38%|███▊      | 7200/18750 [55:54<56:57,  3.38it/s]   

{'loss': 0.9217, 'grad_norm': 1.1493781805038452, 'learning_rate': 3.2549295774647887e-05, 'epoch': 5.76}


                                                    
 38%|███▊      | 7200/18750 [56:11<56:57,  3.38it/s]

{'eval_loss': 1.27311110496521, 'eval_runtime': 17.6689, 'eval_samples_per_second': 56.597, 'eval_steps_per_second': 14.149, 'epoch': 5.76}


 39%|███▉      | 7300/18750 [56:41<56:21,  3.39it/s]   

{'loss': 1.0555, 'grad_norm': 2.351104974746704, 'learning_rate': 3.2267605633802814e-05, 'epoch': 5.84}


                                                    
 39%|███▉      | 7300/18750 [56:59<56:21,  3.39it/s]

{'eval_loss': 1.3137004375457764, 'eval_runtime': 17.7522, 'eval_samples_per_second': 56.331, 'eval_steps_per_second': 14.083, 'epoch': 5.84}


 39%|███▉      | 7400/18750 [57:28<55:52,  3.39it/s]   

{'loss': 0.9276, 'grad_norm': 2.707573652267456, 'learning_rate': 3.198591549295775e-05, 'epoch': 5.92}


                                                    
 39%|███▉      | 7400/18750 [57:46<55:52,  3.39it/s]

{'eval_loss': 1.2630479335784912, 'eval_runtime': 17.6721, 'eval_samples_per_second': 56.586, 'eval_steps_per_second': 14.147, 'epoch': 5.92}


 40%|████      | 7500/18750 [58:16<55:25,  3.38it/s]   

{'loss': 0.914, 'grad_norm': 11.254226684570312, 'learning_rate': 3.1704225352112676e-05, 'epoch': 6.0}


                                                    
 40%|████      | 7500/18750 [58:33<55:25,  3.38it/s]

{'eval_loss': 1.3101428747177124, 'eval_runtime': 17.7679, 'eval_samples_per_second': 56.281, 'eval_steps_per_second': 14.07, 'epoch': 6.0}


 41%|████      | 7600/18750 [59:03<54:52,  3.39it/s]   

{'loss': 0.8624, 'grad_norm': 2.3810882568359375, 'learning_rate': 3.14225352112676e-05, 'epoch': 6.08}


                                                    
 41%|████      | 7600/18750 [59:21<54:52,  3.39it/s]

{'eval_loss': 1.2860435247421265, 'eval_runtime': 17.6911, 'eval_samples_per_second': 56.526, 'eval_steps_per_second': 14.131, 'epoch': 6.08}


 41%|████      | 7700/18750 [59:50<54:14,  3.40it/s]   

{'loss': 0.844, 'grad_norm': 3.755779504776001, 'learning_rate': 3.114084507042254e-05, 'epoch': 6.16}


                                                    
 41%|████      | 7700/18750 [1:00:08<54:14,  3.40it/s]

{'eval_loss': 1.2908786535263062, 'eval_runtime': 17.678, 'eval_samples_per_second': 56.568, 'eval_steps_per_second': 14.142, 'epoch': 6.16}


 42%|████▏     | 7800/18750 [1:00:38<54:12,  3.37it/s]   

{'loss': 0.9643, 'grad_norm': 3.0449306964874268, 'learning_rate': 3.0859154929577465e-05, 'epoch': 6.24}


                                                      
 42%|████▏     | 7800/18750 [1:00:55<54:12,  3.37it/s]

{'eval_loss': 1.2822505235671997, 'eval_runtime': 17.691, 'eval_samples_per_second': 56.526, 'eval_steps_per_second': 14.131, 'epoch': 6.24}


 42%|████▏     | 7900/18750 [1:01:25<53:42,  3.37it/s]   

{'loss': 0.9802, 'grad_norm': 1.5453187227249146, 'learning_rate': 3.05774647887324e-05, 'epoch': 6.32}


                                                      
 42%|████▏     | 7900/18750 [1:01:43<53:42,  3.37it/s]

{'eval_loss': 1.2894009351730347, 'eval_runtime': 17.7096, 'eval_samples_per_second': 56.467, 'eval_steps_per_second': 14.117, 'epoch': 6.32}


 43%|████▎     | 8000/18750 [1:02:12<53:14,  3.36it/s]   

{'loss': 0.8891, 'grad_norm': 4.852970600128174, 'learning_rate': 3.0295774647887327e-05, 'epoch': 6.4}


                                                      
 43%|████▎     | 8000/18750 [1:02:30<53:14,  3.36it/s]

{'eval_loss': 1.3458678722381592, 'eval_runtime': 17.8349, 'eval_samples_per_second': 56.07, 'eval_steps_per_second': 14.017, 'epoch': 6.4}


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
 43%|████▎     | 8100/18750 [1:03:02<52:31,  3.38it/s]   

{'loss': 0.9157, 'grad_norm': 1.3921290636062622, 'learning_rate': 3.0014084507042254e-05, 'epoch': 6.48}


                                                      
 43%|████▎     | 8100/18750 [1:03:19<52:31,  3.38it/s]

{'eval_loss': 1.3073770999908447, 'eval_runtime': 17.7293, 'eval_samples_per_second': 56.404, 'eval_steps_per_second': 14.101, 'epoch': 6.48}


 44%|████▎     | 8200/18750 [1:03:49<52:20,  3.36it/s]   

{'loss': 1.0329, 'grad_norm': 2.852653980255127, 'learning_rate': 2.9732394366197185e-05, 'epoch': 6.56}


                                                      
 44%|████▎     | 8200/18750 [1:04:07<52:20,  3.36it/s]

{'eval_loss': 1.3091810941696167, 'eval_runtime': 17.7282, 'eval_samples_per_second': 56.407, 'eval_steps_per_second': 14.102, 'epoch': 6.56}


 44%|████▍     | 8300/18750 [1:04:37<51:45,  3.36it/s]   

{'loss': 0.9344, 'grad_norm': 1.523045539855957, 'learning_rate': 2.9450704225352116e-05, 'epoch': 6.64}


                                                      
 44%|████▍     | 8300/18750 [1:04:54<51:45,  3.36it/s]

{'eval_loss': 1.3221184015274048, 'eval_runtime': 17.7031, 'eval_samples_per_second': 56.487, 'eval_steps_per_second': 14.122, 'epoch': 6.64}


 45%|████▍     | 8400/18750 [1:05:24<51:09,  3.37it/s]   

{'loss': 0.9049, 'grad_norm': 2.2674803733825684, 'learning_rate': 2.9169014084507047e-05, 'epoch': 6.72}


                                                      
 45%|████▍     | 8400/18750 [1:05:42<51:09,  3.37it/s]

{'eval_loss': 1.3426649570465088, 'eval_runtime': 17.7807, 'eval_samples_per_second': 56.241, 'eval_steps_per_second': 14.06, 'epoch': 6.72}


 45%|████▌     | 8500/18750 [1:06:11<50:59,  3.35it/s]   

{'loss': 1.0182, 'grad_norm': 4.700596809387207, 'learning_rate': 2.8887323943661975e-05, 'epoch': 6.8}


                                                      
 45%|████▌     | 8500/18750 [1:06:29<50:59,  3.35it/s]

{'eval_loss': 1.278124213218689, 'eval_runtime': 17.8019, 'eval_samples_per_second': 56.174, 'eval_steps_per_second': 14.043, 'epoch': 6.8}


 46%|████▌     | 8600/18750 [1:06:59<49:53,  3.39it/s]   

{'loss': 0.88, 'grad_norm': 1.6268075704574585, 'learning_rate': 2.86056338028169e-05, 'epoch': 6.88}


                                                      
 46%|████▌     | 8600/18750 [1:07:17<49:53,  3.39it/s]

{'eval_loss': 1.3052115440368652, 'eval_runtime': 17.7989, 'eval_samples_per_second': 56.183, 'eval_steps_per_second': 14.046, 'epoch': 6.88}


 46%|████▋     | 8700/18750 [1:07:46<49:20,  3.39it/s]   

{'loss': 0.9652, 'grad_norm': 1.7203258275985718, 'learning_rate': 2.832394366197183e-05, 'epoch': 6.96}


                                                      
 46%|████▋     | 8700/18750 [1:08:04<49:20,  3.39it/s]

{'eval_loss': 1.2937151193618774, 'eval_runtime': 17.7139, 'eval_samples_per_second': 56.453, 'eval_steps_per_second': 14.113, 'epoch': 6.96}


 47%|████▋     | 8800/18750 [1:08:34<49:07,  3.38it/s]   

{'loss': 0.8651, 'grad_norm': 2.467071771621704, 'learning_rate': 2.804225352112676e-05, 'epoch': 7.04}


                                                      
 47%|████▋     | 8800/18750 [1:08:52<49:07,  3.38it/s]

{'eval_loss': 1.3131000995635986, 'eval_runtime': 17.8331, 'eval_samples_per_second': 56.076, 'eval_steps_per_second': 14.019, 'epoch': 7.04}


 47%|████▋     | 8900/18750 [1:09:21<48:48,  3.36it/s]   

{'loss': 0.838, 'grad_norm': 2.901857852935791, 'learning_rate': 2.776056338028169e-05, 'epoch': 7.12}


                                                      
 47%|████▋     | 8900/18750 [1:09:39<48:48,  3.36it/s]

{'eval_loss': 1.318298101425171, 'eval_runtime': 17.7655, 'eval_samples_per_second': 56.289, 'eval_steps_per_second': 14.072, 'epoch': 7.12}


 48%|████▊     | 9000/18750 [1:10:09<48:03,  3.38it/s]   

{'loss': 0.8952, 'grad_norm': 2.39595365524292, 'learning_rate': 2.747887323943662e-05, 'epoch': 7.2}


                                                      
 48%|████▊     | 9000/18750 [1:10:27<48:03,  3.38it/s]

{'eval_loss': 1.3264001607894897, 'eval_runtime': 17.8003, 'eval_samples_per_second': 56.179, 'eval_steps_per_second': 14.045, 'epoch': 7.2}


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
 49%|████▊     | 9100/18750 [1:10:58<47:23,  3.39it/s]   

{'loss': 0.9749, 'grad_norm': 1.768763780593872, 'learning_rate': 2.719718309859155e-05, 'epoch': 7.28}


                                                      
 49%|████▊     | 9100/18750 [1:11:16<47:23,  3.39it/s]

{'eval_loss': 1.3617990016937256, 'eval_runtime': 17.764, 'eval_samples_per_second': 56.294, 'eval_steps_per_second': 14.073, 'epoch': 7.28}


 49%|████▉     | 9200/18750 [1:11:46<47:17,  3.37it/s]   

{'loss': 0.9053, 'grad_norm': 2.726498603820801, 'learning_rate': 2.691549295774648e-05, 'epoch': 7.36}


                                                      
 49%|████▉     | 9200/18750 [1:12:03<47:17,  3.37it/s]

{'eval_loss': 1.3575242757797241, 'eval_runtime': 17.7479, 'eval_samples_per_second': 56.345, 'eval_steps_per_second': 14.086, 'epoch': 7.36}


 50%|████▉     | 9300/18750 [1:12:33<46:52,  3.36it/s]   

{'loss': 0.9427, 'grad_norm': 4.220792770385742, 'learning_rate': 2.6633802816901408e-05, 'epoch': 7.44}


                                                      
 50%|████▉     | 9300/18750 [1:12:51<46:52,  3.36it/s]

{'eval_loss': 1.3721379041671753, 'eval_runtime': 17.7042, 'eval_samples_per_second': 56.484, 'eval_steps_per_second': 14.121, 'epoch': 7.44}


 50%|█████     | 9400/18750 [1:13:20<46:07,  3.38it/s]   

{'loss': 0.9148, 'grad_norm': 2.0397064685821533, 'learning_rate': 2.635211267605634e-05, 'epoch': 7.52}


                                                      
 50%|█████     | 9400/18750 [1:13:38<46:07,  3.38it/s]

{'eval_loss': 1.3814001083374023, 'eval_runtime': 17.7922, 'eval_samples_per_second': 56.205, 'eval_steps_per_second': 14.051, 'epoch': 7.52}


 51%|█████     | 9500/18750 [1:14:08<45:28,  3.39it/s]   

{'loss': 0.9881, 'grad_norm': 1.2978854179382324, 'learning_rate': 2.607042253521127e-05, 'epoch': 7.6}


                                                      
 51%|█████     | 9500/18750 [1:14:25<45:28,  3.39it/s]

{'eval_loss': 1.386031985282898, 'eval_runtime': 17.7952, 'eval_samples_per_second': 56.195, 'eval_steps_per_second': 14.049, 'epoch': 7.6}


 51%|█████     | 9600/18750 [1:14:55<44:57,  3.39it/s]   

{'loss': 0.868, 'grad_norm': 2.5449647903442383, 'learning_rate': 2.57887323943662e-05, 'epoch': 7.68}


                                                      
 51%|█████     | 9600/18750 [1:15:13<44:57,  3.39it/s]

{'eval_loss': 1.3229273557662964, 'eval_runtime': 17.7231, 'eval_samples_per_second': 56.423, 'eval_steps_per_second': 14.106, 'epoch': 7.68}


 52%|█████▏    | 9700/18750 [1:15:43<45:09,  3.34it/s]   

{'loss': 0.8511, 'grad_norm': 3.8181796073913574, 'learning_rate': 2.5507042253521128e-05, 'epoch': 7.76}


                                                      
 52%|█████▏    | 9700/18750 [1:16:00<45:09,  3.34it/s]

{'eval_loss': 1.3223357200622559, 'eval_runtime': 17.7169, 'eval_samples_per_second': 56.443, 'eval_steps_per_second': 14.111, 'epoch': 7.76}


 52%|█████▏    | 9800/18750 [1:16:30<44:05,  3.38it/s]   

{'loss': 0.9077, 'grad_norm': 2.3646528720855713, 'learning_rate': 2.522535211267606e-05, 'epoch': 7.84}


                                                      
 52%|█████▏    | 9800/18750 [1:16:48<44:05,  3.38it/s]

{'eval_loss': 1.4240007400512695, 'eval_runtime': 17.7829, 'eval_samples_per_second': 56.234, 'eval_steps_per_second': 14.058, 'epoch': 7.84}


 53%|█████▎    | 9900/18750 [1:17:17<43:49,  3.37it/s]   

{'loss': 0.9239, 'grad_norm': 3.984516143798828, 'learning_rate': 2.4943661971830987e-05, 'epoch': 7.92}


                                                      
 53%|█████▎    | 9900/18750 [1:17:35<43:49,  3.37it/s]

{'eval_loss': 1.3402910232543945, 'eval_runtime': 17.7691, 'eval_samples_per_second': 56.277, 'eval_steps_per_second': 14.069, 'epoch': 7.92}


 53%|█████▎    | 10000/18750 [1:18:05<42:53,  3.40it/s]  

{'loss': 0.9177, 'grad_norm': 1.6792755126953125, 'learning_rate': 2.4661971830985918e-05, 'epoch': 8.0}


                                                       
 53%|█████▎    | 10000/18750 [1:18:23<42:53,  3.40it/s]

{'eval_loss': 1.3603034019470215, 'eval_runtime': 17.7733, 'eval_samples_per_second': 56.264, 'eval_steps_per_second': 14.066, 'epoch': 8.0}


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
 54%|█████▍    | 10100/18750 [1:18:54<42:35,  3.39it/s]   

{'loss': 0.8722, 'grad_norm': 1.4576952457427979, 'learning_rate': 2.4380281690140845e-05, 'epoch': 8.08}


                                                       
 54%|█████▍    | 10100/18750 [1:19:12<42:35,  3.39it/s]

{'eval_loss': 1.4249725341796875, 'eval_runtime': 17.7312, 'eval_samples_per_second': 56.398, 'eval_steps_per_second': 14.099, 'epoch': 8.08}


 54%|█████▍    | 10200/18750 [1:19:41<42:20,  3.37it/s]   

{'loss': 0.9193, 'grad_norm': 1.9808785915374756, 'learning_rate': 2.4098591549295776e-05, 'epoch': 8.16}


                                                       
 54%|█████▍    | 10200/18750 [1:19:59<42:20,  3.37it/s]

{'eval_loss': 1.3659894466400146, 'eval_runtime': 17.7449, 'eval_samples_per_second': 56.354, 'eval_steps_per_second': 14.089, 'epoch': 8.16}


 55%|█████▍    | 10300/18750 [1:20:29<41:33,  3.39it/s]   

{'loss': 0.8844, 'grad_norm': 1.285557508468628, 'learning_rate': 2.3816901408450707e-05, 'epoch': 8.24}


                                                       
 55%|█████▍    | 10300/18750 [1:20:47<41:33,  3.39it/s]

{'eval_loss': 1.425472617149353, 'eval_runtime': 17.7146, 'eval_samples_per_second': 56.451, 'eval_steps_per_second': 14.113, 'epoch': 8.24}


 55%|█████▌    | 10400/18750 [1:21:16<41:00,  3.39it/s]   

{'loss': 0.8193, 'grad_norm': 1.4791103601455688, 'learning_rate': 2.3535211267605634e-05, 'epoch': 8.32}


                                                       
 55%|█████▌    | 10400/18750 [1:21:34<41:00,  3.39it/s]

{'eval_loss': 1.329767107963562, 'eval_runtime': 17.7844, 'eval_samples_per_second': 56.229, 'eval_steps_per_second': 14.057, 'epoch': 8.32}


 56%|█████▌    | 10500/18750 [1:22:04<40:38,  3.38it/s]   

{'loss': 0.7835, 'grad_norm': 1.24977445602417, 'learning_rate': 2.3253521126760562e-05, 'epoch': 8.4}


                                                       
 56%|█████▌    | 10500/18750 [1:22:21<40:38,  3.38it/s]

{'eval_loss': 1.397461175918579, 'eval_runtime': 17.7686, 'eval_samples_per_second': 56.279, 'eval_steps_per_second': 14.07, 'epoch': 8.4}


 57%|█████▋    | 10600/18750 [1:22:51<40:20,  3.37it/s]   

{'loss': 0.9203, 'grad_norm': 1.7182888984680176, 'learning_rate': 2.2971830985915493e-05, 'epoch': 8.48}


                                                       
 57%|█████▋    | 10600/18750 [1:23:09<40:20,  3.37it/s]

{'eval_loss': 1.3644989728927612, 'eval_runtime': 17.7247, 'eval_samples_per_second': 56.418, 'eval_steps_per_second': 14.105, 'epoch': 8.48}


 57%|█████▋    | 10700/18750 [1:23:38<39:47,  3.37it/s]   

{'loss': 0.9328, 'grad_norm': 2.9057133197784424, 'learning_rate': 2.2690140845070424e-05, 'epoch': 8.56}


                                                       
 57%|█████▋    | 10700/18750 [1:23:56<39:47,  3.37it/s]

{'eval_loss': 1.3905081748962402, 'eval_runtime': 17.7206, 'eval_samples_per_second': 56.431, 'eval_steps_per_second': 14.108, 'epoch': 8.56}


 58%|█████▊    | 10800/18750 [1:24:26<39:06,  3.39it/s]   

{'loss': 0.8466, 'grad_norm': 1.7808473110198975, 'learning_rate': 2.2408450704225355e-05, 'epoch': 8.64}


                                                       
 58%|█████▊    | 10800/18750 [1:24:44<39:06,  3.39it/s]

{'eval_loss': 1.3712522983551025, 'eval_runtime': 17.7572, 'eval_samples_per_second': 56.315, 'eval_steps_per_second': 14.079, 'epoch': 8.64}


 58%|█████▊    | 10900/18750 [1:25:13<38:36,  3.39it/s]   

{'loss': 0.9027, 'grad_norm': 1.9230836629867554, 'learning_rate': 2.2126760563380282e-05, 'epoch': 8.72}


                                                       
 58%|█████▊    | 10900/18750 [1:25:31<38:36,  3.39it/s]

{'eval_loss': 1.4051603078842163, 'eval_runtime': 17.7944, 'eval_samples_per_second': 56.197, 'eval_steps_per_second': 14.049, 'epoch': 8.72}


 59%|█████▊    | 11000/18750 [1:26:01<38:20,  3.37it/s]   

{'loss': 0.8958, 'grad_norm': 1.0485848188400269, 'learning_rate': 2.1845070422535213e-05, 'epoch': 8.8}


                                                       
 59%|█████▊    | 11000/18750 [1:26:18<38:20,  3.37it/s]

{'eval_loss': 1.3817914724349976, 'eval_runtime': 17.7283, 'eval_samples_per_second': 56.407, 'eval_steps_per_second': 14.102, 'epoch': 8.8}


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
 59%|█████▉    | 11100/18750 [1:26:50<38:03,  3.35it/s]   

{'loss': 0.8896, 'grad_norm': 1.8644956350326538, 'learning_rate': 2.1563380281690144e-05, 'epoch': 8.88}


                                                       
 59%|█████▉    | 11100/18750 [1:27:08<38:03,  3.35it/s]

{'eval_loss': 1.350766658782959, 'eval_runtime': 17.8023, 'eval_samples_per_second': 56.173, 'eval_steps_per_second': 14.043, 'epoch': 8.88}


 60%|█████▉    | 11200/18750 [1:27:37<37:27,  3.36it/s]   

{'loss': 0.9297, 'grad_norm': 1.3551831245422363, 'learning_rate': 2.128169014084507e-05, 'epoch': 8.96}


                                                       
 60%|█████▉    | 11200/18750 [1:27:55<37:27,  3.36it/s]

{'eval_loss': 1.3587535619735718, 'eval_runtime': 17.7435, 'eval_samples_per_second': 56.359, 'eval_steps_per_second': 14.09, 'epoch': 8.96}


 60%|██████    | 11300/18750 [1:28:25<36:33,  3.40it/s]   

{'loss': 0.9158, 'grad_norm': 1.5649324655532837, 'learning_rate': 2.100281690140845e-05, 'epoch': 9.04}


                                                       
 60%|██████    | 11300/18750 [1:28:43<36:33,  3.40it/s]

{'eval_loss': 1.4259426593780518, 'eval_runtime': 17.7709, 'eval_samples_per_second': 56.272, 'eval_steps_per_second': 14.068, 'epoch': 9.04}


 61%|██████    | 11400/18750 [1:29:12<36:21,  3.37it/s]   

{'loss': 0.8286, 'grad_norm': 1.0989841222763062, 'learning_rate': 2.072112676056338e-05, 'epoch': 9.12}


                                                       
 61%|██████    | 11400/18750 [1:29:30<36:21,  3.37it/s]

{'eval_loss': 1.4413496255874634, 'eval_runtime': 17.7559, 'eval_samples_per_second': 56.319, 'eval_steps_per_second': 14.08, 'epoch': 9.12}


 61%|██████▏   | 11500/18750 [1:30:00<35:56,  3.36it/s]   

{'loss': 0.8156, 'grad_norm': 1.5713773965835571, 'learning_rate': 2.043943661971831e-05, 'epoch': 9.2}


                                                       
 61%|██████▏   | 11500/18750 [1:30:17<35:56,  3.36it/s]

{'eval_loss': 1.4262018203735352, 'eval_runtime': 17.7493, 'eval_samples_per_second': 56.34, 'eval_steps_per_second': 14.085, 'epoch': 9.2}


 62%|██████▏   | 11600/18750 [1:30:47<35:03,  3.40it/s]   

{'loss': 0.8553, 'grad_norm': 2.832491636276245, 'learning_rate': 2.0157746478873242e-05, 'epoch': 9.28}


                                                       
 62%|██████▏   | 11600/18750 [1:31:05<35:03,  3.40it/s]

{'eval_loss': 1.4592199325561523, 'eval_runtime': 17.7353, 'eval_samples_per_second': 56.385, 'eval_steps_per_second': 14.096, 'epoch': 9.28}


 62%|██████▏   | 11700/18750 [1:31:35<34:46,  3.38it/s]   

{'loss': 0.8159, 'grad_norm': 2.21182918548584, 'learning_rate': 1.987605633802817e-05, 'epoch': 9.36}


                                                       
 62%|██████▏   | 11700/18750 [1:31:52<34:46,  3.38it/s]

{'eval_loss': 1.4003314971923828, 'eval_runtime': 17.7815, 'eval_samples_per_second': 56.238, 'eval_steps_per_second': 14.06, 'epoch': 9.36}


 63%|██████▎   | 11800/18750 [1:32:22<34:29,  3.36it/s]   

{'loss': 0.7855, 'grad_norm': 3.4857494831085205, 'learning_rate': 1.95943661971831e-05, 'epoch': 9.44}


                                                       
 63%|██████▎   | 11800/18750 [1:32:40<34:29,  3.36it/s]

{'eval_loss': 1.4398314952850342, 'eval_runtime': 17.7739, 'eval_samples_per_second': 56.262, 'eval_steps_per_second': 14.066, 'epoch': 9.44}


 63%|██████▎   | 11900/18750 [1:33:09<33:45,  3.38it/s]   

{'loss': 0.9216, 'grad_norm': 0.6695079207420349, 'learning_rate': 1.931267605633803e-05, 'epoch': 9.52}


                                                       
 63%|██████▎   | 11900/18750 [1:33:27<33:45,  3.38it/s]

{'eval_loss': 1.3898669481277466, 'eval_runtime': 17.7901, 'eval_samples_per_second': 56.211, 'eval_steps_per_second': 14.053, 'epoch': 9.52}


 64%|██████▍   | 12000/18750 [1:33:57<33:19,  3.38it/s]   

{'loss': 0.8545, 'grad_norm': 2.268627166748047, 'learning_rate': 1.903098591549296e-05, 'epoch': 9.6}


                                                       
 64%|██████▍   | 12000/18750 [1:34:15<33:19,  3.38it/s]

{'eval_loss': 1.4020329713821411, 'eval_runtime': 17.7806, 'eval_samples_per_second': 56.241, 'eval_steps_per_second': 14.06, 'epoch': 9.6}


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
 65%|██████▍   | 12100/18750 [1:34:46<32:53,  3.37it/s]   

{'loss': 0.8669, 'grad_norm': 1.415795922279358, 'learning_rate': 1.8749295774647887e-05, 'epoch': 9.68}


                                                       
 65%|██████▍   | 12100/18750 [1:35:04<32:53,  3.37it/s]

{'eval_loss': 1.434192419052124, 'eval_runtime': 17.7112, 'eval_samples_per_second': 56.461, 'eval_steps_per_second': 14.115, 'epoch': 9.68}


 65%|██████▌   | 12200/18750 [1:35:34<32:14,  3.39it/s]   

{'loss': 0.8876, 'grad_norm': 1.5792756080627441, 'learning_rate': 1.8467605633802817e-05, 'epoch': 9.76}


                                                       
 65%|██████▌   | 12200/18750 [1:35:51<32:14,  3.39it/s]

{'eval_loss': 1.3885256052017212, 'eval_runtime': 17.7678, 'eval_samples_per_second': 56.282, 'eval_steps_per_second': 14.07, 'epoch': 9.76}


 66%|██████▌   | 12300/18750 [1:36:21<31:50,  3.38it/s]   

{'loss': 0.8712, 'grad_norm': 2.926476001739502, 'learning_rate': 1.8185915492957748e-05, 'epoch': 9.84}


                                                       
 66%|██████▌   | 12300/18750 [1:36:39<31:50,  3.38it/s]

{'eval_loss': 1.4028844833374023, 'eval_runtime': 17.8363, 'eval_samples_per_second': 56.065, 'eval_steps_per_second': 14.016, 'epoch': 9.84}


 66%|██████▌   | 12400/18750 [1:37:09<31:12,  3.39it/s]   

{'loss': 0.9684, 'grad_norm': 3.732887029647827, 'learning_rate': 1.7904225352112676e-05, 'epoch': 9.92}


                                                       
 66%|██████▌   | 12400/18750 [1:37:26<31:12,  3.39it/s]

{'eval_loss': 1.43044114112854, 'eval_runtime': 17.7875, 'eval_samples_per_second': 56.219, 'eval_steps_per_second': 14.055, 'epoch': 9.92}


 67%|██████▋   | 12500/18750 [1:37:56<30:53,  3.37it/s]  

{'loss': 0.8379, 'grad_norm': 3.095672607421875, 'learning_rate': 1.7622535211267607e-05, 'epoch': 10.0}


                                                       
 67%|██████▋   | 12500/18750 [1:38:14<30:53,  3.37it/s]

{'eval_loss': 1.468225121498108, 'eval_runtime': 17.8245, 'eval_samples_per_second': 56.102, 'eval_steps_per_second': 14.026, 'epoch': 10.0}


 67%|██████▋   | 12600/18750 [1:38:43<30:22,  3.37it/s]  

{'loss': 0.8216, 'grad_norm': 2.6743414402008057, 'learning_rate': 1.7340845070422538e-05, 'epoch': 10.08}


                                                       
 67%|██████▋   | 12600/18750 [1:39:01<30:22,  3.37it/s]

{'eval_loss': 1.4859977960586548, 'eval_runtime': 17.6886, 'eval_samples_per_second': 56.534, 'eval_steps_per_second': 14.133, 'epoch': 10.08}


 68%|██████▊   | 12700/18750 [1:39:31<29:51,  3.38it/s]  

{'loss': 0.853, 'grad_norm': 1.0229220390319824, 'learning_rate': 1.7059154929577465e-05, 'epoch': 10.16}


                                                       
 68%|██████▊   | 12700/18750 [1:39:49<29:51,  3.38it/s]

{'eval_loss': 1.461850643157959, 'eval_runtime': 17.7762, 'eval_samples_per_second': 56.255, 'eval_steps_per_second': 14.064, 'epoch': 10.16}


 68%|██████▊   | 12800/18750 [1:40:18<29:16,  3.39it/s]  

{'loss': 0.8715, 'grad_norm': 1.6502068042755127, 'learning_rate': 1.6777464788732396e-05, 'epoch': 10.24}


                                                       
 68%|██████▊   | 12800/18750 [1:40:36<29:16,  3.39it/s]

{'eval_loss': 1.468833088874817, 'eval_runtime': 17.9259, 'eval_samples_per_second': 55.785, 'eval_steps_per_second': 13.946, 'epoch': 10.24}


 69%|██████▉   | 12900/18750 [1:41:06<28:51,  3.38it/s]  

{'loss': 0.8353, 'grad_norm': 1.6979864835739136, 'learning_rate': 1.6495774647887323e-05, 'epoch': 10.32}


                                                       
 69%|██████▉   | 12900/18750 [1:41:23<28:51,  3.38it/s]

{'eval_loss': 1.478946328163147, 'eval_runtime': 17.6836, 'eval_samples_per_second': 56.549, 'eval_steps_per_second': 14.137, 'epoch': 10.32}


 69%|██████▉   | 13000/18750 [1:41:53<28:21,  3.38it/s]  

{'loss': 0.9, 'grad_norm': 2.3783955574035645, 'learning_rate': 1.6214084507042254e-05, 'epoch': 10.4}


                                                       
 69%|██████▉   | 13000/18750 [1:42:11<28:21,  3.38it/s]

{'eval_loss': 1.4746257066726685, 'eval_runtime': 17.669, 'eval_samples_per_second': 56.596, 'eval_steps_per_second': 14.149, 'epoch': 10.4}


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
 70%|██████▉   | 13100/18750 [1:42:42<27:51,  3.38it/s]  

{'loss': 0.8643, 'grad_norm': 1.9976886510849, 'learning_rate': 1.5932394366197185e-05, 'epoch': 10.48}


                                                       
 70%|██████▉   | 13100/18750 [1:43:00<27:51,  3.38it/s]

{'eval_loss': 1.4800453186035156, 'eval_runtime': 17.6647, 'eval_samples_per_second': 56.61, 'eval_steps_per_second': 14.153, 'epoch': 10.48}


 70%|███████   | 13200/18750 [1:43:30<27:19,  3.39it/s]  

{'loss': 0.8583, 'grad_norm': 1.4422805309295654, 'learning_rate': 1.5650704225352113e-05, 'epoch': 10.56}


                                                       
 70%|███████   | 13200/18750 [1:43:47<27:19,  3.39it/s]

{'eval_loss': 1.455306887626648, 'eval_runtime': 17.6363, 'eval_samples_per_second': 56.701, 'eval_steps_per_second': 14.175, 'epoch': 10.56}


 71%|███████   | 13300/18750 [1:44:17<26:47,  3.39it/s]  

{'loss': 0.858, 'grad_norm': 1.7057011127471924, 'learning_rate': 1.5369014084507044e-05, 'epoch': 10.64}


                                                       
 71%|███████   | 13300/18750 [1:44:34<26:47,  3.39it/s]

{'eval_loss': 1.4444957971572876, 'eval_runtime': 17.6811, 'eval_samples_per_second': 56.557, 'eval_steps_per_second': 14.139, 'epoch': 10.64}


 71%|███████▏  | 13400/18750 [1:45:04<26:18,  3.39it/s]  

{'loss': 0.8288, 'grad_norm': 2.4579336643218994, 'learning_rate': 1.5087323943661971e-05, 'epoch': 10.72}


                                                       
 71%|███████▏  | 13400/18750 [1:45:22<26:18,  3.39it/s]

{'eval_loss': 1.4601469039916992, 'eval_runtime': 17.6428, 'eval_samples_per_second': 56.68, 'eval_steps_per_second': 14.17, 'epoch': 10.72}


 72%|███████▏  | 13500/18750 [1:45:51<25:47,  3.39it/s]  

{'loss': 0.7788, 'grad_norm': 1.376680850982666, 'learning_rate': 1.48056338028169e-05, 'epoch': 10.8}


                                                       
 72%|███████▏  | 13500/18750 [1:46:09<25:47,  3.39it/s]

{'eval_loss': 1.4508514404296875, 'eval_runtime': 17.6492, 'eval_samples_per_second': 56.66, 'eval_steps_per_second': 14.165, 'epoch': 10.8}


 73%|███████▎  | 13600/18750 [1:46:38<25:11,  3.41it/s]  

{'loss': 0.7181, 'grad_norm': 1.3144381046295166, 'learning_rate': 1.4523943661971831e-05, 'epoch': 10.88}


                                                       
 73%|███████▎  | 13600/18750 [1:46:56<25:11,  3.41it/s]

{'eval_loss': 1.5091283321380615, 'eval_runtime': 17.5837, 'eval_samples_per_second': 56.871, 'eval_steps_per_second': 14.218, 'epoch': 10.88}


 73%|███████▎  | 13700/18750 [1:47:26<24:52,  3.38it/s]  

{'loss': 0.9366, 'grad_norm': 2.912564277648926, 'learning_rate': 1.424225352112676e-05, 'epoch': 10.96}


                                                       
 73%|███████▎  | 13700/18750 [1:47:43<24:52,  3.38it/s]

{'eval_loss': 1.4580790996551514, 'eval_runtime': 17.6155, 'eval_samples_per_second': 56.768, 'eval_steps_per_second': 14.192, 'epoch': 10.96}


 74%|███████▎  | 13800/18750 [1:48:13<24:25,  3.38it/s]  

{'loss': 0.837, 'grad_norm': 2.5886309146881104, 'learning_rate': 1.3960563380281691e-05, 'epoch': 11.04}


                                                       
 74%|███████▎  | 13800/18750 [1:48:30<24:25,  3.38it/s]

{'eval_loss': 1.5052012205123901, 'eval_runtime': 17.6202, 'eval_samples_per_second': 56.753, 'eval_steps_per_second': 14.188, 'epoch': 11.04}


 74%|███████▍  | 13900/18750 [1:49:00<23:54,  3.38it/s]  

{'loss': 0.8406, 'grad_norm': 3.7600340843200684, 'learning_rate': 1.367887323943662e-05, 'epoch': 11.12}


                                                       
 74%|███████▍  | 13900/18750 [1:49:18<23:54,  3.38it/s]

{'eval_loss': 1.5279126167297363, 'eval_runtime': 17.5816, 'eval_samples_per_second': 56.878, 'eval_steps_per_second': 14.219, 'epoch': 11.12}


 75%|███████▍  | 14000/18750 [1:49:47<23:19,  3.39it/s]  

{'loss': 0.7335, 'grad_norm': 1.1266881227493286, 'learning_rate': 1.3397183098591551e-05, 'epoch': 11.2}


                                                       
 75%|███████▍  | 14000/18750 [1:50:05<23:19,  3.39it/s]

{'eval_loss': 1.5326777696609497, 'eval_runtime': 17.6122, 'eval_samples_per_second': 56.779, 'eval_steps_per_second': 14.195, 'epoch': 11.2}


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
 75%|███████▌  | 14100/18750 [1:50:36<22:55,  3.38it/s]  

{'loss': 0.8597, 'grad_norm': 1.7756388187408447, 'learning_rate': 1.311549295774648e-05, 'epoch': 11.28}


                                                       
 75%|███████▌  | 14100/18750 [1:50:54<22:55,  3.38it/s]

{'eval_loss': 1.5357024669647217, 'eval_runtime': 17.5523, 'eval_samples_per_second': 56.973, 'eval_steps_per_second': 14.243, 'epoch': 11.28}


 76%|███████▌  | 14200/18750 [1:51:23<22:23,  3.39it/s]  

{'loss': 0.7479, 'grad_norm': 1.9743074178695679, 'learning_rate': 1.2833802816901408e-05, 'epoch': 11.36}


                                                       
 76%|███████▌  | 14200/18750 [1:51:41<22:23,  3.39it/s]

{'eval_loss': 1.4931561946868896, 'eval_runtime': 17.5796, 'eval_samples_per_second': 56.884, 'eval_steps_per_second': 14.221, 'epoch': 11.36}


 76%|███████▋  | 14300/18750 [1:52:11<21:50,  3.40it/s]  

{'loss': 0.8678, 'grad_norm': 2.4293272495269775, 'learning_rate': 1.2552112676056337e-05, 'epoch': 11.44}


                                                       
 76%|███████▋  | 14300/18750 [1:52:28<21:50,  3.40it/s]

{'eval_loss': 1.4544283151626587, 'eval_runtime': 17.5924, 'eval_samples_per_second': 56.843, 'eval_steps_per_second': 14.211, 'epoch': 11.44}


 77%|███████▋  | 14400/18750 [1:52:58<21:24,  3.39it/s]  

{'loss': 0.8166, 'grad_norm': 1.852067470550537, 'learning_rate': 1.2270422535211268e-05, 'epoch': 11.52}


                                                       
 77%|███████▋  | 14400/18750 [1:53:15<21:24,  3.39it/s]

{'eval_loss': 1.5048028230667114, 'eval_runtime': 17.6098, 'eval_samples_per_second': 56.787, 'eval_steps_per_second': 14.197, 'epoch': 11.52}


 77%|███████▋  | 14500/18750 [1:53:45<20:53,  3.39it/s]  

{'loss': 0.8024, 'grad_norm': 2.02805495262146, 'learning_rate': 1.1988732394366197e-05, 'epoch': 11.6}


                                                       
 77%|███████▋  | 14500/18750 [1:54:02<20:53,  3.39it/s]

{'eval_loss': 1.4930434226989746, 'eval_runtime': 17.5462, 'eval_samples_per_second': 56.992, 'eval_steps_per_second': 14.248, 'epoch': 11.6}


 78%|███████▊  | 14600/18750 [1:54:32<20:25,  3.39it/s]  

{'loss': 0.8425, 'grad_norm': 1.8504719734191895, 'learning_rate': 1.1707042253521128e-05, 'epoch': 11.68}


                                                       
 78%|███████▊  | 14600/18750 [1:54:50<20:25,  3.39it/s]

{'eval_loss': 1.5065891742706299, 'eval_runtime': 17.6447, 'eval_samples_per_second': 56.674, 'eval_steps_per_second': 14.169, 'epoch': 11.68}


 78%|███████▊  | 14700/18750 [1:55:19<19:55,  3.39it/s]  

{'loss': 0.8839, 'grad_norm': 3.4160032272338867, 'learning_rate': 1.1425352112676056e-05, 'epoch': 11.76}


                                                       
 78%|███████▊  | 14700/18750 [1:55:37<19:55,  3.39it/s]

{'eval_loss': 1.5296714305877686, 'eval_runtime': 17.609, 'eval_samples_per_second': 56.789, 'eval_steps_per_second': 14.197, 'epoch': 11.76}


 79%|███████▉  | 14800/18750 [1:56:06<19:19,  3.41it/s]  

{'loss': 0.8315, 'grad_norm': 2.8856067657470703, 'learning_rate': 1.1143661971830987e-05, 'epoch': 11.84}


                                                       
 79%|███████▉  | 14800/18750 [1:56:24<19:19,  3.41it/s]

{'eval_loss': 1.5576735734939575, 'eval_runtime': 17.5957, 'eval_samples_per_second': 56.832, 'eval_steps_per_second': 14.208, 'epoch': 11.84}


 79%|███████▉  | 14900/18750 [1:56:54<18:58,  3.38it/s]  

{'loss': 0.8411, 'grad_norm': 1.6864482164382935, 'learning_rate': 1.0861971830985916e-05, 'epoch': 11.92}


                                                       
 79%|███████▉  | 14900/18750 [1:57:11<18:58,  3.38it/s]

{'eval_loss': 1.5557301044464111, 'eval_runtime': 17.6652, 'eval_samples_per_second': 56.608, 'eval_steps_per_second': 14.152, 'epoch': 11.92}


 80%|████████  | 15000/18750 [1:57:41<18:23,  3.40it/s]  

{'loss': 0.8633, 'grad_norm': 1.3920440673828125, 'learning_rate': 1.0580281690140847e-05, 'epoch': 12.0}


                                                       
 80%|████████  | 15000/18750 [1:57:58<18:23,  3.40it/s]

{'eval_loss': 1.4911667108535767, 'eval_runtime': 17.6023, 'eval_samples_per_second': 56.811, 'eval_steps_per_second': 14.203, 'epoch': 12.0}


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
 81%|████████  | 15100/18750 [1:58:30<17:55,  3.39it/s]  

{'loss': 0.7901, 'grad_norm': 2.0775060653686523, 'learning_rate': 1.0298591549295774e-05, 'epoch': 12.08}


                                                       
 81%|████████  | 15100/18750 [1:58:47<17:55,  3.39it/s]

{'eval_loss': 1.5176364183425903, 'eval_runtime': 17.6039, 'eval_samples_per_second': 56.806, 'eval_steps_per_second': 14.201, 'epoch': 12.08}


 81%|████████  | 15200/18750 [1:59:17<17:31,  3.38it/s]  

{'loss': 0.7696, 'grad_norm': 0.9041844010353088, 'learning_rate': 1.0016901408450705e-05, 'epoch': 12.16}


                                                       
 81%|████████  | 15200/18750 [1:59:35<17:31,  3.38it/s]

{'eval_loss': 1.610163688659668, 'eval_runtime': 17.6718, 'eval_samples_per_second': 56.587, 'eval_steps_per_second': 14.147, 'epoch': 12.16}


 82%|████████▏ | 15300/18750 [2:00:04<16:56,  3.39it/s]  

{'loss': 0.8609, 'grad_norm': 2.5220086574554443, 'learning_rate': 9.735211267605634e-06, 'epoch': 12.24}


                                                       
 82%|████████▏ | 15300/18750 [2:00:22<16:56,  3.39it/s]

{'eval_loss': 1.5724436044692993, 'eval_runtime': 17.6398, 'eval_samples_per_second': 56.69, 'eval_steps_per_second': 14.172, 'epoch': 12.24}


 82%|████████▏ | 15400/18750 [2:00:51<16:27,  3.39it/s]  

{'loss': 0.7286, 'grad_norm': 1.7742081880569458, 'learning_rate': 9.453521126760565e-06, 'epoch': 12.32}


                                                       
 82%|████████▏ | 15400/18750 [2:01:09<16:27,  3.39it/s]

{'eval_loss': 1.5428725481033325, 'eval_runtime': 17.5529, 'eval_samples_per_second': 56.971, 'eval_steps_per_second': 14.243, 'epoch': 12.32}


 83%|████████▎ | 15500/18750 [2:01:38<16:09,  3.35it/s]  

{'loss': 0.8649, 'grad_norm': 2.1264116764068604, 'learning_rate': 9.171830985915493e-06, 'epoch': 12.4}


                                                       
 83%|████████▎ | 15500/18750 [2:01:56<16:09,  3.35it/s]

{'eval_loss': 1.5448437929153442, 'eval_runtime': 17.6174, 'eval_samples_per_second': 56.762, 'eval_steps_per_second': 14.19, 'epoch': 12.4}


 83%|████████▎ | 15600/18750 [2:02:26<15:26,  3.40it/s]  

{'loss': 0.8488, 'grad_norm': 0.8759695291519165, 'learning_rate': 8.890140845070424e-06, 'epoch': 12.48}


                                                       
 83%|████████▎ | 15600/18750 [2:02:43<15:26,  3.40it/s]

{'eval_loss': 1.5545673370361328, 'eval_runtime': 17.6145, 'eval_samples_per_second': 56.771, 'eval_steps_per_second': 14.193, 'epoch': 12.48}


 84%|████████▎ | 15700/18750 [2:03:13<14:59,  3.39it/s]  

{'loss': 0.7781, 'grad_norm': 1.8708807229995728, 'learning_rate': 8.608450704225353e-06, 'epoch': 12.56}


                                                       
 84%|████████▎ | 15700/18750 [2:03:30<14:59,  3.39it/s]

{'eval_loss': 1.5458581447601318, 'eval_runtime': 17.6234, 'eval_samples_per_second': 56.743, 'eval_steps_per_second': 14.186, 'epoch': 12.56}


 84%|████████▍ | 15800/18750 [2:04:00<14:28,  3.40it/s]  

{'loss': 0.7927, 'grad_norm': 2.3047397136688232, 'learning_rate': 8.329577464788733e-06, 'epoch': 12.64}


                                                       
 84%|████████▍ | 15800/18750 [2:04:17<14:28,  3.40it/s]

{'eval_loss': 1.572354793548584, 'eval_runtime': 17.6389, 'eval_samples_per_second': 56.693, 'eval_steps_per_second': 14.173, 'epoch': 12.64}


 85%|████████▍ | 15900/18750 [2:04:47<13:57,  3.40it/s]  

{'loss': 0.8107, 'grad_norm': 2.0808393955230713, 'learning_rate': 8.047887323943662e-06, 'epoch': 12.72}


                                                       
 85%|████████▍ | 15900/18750 [2:05:05<13:57,  3.40it/s]

{'eval_loss': 1.5434354543685913, 'eval_runtime': 17.6252, 'eval_samples_per_second': 56.737, 'eval_steps_per_second': 14.184, 'epoch': 12.72}


 85%|████████▌ | 16000/18750 [2:05:34<13:35,  3.37it/s]  

{'loss': 0.8475, 'grad_norm': 1.8716864585876465, 'learning_rate': 7.766197183098591e-06, 'epoch': 12.8}


                                                       
 85%|████████▌ | 16000/18750 [2:05:52<13:35,  3.37it/s]

{'eval_loss': 1.5719609260559082, 'eval_runtime': 17.6483, 'eval_samples_per_second': 56.663, 'eval_steps_per_second': 14.166, 'epoch': 12.8}


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
 86%|████████▌ | 16100/18750 [2:06:23<13:02,  3.38it/s]  

{'loss': 0.8575, 'grad_norm': 1.4905163049697876, 'learning_rate': 7.484507042253522e-06, 'epoch': 12.88}


                                                       
 86%|████████▌ | 16100/18750 [2:06:41<13:02,  3.38it/s]

{'eval_loss': 1.6027911901474, 'eval_runtime': 17.633, 'eval_samples_per_second': 56.712, 'eval_steps_per_second': 14.178, 'epoch': 12.88}


 86%|████████▋ | 16200/18750 [2:07:10<12:34,  3.38it/s]  

{'loss': 0.787, 'grad_norm': 1.687540888786316, 'learning_rate': 7.20281690140845e-06, 'epoch': 12.96}


                                                       
 86%|████████▋ | 16200/18750 [2:07:28<12:34,  3.38it/s]

{'eval_loss': 1.5606396198272705, 'eval_runtime': 17.6281, 'eval_samples_per_second': 56.728, 'eval_steps_per_second': 14.182, 'epoch': 12.96}


 87%|████████▋ | 16300/18750 [2:07:57<12:05,  3.38it/s]  

{'loss': 0.8661, 'grad_norm': 2.212331771850586, 'learning_rate': 6.9211267605633804e-06, 'epoch': 13.04}


                                                       
 87%|████████▋ | 16300/18750 [2:08:15<12:05,  3.38it/s]

{'eval_loss': 1.585998296737671, 'eval_runtime': 17.626, 'eval_samples_per_second': 56.734, 'eval_steps_per_second': 14.184, 'epoch': 13.04}


 87%|████████▋ | 16400/18750 [2:08:45<11:30,  3.40it/s]  

{'loss': 0.7037, 'grad_norm': 2.1096439361572266, 'learning_rate': 6.6394366197183105e-06, 'epoch': 13.12}


                                                       
 87%|████████▋ | 16400/18750 [2:09:02<11:30,  3.40it/s]

{'eval_loss': 1.5964809656143188, 'eval_runtime': 17.6465, 'eval_samples_per_second': 56.668, 'eval_steps_per_second': 14.167, 'epoch': 13.12}


 88%|████████▊ | 16500/18750 [2:09:32<11:07,  3.37it/s]  

{'loss': 0.8371, 'grad_norm': 0.9980542063713074, 'learning_rate': 6.3577464788732405e-06, 'epoch': 13.2}


                                                       
 88%|████████▊ | 16500/18750 [2:09:49<11:07,  3.37it/s]

{'eval_loss': 1.6000549793243408, 'eval_runtime': 17.6895, 'eval_samples_per_second': 56.531, 'eval_steps_per_second': 14.133, 'epoch': 13.2}


 89%|████████▊ | 16600/18750 [2:10:19<10:36,  3.38it/s]  

{'loss': 0.7752, 'grad_norm': 0.8412716388702393, 'learning_rate': 6.07605633802817e-06, 'epoch': 13.28}


                                                       
 89%|████████▊ | 16600/18750 [2:10:37<10:36,  3.38it/s]

{'eval_loss': 1.6199508905410767, 'eval_runtime': 17.6366, 'eval_samples_per_second': 56.7, 'eval_steps_per_second': 14.175, 'epoch': 13.28}


 89%|████████▉ | 16700/18750 [2:11:06<10:03,  3.40it/s]  

{'loss': 0.8493, 'grad_norm': 1.1205248832702637, 'learning_rate': 5.794366197183099e-06, 'epoch': 13.36}


                                                       
 89%|████████▉ | 16700/18750 [2:11:24<10:03,  3.40it/s]

{'eval_loss': 1.6354548931121826, 'eval_runtime': 17.6558, 'eval_samples_per_second': 56.639, 'eval_steps_per_second': 14.16, 'epoch': 13.36}


 90%|████████▉ | 16800/18750 [2:11:53<09:34,  3.39it/s]  

{'loss': 0.7691, 'grad_norm': 2.7253963947296143, 'learning_rate': 5.512676056338029e-06, 'epoch': 13.44}


                                                       
 90%|████████▉ | 16800/18750 [2:12:11<09:34,  3.39it/s]

{'eval_loss': 1.5921560525894165, 'eval_runtime': 17.6297, 'eval_samples_per_second': 56.722, 'eval_steps_per_second': 14.181, 'epoch': 13.44}


 90%|█████████ | 16900/18750 [2:12:40<09:09,  3.37it/s]  

{'loss': 0.7448, 'grad_norm': 1.6133087873458862, 'learning_rate': 5.230985915492958e-06, 'epoch': 13.52}


                                                       
 90%|█████████ | 16900/18750 [2:12:58<09:09,  3.37it/s]

{'eval_loss': 1.610702395439148, 'eval_runtime': 17.6468, 'eval_samples_per_second': 56.668, 'eval_steps_per_second': 14.167, 'epoch': 13.52}


 91%|█████████ | 17000/18750 [2:13:28<08:35,  3.39it/s]  

{'loss': 0.8467, 'grad_norm': 1.5655022859573364, 'learning_rate': 4.949295774647888e-06, 'epoch': 13.6}


                                                       
 91%|█████████ | 17000/18750 [2:13:45<08:35,  3.39it/s]

{'eval_loss': 1.5913407802581787, 'eval_runtime': 17.612, 'eval_samples_per_second': 56.779, 'eval_steps_per_second': 14.195, 'epoch': 13.6}


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
 91%|█████████ | 17100/18750 [2:14:17<08:09,  3.37it/s]  

{'loss': 0.8198, 'grad_norm': 1.4489446878433228, 'learning_rate': 4.667605633802817e-06, 'epoch': 13.68}


                                                       
 91%|█████████ | 17100/18750 [2:14:34<08:09,  3.37it/s]

{'eval_loss': 1.6005759239196777, 'eval_runtime': 17.6449, 'eval_samples_per_second': 56.674, 'eval_steps_per_second': 14.168, 'epoch': 13.68}


 92%|█████████▏| 17200/18750 [2:15:04<07:41,  3.36it/s]  

{'loss': 0.8376, 'grad_norm': 1.5042181015014648, 'learning_rate': 4.385915492957747e-06, 'epoch': 13.76}


                                                       
 92%|█████████▏| 17200/18750 [2:15:22<07:41,  3.36it/s]

{'eval_loss': 1.5862282514572144, 'eval_runtime': 17.6285, 'eval_samples_per_second': 56.726, 'eval_steps_per_second': 14.182, 'epoch': 13.76}


 92%|█████████▏| 17300/18750 [2:15:51<07:06,  3.40it/s]  

{'loss': 0.8564, 'grad_norm': 1.244521141052246, 'learning_rate': 4.104225352112677e-06, 'epoch': 13.84}


                                                       
 92%|█████████▏| 17300/18750 [2:16:09<07:06,  3.40it/s]

{'eval_loss': 1.586473822593689, 'eval_runtime': 17.5624, 'eval_samples_per_second': 56.94, 'eval_steps_per_second': 14.235, 'epoch': 13.84}


 93%|█████████▎| 17400/18750 [2:16:38<06:37,  3.40it/s]  

{'loss': 0.7888, 'grad_norm': 1.572861909866333, 'learning_rate': 3.822535211267606e-06, 'epoch': 13.92}


                                                       
 93%|█████████▎| 17400/18750 [2:16:56<06:37,  3.40it/s]

{'eval_loss': 1.5974068641662598, 'eval_runtime': 17.6231, 'eval_samples_per_second': 56.744, 'eval_steps_per_second': 14.186, 'epoch': 13.92}


 93%|█████████▎| 17500/18750 [2:17:25<06:08,  3.39it/s]  

{'loss': 0.7517, 'grad_norm': 3.0447518825531006, 'learning_rate': 3.5408450704225355e-06, 'epoch': 14.0}


                                                       
 93%|█████████▎| 17500/18750 [2:17:43<06:08,  3.39it/s]

{'eval_loss': 1.6077128648757935, 'eval_runtime': 17.6693, 'eval_samples_per_second': 56.595, 'eval_steps_per_second': 14.149, 'epoch': 14.0}


 94%|█████████▍| 17600/18750 [2:18:13<05:39,  3.39it/s]  

{'loss': 0.8186, 'grad_norm': 2.185784339904785, 'learning_rate': 3.2591549295774647e-06, 'epoch': 14.08}


                                                       
 94%|█████████▍| 17600/18750 [2:18:30<05:39,  3.39it/s]

{'eval_loss': 1.612874984741211, 'eval_runtime': 17.5804, 'eval_samples_per_second': 56.882, 'eval_steps_per_second': 14.22, 'epoch': 14.08}


 94%|█████████▍| 17700/18750 [2:19:00<05:10,  3.39it/s]  

{'loss': 0.921, 'grad_norm': 2.2952492237091064, 'learning_rate': 2.9774647887323947e-06, 'epoch': 14.16}


                                                       
 94%|█████████▍| 17700/18750 [2:19:17<05:10,  3.39it/s]

{'eval_loss': 1.6246389150619507, 'eval_runtime': 17.692, 'eval_samples_per_second': 56.523, 'eval_steps_per_second': 14.131, 'epoch': 14.16}


 95%|█████████▍| 17800/18750 [2:19:47<04:39,  3.40it/s]  

{'loss': 0.8245, 'grad_norm': 3.2878284454345703, 'learning_rate': 2.6957746478873243e-06, 'epoch': 14.24}


                                                       
 95%|█████████▍| 17800/18750 [2:20:04<04:39,  3.40it/s]

{'eval_loss': 1.6052160263061523, 'eval_runtime': 17.5478, 'eval_samples_per_second': 56.987, 'eval_steps_per_second': 14.247, 'epoch': 14.24}


 95%|█████████▌| 17900/18750 [2:20:34<04:09,  3.40it/s]  

{'loss': 0.8166, 'grad_norm': 1.5348871946334839, 'learning_rate': 2.4169014084507042e-06, 'epoch': 14.32}


                                                       
 95%|█████████▌| 17900/18750 [2:20:52<04:09,  3.40it/s]

{'eval_loss': 1.6210967302322388, 'eval_runtime': 17.6095, 'eval_samples_per_second': 56.788, 'eval_steps_per_second': 14.197, 'epoch': 14.32}


 96%|█████████▌| 18000/18750 [2:21:21<03:40,  3.40it/s]  

{'loss': 0.7606, 'grad_norm': 1.5723938941955566, 'learning_rate': 2.135211267605634e-06, 'epoch': 14.4}


                                                       
 96%|█████████▌| 18000/18750 [2:21:39<03:40,  3.40it/s]

{'eval_loss': 1.618011474609375, 'eval_runtime': 17.6416, 'eval_samples_per_second': 56.684, 'eval_steps_per_second': 14.171, 'epoch': 14.4}


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
 97%|█████████▋| 18100/18750 [2:22:10<03:11,  3.39it/s]  

{'loss': 0.7625, 'grad_norm': 0.9011975526809692, 'learning_rate': 1.8535211267605635e-06, 'epoch': 14.48}


                                                       
 97%|█████████▋| 18100/18750 [2:22:28<03:11,  3.39it/s]

{'eval_loss': 1.6295359134674072, 'eval_runtime': 17.6761, 'eval_samples_per_second': 56.574, 'eval_steps_per_second': 14.143, 'epoch': 14.48}


 97%|█████████▋| 18200/18750 [2:22:57<02:42,  3.38it/s]  

{'loss': 0.7476, 'grad_norm': 2.1844046115875244, 'learning_rate': 1.571830985915493e-06, 'epoch': 14.56}


                                                       
 97%|█████████▋| 18200/18750 [2:23:15<02:42,  3.38it/s]

{'eval_loss': 1.6357841491699219, 'eval_runtime': 17.6089, 'eval_samples_per_second': 56.79, 'eval_steps_per_second': 14.197, 'epoch': 14.56}


 98%|█████████▊| 18300/18750 [2:23:44<02:12,  3.41it/s]

{'loss': 0.7722, 'grad_norm': 2.298203468322754, 'learning_rate': 1.2901408450704225e-06, 'epoch': 14.64}


                                                       
 98%|█████████▊| 18300/18750 [2:24:02<02:12,  3.41it/s]

{'eval_loss': 1.624680757522583, 'eval_runtime': 17.6063, 'eval_samples_per_second': 56.798, 'eval_steps_per_second': 14.199, 'epoch': 14.64}


 98%|█████████▊| 18400/18750 [2:24:32<01:43,  3.39it/s]

{'loss': 0.7622, 'grad_norm': 1.57817804813385, 'learning_rate': 1.0084507042253521e-06, 'epoch': 14.72}


                                                       
 98%|█████████▊| 18400/18750 [2:24:49<01:43,  3.39it/s]

{'eval_loss': 1.6210850477218628, 'eval_runtime': 17.6123, 'eval_samples_per_second': 56.779, 'eval_steps_per_second': 14.195, 'epoch': 14.72}


 99%|█████████▊| 18500/18750 [2:25:19<01:13,  3.39it/s]

{'loss': 0.7524, 'grad_norm': 1.6109455823898315, 'learning_rate': 7.267605633802816e-07, 'epoch': 14.8}


                                                       
 99%|█████████▊| 18500/18750 [2:25:36<01:13,  3.39it/s]

{'eval_loss': 1.636697769165039, 'eval_runtime': 17.6502, 'eval_samples_per_second': 56.657, 'eval_steps_per_second': 14.164, 'epoch': 14.8}


 99%|█████████▉| 18600/18750 [2:26:06<00:44,  3.38it/s]

{'loss': 0.7623, 'grad_norm': 1.5346399545669556, 'learning_rate': 4.4507042253521126e-07, 'epoch': 14.88}


                                                       
 99%|█████████▉| 18600/18750 [2:26:23<00:44,  3.38it/s]

{'eval_loss': 1.6343287229537964, 'eval_runtime': 17.5464, 'eval_samples_per_second': 56.992, 'eval_steps_per_second': 14.248, 'epoch': 14.88}


100%|█████████▉| 18700/18750 [2:26:53<00:14,  3.39it/s]

{'loss': 0.8256, 'grad_norm': 2.462005853652954, 'learning_rate': 1.6338028169014086e-07, 'epoch': 14.96}


                                                       
100%|█████████▉| 18700/18750 [2:27:11<00:14,  3.39it/s]

{'eval_loss': 1.6356921195983887, 'eval_runtime': 17.6319, 'eval_samples_per_second': 56.715, 'eval_steps_per_second': 14.179, 'epoch': 14.96}


100%|██████████| 18750/18750 [2:27:27<00:00,  2.12it/s]

{'train_runtime': 8847.7985, 'train_samples_per_second': 8.477, 'train_steps_per_second': 2.119, 'train_loss': 1.0105573541259765, 'epoch': 15.0}





TrainOutput(global_step=18750, training_loss=1.0105573541259765, metrics={'train_runtime': 8847.7985, 'train_samples_per_second': 8.477, 'train_steps_per_second': 2.119, 'total_flos': 1.97449097472e+16, 'train_loss': 1.0105573541259765, 'epoch': 15.0})

In [13]:
model.save_pretrained("./codebert-finetuned-roberta")
tokenizer.save_pretrained("./codebert-finetuned-roberta")

('./codebert-finetuned-roberta\\tokenizer_config.json',
 './codebert-finetuned-roberta\\special_tokens_map.json',
 './codebert-finetuned-roberta\\vocab.json',
 './codebert-finetuned-roberta\\merges.txt',
 './codebert-finetuned-roberta\\added_tokens.json')

In [14]:
def generate_documentation(code_snippet, model, tokenizer):
    # Tokenize input
    inputs = tokenizer(code_snippet, return_tensors="pt", padding=True, truncation=True).to(device)
    
    # Generate output
    outputs = model.generate(inputs["input_ids"], max_length=150, num_beams=5, early_stopping=True)
    
    # Decode generated text
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [15]:
code_example = """
def add_numbers(a, b):
    return a + c
"""

In [16]:
documentation = generate_documentation(code_example, model, tokenizer)
print("Generated Documentation:", documentation)

Generated Documentation: 
def add_numbers(a, b):
    return a + c



In [17]:
model

RobertaForCausalLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNor