In [1]:
from balm.data import load_dataset, DataCollator
from balm.models import (
    BalmForMaskedLM,
    BalmMoEForMaskedLM,
    BalmExpertChoiceMoEForMaskedLM,
    BalmHybridMoEForMaskedLM,
)
from balm.tokenizer import Tokenizer
from balm.train import Trainer

# from datasets import load_dataset
# from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling, EsmTokenizer

import os

import wandb



In [2]:
tokenizer = Tokenizer(vocab="./vocab.json")
# tokenizer = EsmTokenizer.from_pretrained("./vocab.json")



In [3]:
def remove_sep(txt):
    return txt.replace("</s>", "<cls><cls>")


data_files = {
    "train": "./balm/test_data/test.txt",
    "test": "./balm/test_data/test_1k.txt",
    "eval": "./balm/test_data/test_1k.txt",
}

dataset = load_dataset("text", data_files=data_files, preprocess_fn=remove_sep)

In [4]:
tokenized_dataset = dataset.map(
    lambda x: tokenizer(
        x["text"],
        padding=True,
        truncation=True,
        max_length=320,
    ),
    remove_columns="text"
)

Encoding:   0%|          | 0/66792 [00:00<?, ?it/s]

Encoding:   0%|          | 0/1000 [00:00<?, ?it/s]

Encoding:   0%|          | 0/1000 [00:00<?, ?it/s]

In [5]:
collator = DataCollator(tokenizer=tokenizer)
# collator = DataCollatorForLanguageModeling(
#     tokenizer=tokenizer, 
#     mlm=True,
#     mlm_probability=0.15,
# )



In [6]:
# # matched to ESM-2 8M
# model = BalmForMaskedLM(
#     embed_dim=320,
#     ffn_dim=320*4,
#     num_layers=6,
#     num_heads=20,
#     vocab_size=tokenizer.vocab_size,
# )

# # matched to ESM-2 8M
# model = BalmMoEForMaskedLM(
#     embed_dim=320,
#     ffn_dim=320*4,
#     num_experts=8,
#     num_shared_experts=0,
#     num_layers=6,
#     num_heads=20,
#     alternate_sparsity=True,
#     router_top_k=1,
#     expert_capacity=128,
#     router_z_loss_coef=0.01,
#     router_aux_loss_coef=0.01,
#     vocab_size=tokenizer.vocab_size,
# )

# matched to ESM-2 8M
model = BalmExpertChoiceMoEForMaskedLM(
    embed_dim=320,
    ffn_dim=320 * 4,
    num_experts=8,
    num_shared_experts=0,
    num_layers=6,
    num_heads=20,
    alternate_sparsity=False,
    expert_capacity=128,
    router_z_loss_coef=0.01,
    vocab_size=tokenizer.vocab_size,
)

In [7]:
model.num_parameters

41935073

In [8]:
trainer = Trainer(
    model=model,
    data_collator=collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["eval"],
    epochs=1,
    logging_steps=5,
    eval_steps=10,
    warmup_steps=10,
    per_device_train_batch_size=32,
    # use_cpu=True,
    use_wandb=True,
    wandb_project="test_wandb_logging",
    # wandb_entity="bryanbriney",
    run_name="test_wandb_logging_003",
)

# training_args = TrainingArguments(
#     output_dir="~/Desktop/training",
#     logging_dir="~/Desktop/training/log",
#     per_device_train_batch_size=32,
#     learning_rate=1e-4,
#     max_steps=2000,
#     gradient_accumulation_steps=1,
#     logging_steps=10,
#     eval_steps=50,
#     warmup_steps=100,
#     use_cpu=True,
#     report_to="none"
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     data_collator=collator,
#     train_dataset=tokenized_dataset["train"],
#     eval_dataset=tokenized_dataset["eval"],
# )




In [9]:
os.environ["WANDB_PROJECT"] = trainer.run_name
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mbryanbriney[0m ([33mthebrineylab[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [10]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mbryanbriney[0m. Use [1m`wandb login --relogin`[0m to force relogin


Training:   0%|          | 0/2087 [00:00<?, ?step/s]

step 5     | loss: 3.0656 | MLM loss: 3.0134 | router z-loss: 0.0522 | lr: 0.000200
step 10    | loss: 2.8217 | MLM loss: 2.7818 | router z-loss: 0.0399 | lr: 0.000400


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 2.9026 | accuracy: 0.1677 | perplexity: 16.0625
step 15    | loss: 2.6807 | MLM loss: 2.6556 | router z-loss: 0.0251 | lr: 0.000399
step 20    | loss: 2.5866 | MLM loss: 2.5709 | router z-loss: 0.0157 | lr: 0.000398


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 2.6845 | accuracy: 0.2361 | perplexity: 13.2902
step 25    | loss: 2.5023 | MLM loss: 2.4908 | router z-loss: 0.0115 | lr: 0.000397
step 30    | loss: 2.4882 | MLM loss: 2.4807 | router z-loss: 0.0075 | lr: 0.000396


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 2.5251 | accuracy: 0.2691 | perplexity: 11.4581
step 35    | loss: 2.3808 | MLM loss: 2.3757 | router z-loss: 0.0051 | lr: 0.000395
step 40    | loss: 2.2835 | MLM loss: 2.2784 | router z-loss: 0.0051 | lr: 0.000394


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 2.3321 | accuracy: 0.3114 | perplexity: 9.5447
step 45    | loss: 2.1816 | MLM loss: 2.1765 | router z-loss: 0.0051 | lr: 0.000393
step 50    | loss: 2.1834 | MLM loss: 2.1791 | router z-loss: 0.0043 | lr: 0.000392


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 2.2307 | accuracy: 0.3303 | perplexity: 8.6332
step 55    | loss: 2.1308 | MLM loss: 2.1271 | router z-loss: 0.0037 | lr: 0.000391
step 60    | loss: 2.1047 | MLM loss: 2.1019 | router z-loss: 0.0028 | lr: 0.000390


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 2.1921 | accuracy: 0.3454 | perplexity: 8.3362
step 65    | loss: 2.1631 | MLM loss: 2.1607 | router z-loss: 0.0025 | lr: 0.000389
step 70    | loss: 2.0298 | MLM loss: 2.0271 | router z-loss: 0.0027 | lr: 0.000388


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 2.1380 | accuracy: 0.3432 | perplexity: 7.8777
step 75    | loss: 2.0287 | MLM loss: 2.0263 | router z-loss: 0.0024 | lr: 0.000387
step 80    | loss: 1.9865 | MLM loss: 1.9848 | router z-loss: 0.0017 | lr: 0.000387


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 2.0994 | accuracy: 0.3594 | perplexity: 7.6302
step 85    | loss: 2.0367 | MLM loss: 2.0347 | router z-loss: 0.0020 | lr: 0.000386
step 90    | loss: 2.0957 | MLM loss: 2.0937 | router z-loss: 0.0020 | lr: 0.000385


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 2.0972 | accuracy: 0.3627 | perplexity: 7.5885
step 95    | loss: 2.0289 | MLM loss: 2.0272 | router z-loss: 0.0017 | lr: 0.000384
step 100   | loss: 2.0232 | MLM loss: 2.0215 | router z-loss: 0.0017 | lr: 0.000383


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 2.0661 | accuracy: 0.3746 | perplexity: 7.3874
step 105   | loss: 2.0745 | MLM loss: 2.0731 | router z-loss: 0.0014 | lr: 0.000382
step 110   | loss: 1.9222 | MLM loss: 1.9205 | router z-loss: 0.0017 | lr: 0.000381


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 2.0395 | accuracy: 0.3770 | perplexity: 7.2006
step 115   | loss: 2.0008 | MLM loss: 1.9996 | router z-loss: 0.0013 | lr: 0.000380
step 120   | loss: 1.9066 | MLM loss: 1.9051 | router z-loss: 0.0015 | lr: 0.000379


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 2.0139 | accuracy: 0.3987 | perplexity: 7.0054
step 125   | loss: 1.8896 | MLM loss: 1.8883 | router z-loss: 0.0013 | lr: 0.000378
step 130   | loss: 2.0047 | MLM loss: 2.0031 | router z-loss: 0.0016 | lr: 0.000377


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 2.0058 | accuracy: 0.3891 | perplexity: 6.9721
step 135   | loss: 1.9444 | MLM loss: 1.9434 | router z-loss: 0.0010 | lr: 0.000376
step 140   | loss: 1.9980 | MLM loss: 1.9969 | router z-loss: 0.0011 | lr: 0.000375


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.9982 | accuracy: 0.4015 | perplexity: 6.9231
step 145   | loss: 1.8846 | MLM loss: 1.8837 | router z-loss: 0.0009 | lr: 0.000374
step 150   | loss: 1.9132 | MLM loss: 1.9122 | router z-loss: 0.0010 | lr: 0.000373


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.9752 | accuracy: 0.3973 | perplexity: 6.7533
step 155   | loss: 1.8043 | MLM loss: 1.8031 | router z-loss: 0.0012 | lr: 0.000372
step 160   | loss: 1.9821 | MLM loss: 1.9812 | router z-loss: 0.0009 | lr: 0.000371


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.9658 | accuracy: 0.4060 | perplexity: 6.7055
step 165   | loss: 1.9064 | MLM loss: 1.9053 | router z-loss: 0.0011 | lr: 0.000370
step 170   | loss: 1.9007 | MLM loss: 1.8998 | router z-loss: 0.0009 | lr: 0.000369


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.9345 | accuracy: 0.4154 | perplexity: 6.5012
step 175   | loss: 1.8470 | MLM loss: 1.8460 | router z-loss: 0.0011 | lr: 0.000368
step 180   | loss: 1.8733 | MLM loss: 1.8724 | router z-loss: 0.0009 | lr: 0.000367


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.9183 | accuracy: 0.4222 | perplexity: 6.4086
step 185   | loss: 1.8328 | MLM loss: 1.8317 | router z-loss: 0.0011 | lr: 0.000366
step 190   | loss: 1.8327 | MLM loss: 1.8319 | router z-loss: 0.0008 | lr: 0.000365


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.9046 | accuracy: 0.4295 | perplexity: 6.3177
step 195   | loss: 1.8695 | MLM loss: 1.8684 | router z-loss: 0.0011 | lr: 0.000364
step 200   | loss: 1.8887 | MLM loss: 1.8878 | router z-loss: 0.0009 | lr: 0.000363


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.8878 | accuracy: 0.4342 | perplexity: 6.2132
step 205   | loss: 1.8356 | MLM loss: 1.8344 | router z-loss: 0.0012 | lr: 0.000362
step 210   | loss: 1.8127 | MLM loss: 1.8119 | router z-loss: 0.0008 | lr: 0.000361


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.8496 | accuracy: 0.4477 | perplexity: 5.9836
step 215   | loss: 1.7959 | MLM loss: 1.7951 | router z-loss: 0.0008 | lr: 0.000361
step 220   | loss: 1.8046 | MLM loss: 1.8040 | router z-loss: 0.0006 | lr: 0.000360


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.8359 | accuracy: 0.4528 | perplexity: 5.9134
step 225   | loss: 1.8450 | MLM loss: 1.8442 | router z-loss: 0.0008 | lr: 0.000359
step 230   | loss: 1.6778 | MLM loss: 1.6767 | router z-loss: 0.0011 | lr: 0.000358


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.8263 | accuracy: 0.4531 | perplexity: 5.8796
step 235   | loss: 1.7219 | MLM loss: 1.7210 | router z-loss: 0.0009 | lr: 0.000357
step 240   | loss: 1.7962 | MLM loss: 1.7954 | router z-loss: 0.0008 | lr: 0.000356


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.7915 | accuracy: 0.4628 | perplexity: 5.6534
step 245   | loss: 1.8240 | MLM loss: 1.8230 | router z-loss: 0.0010 | lr: 0.000355
step 250   | loss: 1.7325 | MLM loss: 1.7315 | router z-loss: 0.0011 | lr: 0.000354


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.7699 | accuracy: 0.4728 | perplexity: 5.5534
step 255   | loss: 1.7236 | MLM loss: 1.7229 | router z-loss: 0.0008 | lr: 0.000353
step 260   | loss: 1.6561 | MLM loss: 1.6552 | router z-loss: 0.0009 | lr: 0.000352


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.7479 | accuracy: 0.4727 | perplexity: 5.4293
step 265   | loss: 1.5681 | MLM loss: 1.5673 | router z-loss: 0.0008 | lr: 0.000351
step 270   | loss: 1.7180 | MLM loss: 1.7174 | router z-loss: 0.0006 | lr: 0.000350


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.7220 | accuracy: 0.4819 | perplexity: 5.2849
step 275   | loss: 1.7614 | MLM loss: 1.7608 | router z-loss: 0.0005 | lr: 0.000349
step 280   | loss: 1.6040 | MLM loss: 1.6032 | router z-loss: 0.0008 | lr: 0.000348


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.7050 | accuracy: 0.4921 | perplexity: 5.2205
step 285   | loss: 1.6619 | MLM loss: 1.6612 | router z-loss: 0.0006 | lr: 0.000347
step 290   | loss: 1.6292 | MLM loss: 1.6282 | router z-loss: 0.0009 | lr: 0.000346


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.6689 | accuracy: 0.4957 | perplexity: 5.0399
step 295   | loss: 1.6254 | MLM loss: 1.6247 | router z-loss: 0.0007 | lr: 0.000345
step 300   | loss: 1.5964 | MLM loss: 1.5954 | router z-loss: 0.0010 | lr: 0.000344


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.6441 | accuracy: 0.5075 | perplexity: 4.8914
step 305   | loss: 1.6410 | MLM loss: 1.6402 | router z-loss: 0.0008 | lr: 0.000343
step 310   | loss: 1.5670 | MLM loss: 1.5664 | router z-loss: 0.0006 | lr: 0.000342


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.6074 | accuracy: 0.5153 | perplexity: 4.7431
step 315   | loss: 1.5503 | MLM loss: 1.5496 | router z-loss: 0.0007 | lr: 0.000341
step 320   | loss: 1.6089 | MLM loss: 1.6082 | router z-loss: 0.0006 | lr: 0.000340


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.5983 | accuracy: 0.5123 | perplexity: 4.6878
step 325   | loss: 1.6285 | MLM loss: 1.6277 | router z-loss: 0.0008 | lr: 0.000339
step 330   | loss: 1.4674 | MLM loss: 1.4663 | router z-loss: 0.0012 | lr: 0.000338


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.5483 | accuracy: 0.5327 | perplexity: 4.4762
step 335   | loss: 1.6008 | MLM loss: 1.6000 | router z-loss: 0.0008 | lr: 0.000337
step 340   | loss: 1.5340 | MLM loss: 1.5332 | router z-loss: 0.0008 | lr: 0.000336


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.5238 | accuracy: 0.5412 | perplexity: 4.3838
step 345   | loss: 1.3573 | MLM loss: 1.3566 | router z-loss: 0.0006 | lr: 0.000335
step 350   | loss: 1.5121 | MLM loss: 1.5116 | router z-loss: 0.0006 | lr: 0.000335


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.4807 | accuracy: 0.5513 | perplexity: 4.1797
step 355   | loss: 1.4902 | MLM loss: 1.4895 | router z-loss: 0.0007 | lr: 0.000334
step 360   | loss: 1.4241 | MLM loss: 1.4234 | router z-loss: 0.0007 | lr: 0.000333


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.4495 | accuracy: 0.5632 | perplexity: 4.0596
step 365   | loss: 1.3852 | MLM loss: 1.3847 | router z-loss: 0.0005 | lr: 0.000332
step 370   | loss: 1.5135 | MLM loss: 1.5129 | router z-loss: 0.0006 | lr: 0.000331


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.4374 | accuracy: 0.5710 | perplexity: 4.0063
step 375   | loss: 1.4257 | MLM loss: 1.4250 | router z-loss: 0.0007 | lr: 0.000330
step 380   | loss: 1.4667 | MLM loss: 1.4660 | router z-loss: 0.0006 | lr: 0.000329


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.4145 | accuracy: 0.5758 | perplexity: 3.9336
step 385   | loss: 1.3217 | MLM loss: 1.3210 | router z-loss: 0.0007 | lr: 0.000328
step 390   | loss: 1.3827 | MLM loss: 1.3822 | router z-loss: 0.0005 | lr: 0.000327


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.3784 | accuracy: 0.5869 | perplexity: 3.7896
step 395   | loss: 1.3731 | MLM loss: 1.3725 | router z-loss: 0.0006 | lr: 0.000326
step 400   | loss: 1.3318 | MLM loss: 1.3312 | router z-loss: 0.0006 | lr: 0.000325


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.3523 | accuracy: 0.5917 | perplexity: 3.6990
step 405   | loss: 1.3158 | MLM loss: 1.3150 | router z-loss: 0.0007 | lr: 0.000324
step 410   | loss: 1.2721 | MLM loss: 1.2714 | router z-loss: 0.0006 | lr: 0.000323


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.3458 | accuracy: 0.5936 | perplexity: 3.6807
step 415   | loss: 1.2441 | MLM loss: 1.2437 | router z-loss: 0.0005 | lr: 0.000322
step 420   | loss: 1.2856 | MLM loss: 1.2851 | router z-loss: 0.0005 | lr: 0.000321


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.2924 | accuracy: 0.6159 | perplexity: 3.4875
step 425   | loss: 1.2831 | MLM loss: 1.2823 | router z-loss: 0.0007 | lr: 0.000320
step 430   | loss: 1.2572 | MLM loss: 1.2565 | router z-loss: 0.0007 | lr: 0.000319


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.2718 | accuracy: 0.6174 | perplexity: 3.4214
step 435   | loss: 1.2592 | MLM loss: 1.2587 | router z-loss: 0.0005 | lr: 0.000318
step 440   | loss: 1.1919 | MLM loss: 1.1911 | router z-loss: 0.0008 | lr: 0.000317


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.2458 | accuracy: 0.6256 | perplexity: 3.3419
step 445   | loss: 1.1339 | MLM loss: 1.1334 | router z-loss: 0.0006 | lr: 0.000316
step 450   | loss: 1.3069 | MLM loss: 1.3064 | router z-loss: 0.0004 | lr: 0.000315


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.2264 | accuracy: 0.6340 | perplexity: 3.2788
step 455   | loss: 1.3560 | MLM loss: 1.3555 | router z-loss: 0.0006 | lr: 0.000314
step 460   | loss: 1.1547 | MLM loss: 1.1542 | router z-loss: 0.0006 | lr: 0.000313


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.2021 | accuracy: 0.6444 | perplexity: 3.1960
step 465   | loss: 1.1748 | MLM loss: 1.1742 | router z-loss: 0.0006 | lr: 0.000312
step 470   | loss: 1.1318 | MLM loss: 1.1313 | router z-loss: 0.0005 | lr: 0.000311


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.1725 | accuracy: 0.6545 | perplexity: 3.1066
step 475   | loss: 1.2242 | MLM loss: 1.2237 | router z-loss: 0.0005 | lr: 0.000310
step 480   | loss: 1.1868 | MLM loss: 1.1862 | router z-loss: 0.0006 | lr: 0.000309


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.1630 | accuracy: 0.6544 | perplexity: 3.0762
step 485   | loss: 1.1447 | MLM loss: 1.1442 | router z-loss: 0.0005 | lr: 0.000309
step 490   | loss: 1.0445 | MLM loss: 1.0440 | router z-loss: 0.0005 | lr: 0.000308


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.1249 | accuracy: 0.6714 | perplexity: 2.9630
step 495   | loss: 1.0627 | MLM loss: 1.0622 | router z-loss: 0.0005 | lr: 0.000307
step 500   | loss: 1.1359 | MLM loss: 1.1354 | router z-loss: 0.0004 | lr: 0.000306


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.1052 | accuracy: 0.6809 | perplexity: 2.9067
step 505   | loss: 1.0773 | MLM loss: 1.0768 | router z-loss: 0.0005 | lr: 0.000305
step 510   | loss: 1.1269 | MLM loss: 1.1265 | router z-loss: 0.0004 | lr: 0.000304


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.0840 | accuracy: 0.6827 | perplexity: 2.8585
step 515   | loss: 1.0488 | MLM loss: 1.0484 | router z-loss: 0.0005 | lr: 0.000303
step 520   | loss: 1.0230 | MLM loss: 1.0226 | router z-loss: 0.0004 | lr: 0.000302


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.0532 | accuracy: 0.6959 | perplexity: 2.7605
step 525   | loss: 1.0049 | MLM loss: 1.0044 | router z-loss: 0.0005 | lr: 0.000301
step 530   | loss: 1.0399 | MLM loss: 1.0393 | router z-loss: 0.0005 | lr: 0.000300


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.0186 | accuracy: 0.7051 | perplexity: 2.6743
step 535   | loss: 0.9972 | MLM loss: 0.9967 | router z-loss: 0.0005 | lr: 0.000299
step 540   | loss: 0.9391 | MLM loss: 0.9386 | router z-loss: 0.0004 | lr: 0.000298


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 1.0216 | accuracy: 0.6989 | perplexity: 2.6888
step 545   | loss: 0.9596 | MLM loss: 0.9592 | router z-loss: 0.0005 | lr: 0.000297
step 550   | loss: 1.0105 | MLM loss: 1.0100 | router z-loss: 0.0005 | lr: 0.000296


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.9874 | accuracy: 0.7130 | perplexity: 2.5951
step 555   | loss: 0.9791 | MLM loss: 0.9787 | router z-loss: 0.0005 | lr: 0.000295
step 560   | loss: 1.0203 | MLM loss: 1.0199 | router z-loss: 0.0004 | lr: 0.000294


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.9834 | accuracy: 0.7158 | perplexity: 2.5859
step 565   | loss: 0.8353 | MLM loss: 0.8347 | router z-loss: 0.0006 | lr: 0.000293
step 570   | loss: 0.9531 | MLM loss: 0.9526 | router z-loss: 0.0004 | lr: 0.000292


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.9578 | accuracy: 0.7228 | perplexity: 2.5199
step 575   | loss: 0.9987 | MLM loss: 0.9982 | router z-loss: 0.0004 | lr: 0.000291
step 580   | loss: 0.8938 | MLM loss: 0.8934 | router z-loss: 0.0004 | lr: 0.000290


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.9541 | accuracy: 0.7249 | perplexity: 2.5196
step 585   | loss: 0.9473 | MLM loss: 0.9469 | router z-loss: 0.0004 | lr: 0.000289
step 590   | loss: 0.8565 | MLM loss: 0.8561 | router z-loss: 0.0004 | lr: 0.000288


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.9268 | accuracy: 0.7315 | perplexity: 2.4588
step 595   | loss: 0.8010 | MLM loss: 0.8006 | router z-loss: 0.0004 | lr: 0.000287
step 600   | loss: 0.8893 | MLM loss: 0.8888 | router z-loss: 0.0005 | lr: 0.000286


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.9058 | accuracy: 0.7420 | perplexity: 2.3968
step 605   | loss: 0.8644 | MLM loss: 0.8639 | router z-loss: 0.0005 | lr: 0.000285
step 610   | loss: 0.8700 | MLM loss: 0.8696 | router z-loss: 0.0004 | lr: 0.000284


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.9249 | accuracy: 0.7340 | perplexity: 2.4449
step 615   | loss: 0.8868 | MLM loss: 0.8863 | router z-loss: 0.0005 | lr: 0.000283
step 620   | loss: 0.8041 | MLM loss: 0.8036 | router z-loss: 0.0005 | lr: 0.000283


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.8767 | accuracy: 0.7494 | perplexity: 2.3367
step 625   | loss: 0.8847 | MLM loss: 0.8843 | router z-loss: 0.0004 | lr: 0.000282
step 630   | loss: 0.8466 | MLM loss: 0.8461 | router z-loss: 0.0005 | lr: 0.000281


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.8686 | accuracy: 0.7532 | perplexity: 2.3056
step 635   | loss: 0.9091 | MLM loss: 0.9087 | router z-loss: 0.0004 | lr: 0.000280
step 640   | loss: 0.8887 | MLM loss: 0.8883 | router z-loss: 0.0004 | lr: 0.000279


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.8562 | accuracy: 0.7577 | perplexity: 2.2873
step 645   | loss: 0.8002 | MLM loss: 0.7997 | router z-loss: 0.0004 | lr: 0.000278
step 650   | loss: 0.8572 | MLM loss: 0.8568 | router z-loss: 0.0004 | lr: 0.000277


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.8575 | accuracy: 0.7566 | perplexity: 2.2863
step 655   | loss: 0.9392 | MLM loss: 0.9388 | router z-loss: 0.0004 | lr: 0.000276
step 660   | loss: 0.7813 | MLM loss: 0.7808 | router z-loss: 0.0005 | lr: 0.000275


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.8349 | accuracy: 0.7634 | perplexity: 2.2391
step 665   | loss: 0.8523 | MLM loss: 0.8519 | router z-loss: 0.0004 | lr: 0.000274
step 670   | loss: 0.6897 | MLM loss: 0.6893 | router z-loss: 0.0004 | lr: 0.000273


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.8105 | accuracy: 0.7715 | perplexity: 2.1873
step 675   | loss: 0.7906 | MLM loss: 0.7902 | router z-loss: 0.0004 | lr: 0.000272
step 680   | loss: 0.7342 | MLM loss: 0.7338 | router z-loss: 0.0004 | lr: 0.000271


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.8186 | accuracy: 0.7677 | perplexity: 2.2068
step 685   | loss: 0.8409 | MLM loss: 0.8405 | router z-loss: 0.0005 | lr: 0.000270
step 690   | loss: 0.7863 | MLM loss: 0.7858 | router z-loss: 0.0005 | lr: 0.000269


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.7941 | accuracy: 0.7733 | perplexity: 2.1524
step 695   | loss: 0.8268 | MLM loss: 0.8264 | router z-loss: 0.0004 | lr: 0.000268
step 700   | loss: 0.7413 | MLM loss: 0.7409 | router z-loss: 0.0004 | lr: 0.000267


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.7943 | accuracy: 0.7752 | perplexity: 2.1614
step 705   | loss: 0.7515 | MLM loss: 0.7511 | router z-loss: 0.0004 | lr: 0.000266
step 710   | loss: 0.7863 | MLM loss: 0.7860 | router z-loss: 0.0003 | lr: 0.000265


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.7780 | accuracy: 0.7776 | perplexity: 2.1229
step 715   | loss: 0.8512 | MLM loss: 0.8509 | router z-loss: 0.0004 | lr: 0.000264
step 720   | loss: 0.6921 | MLM loss: 0.6917 | router z-loss: 0.0004 | lr: 0.000263


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.7628 | accuracy: 0.7899 | perplexity: 2.0895
step 725   | loss: 0.7433 | MLM loss: 0.7428 | router z-loss: 0.0005 | lr: 0.000262
step 730   | loss: 0.7454 | MLM loss: 0.7450 | router z-loss: 0.0004 | lr: 0.000261


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.7457 | accuracy: 0.7897 | perplexity: 2.0584
step 735   | loss: 0.7859 | MLM loss: 0.7855 | router z-loss: 0.0004 | lr: 0.000260
step 740   | loss: 0.6857 | MLM loss: 0.6853 | router z-loss: 0.0004 | lr: 0.000259


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.7453 | accuracy: 0.7898 | perplexity: 2.0566
step 745   | loss: 0.7945 | MLM loss: 0.7941 | router z-loss: 0.0004 | lr: 0.000258
step 750   | loss: 0.6071 | MLM loss: 0.6067 | router z-loss: 0.0004 | lr: 0.000257


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.7320 | accuracy: 0.7952 | perplexity: 2.0302
step 755   | loss: 0.6892 | MLM loss: 0.6888 | router z-loss: 0.0004 | lr: 0.000257
step 760   | loss: 0.7538 | MLM loss: 0.7534 | router z-loss: 0.0004 | lr: 0.000256


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.7400 | accuracy: 0.7902 | perplexity: 2.0478
step 765   | loss: 0.7888 | MLM loss: 0.7884 | router z-loss: 0.0004 | lr: 0.000255
step 770   | loss: 0.7727 | MLM loss: 0.7724 | router z-loss: 0.0003 | lr: 0.000254


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.7222 | accuracy: 0.7960 | perplexity: 2.0090
step 775   | loss: 0.6883 | MLM loss: 0.6879 | router z-loss: 0.0004 | lr: 0.000253
step 780   | loss: 0.6812 | MLM loss: 0.6808 | router z-loss: 0.0004 | lr: 0.000252


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.7091 | accuracy: 0.8024 | perplexity: 1.9885
step 785   | loss: 0.7114 | MLM loss: 0.7110 | router z-loss: 0.0004 | lr: 0.000251
step 790   | loss: 0.5533 | MLM loss: 0.5529 | router z-loss: 0.0004 | lr: 0.000250


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.7035 | accuracy: 0.8047 | perplexity: 1.9711
step 795   | loss: 0.6729 | MLM loss: 0.6725 | router z-loss: 0.0004 | lr: 0.000249
step 800   | loss: 0.7415 | MLM loss: 0.7411 | router z-loss: 0.0004 | lr: 0.000248


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.6874 | accuracy: 0.8100 | perplexity: 1.9409
step 805   | loss: 0.6282 | MLM loss: 0.6278 | router z-loss: 0.0004 | lr: 0.000247
step 810   | loss: 0.7060 | MLM loss: 0.7057 | router z-loss: 0.0003 | lr: 0.000246


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.7001 | accuracy: 0.8087 | perplexity: 1.9654
step 815   | loss: 0.6666 | MLM loss: 0.6663 | router z-loss: 0.0004 | lr: 0.000245
step 820   | loss: 0.6590 | MLM loss: 0.6587 | router z-loss: 0.0003 | lr: 0.000244


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.6890 | accuracy: 0.8076 | perplexity: 1.9482
step 825   | loss: 0.7765 | MLM loss: 0.7760 | router z-loss: 0.0005 | lr: 0.000243
step 830   | loss: 0.6955 | MLM loss: 0.6951 | router z-loss: 0.0004 | lr: 0.000242


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.6949 | accuracy: 0.8085 | perplexity: 1.9531
step 835   | loss: 0.6093 | MLM loss: 0.6090 | router z-loss: 0.0004 | lr: 0.000241
step 840   | loss: 0.7120 | MLM loss: 0.7116 | router z-loss: 0.0004 | lr: 0.000240


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.6709 | accuracy: 0.8156 | perplexity: 1.9121
step 845   | loss: 0.5928 | MLM loss: 0.5924 | router z-loss: 0.0003 | lr: 0.000239
step 850   | loss: 0.6075 | MLM loss: 0.6071 | router z-loss: 0.0004 | lr: 0.000238


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.6508 | accuracy: 0.8213 | perplexity: 1.8780
step 855   | loss: 0.6497 | MLM loss: 0.6493 | router z-loss: 0.0004 | lr: 0.000237
step 860   | loss: 0.6150 | MLM loss: 0.6146 | router z-loss: 0.0003 | lr: 0.000236


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.6623 | accuracy: 0.8182 | perplexity: 1.8974
step 865   | loss: 0.6205 | MLM loss: 0.6202 | router z-loss: 0.0003 | lr: 0.000235
step 870   | loss: 0.6158 | MLM loss: 0.6154 | router z-loss: 0.0003 | lr: 0.000234


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.6771 | accuracy: 0.8113 | perplexity: 1.9208
step 875   | loss: 0.5977 | MLM loss: 0.5973 | router z-loss: 0.0004 | lr: 0.000233
step 880   | loss: 0.6643 | MLM loss: 0.6639 | router z-loss: 0.0003 | lr: 0.000232


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.6510 | accuracy: 0.8204 | perplexity: 1.8719
step 885   | loss: 0.6843 | MLM loss: 0.6840 | router z-loss: 0.0003 | lr: 0.000231
step 890   | loss: 0.6510 | MLM loss: 0.6507 | router z-loss: 0.0004 | lr: 0.000231


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.6361 | accuracy: 0.8235 | perplexity: 1.8520
step 895   | loss: 0.7353 | MLM loss: 0.7350 | router z-loss: 0.0003 | lr: 0.000230
step 900   | loss: 0.6262 | MLM loss: 0.6258 | router z-loss: 0.0003 | lr: 0.000229


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.6228 | accuracy: 0.8286 | perplexity: 1.8225
step 905   | loss: 0.6416 | MLM loss: 0.6413 | router z-loss: 0.0003 | lr: 0.000228
step 910   | loss: 0.5718 | MLM loss: 0.5715 | router z-loss: 0.0003 | lr: 0.000227


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.6315 | accuracy: 0.8258 | perplexity: 1.8366
step 915   | loss: 0.7923 | MLM loss: 0.7920 | router z-loss: 0.0003 | lr: 0.000226
step 920   | loss: 0.5770 | MLM loss: 0.5767 | router z-loss: 0.0003 | lr: 0.000225


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.6262 | accuracy: 0.8267 | perplexity: 1.8282
step 925   | loss: 0.5366 | MLM loss: 0.5362 | router z-loss: 0.0004 | lr: 0.000224
step 930   | loss: 0.5060 | MLM loss: 0.5057 | router z-loss: 0.0004 | lr: 0.000223


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.6314 | accuracy: 0.8262 | perplexity: 1.8391
step 935   | loss: 0.7188 | MLM loss: 0.7183 | router z-loss: 0.0004 | lr: 0.000222
step 940   | loss: 0.6277 | MLM loss: 0.6273 | router z-loss: 0.0003 | lr: 0.000221


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.6340 | accuracy: 0.8252 | perplexity: 1.8383
step 945   | loss: 0.6055 | MLM loss: 0.6051 | router z-loss: 0.0004 | lr: 0.000220
step 950   | loss: 0.6261 | MLM loss: 0.6258 | router z-loss: 0.0003 | lr: 0.000219


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.6116 | accuracy: 0.8304 | perplexity: 1.8077
step 955   | loss: 0.5086 | MLM loss: 0.5083 | router z-loss: 0.0003 | lr: 0.000218
step 960   | loss: 0.5783 | MLM loss: 0.5780 | router z-loss: 0.0003 | lr: 0.000217


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.6030 | accuracy: 0.8337 | perplexity: 1.7926
step 965   | loss: 0.5685 | MLM loss: 0.5682 | router z-loss: 0.0003 | lr: 0.000216
step 970   | loss: 0.5144 | MLM loss: 0.5141 | router z-loss: 0.0003 | lr: 0.000215


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.6047 | accuracy: 0.8346 | perplexity: 1.7923
step 975   | loss: 0.6385 | MLM loss: 0.6382 | router z-loss: 0.0003 | lr: 0.000214
step 980   | loss: 0.5741 | MLM loss: 0.5738 | router z-loss: 0.0003 | lr: 0.000213


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.5839 | accuracy: 0.8412 | perplexity: 1.7548
step 985   | loss: 0.5469 | MLM loss: 0.5465 | router z-loss: 0.0005 | lr: 0.000212
step 990   | loss: 0.5519 | MLM loss: 0.5515 | router z-loss: 0.0004 | lr: 0.000211


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.5729 | accuracy: 0.8446 | perplexity: 1.7412
step 995   | loss: 0.5309 | MLM loss: 0.5306 | router z-loss: 0.0003 | lr: 0.000210
step 1000  | loss: 0.5430 | MLM loss: 0.5427 | router z-loss: 0.0003 | lr: 0.000209


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.5762 | accuracy: 0.8431 | perplexity: 1.7451
step 1005  | loss: 0.5221 | MLM loss: 0.5218 | router z-loss: 0.0003 | lr: 0.000208
step 1010  | loss: 0.6424 | MLM loss: 0.6420 | router z-loss: 0.0003 | lr: 0.000207


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.5833 | accuracy: 0.8397 | perplexity: 1.7595
step 1015  | loss: 0.5600 | MLM loss: 0.5597 | router z-loss: 0.0003 | lr: 0.000206
step 1020  | loss: 0.6060 | MLM loss: 0.6057 | router z-loss: 0.0003 | lr: 0.000205


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.5547 | accuracy: 0.8489 | perplexity: 1.7135
step 1025  | loss: 0.4884 | MLM loss: 0.4880 | router z-loss: 0.0003 | lr: 0.000205
step 1030  | loss: 0.5835 | MLM loss: 0.5832 | router z-loss: 0.0003 | lr: 0.000204


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.5641 | accuracy: 0.8463 | perplexity: 1.7290
step 1035  | loss: 0.4942 | MLM loss: 0.4939 | router z-loss: 0.0003 | lr: 0.000203
step 1040  | loss: 0.5658 | MLM loss: 0.5655 | router z-loss: 0.0003 | lr: 0.000202


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.5450 | accuracy: 0.8524 | perplexity: 1.6930
step 1045  | loss: 0.4662 | MLM loss: 0.4659 | router z-loss: 0.0003 | lr: 0.000201
step 1050  | loss: 0.5687 | MLM loss: 0.5684 | router z-loss: 0.0003 | lr: 0.000200


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.5685 | accuracy: 0.8451 | perplexity: 1.7288
step 1055  | loss: 0.5727 | MLM loss: 0.5724 | router z-loss: 0.0003 | lr: 0.000199
step 1060  | loss: 0.4954 | MLM loss: 0.4951 | router z-loss: 0.0003 | lr: 0.000198


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.5690 | accuracy: 0.8449 | perplexity: 1.7316
step 1065  | loss: 0.6294 | MLM loss: 0.6291 | router z-loss: 0.0003 | lr: 0.000197
step 1070  | loss: 0.6031 | MLM loss: 0.6028 | router z-loss: 0.0003 | lr: 0.000196


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.5658 | accuracy: 0.8476 | perplexity: 1.7283
step 1075  | loss: 0.5862 | MLM loss: 0.5859 | router z-loss: 0.0003 | lr: 0.000195
step 1080  | loss: 0.5959 | MLM loss: 0.5956 | router z-loss: 0.0003 | lr: 0.000194


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.5622 | accuracy: 0.8468 | perplexity: 1.7223
step 1085  | loss: 0.5900 | MLM loss: 0.5897 | router z-loss: 0.0003 | lr: 0.000193
step 1090  | loss: 0.5254 | MLM loss: 0.5251 | router z-loss: 0.0003 | lr: 0.000192


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.5481 | accuracy: 0.8500 | perplexity: 1.6996
step 1095  | loss: 0.5358 | MLM loss: 0.5355 | router z-loss: 0.0003 | lr: 0.000191
step 1100  | loss: 0.5949 | MLM loss: 0.5946 | router z-loss: 0.0003 | lr: 0.000190


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.5452 | accuracy: 0.8512 | perplexity: 1.6977
step 1105  | loss: 0.5470 | MLM loss: 0.5466 | router z-loss: 0.0004 | lr: 0.000189
step 1110  | loss: 0.5241 | MLM loss: 0.5239 | router z-loss: 0.0003 | lr: 0.000188


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.5434 | accuracy: 0.8526 | perplexity: 1.6935
step 1115  | loss: 0.5748 | MLM loss: 0.5745 | router z-loss: 0.0003 | lr: 0.000187
step 1120  | loss: 0.4419 | MLM loss: 0.4416 | router z-loss: 0.0003 | lr: 0.000186


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.5400 | accuracy: 0.8518 | perplexity: 1.6851
step 1125  | loss: 0.5364 | MLM loss: 0.5361 | router z-loss: 0.0003 | lr: 0.000185
step 1130  | loss: 0.4230 | MLM loss: 0.4227 | router z-loss: 0.0003 | lr: 0.000184


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.5281 | accuracy: 0.8572 | perplexity: 1.6671
step 1135  | loss: 0.5436 | MLM loss: 0.5433 | router z-loss: 0.0003 | lr: 0.000183
step 1140  | loss: 0.5470 | MLM loss: 0.5467 | router z-loss: 0.0003 | lr: 0.000182


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.5391 | accuracy: 0.8550 | perplexity: 1.6819
step 1145  | loss: 0.4402 | MLM loss: 0.4399 | router z-loss: 0.0003 | lr: 0.000181
step 1150  | loss: 0.4479 | MLM loss: 0.4476 | router z-loss: 0.0003 | lr: 0.000180


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.5248 | accuracy: 0.8586 | perplexity: 1.6577
step 1155  | loss: 0.5832 | MLM loss: 0.5829 | router z-loss: 0.0003 | lr: 0.000179
step 1160  | loss: 0.6116 | MLM loss: 0.6113 | router z-loss: 0.0003 | lr: 0.000179


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.5168 | accuracy: 0.8606 | perplexity: 1.6497
step 1165  | loss: 0.5110 | MLM loss: 0.5108 | router z-loss: 0.0003 | lr: 0.000178
step 1170  | loss: 0.4462 | MLM loss: 0.4459 | router z-loss: 0.0003 | lr: 0.000177


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.5125 | accuracy: 0.8652 | perplexity: 1.6436
step 1175  | loss: 0.5253 | MLM loss: 0.5250 | router z-loss: 0.0003 | lr: 0.000176
step 1180  | loss: 0.5005 | MLM loss: 0.5002 | router z-loss: 0.0003 | lr: 0.000175


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.5189 | accuracy: 0.8609 | perplexity: 1.6484
step 1185  | loss: 0.4798 | MLM loss: 0.4795 | router z-loss: 0.0003 | lr: 0.000174
step 1190  | loss: 0.3931 | MLM loss: 0.3928 | router z-loss: 0.0003 | lr: 0.000173


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.5065 | accuracy: 0.8649 | perplexity: 1.6331
step 1195  | loss: 0.4904 | MLM loss: 0.4901 | router z-loss: 0.0003 | lr: 0.000172
step 1200  | loss: 0.5074 | MLM loss: 0.5071 | router z-loss: 0.0003 | lr: 0.000171


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.5059 | accuracy: 0.8636 | perplexity: 1.6307
step 1205  | loss: 0.5677 | MLM loss: 0.5674 | router z-loss: 0.0003 | lr: 0.000170
step 1210  | loss: 0.4676 | MLM loss: 0.4673 | router z-loss: 0.0003 | lr: 0.000169


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.5044 | accuracy: 0.8652 | perplexity: 1.6271
step 1215  | loss: 0.4105 | MLM loss: 0.4102 | router z-loss: 0.0003 | lr: 0.000168
step 1220  | loss: 0.4990 | MLM loss: 0.4988 | router z-loss: 0.0003 | lr: 0.000167


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.5058 | accuracy: 0.8657 | perplexity: 1.6288
step 1225  | loss: 0.4508 | MLM loss: 0.4506 | router z-loss: 0.0003 | lr: 0.000166
step 1230  | loss: 0.5651 | MLM loss: 0.5648 | router z-loss: 0.0003 | lr: 0.000165


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.5054 | accuracy: 0.8657 | perplexity: 1.6306
step 1235  | loss: 0.5091 | MLM loss: 0.5089 | router z-loss: 0.0003 | lr: 0.000164
step 1240  | loss: 0.4046 | MLM loss: 0.4043 | router z-loss: 0.0003 | lr: 0.000163


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.4889 | accuracy: 0.8694 | perplexity: 1.6065
step 1245  | loss: 0.5545 | MLM loss: 0.5543 | router z-loss: 0.0003 | lr: 0.000162
step 1250  | loss: 0.5475 | MLM loss: 0.5472 | router z-loss: 0.0003 | lr: 0.000161


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.4965 | accuracy: 0.8681 | perplexity: 1.6160
step 1255  | loss: 0.6205 | MLM loss: 0.6202 | router z-loss: 0.0003 | lr: 0.000160
step 1260  | loss: 0.5463 | MLM loss: 0.5460 | router z-loss: 0.0003 | lr: 0.000159


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.4998 | accuracy: 0.8669 | perplexity: 1.6207
step 1265  | loss: 0.5214 | MLM loss: 0.5211 | router z-loss: 0.0003 | lr: 0.000158
step 1270  | loss: 0.4654 | MLM loss: 0.4651 | router z-loss: 0.0003 | lr: 0.000157


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.4835 | accuracy: 0.8706 | perplexity: 1.5957
step 1275  | loss: 0.6222 | MLM loss: 0.6219 | router z-loss: 0.0003 | lr: 0.000156
step 1280  | loss: 0.4177 | MLM loss: 0.4174 | router z-loss: 0.0003 | lr: 0.000155


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.4932 | accuracy: 0.8671 | perplexity: 1.6114
step 1285  | loss: 0.5245 | MLM loss: 0.5241 | router z-loss: 0.0003 | lr: 0.000154
step 1290  | loss: 0.4722 | MLM loss: 0.4720 | router z-loss: 0.0003 | lr: 0.000153


Evaluating:   0%|          | 0/31 [00:00<?, ?step/s]

<< EVAL >> | loss: 0.4862 | accuracy: 0.8704 | perplexity: 1.6015


Thread SenderThread:
Traceback (most recent call last):
  File "/Users/bryanbriney/conda/lib/python3.11/site-packages/wandb/sdk/internal/internal_util.py", line 49, in run
    self._run()
  File "/Users/bryanbriney/conda/lib/python3.11/site-packages/wandb/sdk/internal/internal_util.py", line 100, in _run
    self._process(record)
  File "/Users/bryanbriney/conda/lib/python3.11/site-packages/wandb/sdk/internal/internal.py", line 328, in _process
    self._sm.send(record)
  File "/Users/bryanbriney/conda/lib/python3.11/site-packages/wandb/sdk/internal/sender.py", line 389, in send
    send_handler(record)
  File "/Users/bryanbriney/conda/lib/python3.11/site-packages/wandb/sdk/internal/sender.py", line 411, in send_request
    send_handler(record)
  File "/Users/bryanbriney/conda/lib/python3.11/site-packages/wandb/sdk/internal/sender.py", line 1160, in send_request_summary_record
    self._update_summary_record(record.request.summary_record.summary)
  File "/Users/bryanbriney/conda/lib/py

step 1295  | loss: 0.5281 | MLM loss: 0.5278 | router z-loss: 0.0003 | lr: 0.000153


BrokenPipeError: [Errno 32] Broken pipe

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x2de2df490>> (for post_run_cell):


BrokenPipeError: [Errno 32] Broken pipe