In [1]:
from e2e_sae.scripts.train_tlens_saes.run_train_tlens_saes import Config
from e2e_sae.scripts.train_tlens_saes.run_train_tlens_saes import main as run_training
from e2e_sae.scripts.train_tlens_saes.run_train_tlens_bayesian_saes import main as run_bayesian_training


  from .autonotebook import tqdm as notebook_tqdm


## Define a Config
The sample config below will train a single SAE on layer 6 of gpt2 using an e2e loss.

Note that this will take 10-11 hours on an A100 to run. See
[e2e_sae/scripts/train_tlens_saes/tinystories_1M_e2e.yaml](../e2e_sae/scripts/train_tlens_saes/tinystories_1M_e2e.yaml)
for a tinystories-1m config, or simply choose a smaller model to train on and adjust the
n_ctx and dataset accordingly (some other pre-tokenized datasets can be found [here](https://huggingface.co/apollo-research)).


In [3]:
config = Config(
    wandb_project="gpt2-e2e_play",
    wandb_run_name=None,  # If not set, will use a name based on important config values
    wandb_run_name_prefix="",
    seed=0,
    tlens_model_name="gpt2-small",
    tlens_model_path=None,
    n_samples=400_000,
    save_every_n_samples=None,
    eval_every_n_samples=40_000,
    eval_n_samples=500,
    log_every_n_grad_steps=20,
    collect_act_frequency_every_n_samples=40_000,
    act_frequency_n_tokens=500_000,
    batch_size=8,
    effective_batch_size=16,  # Number of samples before each optimizer step
    lr=5e-4,
    lr_schedule="cosine",
    min_lr_factor=0.1,  # Minimum learning rate as a fraction of the initial learning rate
    warmup_samples=20_000,  # Linear warmup over this many samples
    max_grad_norm=10.0,  # Gradient norms get clipped to this value before optimizer steps
    loss={
        # Note that "original acts" below refers to the activations in a model without SAEs
        "sparsity": {
            "p_norm": 1.0,  # p value in Lp norm
            "coeff": 1.5,  # Multiplies the Lp norm in the loss (sparsity coefficient)
        },
        "in_to_orig": None,  # Used for e2e+future recon. MSE between the input to the SAE and original acts
        "out_to_orig": None,  # Not commonly used. MSE between the output of the SAE and original acts
        "out_to_in": {
            # Multiplies the MSE between the output and input of the SAE. Setting to 0 lets us track this
            # loss during training without optimizing it
            "coeff": 0.0,
        },
        "logits_kl": {
            "coeff": 1.0,  # Multiplies the KL divergence between the logits of the SAE model and original model
        },
    },
    train_data={
        # See https://huggingface.co/apollo-research for other pre-tokenized datasets
        "dataset_name": "apollo-research/Skylion007-openwebtext-tokenizer-gpt2",
        "is_tokenized": True,
        "tokenizer_name": "gpt2",
        "streaming": True,
        "split": "train",
        "n_ctx": 1024,
    },
    eval_data={
        # By default this will use a different seed to the training data, but can be set with `seed`
        "dataset_name": "apollo-research/Skylion007-openwebtext-tokenizer-gpt2",
        "is_tokenized": True,
        "tokenizer_name": "gpt2",
        "streaming": True,
        "split": "train",
        "n_ctx": 1024,
    },
    saes={
        "retrain_saes": False,  # Determines whether to continue training the SAEs in pretrained_sae_paths
        "pretrained_sae_paths": None,  # Path or paths to pretrained SAEs
        "sae_positions": [  # Position or positions to place SAEs in the model
            "blocks.6.hook_resid_pre",
        ],
        "dict_size_to_input_ratio": 60.0,  # Size of the dictionary relative to the activations at the SAE positions
    },
)

## Train

In [2]:
run_bayesian_training("e2e_sae/scripts/train_tlens_saes/tinystories_1M_e2e.yaml")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mraymondl[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


2025-04-17 03:08:38 - INFO - wandb_project='tinystories-1m-2' wandb_run_name=None wandb_run_name_prefix='' seed=0 tlens_model_name='roneneldan/TinyStories-1M' tlens_model_path=None save_dir=PosixPath('/home/raymond/.temp/e2e_sae/e2e_sae/scripts/train_tlens_saes/out') n_samples=400000 save_every_n_samples=None eval_every_n_samples=40000 eval_n_samples=500 batch_size=20 effective_batch_size=20 lr=0.001 lr_schedule='cosine' min_lr_factor=0.1 warmup_samples=20000 cooldown_samples=0 max_grad_norm=1.0 log_every_n_grad_steps=20 collect_act_frequency_every_n_samples=40000 act_frequency_n_tokens=500000 loss=LossConfigs(sparsity=SparsityLoss(coeff=3.0, p_norm=1.0), in_to_orig=None, out_to_orig=None, out_to_in=OutToInLoss(coeff=0.0), logits_kl=LogitsKLLoss(coeff=1.0)) train_data=DatasetConfig(dataset_name='apollo-research/roneneldan-TinyStories-tokenizer-gpt2', is_tokenized=True, tokenizer_name='gpt2', streaming=True, split='train', n_ctx=512, seed=None, column_name='input_ids') eval_data=Dataset

Loaded pretrained model roneneldan/TinyStories-1M into HookedTransformer
Moving model to device:  cuda


Steps:   0%|          | 0/20000 [00:01<?, ?it/s]

Samples 20 Batch_idx 0 GradUpdates 1 Loss 83.99545


Eval Steps: 100%|██████████| 25/25 [00:03<00:00,  6.40it/s]
Steps:   0%|          | 21/20000 [00:09<44:40,  7.45it/s]  

Samples 400 Batch_idx 19 GradUpdates 20 Loss 82.35793


Steps:   0%|          | 41/20000 [00:12<43:58,  7.56it/s]

Samples 800 Batch_idx 39 GradUpdates 40 Loss 80.80156


Steps:   0%|          | 61/20000 [00:15<44:34,  7.45it/s]  

Samples 1200 Batch_idx 59 GradUpdates 60 Loss 79.73633


Steps:   0%|          | 81/20000 [00:18<43:51,  7.57it/s]

Samples 1600 Batch_idx 79 GradUpdates 80 Loss 79.22464


Steps:   1%|          | 101/20000 [00:20<43:48,  7.57it/s]

Samples 2000 Batch_idx 99 GradUpdates 100 Loss 78.84478


Steps:   1%|          | 121/20000 [00:23<43:48,  7.56it/s]

Samples 2400 Batch_idx 119 GradUpdates 120 Loss 78.52633


Steps:   1%|          | 141/20000 [00:25<43:44,  7.57it/s]

Samples 2800 Batch_idx 139 GradUpdates 140 Loss 78.25086


Steps:   1%|          | 161/20000 [00:28<43:44,  7.56it/s]

Samples 3200 Batch_idx 159 GradUpdates 160 Loss 77.95848


Steps:   1%|          | 181/20000 [00:30<43:42,  7.56it/s]

Samples 3600 Batch_idx 179 GradUpdates 180 Loss 77.59824


Steps:   1%|          | 201/20000 [00:33<43:42,  7.55it/s]

Samples 4000 Batch_idx 199 GradUpdates 200 Loss 77.29641


Steps:   1%|          | 221/20000 [00:35<43:40,  7.55it/s]

Samples 4400 Batch_idx 219 GradUpdates 220 Loss 76.91315


Steps:   1%|          | 241/20000 [00:38<44:36,  7.38it/s]

Samples 4800 Batch_idx 239 GradUpdates 240 Loss 76.52225


Steps:   1%|▏         | 261/20000 [00:40<43:30,  7.56it/s]

Samples 5200 Batch_idx 259 GradUpdates 260 Loss 76.01271


Steps:   1%|▏         | 281/20000 [00:43<43:28,  7.56it/s]

Samples 5600 Batch_idx 279 GradUpdates 280 Loss 75.61269


Steps:   2%|▏         | 301/20000 [00:45<43:28,  7.55it/s]

Samples 6000 Batch_idx 299 GradUpdates 300 Loss 75.22674


Steps:   2%|▏         | 321/20000 [00:48<43:43,  7.50it/s]

Samples 6400 Batch_idx 319 GradUpdates 320 Loss 74.68434


Steps:   2%|▏         | 341/20000 [00:50<43:17,  7.57it/s]

Samples 6800 Batch_idx 339 GradUpdates 340 Loss 74.15488


Steps:   2%|▏         | 361/20000 [00:53<43:24,  7.54it/s]

Samples 7200 Batch_idx 359 GradUpdates 360 Loss 73.57266


Steps:   2%|▏         | 381/20000 [00:56<43:22,  7.54it/s]

Samples 7600 Batch_idx 379 GradUpdates 380 Loss 72.99783


Steps:   2%|▏         | 401/20000 [00:58<43:15,  7.55it/s]

Samples 8000 Batch_idx 399 GradUpdates 400 Loss 72.39586


Steps:   2%|▏         | 421/20000 [01:01<43:14,  7.55it/s]

Samples 8400 Batch_idx 419 GradUpdates 420 Loss 71.85258


Steps:   2%|▏         | 441/20000 [01:03<43:13,  7.54it/s]

Samples 8800 Batch_idx 439 GradUpdates 440 Loss 71.10860


Steps:   2%|▏         | 461/20000 [01:06<43:13,  7.53it/s]

Samples 9200 Batch_idx 459 GradUpdates 460 Loss 70.40266


Steps:   2%|▏         | 481/20000 [01:08<43:05,  7.55it/s]

Samples 9600 Batch_idx 479 GradUpdates 480 Loss 69.81716


Steps:   3%|▎         | 501/20000 [01:11<43:03,  7.55it/s]

Samples 10000 Batch_idx 499 GradUpdates 500 Loss 69.08646


Steps:   3%|▎         | 521/20000 [01:13<43:05,  7.53it/s]

Samples 10400 Batch_idx 519 GradUpdates 520 Loss 68.30911


Steps:   3%|▎         | 541/20000 [01:16<42:53,  7.56it/s]

Samples 10800 Batch_idx 539 GradUpdates 540 Loss 67.63451


Steps:   3%|▎         | 561/20000 [01:18<43:00,  7.53it/s]

Samples 11200 Batch_idx 559 GradUpdates 560 Loss 66.75754


Steps:   3%|▎         | 581/20000 [01:21<42:47,  7.56it/s]

Samples 11600 Batch_idx 579 GradUpdates 580 Loss 65.91428


Steps:   3%|▎         | 601/20000 [01:23<42:47,  7.56it/s]

Samples 12000 Batch_idx 599 GradUpdates 600 Loss 65.19624


Steps:   3%|▎         | 621/20000 [01:26<42:45,  7.55it/s]

Samples 12400 Batch_idx 619 GradUpdates 620 Loss 64.35023


Steps:   3%|▎         | 641/20000 [01:28<42:44,  7.55it/s]

Samples 12800 Batch_idx 639 GradUpdates 640 Loss 63.29093


Steps:   3%|▎         | 661/20000 [01:31<42:41,  7.55it/s]

Samples 13200 Batch_idx 659 GradUpdates 660 Loss 62.55579


Steps:   3%|▎         | 681/20000 [01:34<42:51,  7.51it/s]

Samples 13600 Batch_idx 679 GradUpdates 680 Loss 61.62899


Steps:   4%|▎         | 701/20000 [01:36<42:37,  7.55it/s]

Samples 14000 Batch_idx 699 GradUpdates 700 Loss 60.72082


Steps:   4%|▎         | 721/20000 [01:39<42:34,  7.55it/s]

Samples 14400 Batch_idx 719 GradUpdates 720 Loss 59.83543


Steps:   4%|▎         | 741/20000 [01:41<42:33,  7.54it/s]

Samples 14800 Batch_idx 739 GradUpdates 740 Loss 58.89797


Steps:   4%|▍         | 761/20000 [01:44<42:21,  7.57it/s]

Samples 15200 Batch_idx 759 GradUpdates 760 Loss 57.90351


Steps:   4%|▍         | 781/20000 [01:46<42:25,  7.55it/s]

Samples 15600 Batch_idx 779 GradUpdates 780 Loss 57.03498


Steps:   4%|▍         | 801/20000 [01:49<42:38,  7.50it/s]

Samples 16000 Batch_idx 799 GradUpdates 800 Loss 56.05225


Steps:   4%|▍         | 821/20000 [01:51<42:27,  7.53it/s]

Samples 16400 Batch_idx 819 GradUpdates 820 Loss 55.07508


Steps:   4%|▍         | 841/20000 [01:54<42:07,  7.58it/s]

Samples 16800 Batch_idx 839 GradUpdates 840 Loss 54.02106


Steps:   4%|▍         | 861/20000 [01:56<42:17,  7.54it/s]

Samples 17200 Batch_idx 859 GradUpdates 860 Loss 52.97606


Steps:   4%|▍         | 881/20000 [01:59<42:07,  7.56it/s]

Samples 17600 Batch_idx 879 GradUpdates 880 Loss 51.86522


Steps:   5%|▍         | 901/20000 [02:01<42:04,  7.57it/s]

Samples 18000 Batch_idx 899 GradUpdates 900 Loss 50.99816


Steps:   5%|▍         | 921/20000 [02:04<42:04,  7.56it/s]

Samples 18400 Batch_idx 919 GradUpdates 920 Loss 49.96854


Steps:   5%|▍         | 941/20000 [02:06<42:01,  7.56it/s]

Samples 18800 Batch_idx 939 GradUpdates 940 Loss 48.72889


Steps:   5%|▍         | 961/20000 [02:09<41:57,  7.56it/s]

Samples 19200 Batch_idx 959 GradUpdates 960 Loss 47.86567


Steps:   5%|▍         | 981/20000 [02:12<41:53,  7.57it/s]

Samples 19600 Batch_idx 979 GradUpdates 980 Loss 46.89437


Steps:   5%|▌         | 1001/20000 [02:14<41:53,  7.56it/s]

Samples 20000 Batch_idx 999 GradUpdates 1000 Loss 45.98846


Steps:   5%|▌         | 1021/20000 [02:17<41:49,  7.56it/s]

Samples 20400 Batch_idx 1019 GradUpdates 1020 Loss 45.03571


Steps:   5%|▌         | 1041/20000 [02:19<41:45,  7.57it/s]

Samples 20800 Batch_idx 1039 GradUpdates 1040 Loss 44.06042


Steps:   5%|▌         | 1061/20000 [02:22<42:03,  7.50it/s]

Samples 21200 Batch_idx 1059 GradUpdates 1060 Loss 42.99715


Steps:   5%|▌         | 1081/20000 [02:24<41:46,  7.55it/s]

Samples 21600 Batch_idx 1079 GradUpdates 1080 Loss 42.08613


Steps:   6%|▌         | 1101/20000 [02:27<41:38,  7.56it/s]

Samples 22000 Batch_idx 1099 GradUpdates 1100 Loss 41.30374


Steps:   6%|▌         | 1121/20000 [02:29<41:36,  7.56it/s]

Samples 22400 Batch_idx 1119 GradUpdates 1120 Loss 40.29601


Steps:   6%|▌         | 1141/20000 [02:32<41:36,  7.55it/s]

Samples 22800 Batch_idx 1139 GradUpdates 1140 Loss 39.42993


Steps:   6%|▌         | 1161/20000 [02:34<41:30,  7.56it/s]

Samples 23200 Batch_idx 1159 GradUpdates 1160 Loss 38.68116


Steps:   6%|▌         | 1181/20000 [02:37<41:35,  7.54it/s]

Samples 23600 Batch_idx 1179 GradUpdates 1180 Loss 37.84556


Steps:   6%|▌         | 1201/20000 [02:39<41:35,  7.53it/s]

Samples 24000 Batch_idx 1199 GradUpdates 1200 Loss 37.10319


Steps:   6%|▌         | 1221/20000 [02:42<41:29,  7.54it/s]

Samples 24400 Batch_idx 1219 GradUpdates 1220 Loss 36.29738


Steps:   6%|▌         | 1241/20000 [02:44<41:21,  7.56it/s]

Samples 24800 Batch_idx 1239 GradUpdates 1240 Loss 35.40121


Steps:   6%|▋         | 1261/20000 [02:47<41:20,  7.55it/s]

Samples 25200 Batch_idx 1259 GradUpdates 1260 Loss 34.89672


Steps:   6%|▋         | 1281/20000 [02:50<41:22,  7.54it/s]

Samples 25600 Batch_idx 1279 GradUpdates 1280 Loss 34.15653


Steps:   7%|▋         | 1301/20000 [02:52<41:31,  7.50it/s]

Samples 26000 Batch_idx 1299 GradUpdates 1300 Loss 33.37160


Steps:   7%|▋         | 1321/20000 [02:55<41:04,  7.58it/s]

Samples 26400 Batch_idx 1319 GradUpdates 1320 Loss 32.90891


Steps:   7%|▋         | 1341/20000 [02:57<41:09,  7.56it/s]

Samples 26800 Batch_idx 1339 GradUpdates 1340 Loss 31.94251


Steps:   7%|▋         | 1361/20000 [03:00<41:08,  7.55it/s]

Samples 27200 Batch_idx 1359 GradUpdates 1360 Loss 31.41052


Steps:   7%|▋         | 1381/20000 [03:02<41:02,  7.56it/s]

Samples 27600 Batch_idx 1379 GradUpdates 1380 Loss 30.94007


Steps:   7%|▋         | 1401/20000 [03:05<41:04,  7.55it/s]

Samples 28000 Batch_idx 1399 GradUpdates 1400 Loss 30.10039


Steps:   7%|▋         | 1421/20000 [03:07<41:30,  7.46it/s]

Samples 28400 Batch_idx 1419 GradUpdates 1420 Loss 29.79742


Steps:   7%|▋         | 1441/20000 [03:10<40:53,  7.56it/s]

Samples 28800 Batch_idx 1439 GradUpdates 1440 Loss 29.13492


Steps:   7%|▋         | 1461/20000 [03:12<40:57,  7.54it/s]

Samples 29200 Batch_idx 1459 GradUpdates 1460 Loss 28.50066


Steps:   7%|▋         | 1481/20000 [03:15<40:50,  7.56it/s]

Samples 29600 Batch_idx 1479 GradUpdates 1480 Loss 27.89557


Steps:   8%|▊         | 1501/20000 [03:17<40:56,  7.53it/s]

Samples 30000 Batch_idx 1499 GradUpdates 1500 Loss 27.61146


Steps:   8%|▊         | 1521/20000 [03:20<40:44,  7.56it/s]

Samples 30400 Batch_idx 1519 GradUpdates 1520 Loss 27.04369


Steps:   8%|▊         | 1541/20000 [03:22<40:44,  7.55it/s]

Samples 30800 Batch_idx 1539 GradUpdates 1540 Loss 26.48017


Steps:   8%|▊         | 1561/20000 [03:25<40:59,  7.50it/s]

Samples 31200 Batch_idx 1559 GradUpdates 1560 Loss 25.96595


Steps:   8%|▊         | 1581/20000 [03:28<40:34,  7.57it/s]

Samples 31600 Batch_idx 1579 GradUpdates 1580 Loss 25.72755


Steps:   8%|▊         | 1601/20000 [03:30<40:38,  7.55it/s]

Samples 32000 Batch_idx 1599 GradUpdates 1600 Loss 25.11861


Steps:   8%|▊         | 1621/20000 [03:33<40:32,  7.55it/s]

Samples 32400 Batch_idx 1619 GradUpdates 1620 Loss 24.70911


Steps:   8%|▊         | 1641/20000 [03:35<40:23,  7.57it/s]

Samples 32800 Batch_idx 1639 GradUpdates 1640 Loss 24.19687


Steps:   8%|▊         | 1661/20000 [03:38<40:14,  7.60it/s]

Samples 33200 Batch_idx 1659 GradUpdates 1660 Loss 23.97893


Steps:   8%|▊         | 1681/20000 [03:40<40:22,  7.56it/s]

Samples 33600 Batch_idx 1679 GradUpdates 1680 Loss 23.47489


Steps:   9%|▊         | 1701/20000 [03:43<40:23,  7.55it/s]

Samples 34000 Batch_idx 1699 GradUpdates 1700 Loss 22.82861


Steps:   9%|▊         | 1721/20000 [03:45<40:21,  7.55it/s]

Samples 34400 Batch_idx 1719 GradUpdates 1720 Loss 22.79118


Steps:   9%|▊         | 1741/20000 [03:48<40:07,  7.59it/s]

Samples 34800 Batch_idx 1739 GradUpdates 1740 Loss 22.16249


Steps:   9%|▉         | 1761/20000 [03:50<40:11,  7.56it/s]

Samples 35200 Batch_idx 1759 GradUpdates 1760 Loss 21.85539


Steps:   9%|▉         | 1781/20000 [03:53<40:08,  7.56it/s]

Samples 35600 Batch_idx 1779 GradUpdates 1780 Loss 21.47708


Steps:   9%|▉         | 1801/20000 [03:55<40:28,  7.49it/s]

Samples 36000 Batch_idx 1799 GradUpdates 1800 Loss 21.29999


Steps:   9%|▉         | 1821/20000 [03:58<40:04,  7.56it/s]

Samples 36400 Batch_idx 1819 GradUpdates 1820 Loss 20.93833


Steps:   9%|▉         | 1841/20000 [04:00<40:59,  7.38it/s]

Samples 36800 Batch_idx 1839 GradUpdates 1840 Loss 20.47579


Steps:   9%|▉         | 1861/20000 [04:03<40:05,  7.54it/s]

Samples 37200 Batch_idx 1859 GradUpdates 1860 Loss 20.16072


Steps:   9%|▉         | 1881/20000 [04:06<39:59,  7.55it/s]

Samples 37600 Batch_idx 1879 GradUpdates 1880 Loss 19.99705


Steps:  10%|▉         | 1901/20000 [04:08<39:54,  7.56it/s]

Samples 38000 Batch_idx 1899 GradUpdates 1900 Loss 19.57081


Steps:  10%|▉         | 1921/20000 [04:11<39:54,  7.55it/s]

Samples 38400 Batch_idx 1919 GradUpdates 1920 Loss 19.30560


Steps:  10%|▉         | 1941/20000 [04:13<39:50,  7.56it/s]

Samples 38800 Batch_idx 1939 GradUpdates 1940 Loss 18.89688


Steps:  10%|▉         | 1961/20000 [04:16<40:01,  7.51it/s]

Samples 39200 Batch_idx 1959 GradUpdates 1960 Loss 18.74771


Steps:  10%|▉         | 1981/20000 [04:18<39:40,  7.57it/s]

Samples 39600 Batch_idx 1979 GradUpdates 1980 Loss 18.49155


Steps:  10%|█         | 2000/20000 [04:21<43:04,  6.96it/s]

Samples 40000 Batch_idx 1999 GradUpdates 2000 Loss 18.30432
Samples 40020 Batch_idx 2000 GradUpdates 2001 Loss 18.19844


Eval Steps: 100%|██████████| 25/25 [00:04<00:00,  5.78it/s]
Steps:  10%|█         | 2021/20000 [04:30<40:16,  7.44it/s]   

Samples 40400 Batch_idx 2019 GradUpdates 2020 Loss 17.94943


Steps:  10%|█         | 2041/20000 [04:32<39:42,  7.54it/s]

Samples 40800 Batch_idx 2039 GradUpdates 2040 Loss 17.75477


Steps:  10%|█         | 2061/20000 [04:35<39:43,  7.53it/s]

Samples 41200 Batch_idx 2059 GradUpdates 2060 Loss 17.40765


Steps:  10%|█         | 2081/20000 [04:38<39:38,  7.53it/s]

Samples 41600 Batch_idx 2079 GradUpdates 2080 Loss 17.10039


Steps:  11%|█         | 2101/20000 [04:41<54:22,  5.49it/s]  

Samples 42000 Batch_idx 2099 GradUpdates 2100 Loss 16.96486


Steps:  11%|█         | 2121/20000 [04:43<39:29,  7.55it/s]

Samples 42400 Batch_idx 2119 GradUpdates 2120 Loss 16.66075


Steps:  11%|█         | 2141/20000 [04:46<39:22,  7.56it/s]

Samples 42800 Batch_idx 2139 GradUpdates 2140 Loss 16.50059


Steps:  11%|█         | 2161/20000 [04:48<39:30,  7.52it/s]

Samples 43200 Batch_idx 2159 GradUpdates 2160 Loss 16.35100


Steps:  11%|█         | 2181/20000 [04:51<39:29,  7.52it/s]

Samples 43600 Batch_idx 2179 GradUpdates 2180 Loss 16.18197


Steps:  11%|█         | 2201/20000 [04:53<39:36,  7.49it/s]

Samples 44000 Batch_idx 2199 GradUpdates 2200 Loss 15.85884


Steps:  11%|█         | 2221/20000 [04:56<39:25,  7.52it/s]

Samples 44400 Batch_idx 2219 GradUpdates 2220 Loss 15.70985


Steps:  11%|█         | 2241/20000 [04:59<39:23,  7.51it/s]

Samples 44800 Batch_idx 2239 GradUpdates 2240 Loss 15.52930


Steps:  11%|█▏        | 2261/20000 [05:01<39:13,  7.54it/s]

Samples 45200 Batch_idx 2259 GradUpdates 2260 Loss 15.16891


Steps:  11%|█▏        | 2281/20000 [05:04<39:16,  7.52it/s]

Samples 45600 Batch_idx 2279 GradUpdates 2280 Loss 15.08404


Steps:  12%|█▏        | 2301/20000 [05:06<39:14,  7.52it/s]

Samples 46000 Batch_idx 2299 GradUpdates 2300 Loss 14.77493


Steps:  12%|█▏        | 2321/20000 [05:09<39:11,  7.52it/s]

Samples 46400 Batch_idx 2319 GradUpdates 2320 Loss 14.61859


Steps:  12%|█▏        | 2341/20000 [05:11<39:09,  7.52it/s]

Samples 46800 Batch_idx 2339 GradUpdates 2340 Loss 14.52347


Steps:  12%|█▏        | 2361/20000 [05:14<39:07,  7.51it/s]

Samples 47200 Batch_idx 2359 GradUpdates 2360 Loss 14.34060


Steps:  12%|█▏        | 2381/20000 [05:16<39:02,  7.52it/s]

Samples 47600 Batch_idx 2379 GradUpdates 2380 Loss 14.03607


Steps:  12%|█▏        | 2401/20000 [05:19<38:59,  7.52it/s]

Samples 48000 Batch_idx 2399 GradUpdates 2400 Loss 13.88082


Steps:  12%|█▏        | 2421/20000 [05:21<38:58,  7.52it/s]

Samples 48400 Batch_idx 2419 GradUpdates 2420 Loss 13.98053


Steps:  12%|█▏        | 2441/20000 [05:24<38:56,  7.52it/s]

Samples 48800 Batch_idx 2439 GradUpdates 2440 Loss 13.64338


Steps:  12%|█▏        | 2461/20000 [05:27<39:11,  7.46it/s]

Samples 49200 Batch_idx 2459 GradUpdates 2460 Loss 13.63483


Steps:  12%|█▏        | 2481/20000 [05:29<38:51,  7.51it/s]

Samples 49600 Batch_idx 2479 GradUpdates 2480 Loss 13.31459


Steps:  13%|█▎        | 2501/20000 [05:32<38:44,  7.53it/s]

Samples 50000 Batch_idx 2499 GradUpdates 2500 Loss 13.21672


Steps:  13%|█▎        | 2521/20000 [05:34<38:51,  7.50it/s]

Samples 50400 Batch_idx 2519 GradUpdates 2520 Loss 12.96528


Steps:  13%|█▎        | 2541/20000 [05:37<38:39,  7.53it/s]

Samples 50800 Batch_idx 2539 GradUpdates 2540 Loss 12.88573


Steps:  13%|█▎        | 2561/20000 [05:39<38:38,  7.52it/s]

Samples 51200 Batch_idx 2559 GradUpdates 2560 Loss 12.71950


Steps:  13%|█▎        | 2581/20000 [05:42<38:27,  7.55it/s]

Samples 51600 Batch_idx 2579 GradUpdates 2580 Loss 12.53686


Steps:  13%|█▎        | 2601/20000 [05:44<38:35,  7.51it/s]

Samples 52000 Batch_idx 2599 GradUpdates 2600 Loss 12.55918


Steps:  13%|█▎        | 2621/20000 [05:47<38:31,  7.52it/s]

Samples 52400 Batch_idx 2619 GradUpdates 2620 Loss 12.30874


Steps:  13%|█▎        | 2641/20000 [05:49<38:26,  7.53it/s]

Samples 52800 Batch_idx 2639 GradUpdates 2640 Loss 12.15455


Steps:  13%|█▎        | 2661/20000 [05:52<38:25,  7.52it/s]

Samples 53200 Batch_idx 2659 GradUpdates 2660 Loss 12.08913


Steps:  13%|█▎        | 2681/20000 [05:55<38:24,  7.52it/s]

Samples 53600 Batch_idx 2679 GradUpdates 2680 Loss 12.00879


Steps:  14%|█▎        | 2701/20000 [05:57<38:35,  7.47it/s]

Samples 54000 Batch_idx 2699 GradUpdates 2700 Loss 11.82141


Steps:  14%|█▎        | 2721/20000 [06:00<38:19,  7.51it/s]

Samples 54400 Batch_idx 2719 GradUpdates 2720 Loss 11.72053


Steps:  14%|█▎        | 2741/20000 [06:02<38:15,  7.52it/s]

Samples 54800 Batch_idx 2739 GradUpdates 2740 Loss 11.53460


Steps:  14%|█▍        | 2761/20000 [06:05<38:12,  7.52it/s]

Samples 55200 Batch_idx 2759 GradUpdates 2760 Loss 11.42690


Steps:  14%|█▍        | 2781/20000 [06:07<38:09,  7.52it/s]

Samples 55600 Batch_idx 2779 GradUpdates 2780 Loss 11.37010


Steps:  14%|█▍        | 2801/20000 [06:10<38:09,  7.51it/s]

Samples 56000 Batch_idx 2799 GradUpdates 2800 Loss 11.11811


Steps:  14%|█▍        | 2821/20000 [06:12<38:09,  7.50it/s]

Samples 56400 Batch_idx 2819 GradUpdates 2820 Loss 11.19848


Steps:  14%|█▍        | 2841/20000 [06:15<38:01,  7.52it/s]

Samples 56800 Batch_idx 2839 GradUpdates 2840 Loss 11.00297


Steps:  14%|█▍        | 2861/20000 [06:17<38:02,  7.51it/s]

Samples 57200 Batch_idx 2859 GradUpdates 2860 Loss 10.78154


Steps:  14%|█▍        | 2881/20000 [06:20<37:46,  7.55it/s]

Samples 57600 Batch_idx 2879 GradUpdates 2880 Loss 10.76278


Steps:  15%|█▍        | 2901/20000 [06:23<37:48,  7.54it/s]

Samples 58000 Batch_idx 2899 GradUpdates 2900 Loss 10.65821


Steps:  15%|█▍        | 2921/20000 [06:25<37:53,  7.51it/s]

Samples 58400 Batch_idx 2919 GradUpdates 2920 Loss 10.54650


Steps:  15%|█▍        | 2941/20000 [06:28<37:54,  7.50it/s]

Samples 58800 Batch_idx 2939 GradUpdates 2940 Loss 10.46760


Steps:  15%|█▍        | 2961/20000 [06:30<38:06,  7.45it/s]

Samples 59200 Batch_idx 2959 GradUpdates 2960 Loss 10.42189


Steps:  15%|█▍        | 2981/20000 [06:33<37:44,  7.51it/s]

Samples 59600 Batch_idx 2979 GradUpdates 2980 Loss 10.36296


Steps:  15%|█▌        | 3001/20000 [06:35<37:43,  7.51it/s]

Samples 60000 Batch_idx 2999 GradUpdates 3000 Loss 10.19138


Steps:  15%|█▌        | 3021/20000 [06:38<37:40,  7.51it/s]

Samples 60400 Batch_idx 3019 GradUpdates 3020 Loss 10.29211


Steps:  15%|█▌        | 3041/20000 [06:40<37:38,  7.51it/s]

Samples 60800 Batch_idx 3039 GradUpdates 3040 Loss 9.90092


Steps:  15%|█▌        | 3061/20000 [06:43<37:35,  7.51it/s]

Samples 61200 Batch_idx 3059 GradUpdates 3060 Loss 9.76275


Steps:  15%|█▌        | 3081/20000 [06:46<37:34,  7.50it/s]

Samples 61600 Batch_idx 3079 GradUpdates 3080 Loss 9.96831


Steps:  16%|█▌        | 3101/20000 [06:48<37:30,  7.51it/s]

Samples 62000 Batch_idx 3099 GradUpdates 3100 Loss 9.86269


Steps:  16%|█▌        | 3121/20000 [06:51<37:30,  7.50it/s]

Samples 62400 Batch_idx 3119 GradUpdates 3120 Loss 9.72460


Steps:  16%|█▌        | 3141/20000 [06:53<37:25,  7.51it/s]

Samples 62800 Batch_idx 3139 GradUpdates 3140 Loss 9.67243


Steps:  16%|█▌        | 3161/20000 [06:56<37:19,  7.52it/s]

Samples 63200 Batch_idx 3159 GradUpdates 3160 Loss 9.49035


Steps:  16%|█▌        | 3181/20000 [06:58<37:18,  7.51it/s]

Samples 63600 Batch_idx 3179 GradUpdates 3180 Loss 9.46504


Steps:  16%|█▌        | 3201/20000 [07:01<37:34,  7.45it/s]

Samples 64000 Batch_idx 3199 GradUpdates 3200 Loss 9.31408


Steps:  16%|█▌        | 3221/20000 [07:03<37:20,  7.49it/s]

Samples 64400 Batch_idx 3219 GradUpdates 3220 Loss 9.25141


Steps:  16%|█▌        | 3241/20000 [07:06<37:09,  7.52it/s]

Samples 64800 Batch_idx 3239 GradUpdates 3240 Loss 9.28102


Steps:  16%|█▋        | 3261/20000 [07:08<37:11,  7.50it/s]

Samples 65200 Batch_idx 3259 GradUpdates 3260 Loss 9.03765


Steps:  16%|█▋        | 3281/20000 [07:11<37:07,  7.51it/s]

Samples 65600 Batch_idx 3279 GradUpdates 3280 Loss 9.00228


Steps:  17%|█▋        | 3301/20000 [07:14<37:04,  7.51it/s]

Samples 66000 Batch_idx 3299 GradUpdates 3300 Loss 8.95565


Steps:  17%|█▋        | 3321/20000 [07:16<36:58,  7.52it/s]

Samples 66400 Batch_idx 3319 GradUpdates 3320 Loss 9.01283


Steps:  17%|█▋        | 3341/20000 [07:19<36:57,  7.51it/s]

Samples 66800 Batch_idx 3339 GradUpdates 3340 Loss 8.92027


Steps:  17%|█▋        | 3361/20000 [07:21<36:57,  7.50it/s]

Samples 67200 Batch_idx 3359 GradUpdates 3360 Loss 8.78875


Steps:  17%|█▋        | 3381/20000 [07:24<36:51,  7.52it/s]

Samples 67600 Batch_idx 3379 GradUpdates 3380 Loss 8.75894


Steps:  17%|█▋        | 3401/20000 [07:26<36:51,  7.51it/s]

Samples 68000 Batch_idx 3399 GradUpdates 3400 Loss 8.60186


Steps:  17%|█▋        | 3421/20000 [07:29<36:48,  7.51it/s]

Samples 68400 Batch_idx 3419 GradUpdates 3420 Loss 8.55658


Steps:  17%|█▋        | 3441/20000 [07:31<36:42,  7.52it/s]

Samples 68800 Batch_idx 3439 GradUpdates 3440 Loss 8.36480


Steps:  17%|█▋        | 3461/20000 [07:34<37:01,  7.45it/s]

Samples 69200 Batch_idx 3459 GradUpdates 3460 Loss 8.45691


Steps:  17%|█▋        | 3481/20000 [07:36<36:36,  7.52it/s]

Samples 69600 Batch_idx 3479 GradUpdates 3480 Loss 8.32575


Steps:  18%|█▊        | 3501/20000 [07:39<36:40,  7.50it/s]

Samples 70000 Batch_idx 3499 GradUpdates 3500 Loss 8.37806


Steps:  18%|█▊        | 3521/20000 [07:42<36:33,  7.51it/s]

Samples 70400 Batch_idx 3519 GradUpdates 3520 Loss 8.20766


Steps:  18%|█▊        | 3541/20000 [07:44<36:29,  7.52it/s]

Samples 70800 Batch_idx 3539 GradUpdates 3540 Loss 8.23338


Steps:  18%|█▊        | 3561/20000 [07:47<36:28,  7.51it/s]

Samples 71200 Batch_idx 3559 GradUpdates 3560 Loss 8.30634


Steps:  18%|█▊        | 3581/20000 [07:49<36:26,  7.51it/s]

Samples 71600 Batch_idx 3579 GradUpdates 3580 Loss 8.09250


Steps:  18%|█▊        | 3601/20000 [07:52<36:23,  7.51it/s]

Samples 72000 Batch_idx 3599 GradUpdates 3600 Loss 7.98609


Steps:  18%|█▊        | 3621/20000 [07:54<36:18,  7.52it/s]

Samples 72400 Batch_idx 3619 GradUpdates 3620 Loss 8.00065


Steps:  18%|█▊        | 3641/20000 [07:57<36:20,  7.50it/s]

Samples 72800 Batch_idx 3639 GradUpdates 3640 Loss 7.95962


Steps:  18%|█▊        | 3661/20000 [07:59<36:12,  7.52it/s]

Samples 73200 Batch_idx 3659 GradUpdates 3660 Loss 7.89720


Steps:  18%|█▊        | 3681/20000 [08:02<36:20,  7.48it/s]

Samples 73600 Batch_idx 3679 GradUpdates 3680 Loss 7.77698


Steps:  19%|█▊        | 3701/20000 [08:04<36:29,  7.45it/s]

Samples 74000 Batch_idx 3699 GradUpdates 3700 Loss 7.86082


Steps:  19%|█▊        | 3721/20000 [08:07<36:12,  7.49it/s]

Samples 74400 Batch_idx 3719 GradUpdates 3720 Loss 7.75246


Steps:  19%|█▊        | 3741/20000 [08:10<36:04,  7.51it/s]

Samples 74800 Batch_idx 3739 GradUpdates 3740 Loss 7.70042


Steps:  19%|█▉        | 3761/20000 [08:12<35:59,  7.52it/s]

Samples 75200 Batch_idx 3759 GradUpdates 3760 Loss 7.59272


Steps:  19%|█▉        | 3781/20000 [08:15<36:03,  7.50it/s]

Samples 75600 Batch_idx 3779 GradUpdates 3780 Loss 7.59470


Steps:  19%|█▉        | 3801/20000 [08:17<35:52,  7.53it/s]

Samples 76000 Batch_idx 3799 GradUpdates 3800 Loss 7.50121


Steps:  19%|█▉        | 3821/20000 [08:20<35:54,  7.51it/s]

Samples 76400 Batch_idx 3819 GradUpdates 3820 Loss 7.41901


Steps:  19%|█▉        | 3841/20000 [08:22<35:49,  7.52it/s]

Samples 76800 Batch_idx 3839 GradUpdates 3840 Loss 7.49362


Steps:  19%|█▉        | 3861/20000 [08:25<35:47,  7.51it/s]

Samples 77200 Batch_idx 3859 GradUpdates 3860 Loss 7.45299


Steps:  19%|█▉        | 3881/20000 [08:27<35:42,  7.52it/s]

Samples 77600 Batch_idx 3879 GradUpdates 3880 Loss 7.26695


Steps:  20%|█▉        | 3901/20000 [08:30<35:33,  7.54it/s]

Samples 78000 Batch_idx 3899 GradUpdates 3900 Loss 7.24822


Steps:  20%|█▉        | 3921/20000 [08:33<35:28,  7.55it/s]

Samples 78400 Batch_idx 3919 GradUpdates 3920 Loss 7.28213


Steps:  20%|█▉        | 3941/20000 [08:35<35:37,  7.51it/s]

Samples 78800 Batch_idx 3939 GradUpdates 3940 Loss 7.14675


Steps:  20%|█▉        | 3961/20000 [08:38<35:55,  7.44it/s]

Samples 79200 Batch_idx 3959 GradUpdates 3960 Loss 7.05602


Steps:  20%|█▉        | 3981/20000 [08:40<35:30,  7.52it/s]

Samples 79600 Batch_idx 3979 GradUpdates 3980 Loss 7.05405


Steps:  20%|██        | 4000/20000 [08:43<36:17,  7.35it/s]

Samples 80000 Batch_idx 3999 GradUpdates 4000 Loss 7.18818
Samples 80020 Batch_idx 4000 GradUpdates 4001 Loss 7.08685


Eval Steps: 100%|██████████| 25/25 [00:04<00:00,  5.80it/s]
Steps:  20%|██        | 4021/20000 [08:53<35:57,  7.41it/s]   

Samples 80400 Batch_idx 4019 GradUpdates 4020 Loss 7.03607


Steps:  20%|██        | 4041/20000 [08:56<35:23,  7.51it/s]

Samples 80800 Batch_idx 4039 GradUpdates 4040 Loss 7.07716


Steps:  20%|██        | 4061/20000 [08:58<35:30,  7.48it/s]

Samples 81200 Batch_idx 4059 GradUpdates 4060 Loss 6.88660


Steps:  20%|██        | 4081/20000 [09:01<35:20,  7.51it/s]

Samples 81600 Batch_idx 4079 GradUpdates 4080 Loss 6.86166


Steps:  21%|██        | 4101/20000 [09:03<35:22,  7.49it/s]

Samples 82000 Batch_idx 4099 GradUpdates 4100 Loss 6.97613


Steps:  21%|██        | 4121/20000 [09:06<35:19,  7.49it/s]

Samples 82400 Batch_idx 4119 GradUpdates 4120 Loss 6.81913


Steps:  21%|██        | 4141/20000 [09:08<35:14,  7.50it/s]

Samples 82800 Batch_idx 4139 GradUpdates 4140 Loss 6.82930


Steps:  21%|██        | 4161/20000 [09:11<35:05,  7.52it/s]  

Samples 83200 Batch_idx 4159 GradUpdates 4160 Loss 6.75111


Steps:  21%|██        | 4181/20000 [09:14<35:02,  7.53it/s]

Samples 83600 Batch_idx 4179 GradUpdates 4180 Loss 6.78529


Steps:  21%|██        | 4201/20000 [09:16<35:05,  7.50it/s]

Samples 84000 Batch_idx 4199 GradUpdates 4200 Loss 6.60814


Steps:  21%|██        | 4221/20000 [09:19<34:59,  7.52it/s]

Samples 84400 Batch_idx 4219 GradUpdates 4220 Loss 6.59784


Steps:  21%|██        | 4241/20000 [09:22<34:58,  7.51it/s]

Samples 84800 Batch_idx 4239 GradUpdates 4240 Loss 6.56310


Steps:  21%|██▏       | 4261/20000 [09:24<35:14,  7.45it/s]

Samples 85200 Batch_idx 4259 GradUpdates 4260 Loss 6.50372


Steps:  21%|██▏       | 4281/20000 [09:27<34:51,  7.51it/s]

Samples 85600 Batch_idx 4279 GradUpdates 4280 Loss 6.61293


Steps:  22%|██▏       | 4301/20000 [09:29<34:44,  7.53it/s]

Samples 86000 Batch_idx 4299 GradUpdates 4300 Loss 6.46329


Steps:  22%|██▏       | 4321/20000 [09:32<34:34,  7.56it/s]

Samples 86400 Batch_idx 4319 GradUpdates 4320 Loss 6.51338


Steps:  22%|██▏       | 4341/20000 [09:34<34:35,  7.54it/s]

Samples 86800 Batch_idx 4339 GradUpdates 4340 Loss 6.53788


Steps:  22%|██▏       | 4361/20000 [09:37<34:39,  7.52it/s]

Samples 87200 Batch_idx 4359 GradUpdates 4360 Loss 6.39480


Steps:  22%|██▏       | 4381/20000 [09:39<34:40,  7.51it/s]

Samples 87600 Batch_idx 4379 GradUpdates 4380 Loss 6.38326


Steps:  22%|██▏       | 4401/20000 [09:42<34:34,  7.52it/s]

Samples 88000 Batch_idx 4399 GradUpdates 4400 Loss 6.36768


Steps:  22%|██▏       | 4421/20000 [09:44<34:30,  7.53it/s]

Samples 88400 Batch_idx 4419 GradUpdates 4420 Loss 6.27092


Steps:  22%|██▏       | 4441/20000 [09:47<34:59,  7.41it/s]

Samples 88800 Batch_idx 4439 GradUpdates 4440 Loss 6.29706


Steps:  22%|██▏       | 4461/20000 [09:49<34:24,  7.53it/s]

Samples 89200 Batch_idx 4459 GradUpdates 4460 Loss 6.26507


Steps:  22%|██▏       | 4481/20000 [09:52<34:21,  7.53it/s]

Samples 89600 Batch_idx 4479 GradUpdates 4480 Loss 6.27185


Steps:  23%|██▎       | 4501/20000 [09:55<34:43,  7.44it/s]

Samples 90000 Batch_idx 4499 GradUpdates 4500 Loss 6.25784


Steps:  23%|██▎       | 4521/20000 [09:57<34:15,  7.53it/s]

Samples 90400 Batch_idx 4519 GradUpdates 4520 Loss 6.19105


Steps:  23%|██▎       | 4541/20000 [10:00<34:18,  7.51it/s]

Samples 90800 Batch_idx 4539 GradUpdates 4540 Loss 6.14142


Steps:  23%|██▎       | 4561/20000 [10:02<34:10,  7.53it/s]

Samples 91200 Batch_idx 4559 GradUpdates 4560 Loss 6.15689


Steps:  23%|██▎       | 4581/20000 [10:05<34:09,  7.52it/s]

Samples 91600 Batch_idx 4579 GradUpdates 4580 Loss 6.06573


Steps:  23%|██▎       | 4601/20000 [10:07<34:07,  7.52it/s]

Samples 92000 Batch_idx 4599 GradUpdates 4600 Loss 6.02528


Steps:  23%|██▎       | 4621/20000 [10:10<34:05,  7.52it/s]

Samples 92400 Batch_idx 4619 GradUpdates 4620 Loss 6.02470


Steps:  23%|██▎       | 4641/20000 [10:12<34:00,  7.53it/s]

Samples 92800 Batch_idx 4639 GradUpdates 4640 Loss 6.03569


Steps:  23%|██▎       | 4661/20000 [10:15<33:58,  7.53it/s]

Samples 93200 Batch_idx 4659 GradUpdates 4660 Loss 6.10328


Steps:  23%|██▎       | 4681/20000 [10:17<33:56,  7.52it/s]

Samples 93600 Batch_idx 4679 GradUpdates 4680 Loss 5.94075


Steps:  24%|██▎       | 4701/20000 [10:20<33:55,  7.52it/s]

Samples 94000 Batch_idx 4699 GradUpdates 4700 Loss 6.03322


Steps:  24%|██▎       | 4721/20000 [10:23<33:49,  7.53it/s]

Samples 94400 Batch_idx 4719 GradUpdates 4720 Loss 5.92076


Steps:  24%|██▎       | 4741/20000 [10:25<33:47,  7.53it/s]

Samples 94800 Batch_idx 4739 GradUpdates 4740 Loss 5.88396


Steps:  24%|██▍       | 4761/20000 [10:28<34:01,  7.46it/s]

Samples 95200 Batch_idx 4759 GradUpdates 4760 Loss 5.85825


Steps:  24%|██▍       | 4781/20000 [10:30<33:43,  7.52it/s]

Samples 95600 Batch_idx 4779 GradUpdates 4780 Loss 5.83803


Steps:  24%|██▍       | 4801/20000 [10:33<33:38,  7.53it/s]

Samples 96000 Batch_idx 4799 GradUpdates 4800 Loss 5.78880


Steps:  24%|██▍       | 4821/20000 [10:35<33:45,  7.49it/s]

Samples 96400 Batch_idx 4819 GradUpdates 4820 Loss 5.79718


Steps:  24%|██▍       | 4841/20000 [10:38<33:35,  7.52it/s]

Samples 96800 Batch_idx 4839 GradUpdates 4840 Loss 5.71398


Steps:  24%|██▍       | 4861/20000 [10:40<33:31,  7.53it/s]

Samples 97200 Batch_idx 4859 GradUpdates 4860 Loss 5.83822


Steps:  24%|██▍       | 4881/20000 [10:43<33:31,  7.52it/s]

Samples 97600 Batch_idx 4879 GradUpdates 4880 Loss 5.73216


Steps:  25%|██▍       | 4901/20000 [10:45<33:28,  7.52it/s]

Samples 98000 Batch_idx 4899 GradUpdates 4900 Loss 5.76757


Steps:  25%|██▍       | 4921/20000 [10:48<33:22,  7.53it/s]

Samples 98400 Batch_idx 4919 GradUpdates 4920 Loss 5.67148


Steps:  25%|██▍       | 4941/20000 [10:51<33:25,  7.51it/s]

Samples 98800 Batch_idx 4939 GradUpdates 4940 Loss 5.66540


Steps:  25%|██▍       | 4961/20000 [10:53<33:19,  7.52it/s]

Samples 99200 Batch_idx 4959 GradUpdates 4960 Loss 5.55841


Steps:  25%|██▍       | 4981/20000 [10:56<33:21,  7.50it/s]

Samples 99600 Batch_idx 4979 GradUpdates 4980 Loss 5.55609


Steps:  25%|██▌       | 5001/20000 [10:58<33:30,  7.46it/s]

Samples 100000 Batch_idx 4999 GradUpdates 5000 Loss 5.60797


Steps:  25%|██▌       | 5021/20000 [11:01<33:11,  7.52it/s]

Samples 100400 Batch_idx 5019 GradUpdates 5020 Loss 5.50762


Steps:  25%|██▌       | 5041/20000 [11:03<33:07,  7.53it/s]

Samples 100800 Batch_idx 5039 GradUpdates 5040 Loss 5.46028


Steps:  25%|██▌       | 5061/20000 [11:06<33:05,  7.52it/s]

Samples 101200 Batch_idx 5059 GradUpdates 5060 Loss 5.50135


Steps:  25%|██▌       | 5081/20000 [11:08<33:03,  7.52it/s]

Samples 101600 Batch_idx 5079 GradUpdates 5080 Loss 5.45429


Steps:  26%|██▌       | 5101/20000 [11:11<33:03,  7.51it/s]

Samples 102000 Batch_idx 5099 GradUpdates 5100 Loss 5.44774


Steps:  26%|██▌       | 5121/20000 [11:13<33:01,  7.51it/s]

Samples 102400 Batch_idx 5119 GradUpdates 5120 Loss 5.50236


Steps:  26%|██▌       | 5141/20000 [11:16<32:55,  7.52it/s]

Samples 102800 Batch_idx 5139 GradUpdates 5140 Loss 5.40342


Steps:  26%|██▌       | 5161/20000 [11:18<32:53,  7.52it/s]

Samples 103200 Batch_idx 5159 GradUpdates 5160 Loss 5.42627


Steps:  26%|██▌       | 5181/20000 [11:21<32:49,  7.52it/s]

Samples 103600 Batch_idx 5179 GradUpdates 5180 Loss 5.39169


Steps:  26%|██▌       | 5201/20000 [11:24<32:49,  7.51it/s]

Samples 104000 Batch_idx 5199 GradUpdates 5200 Loss 5.37515


Steps:  26%|██▌       | 5221/20000 [11:26<32:44,  7.52it/s]

Samples 104400 Batch_idx 5219 GradUpdates 5220 Loss 5.35749


Steps:  26%|██▌       | 5241/20000 [11:29<32:42,  7.52it/s]

Samples 104800 Batch_idx 5239 GradUpdates 5240 Loss 5.35348


Steps:  26%|██▋       | 5261/20000 [11:31<32:57,  7.45it/s]

Samples 105200 Batch_idx 5259 GradUpdates 5260 Loss 5.29449


Steps:  26%|██▋       | 5281/20000 [11:34<32:36,  7.52it/s]

Samples 105600 Batch_idx 5279 GradUpdates 5280 Loss 5.34044


Steps:  27%|██▋       | 5301/20000 [11:36<32:37,  7.51it/s]

Samples 106000 Batch_idx 5299 GradUpdates 5300 Loss 5.27917


Steps:  27%|██▋       | 5321/20000 [11:39<32:31,  7.52it/s]

Samples 106400 Batch_idx 5319 GradUpdates 5320 Loss 5.27050


Steps:  27%|██▋       | 5341/20000 [11:41<32:28,  7.52it/s]

Samples 106800 Batch_idx 5339 GradUpdates 5340 Loss 5.29956


Steps:  27%|██▋       | 5361/20000 [11:44<32:27,  7.52it/s]

Samples 107200 Batch_idx 5359 GradUpdates 5360 Loss 5.26450


Steps:  27%|██▋       | 5381/20000 [11:46<32:23,  7.52it/s]

Samples 107600 Batch_idx 5379 GradUpdates 5380 Loss 5.12982


Steps:  27%|██▋       | 5401/20000 [11:49<32:26,  7.50it/s]

Samples 108000 Batch_idx 5399 GradUpdates 5400 Loss 5.23457


Steps:  27%|██▋       | 5421/20000 [11:52<32:18,  7.52it/s]

Samples 108400 Batch_idx 5419 GradUpdates 5420 Loss 5.17463


Steps:  27%|██▋       | 5441/20000 [11:54<32:13,  7.53it/s]

Samples 108800 Batch_idx 5439 GradUpdates 5440 Loss 5.18422


Steps:  27%|██▋       | 5461/20000 [11:57<32:12,  7.52it/s]

Samples 109200 Batch_idx 5459 GradUpdates 5460 Loss 5.10304


Steps:  27%|██▋       | 5481/20000 [11:59<32:07,  7.53it/s]

Samples 109600 Batch_idx 5479 GradUpdates 5480 Loss 5.27733


Steps:  28%|██▊       | 5501/20000 [12:02<32:22,  7.46it/s]

Samples 110000 Batch_idx 5499 GradUpdates 5500 Loss 5.04951


Steps:  28%|██▊       | 5521/20000 [12:04<32:03,  7.53it/s]

Samples 110400 Batch_idx 5519 GradUpdates 5520 Loss 5.05346


Steps:  28%|██▊       | 5541/20000 [12:07<32:15,  7.47it/s]

Samples 110800 Batch_idx 5539 GradUpdates 5540 Loss 5.09920


Steps:  28%|██▊       | 5561/20000 [12:09<32:00,  7.52it/s]

Samples 111200 Batch_idx 5559 GradUpdates 5560 Loss 5.07906


Steps:  28%|██▊       | 5581/20000 [12:12<32:03,  7.50it/s]

Samples 111600 Batch_idx 5579 GradUpdates 5580 Loss 5.02563


Steps:  28%|██▊       | 5601/20000 [12:14<31:55,  7.52it/s]

Samples 112000 Batch_idx 5599 GradUpdates 5600 Loss 5.13077


Steps:  28%|██▊       | 5621/20000 [12:17<31:50,  7.52it/s]

Samples 112400 Batch_idx 5619 GradUpdates 5620 Loss 5.04169


Steps:  28%|██▊       | 5641/20000 [12:20<31:48,  7.52it/s]

Samples 112800 Batch_idx 5639 GradUpdates 5640 Loss 5.08607


Steps:  28%|██▊       | 5661/20000 [12:22<31:44,  7.53it/s]

Samples 113200 Batch_idx 5659 GradUpdates 5660 Loss 5.01973


Steps:  28%|██▊       | 5681/20000 [12:25<31:46,  7.51it/s]

Samples 113600 Batch_idx 5679 GradUpdates 5680 Loss 4.99258


Steps:  29%|██▊       | 5701/20000 [12:27<31:42,  7.51it/s]

Samples 114000 Batch_idx 5699 GradUpdates 5700 Loss 4.98173


Steps:  29%|██▊       | 5721/20000 [12:30<31:42,  7.50it/s]

Samples 114400 Batch_idx 5719 GradUpdates 5720 Loss 4.94771


Steps:  29%|██▊       | 5741/20000 [12:32<31:34,  7.53it/s]

Samples 114800 Batch_idx 5739 GradUpdates 5740 Loss 4.84726


Steps:  29%|██▉       | 5761/20000 [12:35<31:53,  7.44it/s]

Samples 115200 Batch_idx 5759 GradUpdates 5760 Loss 4.95125


Steps:  29%|██▉       | 5781/20000 [12:37<31:28,  7.53it/s]

Samples 115600 Batch_idx 5779 GradUpdates 5780 Loss 5.01373


Steps:  29%|██▉       | 5801/20000 [12:40<31:29,  7.52it/s]

Samples 116000 Batch_idx 5799 GradUpdates 5800 Loss 4.88713


Steps:  29%|██▉       | 5821/20000 [12:42<31:26,  7.52it/s]

Samples 116400 Batch_idx 5819 GradUpdates 5820 Loss 4.85750


Steps:  29%|██▉       | 5841/20000 [12:45<31:22,  7.52it/s]

Samples 116800 Batch_idx 5839 GradUpdates 5840 Loss 4.95529


Steps:  29%|██▉       | 5861/20000 [12:48<31:21,  7.52it/s]

Samples 117200 Batch_idx 5859 GradUpdates 5860 Loss 4.81737


Steps:  29%|██▉       | 5881/20000 [12:50<31:17,  7.52it/s]

Samples 117600 Batch_idx 5879 GradUpdates 5880 Loss 4.80747


Steps:  30%|██▉       | 5901/20000 [12:53<31:17,  7.51it/s]

Samples 118000 Batch_idx 5899 GradUpdates 5900 Loss 4.86110


Steps:  30%|██▉       | 5921/20000 [12:55<31:11,  7.52it/s]

Samples 118400 Batch_idx 5919 GradUpdates 5920 Loss 4.79650


Steps:  30%|██▉       | 5941/20000 [12:58<31:11,  7.51it/s]

Samples 118800 Batch_idx 5939 GradUpdates 5940 Loss 4.79311


Steps:  30%|██▉       | 5961/20000 [13:00<31:04,  7.53it/s]

Samples 119200 Batch_idx 5959 GradUpdates 5960 Loss 4.83952


Steps:  30%|██▉       | 5981/20000 [13:03<31:01,  7.53it/s]

Samples 119600 Batch_idx 5979 GradUpdates 5980 Loss 4.74407


Steps:  30%|███       | 6000/20000 [13:05<31:48,  7.34it/s]

Samples 120000 Batch_idx 5999 GradUpdates 6000 Loss 4.73282
Samples 120020 Batch_idx 6000 GradUpdates 6001 Loss 4.86499


Eval Steps: 100%|██████████| 25/25 [00:04<00:00,  5.81it/s]
Steps:  30%|███       | 6021/20000 [13:15<31:24,  7.42it/s]  

Samples 120400 Batch_idx 6019 GradUpdates 6020 Loss 4.70047


Steps:  30%|███       | 6041/20000 [13:17<31:00,  7.50it/s]

Samples 120800 Batch_idx 6039 GradUpdates 6040 Loss 4.81007


Steps:  30%|███       | 6061/20000 [13:20<30:56,  7.51it/s]

Samples 121200 Batch_idx 6059 GradUpdates 6060 Loss 4.72700


Steps:  30%|███       | 6081/20000 [13:22<30:57,  7.49it/s]

Samples 121600 Batch_idx 6079 GradUpdates 6080 Loss 4.69272


Steps:  31%|███       | 6101/20000 [13:25<30:52,  7.50it/s]

Samples 122000 Batch_idx 6099 GradUpdates 6100 Loss 4.75167


Steps:  31%|███       | 6121/20000 [13:27<30:50,  7.50it/s]

Samples 122400 Batch_idx 6119 GradUpdates 6120 Loss 4.72904


Steps:  31%|███       | 6141/20000 [13:30<30:48,  7.50it/s]

Samples 122800 Batch_idx 6139 GradUpdates 6140 Loss 4.72014


Steps:  31%|███       | 6161/20000 [13:33<30:46,  7.49it/s]

Samples 123200 Batch_idx 6159 GradUpdates 6160 Loss 4.68064


Steps:  31%|███       | 6181/20000 [13:35<30:44,  7.49it/s]

Samples 123600 Batch_idx 6179 GradUpdates 6180 Loss 4.67428


Steps:  31%|███       | 6201/20000 [13:38<32:40,  7.04it/s]  

Samples 124000 Batch_idx 6199 GradUpdates 6200 Loss 4.63095


Steps:  31%|███       | 6221/20000 [13:41<30:34,  7.51it/s]

Samples 124400 Batch_idx 6219 GradUpdates 6220 Loss 4.57615


Steps:  31%|███       | 6241/20000 [13:43<30:33,  7.51it/s]

Samples 124800 Batch_idx 6239 GradUpdates 6240 Loss 4.64493


Steps:  31%|███▏      | 6261/20000 [13:46<30:29,  7.51it/s]

Samples 125200 Batch_idx 6259 GradUpdates 6260 Loss 4.59793


Steps:  31%|███▏      | 6281/20000 [13:48<30:36,  7.47it/s]

Samples 125600 Batch_idx 6279 GradUpdates 6280 Loss 4.58735


Steps:  32%|███▏      | 6301/20000 [13:51<30:25,  7.51it/s]

Samples 126000 Batch_idx 6299 GradUpdates 6300 Loss 4.48609


Steps:  32%|███▏      | 6321/20000 [13:53<30:22,  7.51it/s]

Samples 126400 Batch_idx 6319 GradUpdates 6320 Loss 4.61972


Steps:  32%|███▏      | 6341/20000 [13:56<30:16,  7.52it/s]

Samples 126800 Batch_idx 6339 GradUpdates 6340 Loss 4.62171


Steps:  32%|███▏      | 6361/20000 [13:58<30:16,  7.51it/s]

Samples 127200 Batch_idx 6359 GradUpdates 6360 Loss 4.57910


Steps:  32%|███▏      | 6381/20000 [14:01<30:11,  7.52it/s]

Samples 127600 Batch_idx 6379 GradUpdates 6380 Loss 4.52701


Steps:  32%|███▏      | 6401/20000 [14:04<30:10,  7.51it/s]

Samples 128000 Batch_idx 6399 GradUpdates 6400 Loss 4.50866


Steps:  32%|███▏      | 6421/20000 [14:06<30:06,  7.52it/s]

Samples 128400 Batch_idx 6419 GradUpdates 6420 Loss 4.48512


Steps:  32%|███▏      | 6441/20000 [14:09<30:02,  7.52it/s]

Samples 128800 Batch_idx 6439 GradUpdates 6440 Loss 4.53577


Steps:  32%|███▏      | 6461/20000 [14:11<30:02,  7.51it/s]

Samples 129200 Batch_idx 6459 GradUpdates 6460 Loss 4.56276


Steps:  32%|███▏      | 6481/20000 [14:14<29:57,  7.52it/s]

Samples 129600 Batch_idx 6479 GradUpdates 6480 Loss 4.52693


Steps:  33%|███▎      | 6501/20000 [14:16<29:57,  7.51it/s]

Samples 130000 Batch_idx 6499 GradUpdates 6500 Loss 4.52286


Steps:  33%|███▎      | 6521/20000 [14:19<30:09,  7.45it/s]

Samples 130400 Batch_idx 6519 GradUpdates 6520 Loss 4.45092


Steps:  33%|███▎      | 6541/20000 [14:21<29:51,  7.51it/s]

Samples 130800 Batch_idx 6539 GradUpdates 6540 Loss 4.43617


Steps:  33%|███▎      | 6561/20000 [14:24<29:49,  7.51it/s]

Samples 131200 Batch_idx 6559 GradUpdates 6560 Loss 4.40365


Steps:  33%|███▎      | 6581/20000 [14:26<29:46,  7.51it/s]

Samples 131600 Batch_idx 6579 GradUpdates 6580 Loss 4.45245


Steps:  33%|███▎      | 6601/20000 [14:29<29:47,  7.50it/s]

Samples 132000 Batch_idx 6599 GradUpdates 6600 Loss 4.33772


Steps:  33%|███▎      | 6621/20000 [14:32<29:40,  7.51it/s]

Samples 132400 Batch_idx 6619 GradUpdates 6620 Loss 4.52535


Steps:  33%|███▎      | 6641/20000 [14:34<29:38,  7.51it/s]

Samples 132800 Batch_idx 6639 GradUpdates 6640 Loss 4.38188


Steps:  33%|███▎      | 6661/20000 [14:37<29:55,  7.43it/s]

Samples 133200 Batch_idx 6659 GradUpdates 6660 Loss 4.41559


Steps:  33%|███▎      | 6681/20000 [14:39<29:32,  7.52it/s]

Samples 133600 Batch_idx 6679 GradUpdates 6680 Loss 4.40550


Steps:  34%|███▎      | 6701/20000 [14:42<29:32,  7.50it/s]

Samples 134000 Batch_idx 6699 GradUpdates 6700 Loss 4.36753


Steps:  34%|███▎      | 6721/20000 [14:44<29:35,  7.48it/s]

Samples 134400 Batch_idx 6719 GradUpdates 6720 Loss 4.36788


Steps:  34%|███▎      | 6741/20000 [14:47<29:25,  7.51it/s]

Samples 134800 Batch_idx 6739 GradUpdates 6740 Loss 4.29805


Steps:  34%|███▍      | 6761/20000 [14:49<29:38,  7.44it/s]

Samples 135200 Batch_idx 6759 GradUpdates 6760 Loss 4.34330


Steps:  34%|███▍      | 6781/20000 [14:52<29:15,  7.53it/s]

Samples 135600 Batch_idx 6779 GradUpdates 6780 Loss 4.34509


Steps:  34%|███▍      | 6801/20000 [14:55<29:18,  7.50it/s]

Samples 136000 Batch_idx 6799 GradUpdates 6800 Loss 4.33744


Steps:  34%|███▍      | 6821/20000 [14:57<29:14,  7.51it/s]

Samples 136400 Batch_idx 6819 GradUpdates 6820 Loss 4.29296


Steps:  34%|███▍      | 6841/20000 [15:00<29:10,  7.52it/s]

Samples 136800 Batch_idx 6839 GradUpdates 6840 Loss 4.31254


Steps:  34%|███▍      | 6861/20000 [15:02<29:06,  7.52it/s]

Samples 137200 Batch_idx 6859 GradUpdates 6860 Loss 4.26062


Steps:  34%|███▍      | 6881/20000 [15:05<29:06,  7.51it/s]

Samples 137600 Batch_idx 6879 GradUpdates 6880 Loss 4.26121


Steps:  35%|███▍      | 6901/20000 [15:07<29:02,  7.52it/s]

Samples 138000 Batch_idx 6899 GradUpdates 6900 Loss 4.21182


Steps:  35%|███▍      | 6921/20000 [15:10<29:02,  7.50it/s]

Samples 138400 Batch_idx 6919 GradUpdates 6920 Loss 4.25148


Steps:  35%|███▍      | 6941/20000 [15:12<28:56,  7.52it/s]

Samples 138800 Batch_idx 6939 GradUpdates 6940 Loss 4.23670


Steps:  35%|███▍      | 6961/20000 [15:15<28:55,  7.51it/s]

Samples 139200 Batch_idx 6959 GradUpdates 6960 Loss 4.31141


Steps:  35%|███▍      | 6981/20000 [15:17<28:51,  7.52it/s]

Samples 139600 Batch_idx 6979 GradUpdates 6980 Loss 4.23422


Steps:  35%|███▌      | 7001/20000 [15:20<29:12,  7.42it/s]

Samples 140000 Batch_idx 6999 GradUpdates 7000 Loss 4.30625


Steps:  35%|███▌      | 7021/20000 [15:23<28:48,  7.51it/s]

Samples 140400 Batch_idx 7019 GradUpdates 7020 Loss 4.18929


Steps:  35%|███▌      | 7041/20000 [15:25<28:44,  7.52it/s]

Samples 140800 Batch_idx 7039 GradUpdates 7040 Loss 4.20965


Steps:  35%|███▌      | 7061/20000 [15:28<28:41,  7.51it/s]

Samples 141200 Batch_idx 7059 GradUpdates 7060 Loss 4.14697


Steps:  35%|███▌      | 7081/20000 [15:30<28:38,  7.52it/s]

Samples 141600 Batch_idx 7079 GradUpdates 7080 Loss 4.25016


Steps:  36%|███▌      | 7101/20000 [15:33<28:37,  7.51it/s]

Samples 142000 Batch_idx 7099 GradUpdates 7100 Loss 4.18419


Steps:  36%|███▌      | 7121/20000 [15:35<28:33,  7.52it/s]

Samples 142400 Batch_idx 7119 GradUpdates 7120 Loss 4.21803


Steps:  36%|███▌      | 7141/20000 [15:38<28:34,  7.50it/s]

Samples 142800 Batch_idx 7139 GradUpdates 7140 Loss 4.19804


Steps:  36%|███▌      | 7161/20000 [15:40<28:29,  7.51it/s]

Samples 143200 Batch_idx 7159 GradUpdates 7160 Loss 4.11539


Steps:  36%|███▌      | 7181/20000 [15:43<28:25,  7.51it/s]

Samples 143600 Batch_idx 7179 GradUpdates 7180 Loss 4.14150


Steps:  36%|███▌      | 7201/20000 [15:45<28:27,  7.50it/s]

Samples 144000 Batch_idx 7199 GradUpdates 7200 Loss 4.09438


Steps:  36%|███▌      | 7221/20000 [15:48<28:22,  7.51it/s]

Samples 144400 Batch_idx 7219 GradUpdates 7220 Loss 4.15904


Steps:  36%|███▌      | 7241/20000 [15:51<28:35,  7.44it/s]

Samples 144800 Batch_idx 7239 GradUpdates 7240 Loss 4.24063


Steps:  36%|███▋      | 7261/20000 [15:53<28:34,  7.43it/s]

Samples 145200 Batch_idx 7259 GradUpdates 7260 Loss 4.20230


Steps:  36%|███▋      | 7281/20000 [15:56<28:18,  7.49it/s]

Samples 145600 Batch_idx 7279 GradUpdates 7280 Loss 4.20099


Steps:  37%|███▋      | 7301/20000 [15:58<28:13,  7.50it/s]

Samples 146000 Batch_idx 7299 GradUpdates 7300 Loss 4.04466


Steps:  37%|███▋      | 7321/20000 [16:01<28:07,  7.51it/s]

Samples 146400 Batch_idx 7319 GradUpdates 7320 Loss 4.07507


Steps:  37%|███▋      | 7341/20000 [16:03<28:06,  7.51it/s]

Samples 146800 Batch_idx 7339 GradUpdates 7340 Loss 4.03556


Steps:  37%|███▋      | 7361/20000 [16:06<28:02,  7.51it/s]

Samples 147200 Batch_idx 7359 GradUpdates 7360 Loss 4.19226


Steps:  37%|███▋      | 7381/20000 [16:08<28:01,  7.50it/s]

Samples 147600 Batch_idx 7379 GradUpdates 7380 Loss 4.08107


Steps:  37%|███▋      | 7401/20000 [16:11<27:54,  7.52it/s]

Samples 148000 Batch_idx 7399 GradUpdates 7400 Loss 4.05707


Steps:  37%|███▋      | 7421/20000 [16:13<27:50,  7.53it/s]

Samples 148400 Batch_idx 7419 GradUpdates 7420 Loss 4.07653


Steps:  37%|███▋      | 7441/20000 [16:16<27:50,  7.52it/s]

Samples 148800 Batch_idx 7439 GradUpdates 7440 Loss 4.04694


Steps:  37%|███▋      | 7461/20000 [16:19<27:46,  7.52it/s]

Samples 149200 Batch_idx 7459 GradUpdates 7460 Loss 4.06812


Steps:  37%|███▋      | 7481/20000 [16:21<27:44,  7.52it/s]

Samples 149600 Batch_idx 7479 GradUpdates 7480 Loss 4.03545


Steps:  38%|███▊      | 7501/20000 [16:24<27:57,  7.45it/s]

Samples 150000 Batch_idx 7499 GradUpdates 7500 Loss 4.12392


Steps:  38%|███▊      | 7521/20000 [16:26<27:40,  7.52it/s]

Samples 150400 Batch_idx 7519 GradUpdates 7520 Loss 4.07291


Steps:  38%|███▊      | 7541/20000 [16:29<27:34,  7.53it/s]

Samples 150800 Batch_idx 7539 GradUpdates 7540 Loss 3.97317


Steps:  38%|███▊      | 7561/20000 [16:31<27:36,  7.51it/s]

Samples 151200 Batch_idx 7559 GradUpdates 7560 Loss 3.96789


Steps:  38%|███▊      | 7581/20000 [16:34<27:32,  7.52it/s]

Samples 151600 Batch_idx 7579 GradUpdates 7580 Loss 4.03722


Steps:  38%|███▊      | 7601/20000 [16:36<27:42,  7.46it/s]

Samples 152000 Batch_idx 7599 GradUpdates 7600 Loss 3.96136


Steps:  38%|███▊      | 7621/20000 [16:39<27:37,  7.47it/s]

Samples 152400 Batch_idx 7619 GradUpdates 7620 Loss 3.99121


Steps:  38%|███▊      | 7641/20000 [16:42<27:26,  7.51it/s]

Samples 152800 Batch_idx 7639 GradUpdates 7640 Loss 3.94543


Steps:  38%|███▊      | 7661/20000 [16:44<27:27,  7.49it/s]

Samples 153200 Batch_idx 7659 GradUpdates 7660 Loss 3.96240


Steps:  38%|███▊      | 7681/20000 [16:47<27:19,  7.51it/s]

Samples 153600 Batch_idx 7679 GradUpdates 7680 Loss 4.01875


Steps:  39%|███▊      | 7701/20000 [16:49<27:18,  7.51it/s]

Samples 154000 Batch_idx 7699 GradUpdates 7700 Loss 3.95680


Steps:  39%|███▊      | 7721/20000 [16:52<27:13,  7.52it/s]

Samples 154400 Batch_idx 7719 GradUpdates 7720 Loss 3.89715


Steps:  39%|███▊      | 7741/20000 [16:54<27:14,  7.50it/s]

Samples 154800 Batch_idx 7739 GradUpdates 7740 Loss 3.94726


Steps:  39%|███▉      | 7761/20000 [16:57<27:17,  7.47it/s]

Samples 155200 Batch_idx 7759 GradUpdates 7760 Loss 3.91429


Steps:  39%|███▉      | 7781/20000 [16:59<26:59,  7.54it/s]

Samples 155600 Batch_idx 7779 GradUpdates 7780 Loss 3.94014


Steps:  39%|███▉      | 7801/20000 [17:02<27:02,  7.52it/s]

Samples 156000 Batch_idx 7799 GradUpdates 7800 Loss 3.95795


Steps:  39%|███▉      | 7821/20000 [17:05<27:02,  7.51it/s]

Samples 156400 Batch_idx 7819 GradUpdates 7820 Loss 3.85292


Steps:  39%|███▉      | 7841/20000 [17:07<26:57,  7.52it/s]

Samples 156800 Batch_idx 7839 GradUpdates 7840 Loss 3.87846


Steps:  39%|███▉      | 7861/20000 [17:10<26:56,  7.51it/s]

Samples 157200 Batch_idx 7859 GradUpdates 7860 Loss 3.93694


Steps:  39%|███▉      | 7881/20000 [17:12<26:51,  7.52it/s]

Samples 157600 Batch_idx 7879 GradUpdates 7880 Loss 3.90470


Steps:  40%|███▉      | 7901/20000 [17:15<26:49,  7.52it/s]

Samples 158000 Batch_idx 7899 GradUpdates 7900 Loss 3.84889


Steps:  40%|███▉      | 7921/20000 [17:17<26:46,  7.52it/s]

Samples 158400 Batch_idx 7919 GradUpdates 7920 Loss 3.90450


Steps:  40%|███▉      | 7941/20000 [17:20<26:44,  7.52it/s]

Samples 158800 Batch_idx 7939 GradUpdates 7940 Loss 3.85987


Steps:  40%|███▉      | 7961/20000 [17:22<26:33,  7.56it/s]

Samples 159200 Batch_idx 7959 GradUpdates 7960 Loss 3.89910


Steps:  40%|███▉      | 7981/20000 [17:25<26:35,  7.54it/s]

Samples 159600 Batch_idx 7979 GradUpdates 7980 Loss 3.85032


Steps:  40%|████      | 8000/20000 [17:27<27:13,  7.34it/s]

Samples 160000 Batch_idx 7999 GradUpdates 8000 Loss 3.89116
Samples 160020 Batch_idx 8000 GradUpdates 8001 Loss 3.87746


Eval Steps: 100%|██████████| 25/25 [00:04<00:00,  5.82it/s]
Steps:  40%|████      | 8021/20000 [17:37<26:54,  7.42it/s]  

Samples 160400 Batch_idx 8019 GradUpdates 8020 Loss 3.83426


Steps:  40%|████      | 8041/20000 [17:39<26:35,  7.50it/s]

Samples 160800 Batch_idx 8039 GradUpdates 8040 Loss 3.82718


Steps:  40%|████      | 8061/20000 [17:42<26:45,  7.44it/s]

Samples 161200 Batch_idx 8059 GradUpdates 8060 Loss 3.81424


Steps:  40%|████      | 8081/20000 [17:44<26:31,  7.49it/s]

Samples 161600 Batch_idx 8079 GradUpdates 8080 Loss 3.79248


Steps:  41%|████      | 8101/20000 [17:47<26:25,  7.51it/s]

Samples 162000 Batch_idx 8099 GradUpdates 8100 Loss 3.81727


Steps:  41%|████      | 8121/20000 [17:49<26:25,  7.49it/s]

Samples 162400 Batch_idx 8119 GradUpdates 8120 Loss 3.78821


Steps:  41%|████      | 8141/20000 [17:52<26:19,  7.51it/s]

Samples 162800 Batch_idx 8139 GradUpdates 8140 Loss 3.89051


Steps:  41%|████      | 8161/20000 [17:55<26:21,  7.49it/s]

Samples 163200 Batch_idx 8159 GradUpdates 8160 Loss 3.82460


Steps:  41%|████      | 8181/20000 [17:57<26:24,  7.46it/s]

Samples 163600 Batch_idx 8179 GradUpdates 8180 Loss 3.85016


Steps:  41%|████      | 8201/20000 [18:00<26:18,  7.47it/s]

Samples 164000 Batch_idx 8199 GradUpdates 8200 Loss 3.84796


Steps:  41%|████      | 8221/20000 [18:02<26:16,  7.47it/s]

Samples 164400 Batch_idx 8219 GradUpdates 8220 Loss 3.82655


Steps:  41%|████      | 8240/20000 [18:05<26:49,  7.31it/s]

Samples 164800 Batch_idx 8239 GradUpdates 8240 Loss 3.82382


Steps:  41%|████▏     | 8261/20000 [18:08<25:57,  7.54it/s]

Samples 165200 Batch_idx 8259 GradUpdates 8260 Loss 3.78396


Steps:  41%|████▏     | 8281/20000 [18:10<25:54,  7.54it/s]

Samples 165600 Batch_idx 8279 GradUpdates 8280 Loss 3.76365


Steps:  42%|████▏     | 8301/20000 [18:13<25:55,  7.52it/s]

Samples 166000 Batch_idx 8299 GradUpdates 8300 Loss 3.74389


Steps:  42%|████▏     | 8321/20000 [18:15<26:39,  7.30it/s]

Samples 166400 Batch_idx 8319 GradUpdates 8320 Loss 3.76042


Steps:  42%|████▏     | 8341/20000 [18:18<25:52,  7.51it/s]

Samples 166800 Batch_idx 8339 GradUpdates 8340 Loss 3.77327


Steps:  42%|████▏     | 8361/20000 [18:21<26:02,  7.45it/s]

Samples 167200 Batch_idx 8359 GradUpdates 8360 Loss 3.74721


Steps:  42%|████▏     | 8381/20000 [18:23<25:47,  7.51it/s]

Samples 167600 Batch_idx 8379 GradUpdates 8380 Loss 3.73428


Steps:  42%|████▏     | 8401/20000 [18:26<25:46,  7.50it/s]

Samples 168000 Batch_idx 8399 GradUpdates 8400 Loss 3.82031


Steps:  42%|████▏     | 8421/20000 [18:28<25:41,  7.51it/s]

Samples 168400 Batch_idx 8419 GradUpdates 8420 Loss 3.70980


Steps:  42%|████▏     | 8441/20000 [18:31<25:37,  7.52it/s]

Samples 168800 Batch_idx 8439 GradUpdates 8440 Loss 3.67107


Steps:  42%|████▏     | 8461/20000 [18:33<25:35,  7.52it/s]

Samples 169200 Batch_idx 8459 GradUpdates 8460 Loss 3.62186


Steps:  42%|████▏     | 8481/20000 [18:36<25:33,  7.51it/s]

Samples 169600 Batch_idx 8479 GradUpdates 8480 Loss 3.70634


Steps:  43%|████▎     | 8501/20000 [18:38<25:31,  7.51it/s]

Samples 170000 Batch_idx 8499 GradUpdates 8500 Loss 3.75017


Steps:  43%|████▎     | 8521/20000 [18:41<25:26,  7.52it/s]

Samples 170400 Batch_idx 8519 GradUpdates 8520 Loss 3.71492


Steps:  43%|████▎     | 8541/20000 [18:43<25:25,  7.51it/s]

Samples 170800 Batch_idx 8539 GradUpdates 8540 Loss 3.72482


Steps:  43%|████▎     | 8561/20000 [18:46<25:23,  7.51it/s]

Samples 171200 Batch_idx 8559 GradUpdates 8560 Loss 3.67586


Steps:  43%|████▎     | 8581/20000 [18:49<25:22,  7.50it/s]

Samples 171600 Batch_idx 8579 GradUpdates 8580 Loss 3.66427


Steps:  43%|████▎     | 8601/20000 [18:51<25:33,  7.43it/s]

Samples 172000 Batch_idx 8599 GradUpdates 8600 Loss 3.65766


Steps:  43%|████▎     | 8621/20000 [18:54<25:15,  7.51it/s]

Samples 172400 Batch_idx 8619 GradUpdates 8620 Loss 3.62349


Steps:  43%|████▎     | 8641/20000 [18:56<25:12,  7.51it/s]

Samples 172800 Batch_idx 8639 GradUpdates 8640 Loss 3.62943


Steps:  43%|████▎     | 8661/20000 [18:59<25:10,  7.50it/s]

Samples 173200 Batch_idx 8659 GradUpdates 8660 Loss 3.59407


Steps:  43%|████▎     | 8681/20000 [19:01<25:07,  7.51it/s]

Samples 173600 Batch_idx 8679 GradUpdates 8680 Loss 3.69770


Steps:  44%|████▎     | 8701/20000 [19:04<25:05,  7.50it/s]

Samples 174000 Batch_idx 8699 GradUpdates 8700 Loss 3.63499


Steps:  44%|████▎     | 8721/20000 [19:06<25:01,  7.51it/s]

Samples 174400 Batch_idx 8719 GradUpdates 8720 Loss 3.62171


Steps:  44%|████▎     | 8741/20000 [19:09<24:57,  7.52it/s]

Samples 174800 Batch_idx 8739 GradUpdates 8740 Loss 3.67771


Steps:  44%|████▍     | 8761/20000 [19:11<25:03,  7.47it/s]

Samples 175200 Batch_idx 8759 GradUpdates 8760 Loss 3.62192


Steps:  44%|████▍     | 8781/20000 [19:14<24:52,  7.52it/s]

Samples 175600 Batch_idx 8779 GradUpdates 8780 Loss 3.59575


Steps:  44%|████▍     | 8801/20000 [19:17<24:53,  7.50it/s]

Samples 176000 Batch_idx 8799 GradUpdates 8800 Loss 3.64543


Steps:  44%|████▍     | 8821/20000 [19:19<24:48,  7.51it/s]

Samples 176400 Batch_idx 8819 GradUpdates 8820 Loss 3.54952


Steps:  44%|████▍     | 8841/20000 [19:22<24:45,  7.51it/s]

Samples 176800 Batch_idx 8839 GradUpdates 8840 Loss 3.50991


Steps:  44%|████▍     | 8861/20000 [19:24<24:57,  7.44it/s]

Samples 177200 Batch_idx 8859 GradUpdates 8860 Loss 3.60369


Steps:  44%|████▍     | 8881/20000 [19:27<24:39,  7.51it/s]

Samples 177600 Batch_idx 8879 GradUpdates 8880 Loss 3.58786


Steps:  45%|████▍     | 8901/20000 [19:29<24:42,  7.49it/s]

Samples 178000 Batch_idx 8899 GradUpdates 8900 Loss 3.58072


Steps:  45%|████▍     | 8921/20000 [19:32<24:34,  7.51it/s]

Samples 178400 Batch_idx 8919 GradUpdates 8920 Loss 3.59074


Steps:  45%|████▍     | 8941/20000 [19:34<24:34,  7.50it/s]

Samples 178800 Batch_idx 8939 GradUpdates 8940 Loss 3.63531


Steps:  45%|████▍     | 8961/20000 [19:37<24:28,  7.52it/s]

Samples 179200 Batch_idx 8959 GradUpdates 8960 Loss 3.61390


Steps:  45%|████▍     | 8981/20000 [19:40<24:26,  7.51it/s]

Samples 179600 Batch_idx 8979 GradUpdates 8980 Loss 3.53475


Steps:  45%|████▌     | 9001/20000 [19:42<24:25,  7.51it/s]

Samples 180000 Batch_idx 8999 GradUpdates 9000 Loss 3.52041


Steps:  45%|████▌     | 9021/20000 [19:45<24:19,  7.52it/s]

Samples 180400 Batch_idx 9019 GradUpdates 9020 Loss 3.58825


Steps:  45%|████▌     | 9041/20000 [19:47<24:22,  7.49it/s]

Samples 180800 Batch_idx 9039 GradUpdates 9040 Loss 3.58802


Steps:  45%|████▌     | 9061/20000 [19:50<24:14,  7.52it/s]

Samples 181200 Batch_idx 9059 GradUpdates 9060 Loss 3.50646


Steps:  45%|████▌     | 9081/20000 [19:52<24:08,  7.54it/s]

Samples 181600 Batch_idx 9079 GradUpdates 9080 Loss 3.53938


Steps:  46%|████▌     | 9101/20000 [19:55<24:26,  7.43it/s]

Samples 182000 Batch_idx 9099 GradUpdates 9100 Loss 3.56821


Steps:  46%|████▌     | 9121/20000 [19:57<24:07,  7.52it/s]

Samples 182400 Batch_idx 9119 GradUpdates 9120 Loss 3.52154


Steps:  46%|████▌     | 9141/20000 [20:00<24:04,  7.52it/s]

Samples 182800 Batch_idx 9139 GradUpdates 9140 Loss 3.48750


Steps:  46%|████▌     | 9161/20000 [20:02<24:02,  7.51it/s]

Samples 183200 Batch_idx 9159 GradUpdates 9160 Loss 3.50086


Steps:  46%|████▌     | 9181/20000 [20:05<23:57,  7.52it/s]

Samples 183600 Batch_idx 9179 GradUpdates 9180 Loss 3.48191


Steps:  46%|████▌     | 9201/20000 [20:08<24:00,  7.50it/s]

Samples 184000 Batch_idx 9199 GradUpdates 9200 Loss 3.49600


Steps:  46%|████▌     | 9221/20000 [20:10<23:54,  7.51it/s]

Samples 184400 Batch_idx 9219 GradUpdates 9220 Loss 3.48396


Steps:  46%|████▌     | 9241/20000 [20:13<23:51,  7.51it/s]

Samples 184800 Batch_idx 9239 GradUpdates 9240 Loss 3.47563


Steps:  46%|████▋     | 9261/20000 [20:15<23:50,  7.51it/s]

Samples 185200 Batch_idx 9259 GradUpdates 9260 Loss 3.52250


Steps:  46%|████▋     | 9281/20000 [20:18<23:49,  7.50it/s]

Samples 185600 Batch_idx 9279 GradUpdates 9280 Loss 3.47705


Steps:  47%|████▋     | 9301/20000 [20:20<23:45,  7.50it/s]

Samples 186000 Batch_idx 9299 GradUpdates 9300 Loss 3.49292


Steps:  47%|████▋     | 9321/20000 [20:23<23:43,  7.50it/s]

Samples 186400 Batch_idx 9319 GradUpdates 9320 Loss 3.50061


Steps:  47%|████▋     | 9341/20000 [20:25<23:41,  7.50it/s]

Samples 186800 Batch_idx 9339 GradUpdates 9340 Loss 3.44077


Steps:  47%|████▋     | 9361/20000 [20:28<23:51,  7.43it/s]

Samples 187200 Batch_idx 9359 GradUpdates 9360 Loss 3.48278


Steps:  47%|████▋     | 9381/20000 [20:30<23:36,  7.50it/s]

Samples 187600 Batch_idx 9379 GradUpdates 9380 Loss 3.47161


Steps:  47%|████▋     | 9401/20000 [20:33<23:32,  7.50it/s]

Samples 188000 Batch_idx 9399 GradUpdates 9400 Loss 3.40982


Steps:  47%|████▋     | 9421/20000 [20:36<23:31,  7.50it/s]

Samples 188400 Batch_idx 9419 GradUpdates 9420 Loss 3.54491


Steps:  47%|████▋     | 9441/20000 [20:38<23:31,  7.48it/s]

Samples 188800 Batch_idx 9439 GradUpdates 9440 Loss 3.58346


Steps:  47%|████▋     | 9461/20000 [20:41<23:19,  7.53it/s]

Samples 189200 Batch_idx 9459 GradUpdates 9460 Loss 3.39757


Steps:  47%|████▋     | 9481/20000 [20:43<23:20,  7.51it/s]

Samples 189600 Batch_idx 9479 GradUpdates 9480 Loss 3.39686


Steps:  48%|████▊     | 9501/20000 [20:46<23:23,  7.48it/s]

Samples 190000 Batch_idx 9499 GradUpdates 9500 Loss 3.49474


Steps:  48%|████▊     | 9521/20000 [20:48<23:16,  7.51it/s]

Samples 190400 Batch_idx 9519 GradUpdates 9520 Loss 3.38937


Steps:  48%|████▊     | 9541/20000 [20:51<23:12,  7.51it/s]

Samples 190800 Batch_idx 9539 GradUpdates 9540 Loss 3.34113


Steps:  48%|████▊     | 9561/20000 [20:53<23:11,  7.50it/s]

Samples 191200 Batch_idx 9559 GradUpdates 9560 Loss 3.47526


Steps:  48%|████▊     | 9581/20000 [20:56<23:06,  7.52it/s]

Samples 191600 Batch_idx 9579 GradUpdates 9580 Loss 3.44729


Steps:  48%|████▊     | 9601/20000 [20:59<23:17,  7.44it/s]

Samples 192000 Batch_idx 9599 GradUpdates 9600 Loss 3.46552


Steps:  48%|████▊     | 9621/20000 [21:01<22:57,  7.54it/s]

Samples 192400 Batch_idx 9619 GradUpdates 9620 Loss 3.44879


Steps:  48%|████▊     | 9641/20000 [21:04<22:57,  7.52it/s]

Samples 192800 Batch_idx 9639 GradUpdates 9640 Loss 3.46614


Steps:  48%|████▊     | 9661/20000 [21:06<22:52,  7.53it/s]

Samples 193200 Batch_idx 9659 GradUpdates 9660 Loss 3.36753


Steps:  48%|████▊     | 9681/20000 [21:09<22:51,  7.52it/s]

Samples 193600 Batch_idx 9679 GradUpdates 9680 Loss 3.44571


Steps:  49%|████▊     | 9701/20000 [21:11<22:48,  7.52it/s]

Samples 194000 Batch_idx 9699 GradUpdates 9700 Loss 3.35818


Steps:  49%|████▊     | 9721/20000 [21:14<22:49,  7.51it/s]

Samples 194400 Batch_idx 9719 GradUpdates 9720 Loss 3.39339


Steps:  49%|████▊     | 9741/20000 [21:16<22:47,  7.50it/s]

Samples 194800 Batch_idx 9739 GradUpdates 9740 Loss 3.36610


Steps:  49%|████▉     | 9761/20000 [21:19<22:45,  7.50it/s]

Samples 195200 Batch_idx 9759 GradUpdates 9760 Loss 3.43205


Steps:  49%|████▉     | 9781/20000 [21:21<22:43,  7.50it/s]

Samples 195600 Batch_idx 9779 GradUpdates 9780 Loss 3.41360


Steps:  49%|████▉     | 9801/20000 [21:24<22:41,  7.49it/s]

Samples 196000 Batch_idx 9799 GradUpdates 9800 Loss 3.37262


Steps:  49%|████▉     | 9821/20000 [21:27<22:40,  7.48it/s]

Samples 196400 Batch_idx 9819 GradUpdates 9820 Loss 3.38913


Steps:  49%|████▉     | 9841/20000 [21:29<22:34,  7.50it/s]

Samples 196800 Batch_idx 9839 GradUpdates 9840 Loss 3.34326


Steps:  49%|████▉     | 9861/20000 [21:32<22:43,  7.43it/s]

Samples 197200 Batch_idx 9859 GradUpdates 9860 Loss 3.38998


Steps:  49%|████▉     | 9881/20000 [21:34<22:21,  7.54it/s]

Samples 197600 Batch_idx 9879 GradUpdates 9880 Loss 3.36815


Steps:  50%|████▉     | 9901/20000 [21:37<22:19,  7.54it/s]

Samples 198000 Batch_idx 9899 GradUpdates 9900 Loss 3.37209


Steps:  50%|████▉     | 9921/20000 [21:39<22:22,  7.51it/s]

Samples 198400 Batch_idx 9919 GradUpdates 9920 Loss 3.35835


Steps:  50%|████▉     | 9941/20000 [21:42<22:19,  7.51it/s]

Samples 198800 Batch_idx 9939 GradUpdates 9940 Loss 3.30909


Steps:  50%|████▉     | 9961/20000 [21:44<22:18,  7.50it/s]

Samples 199200 Batch_idx 9959 GradUpdates 9960 Loss 3.35349


Steps:  50%|████▉     | 9981/20000 [21:47<22:13,  7.51it/s]

Samples 199600 Batch_idx 9979 GradUpdates 9980 Loss 3.40675


Steps:  50%|█████     | 10000/20000 [21:50<22:56,  7.27it/s]

Samples 200000 Batch_idx 9999 GradUpdates 10000 Loss 3.31246
Samples 200020 Batch_idx 10000 GradUpdates 10001 Loss 3.26105


Eval Steps: 100%|██████████| 25/25 [00:04<00:00,  5.66it/s]
Steps:  50%|█████     | 10021/20000 [21:59<22:26,  7.41it/s]  

Samples 200400 Batch_idx 10019 GradUpdates 10020 Loss 3.31910


Steps:  50%|█████     | 10041/20000 [22:01<22:26,  7.39it/s]

Samples 200800 Batch_idx 10039 GradUpdates 10040 Loss 3.29331


Steps:  50%|█████     | 10061/20000 [22:04<22:03,  7.51it/s]

Samples 201200 Batch_idx 10059 GradUpdates 10060 Loss 3.28000


Steps:  50%|█████     | 10081/20000 [22:06<22:00,  7.51it/s]

Samples 201600 Batch_idx 10079 GradUpdates 10080 Loss 3.34043


Steps:  51%|█████     | 10101/20000 [22:09<21:58,  7.51it/s]

Samples 202000 Batch_idx 10099 GradUpdates 10100 Loss 3.36671


Steps:  51%|█████     | 10121/20000 [22:12<21:56,  7.50it/s]

Samples 202400 Batch_idx 10119 GradUpdates 10120 Loss 3.27379


Steps:  51%|█████     | 10141/20000 [22:14<21:51,  7.52it/s]

Samples 202800 Batch_idx 10139 GradUpdates 10140 Loss 3.28831


Steps:  51%|█████     | 10161/20000 [22:17<21:54,  7.48it/s]

Samples 203200 Batch_idx 10159 GradUpdates 10160 Loss 3.31464


Steps:  51%|█████     | 10181/20000 [22:19<22:04,  7.41it/s]

Samples 203600 Batch_idx 10179 GradUpdates 10180 Loss 3.28280


Steps:  51%|█████     | 10201/20000 [22:22<21:51,  7.47it/s]

Samples 204000 Batch_idx 10199 GradUpdates 10200 Loss 3.26124


Steps:  51%|█████     | 10221/20000 [22:24<21:44,  7.50it/s]

Samples 204400 Batch_idx 10219 GradUpdates 10220 Loss 3.32686


Steps:  51%|█████     | 10241/20000 [22:27<21:41,  7.50it/s]

Samples 204800 Batch_idx 10239 GradUpdates 10240 Loss 3.29557


Steps:  51%|█████▏    | 10261/20000 [22:29<21:55,  7.40it/s]

Samples 205200 Batch_idx 10259 GradUpdates 10260 Loss 3.27564


Steps:  51%|█████▏    | 10281/20000 [22:32<21:36,  7.49it/s]

Samples 205600 Batch_idx 10279 GradUpdates 10280 Loss 3.34306


Steps:  52%|█████▏    | 10301/20000 [22:35<22:02,  7.34it/s]

Samples 206000 Batch_idx 10299 GradUpdates 10300 Loss 3.36568


Steps:  52%|█████▏    | 10321/20000 [22:38<21:29,  7.51it/s]

Samples 206400 Batch_idx 10319 GradUpdates 10320 Loss 3.28441


Steps:  52%|█████▏    | 10341/20000 [22:40<21:26,  7.51it/s]

Samples 206800 Batch_idx 10339 GradUpdates 10340 Loss 3.30729


Steps:  52%|█████▏    | 10361/20000 [22:43<21:27,  7.49it/s]

Samples 207200 Batch_idx 10359 GradUpdates 10360 Loss 3.24875


Steps:  52%|█████▏    | 10381/20000 [22:45<21:24,  7.49it/s]

Samples 207600 Batch_idx 10379 GradUpdates 10380 Loss 3.24871


Steps:  52%|█████▏    | 10401/20000 [22:48<21:25,  7.47it/s]

Samples 208000 Batch_idx 10399 GradUpdates 10400 Loss 3.25941


Steps:  52%|█████▏    | 10421/20000 [22:51<21:18,  7.50it/s]

Samples 208400 Batch_idx 10419 GradUpdates 10420 Loss 3.27286


Steps:  52%|█████▏    | 10441/20000 [22:53<21:14,  7.50it/s]

Samples 208800 Batch_idx 10439 GradUpdates 10440 Loss 3.30695


Steps:  52%|█████▏    | 10461/20000 [22:56<21:10,  7.51it/s]

Samples 209200 Batch_idx 10459 GradUpdates 10460 Loss 3.23185


Steps:  52%|█████▏    | 10481/20000 [22:58<21:07,  7.51it/s]

Samples 209600 Batch_idx 10479 GradUpdates 10480 Loss 3.27866


Steps:  53%|█████▎    | 10501/20000 [23:01<21:04,  7.51it/s]

Samples 210000 Batch_idx 10499 GradUpdates 10500 Loss 3.24260


Steps:  53%|█████▎    | 10521/20000 [23:03<21:06,  7.49it/s]

Samples 210400 Batch_idx 10519 GradUpdates 10520 Loss 3.27306


Steps:  53%|█████▎    | 10541/20000 [23:06<21:01,  7.50it/s]

Samples 210800 Batch_idx 10539 GradUpdates 10540 Loss 3.27924


Steps:  53%|█████▎    | 10561/20000 [23:08<20:55,  7.52it/s]

Samples 211200 Batch_idx 10559 GradUpdates 10560 Loss 3.27536


Steps:  53%|█████▎    | 10581/20000 [23:11<20:50,  7.53it/s]

Samples 211600 Batch_idx 10579 GradUpdates 10580 Loss 3.22903


Steps:  53%|█████▎    | 10601/20000 [23:14<20:47,  7.53it/s]

Samples 212000 Batch_idx 10599 GradUpdates 10600 Loss 3.24014


Steps:  53%|█████▎    | 10621/20000 [23:16<20:49,  7.51it/s]

Samples 212400 Batch_idx 10619 GradUpdates 10620 Loss 3.22631


Steps:  53%|█████▎    | 10641/20000 [23:19<20:50,  7.48it/s]

Samples 212800 Batch_idx 10639 GradUpdates 10640 Loss 3.20526


Steps:  53%|█████▎    | 10661/20000 [23:21<20:56,  7.43it/s]

Samples 213200 Batch_idx 10659 GradUpdates 10660 Loss 3.23422


Steps:  53%|█████▎    | 10681/20000 [23:24<20:41,  7.51it/s]

Samples 213600 Batch_idx 10679 GradUpdates 10680 Loss 3.19055


Steps:  54%|█████▎    | 10701/20000 [23:26<20:40,  7.50it/s]

Samples 214000 Batch_idx 10699 GradUpdates 10700 Loss 3.24216


Steps:  54%|█████▎    | 10721/20000 [23:29<20:36,  7.50it/s]

Samples 214400 Batch_idx 10719 GradUpdates 10720 Loss 3.23233


Steps:  54%|█████▎    | 10741/20000 [23:31<20:35,  7.50it/s]

Samples 214800 Batch_idx 10739 GradUpdates 10740 Loss 3.27537


Steps:  54%|█████▍    | 10761/20000 [23:34<20:30,  7.51it/s]

Samples 215200 Batch_idx 10759 GradUpdates 10760 Loss 3.22742


Steps:  54%|█████▍    | 10781/20000 [23:37<20:27,  7.51it/s]

Samples 215600 Batch_idx 10779 GradUpdates 10780 Loss 3.20992


Steps:  54%|█████▍    | 10801/20000 [23:39<20:24,  7.51it/s]

Samples 216000 Batch_idx 10799 GradUpdates 10800 Loss 3.25363


Steps:  54%|█████▍    | 10821/20000 [23:42<20:21,  7.52it/s]

Samples 216400 Batch_idx 10819 GradUpdates 10820 Loss 3.22011


Steps:  54%|█████▍    | 10841/20000 [23:44<20:20,  7.50it/s]

Samples 216800 Batch_idx 10839 GradUpdates 10840 Loss 3.17603


Steps:  54%|█████▍    | 10861/20000 [23:47<20:16,  7.51it/s]

Samples 217200 Batch_idx 10859 GradUpdates 10860 Loss 3.16874


Steps:  54%|█████▍    | 10881/20000 [23:49<20:14,  7.51it/s]

Samples 217600 Batch_idx 10879 GradUpdates 10880 Loss 3.24151


Steps:  55%|█████▍    | 10901/20000 [23:52<20:22,  7.44it/s]

Samples 218000 Batch_idx 10899 GradUpdates 10900 Loss 3.16968


Steps:  55%|█████▍    | 10921/20000 [23:54<20:08,  7.51it/s]

Samples 218400 Batch_idx 10919 GradUpdates 10920 Loss 3.14725


Steps:  55%|█████▍    | 10941/20000 [23:57<20:08,  7.49it/s]

Samples 218800 Batch_idx 10939 GradUpdates 10940 Loss 3.19123


Steps:  55%|█████▍    | 10961/20000 [23:59<20:03,  7.51it/s]

Samples 219200 Batch_idx 10959 GradUpdates 10960 Loss 3.20111


Steps:  55%|█████▍    | 10981/20000 [24:02<20:02,  7.50it/s]

Samples 219600 Batch_idx 10979 GradUpdates 10980 Loss 3.16334


Steps:  55%|█████▌    | 11001/20000 [24:05<19:58,  7.51it/s]

Samples 220000 Batch_idx 10999 GradUpdates 11000 Loss 3.16185


Steps:  55%|█████▌    | 11021/20000 [24:07<19:55,  7.51it/s]

Samples 220400 Batch_idx 11019 GradUpdates 11020 Loss 3.21007


Steps:  55%|█████▌    | 11041/20000 [24:10<19:49,  7.53it/s]

Samples 220800 Batch_idx 11039 GradUpdates 11040 Loss 3.24108


Steps:  55%|█████▌    | 11061/20000 [24:12<19:45,  7.54it/s]

Samples 221200 Batch_idx 11059 GradUpdates 11060 Loss 3.21728


Steps:  55%|█████▌    | 11081/20000 [24:15<19:48,  7.50it/s]

Samples 221600 Batch_idx 11079 GradUpdates 11080 Loss 3.11741


Steps:  56%|█████▌    | 11101/20000 [24:17<19:45,  7.51it/s]

Samples 222000 Batch_idx 11099 GradUpdates 11100 Loss 3.13402


Steps:  56%|█████▌    | 11121/20000 [24:20<19:49,  7.47it/s]

Samples 222400 Batch_idx 11119 GradUpdates 11120 Loss 3.22915


Steps:  56%|█████▌    | 11141/20000 [24:22<19:40,  7.50it/s]

Samples 222800 Batch_idx 11139 GradUpdates 11140 Loss 3.11439


Steps:  56%|█████▌    | 11161/20000 [24:25<19:51,  7.42it/s]

Samples 223200 Batch_idx 11159 GradUpdates 11160 Loss 3.14484


Steps:  56%|█████▌    | 11181/20000 [24:28<19:36,  7.50it/s]

Samples 223600 Batch_idx 11179 GradUpdates 11180 Loss 3.13981


Steps:  56%|█████▌    | 11201/20000 [24:30<19:37,  7.48it/s]

Samples 224000 Batch_idx 11199 GradUpdates 11200 Loss 3.15595


Steps:  56%|█████▌    | 11221/20000 [24:33<19:29,  7.51it/s]

Samples 224400 Batch_idx 11219 GradUpdates 11220 Loss 3.19253


Steps:  56%|█████▌    | 11241/20000 [24:35<19:27,  7.50it/s]

Samples 224800 Batch_idx 11239 GradUpdates 11240 Loss 3.20000


Steps:  56%|█████▋    | 11261/20000 [24:38<19:23,  7.51it/s]

Samples 225200 Batch_idx 11259 GradUpdates 11260 Loss 3.17251


Steps:  56%|█████▋    | 11281/20000 [24:40<19:21,  7.51it/s]

Samples 225600 Batch_idx 11279 GradUpdates 11280 Loss 3.12739


Steps:  57%|█████▋    | 11301/20000 [24:43<19:19,  7.50it/s]

Samples 226000 Batch_idx 11299 GradUpdates 11300 Loss 3.16975


Steps:  57%|█████▋    | 11321/20000 [24:45<19:34,  7.39it/s]

Samples 226400 Batch_idx 11319 GradUpdates 11320 Loss 3.15727


Steps:  57%|█████▋    | 11341/20000 [24:48<19:13,  7.50it/s]

Samples 226800 Batch_idx 11339 GradUpdates 11340 Loss 3.11445


Steps:  57%|█████▋    | 11361/20000 [24:50<19:11,  7.50it/s]

Samples 227200 Batch_idx 11359 GradUpdates 11360 Loss 3.14077


Steps:  57%|█████▋    | 11381/20000 [24:53<19:15,  7.46it/s]

Samples 227600 Batch_idx 11379 GradUpdates 11380 Loss 3.13248


Steps:  57%|█████▋    | 11401/20000 [24:56<19:17,  7.43it/s]

Samples 228000 Batch_idx 11399 GradUpdates 11400 Loss 3.09238


Steps:  57%|█████▋    | 11421/20000 [24:58<19:04,  7.49it/s]

Samples 228400 Batch_idx 11419 GradUpdates 11420 Loss 3.11619


Steps:  57%|█████▋    | 11441/20000 [25:01<19:04,  7.48it/s]

Samples 228800 Batch_idx 11439 GradUpdates 11440 Loss 3.18346


Steps:  57%|█████▋    | 11461/20000 [25:03<18:57,  7.50it/s]

Samples 229200 Batch_idx 11459 GradUpdates 11460 Loss 3.12300


Steps:  57%|█████▋    | 11481/20000 [25:06<18:55,  7.50it/s]

Samples 229600 Batch_idx 11479 GradUpdates 11480 Loss 3.13840


Steps:  58%|█████▊    | 11501/20000 [25:08<18:53,  7.50it/s]

Samples 230000 Batch_idx 11499 GradUpdates 11500 Loss 3.13994


Steps:  58%|█████▊    | 11521/20000 [25:12<21:11,  6.67it/s]  

Samples 230400 Batch_idx 11519 GradUpdates 11520 Loss 3.09741


Steps:  58%|█████▊    | 11541/20000 [25:15<20:29,  6.88it/s]

Samples 230800 Batch_idx 11539 GradUpdates 11540 Loss 3.08524


Steps:  58%|█████▊    | 11561/20000 [25:17<18:42,  7.52it/s]

Samples 231200 Batch_idx 11559 GradUpdates 11560 Loss 3.09524


Steps:  58%|█████▊    | 11581/20000 [25:20<18:43,  7.50it/s]

Samples 231600 Batch_idx 11579 GradUpdates 11580 Loss 3.09938


Steps:  58%|█████▊    | 11601/20000 [25:23<18:34,  7.54it/s]

Samples 232000 Batch_idx 11599 GradUpdates 11600 Loss 3.12414


Steps:  58%|█████▊    | 11621/20000 [25:25<18:35,  7.51it/s]

Samples 232400 Batch_idx 11619 GradUpdates 11620 Loss 3.10900


Steps:  58%|█████▊    | 11641/20000 [25:28<18:30,  7.53it/s]

Samples 232800 Batch_idx 11639 GradUpdates 11640 Loss 3.12609


Steps:  58%|█████▊    | 11661/20000 [25:30<18:29,  7.52it/s]

Samples 233200 Batch_idx 11659 GradUpdates 11660 Loss 3.07474


Steps:  58%|█████▊    | 11681/20000 [25:33<18:28,  7.51it/s]

Samples 233600 Batch_idx 11679 GradUpdates 11680 Loss 3.09425


Steps:  59%|█████▊    | 11701/20000 [25:35<18:23,  7.52it/s]

Samples 234000 Batch_idx 11699 GradUpdates 11700 Loss 3.11184


Steps:  59%|█████▊    | 11721/20000 [25:38<18:20,  7.52it/s]

Samples 234400 Batch_idx 11719 GradUpdates 11720 Loss 3.09177


Steps:  59%|█████▊    | 11741/20000 [25:40<18:18,  7.52it/s]

Samples 234800 Batch_idx 11739 GradUpdates 11740 Loss 3.12500


Steps:  59%|█████▉    | 11761/20000 [25:43<18:16,  7.51it/s]

Samples 235200 Batch_idx 11759 GradUpdates 11760 Loss 3.12119


Steps:  59%|█████▉    | 11781/20000 [25:45<18:16,  7.50it/s]

Samples 235600 Batch_idx 11779 GradUpdates 11780 Loss 3.10865


Steps:  59%|█████▉    | 11801/20000 [25:48<18:10,  7.52it/s]

Samples 236000 Batch_idx 11799 GradUpdates 11800 Loss 3.13445


Steps:  59%|█████▉    | 11821/20000 [25:51<18:21,  7.42it/s]

Samples 236400 Batch_idx 11819 GradUpdates 11820 Loss 3.06053


Steps:  59%|█████▉    | 11841/20000 [25:53<18:06,  7.51it/s]

Samples 236800 Batch_idx 11839 GradUpdates 11840 Loss 3.01088


Steps:  59%|█████▉    | 11861/20000 [25:56<18:04,  7.51it/s]

Samples 237200 Batch_idx 11859 GradUpdates 11860 Loss 3.11653


Steps:  59%|█████▉    | 11881/20000 [25:58<17:59,  7.52it/s]

Samples 237600 Batch_idx 11879 GradUpdates 11880 Loss 3.08927


Steps:  60%|█████▉    | 11901/20000 [26:01<17:54,  7.54it/s]

Samples 238000 Batch_idx 11899 GradUpdates 11900 Loss 3.06000


Steps:  60%|█████▉    | 11921/20000 [26:03<18:02,  7.46it/s]

Samples 238400 Batch_idx 11919 GradUpdates 11920 Loss 3.00689


Steps:  60%|█████▉    | 11941/20000 [26:06<17:54,  7.50it/s]

Samples 238800 Batch_idx 11939 GradUpdates 11940 Loss 3.08952


Steps:  60%|█████▉    | 11961/20000 [26:08<17:50,  7.51it/s]

Samples 239200 Batch_idx 11959 GradUpdates 11960 Loss 3.08906


Steps:  60%|█████▉    | 11981/20000 [26:11<17:46,  7.52it/s]

Samples 239600 Batch_idx 11979 GradUpdates 11980 Loss 3.04738


Steps:  60%|██████    | 12000/20000 [26:13<18:11,  7.33it/s]

Samples 240000 Batch_idx 11999 GradUpdates 12000 Loss 3.07945
Samples 240020 Batch_idx 12000 GradUpdates 12001 Loss 2.99164


Eval Steps: 100%|██████████| 25/25 [00:04<00:00,  6.05it/s]
Steps:  60%|██████    | 12021/20000 [26:22<17:53,  7.43it/s]  

Samples 240400 Batch_idx 12019 GradUpdates 12020 Loss 3.01804


Steps:  60%|██████    | 12041/20000 [26:25<17:40,  7.51it/s]

Samples 240800 Batch_idx 12039 GradUpdates 12040 Loss 3.04408


Steps:  60%|██████    | 12061/20000 [26:27<17:37,  7.50it/s]

Samples 241200 Batch_idx 12059 GradUpdates 12060 Loss 3.10174


Steps:  60%|██████    | 12081/20000 [26:30<17:33,  7.51it/s]

Samples 241600 Batch_idx 12079 GradUpdates 12080 Loss 3.07842


Steps:  61%|██████    | 12101/20000 [26:32<17:31,  7.51it/s]

Samples 242000 Batch_idx 12099 GradUpdates 12100 Loss 2.99727


Steps:  61%|██████    | 12121/20000 [26:35<17:28,  7.52it/s]

Samples 242400 Batch_idx 12119 GradUpdates 12120 Loss 3.07604


Steps:  61%|██████    | 12141/20000 [26:37<17:25,  7.52it/s]

Samples 242800 Batch_idx 12139 GradUpdates 12140 Loss 2.99397


Steps:  61%|██████    | 12161/20000 [26:40<17:23,  7.51it/s]

Samples 243200 Batch_idx 12159 GradUpdates 12160 Loss 3.10711


Steps:  61%|██████    | 12181/20000 [26:42<17:20,  7.51it/s]

Samples 243600 Batch_idx 12179 GradUpdates 12180 Loss 3.01028


Steps:  61%|██████    | 12201/20000 [26:45<17:18,  7.51it/s]

Samples 244000 Batch_idx 12199 GradUpdates 12200 Loss 3.00696


Steps:  61%|██████    | 12221/20000 [26:48<17:08,  7.57it/s]

Samples 244400 Batch_idx 12219 GradUpdates 12220 Loss 3.06397


Steps:  61%|██████    | 12241/20000 [26:50<17:10,  7.53it/s]

Samples 244800 Batch_idx 12239 GradUpdates 12240 Loss 3.00436


Steps:  61%|██████▏   | 12261/20000 [26:53<17:18,  7.45it/s]

Samples 245200 Batch_idx 12259 GradUpdates 12260 Loss 3.02252


Steps:  61%|██████▏   | 12281/20000 [26:55<17:04,  7.53it/s]

Samples 245600 Batch_idx 12279 GradUpdates 12280 Loss 3.02402


Steps:  62%|██████▏   | 12301/20000 [26:58<17:01,  7.54it/s]

Samples 246000 Batch_idx 12299 GradUpdates 12300 Loss 3.00739


Steps:  62%|██████▏   | 12321/20000 [27:00<16:59,  7.53it/s]

Samples 246400 Batch_idx 12319 GradUpdates 12320 Loss 3.00912


Steps:  62%|██████▏   | 12341/20000 [27:03<21:30,  5.94it/s]

Samples 246800 Batch_idx 12339 GradUpdates 12340 Loss 3.05936


Steps:  62%|██████▏   | 12361/20000 [27:06<16:55,  7.52it/s]

Samples 247200 Batch_idx 12359 GradUpdates 12360 Loss 3.05917


Steps:  62%|██████▏   | 12381/20000 [27:08<16:51,  7.53it/s]

Samples 247600 Batch_idx 12379 GradUpdates 12380 Loss 3.04785


Steps:  62%|██████▏   | 12401/20000 [27:11<16:51,  7.51it/s]

Samples 248000 Batch_idx 12399 GradUpdates 12400 Loss 3.04109


Steps:  62%|██████▏   | 12421/20000 [27:13<16:46,  7.53it/s]

Samples 248400 Batch_idx 12419 GradUpdates 12420 Loss 2.98271


Steps:  62%|██████▏   | 12441/20000 [27:16<16:46,  7.51it/s]

Samples 248800 Batch_idx 12439 GradUpdates 12440 Loss 3.03868


Steps:  62%|██████▏   | 12461/20000 [27:19<16:53,  7.44it/s]

Samples 249200 Batch_idx 12459 GradUpdates 12460 Loss 3.05651


Steps:  62%|██████▏   | 12481/20000 [27:21<16:40,  7.51it/s]

Samples 249600 Batch_idx 12479 GradUpdates 12480 Loss 2.99332


Steps:  63%|██████▎   | 12501/20000 [27:24<16:39,  7.50it/s]

Samples 250000 Batch_idx 12499 GradUpdates 12500 Loss 3.05672


Steps:  63%|██████▎   | 12521/20000 [27:26<16:36,  7.51it/s]

Samples 250400 Batch_idx 12519 GradUpdates 12520 Loss 2.99683


Steps:  63%|██████▎   | 12541/20000 [27:29<16:33,  7.51it/s]

Samples 250800 Batch_idx 12539 GradUpdates 12540 Loss 3.02809


Steps:  63%|██████▎   | 12561/20000 [27:31<16:30,  7.51it/s]

Samples 251200 Batch_idx 12559 GradUpdates 12560 Loss 3.02673


Steps:  63%|██████▎   | 12581/20000 [27:34<16:28,  7.51it/s]

Samples 251600 Batch_idx 12579 GradUpdates 12580 Loss 2.95879


Steps:  63%|██████▎   | 12601/20000 [27:36<16:23,  7.52it/s]

Samples 252000 Batch_idx 12599 GradUpdates 12600 Loss 3.01208


Steps:  63%|██████▎   | 12621/20000 [27:39<16:23,  7.50it/s]

Samples 252400 Batch_idx 12619 GradUpdates 12620 Loss 3.00593


Steps:  63%|██████▎   | 12641/20000 [27:41<16:20,  7.51it/s]

Samples 252800 Batch_idx 12639 GradUpdates 12640 Loss 2.94131


Steps:  63%|██████▎   | 12661/20000 [27:44<16:17,  7.51it/s]

Samples 253200 Batch_idx 12659 GradUpdates 12660 Loss 2.97680


Steps:  63%|██████▎   | 12681/20000 [27:47<16:15,  7.50it/s]

Samples 253600 Batch_idx 12679 GradUpdates 12680 Loss 3.03047


Steps:  64%|██████▎   | 12701/20000 [27:49<16:13,  7.50it/s]

Samples 254000 Batch_idx 12699 GradUpdates 12700 Loss 3.03326


Steps:  64%|██████▎   | 12721/20000 [27:52<16:09,  7.51it/s]

Samples 254400 Batch_idx 12719 GradUpdates 12720 Loss 2.97880


Steps:  64%|██████▎   | 12741/20000 [27:54<16:17,  7.43it/s]

Samples 254800 Batch_idx 12739 GradUpdates 12740 Loss 2.95263


Steps:  64%|██████▍   | 12761/20000 [27:57<16:04,  7.50it/s]

Samples 255200 Batch_idx 12759 GradUpdates 12760 Loss 3.03187


Steps:  64%|██████▍   | 12781/20000 [27:59<16:01,  7.51it/s]

Samples 255600 Batch_idx 12779 GradUpdates 12780 Loss 2.94080


Steps:  64%|██████▍   | 12801/20000 [28:02<15:59,  7.51it/s]

Samples 256000 Batch_idx 12799 GradUpdates 12800 Loss 3.05372


Steps:  64%|██████▍   | 12821/20000 [28:04<15:56,  7.50it/s]

Samples 256400 Batch_idx 12819 GradUpdates 12820 Loss 2.99629


Steps:  64%|██████▍   | 12841/20000 [28:07<15:54,  7.50it/s]

Samples 256800 Batch_idx 12839 GradUpdates 12840 Loss 2.95576


Steps:  64%|██████▍   | 12861/20000 [28:10<15:52,  7.49it/s]

Samples 257200 Batch_idx 12859 GradUpdates 12860 Loss 2.97434


Steps:  64%|██████▍   | 12881/20000 [28:12<15:48,  7.50it/s]

Samples 257600 Batch_idx 12879 GradUpdates 12880 Loss 3.00654


Steps:  65%|██████▍   | 12901/20000 [28:15<15:46,  7.50it/s]

Samples 258000 Batch_idx 12899 GradUpdates 12900 Loss 2.92443


Steps:  65%|██████▍   | 12921/20000 [28:17<15:43,  7.51it/s]

Samples 258400 Batch_idx 12919 GradUpdates 12920 Loss 2.95995


Steps:  65%|██████▍   | 12941/20000 [28:20<15:41,  7.50it/s]

Samples 258800 Batch_idx 12939 GradUpdates 12940 Loss 3.03413


Steps:  65%|██████▍   | 12961/20000 [28:22<15:38,  7.50it/s]

Samples 259200 Batch_idx 12959 GradUpdates 12960 Loss 2.92777


Steps:  65%|██████▍   | 12981/20000 [28:25<15:42,  7.45it/s]

Samples 259600 Batch_idx 12979 GradUpdates 12980 Loss 2.97312


Steps:  65%|██████▌   | 13001/20000 [28:27<15:32,  7.50it/s]

Samples 260000 Batch_idx 12999 GradUpdates 13000 Loss 2.97497


Steps:  65%|██████▌   | 13021/20000 [28:30<15:40,  7.42it/s]

Samples 260400 Batch_idx 13019 GradUpdates 13020 Loss 2.97471


Steps:  65%|██████▌   | 13041/20000 [28:32<15:27,  7.50it/s]

Samples 260800 Batch_idx 13039 GradUpdates 13040 Loss 2.98175


Steps:  65%|██████▌   | 13061/20000 [28:35<15:25,  7.50it/s]

Samples 261200 Batch_idx 13059 GradUpdates 13060 Loss 2.98881


Steps:  65%|██████▌   | 13081/20000 [28:38<15:21,  7.51it/s]

Samples 261600 Batch_idx 13079 GradUpdates 13080 Loss 2.91323


Steps:  66%|██████▌   | 13101/20000 [28:40<15:20,  7.49it/s]

Samples 262000 Batch_idx 13099 GradUpdates 13100 Loss 2.89018


Steps:  66%|██████▌   | 13121/20000 [28:43<15:17,  7.50it/s]

Samples 262400 Batch_idx 13119 GradUpdates 13120 Loss 2.99053


Steps:  66%|██████▌   | 13141/20000 [28:45<15:14,  7.50it/s]

Samples 262800 Batch_idx 13139 GradUpdates 13140 Loss 2.98891


Steps:  66%|██████▌   | 13161/20000 [28:48<15:11,  7.50it/s]

Samples 263200 Batch_idx 13159 GradUpdates 13160 Loss 3.02906


Steps:  66%|██████▌   | 13181/20000 [28:50<15:08,  7.51it/s]

Samples 263600 Batch_idx 13179 GradUpdates 13180 Loss 2.98291


Steps:  66%|██████▌   | 13201/20000 [28:53<15:06,  7.50it/s]

Samples 264000 Batch_idx 13199 GradUpdates 13200 Loss 2.95179


Steps:  66%|██████▌   | 13221/20000 [28:55<15:03,  7.50it/s]

Samples 264400 Batch_idx 13219 GradUpdates 13220 Loss 2.91390


Steps:  66%|██████▌   | 13241/20000 [28:58<15:01,  7.50it/s]

Samples 264800 Batch_idx 13239 GradUpdates 13240 Loss 2.96117


Steps:  66%|██████▋   | 13261/20000 [29:01<14:58,  7.50it/s]

Samples 265200 Batch_idx 13259 GradUpdates 13260 Loss 2.88722


Steps:  66%|██████▋   | 13281/20000 [29:03<14:55,  7.50it/s]

Samples 265600 Batch_idx 13279 GradUpdates 13280 Loss 2.99781


Steps:  67%|██████▋   | 13301/20000 [29:06<15:02,  7.43it/s]

Samples 266000 Batch_idx 13299 GradUpdates 13300 Loss 2.98242


Steps:  67%|██████▋   | 13321/20000 [29:08<14:50,  7.50it/s]

Samples 266400 Batch_idx 13319 GradUpdates 13320 Loss 2.94538


Steps:  67%|██████▋   | 13341/20000 [29:11<14:59,  7.41it/s]

Samples 266800 Batch_idx 13339 GradUpdates 13340 Loss 2.97274


Steps:  67%|██████▋   | 13361/20000 [29:13<14:44,  7.50it/s]

Samples 267200 Batch_idx 13359 GradUpdates 13360 Loss 2.94642


Steps:  67%|██████▋   | 13381/20000 [29:16<14:40,  7.52it/s]

Samples 267600 Batch_idx 13379 GradUpdates 13380 Loss 2.99473


Steps:  67%|██████▋   | 13401/20000 [29:18<14:38,  7.51it/s]

Samples 268000 Batch_idx 13399 GradUpdates 13400 Loss 2.94364


Steps:  67%|██████▋   | 13421/20000 [29:21<14:35,  7.51it/s]

Samples 268400 Batch_idx 13419 GradUpdates 13420 Loss 2.95561


Steps:  67%|██████▋   | 13441/20000 [29:23<14:40,  7.45it/s]

Samples 268800 Batch_idx 13439 GradUpdates 13440 Loss 2.95846


Steps:  67%|██████▋   | 13461/20000 [29:26<14:31,  7.51it/s]

Samples 269200 Batch_idx 13459 GradUpdates 13460 Loss 2.90941


Steps:  67%|██████▋   | 13481/20000 [29:29<14:30,  7.49it/s]

Samples 269600 Batch_idx 13479 GradUpdates 13480 Loss 2.93551


Steps:  68%|██████▊   | 13501/20000 [29:31<14:26,  7.50it/s]

Samples 270000 Batch_idx 13499 GradUpdates 13500 Loss 2.89467


Steps:  68%|██████▊   | 13521/20000 [29:34<14:22,  7.51it/s]

Samples 270400 Batch_idx 13519 GradUpdates 13520 Loss 2.94427


Steps:  68%|██████▊   | 13541/20000 [29:36<14:20,  7.50it/s]

Samples 270800 Batch_idx 13539 GradUpdates 13540 Loss 2.94234


Steps:  68%|██████▊   | 13561/20000 [29:39<14:26,  7.43it/s]

Samples 271200 Batch_idx 13559 GradUpdates 13560 Loss 2.92255


Steps:  68%|██████▊   | 13581/20000 [29:41<14:14,  7.51it/s]

Samples 271600 Batch_idx 13579 GradUpdates 13580 Loss 2.91951


Steps:  68%|██████▊   | 13601/20000 [29:44<14:12,  7.51it/s]

Samples 272000 Batch_idx 13599 GradUpdates 13600 Loss 2.90784


Steps:  68%|██████▊   | 13621/20000 [29:46<14:12,  7.48it/s]

Samples 272400 Batch_idx 13619 GradUpdates 13620 Loss 2.95962


Steps:  68%|██████▊   | 13641/20000 [29:49<14:07,  7.50it/s]

Samples 272800 Batch_idx 13639 GradUpdates 13640 Loss 2.89936


Steps:  68%|██████▊   | 13661/20000 [29:52<14:04,  7.51it/s]

Samples 273200 Batch_idx 13659 GradUpdates 13660 Loss 2.96336


Steps:  68%|██████▊   | 13681/20000 [29:54<14:01,  7.51it/s]

Samples 273600 Batch_idx 13679 GradUpdates 13680 Loss 2.95620


Steps:  69%|██████▊   | 13701/20000 [29:57<14:00,  7.49it/s]

Samples 274000 Batch_idx 13699 GradUpdates 13700 Loss 2.96770


Steps:  69%|██████▊   | 13721/20000 [29:59<13:57,  7.50it/s]

Samples 274400 Batch_idx 13719 GradUpdates 13720 Loss 2.90683


Steps:  69%|██████▊   | 13741/20000 [30:02<14:02,  7.43it/s]

Samples 274800 Batch_idx 13739 GradUpdates 13740 Loss 2.96482


Steps:  69%|██████▉   | 13761/20000 [30:04<13:51,  7.50it/s]

Samples 275200 Batch_idx 13759 GradUpdates 13760 Loss 2.99700


Steps:  69%|██████▉   | 13781/20000 [30:07<13:48,  7.50it/s]

Samples 275600 Batch_idx 13779 GradUpdates 13780 Loss 2.82861


Steps:  69%|██████▉   | 13801/20000 [30:09<13:46,  7.50it/s]

Samples 276000 Batch_idx 13799 GradUpdates 13800 Loss 2.98811


Steps:  69%|██████▉   | 13821/20000 [30:12<13:43,  7.50it/s]

Samples 276400 Batch_idx 13819 GradUpdates 13820 Loss 2.93811


Steps:  69%|██████▉   | 13841/20000 [30:14<13:50,  7.41it/s]

Samples 276800 Batch_idx 13839 GradUpdates 13840 Loss 2.93764


Steps:  69%|██████▉   | 13861/20000 [30:17<13:38,  7.50it/s]

Samples 277200 Batch_idx 13859 GradUpdates 13860 Loss 2.93448


Steps:  69%|██████▉   | 13881/20000 [30:20<13:35,  7.50it/s]

Samples 277600 Batch_idx 13879 GradUpdates 13880 Loss 2.90389


Steps:  70%|██████▉   | 13901/20000 [30:22<13:33,  7.50it/s]

Samples 278000 Batch_idx 13899 GradUpdates 13900 Loss 2.94544


Steps:  70%|██████▉   | 13921/20000 [30:25<13:30,  7.50it/s]

Samples 278400 Batch_idx 13919 GradUpdates 13920 Loss 2.86319


Steps:  70%|██████▉   | 13941/20000 [30:27<13:27,  7.50it/s]

Samples 278800 Batch_idx 13939 GradUpdates 13940 Loss 2.90613


Steps:  70%|██████▉   | 13961/20000 [30:30<13:27,  7.48it/s]

Samples 279200 Batch_idx 13959 GradUpdates 13960 Loss 2.96254


Steps:  70%|██████▉   | 13981/20000 [30:32<13:22,  7.50it/s]

Samples 279600 Batch_idx 13979 GradUpdates 13980 Loss 2.93780


Steps:  70%|███████   | 14000/20000 [30:35<13:40,  7.31it/s]

Samples 280000 Batch_idx 13999 GradUpdates 14000 Loss 2.89010
Samples 280020 Batch_idx 14000 GradUpdates 14001 Loss 2.87365


Eval Steps: 100%|██████████| 25/25 [00:04<00:00,  5.81it/s]
Steps:  70%|███████   | 14021/20000 [30:44<13:29,  7.39it/s]  

Samples 280400 Batch_idx 14019 GradUpdates 14020 Loss 2.93704


Steps:  70%|███████   | 14041/20000 [30:47<13:14,  7.50it/s]

Samples 280800 Batch_idx 14039 GradUpdates 14040 Loss 2.88345


Steps:  70%|███████   | 14061/20000 [30:49<13:11,  7.50it/s]

Samples 281200 Batch_idx 14059 GradUpdates 14060 Loss 2.94884


Steps:  70%|███████   | 14081/20000 [30:52<13:08,  7.50it/s]

Samples 281600 Batch_idx 14079 GradUpdates 14080 Loss 2.90273


Steps:  71%|███████   | 14101/20000 [30:54<13:08,  7.49it/s]

Samples 282000 Batch_idx 14099 GradUpdates 14100 Loss 2.90224


Steps:  71%|███████   | 14121/20000 [30:57<13:03,  7.50it/s]

Samples 282400 Batch_idx 14119 GradUpdates 14120 Loss 2.99580


Steps:  71%|███████   | 14141/20000 [30:59<13:00,  7.50it/s]

Samples 282800 Batch_idx 14139 GradUpdates 14140 Loss 2.91072


Steps:  71%|███████   | 14161/20000 [31:02<12:58,  7.50it/s]

Samples 283200 Batch_idx 14159 GradUpdates 14160 Loss 2.87821


Steps:  71%|███████   | 14181/20000 [31:04<12:56,  7.50it/s]

Samples 283600 Batch_idx 14179 GradUpdates 14180 Loss 2.87345


Steps:  71%|███████   | 14201/20000 [31:07<12:53,  7.50it/s]

Samples 284000 Batch_idx 14199 GradUpdates 14200 Loss 2.92510


Steps:  71%|███████   | 14221/20000 [31:10<12:51,  7.49it/s]

Samples 284400 Batch_idx 14219 GradUpdates 14220 Loss 2.88576


Steps:  71%|███████   | 14241/20000 [31:12<12:46,  7.51it/s]

Samples 284800 Batch_idx 14239 GradUpdates 14240 Loss 2.87833


Steps:  71%|███████▏  | 14261/20000 [31:15<12:58,  7.37it/s]

Samples 285200 Batch_idx 14259 GradUpdates 14260 Loss 2.87556


Steps:  71%|███████▏  | 14281/20000 [31:17<12:45,  7.47it/s]

Samples 285600 Batch_idx 14279 GradUpdates 14280 Loss 2.90444


Steps:  72%|███████▏  | 14301/20000 [31:20<12:39,  7.50it/s]

Samples 286000 Batch_idx 14299 GradUpdates 14300 Loss 2.91835


Steps:  72%|███████▏  | 14321/20000 [31:22<12:38,  7.49it/s]

Samples 286400 Batch_idx 14319 GradUpdates 14320 Loss 2.89576


Steps:  72%|███████▏  | 14341/20000 [31:25<12:33,  7.51it/s]

Samples 286800 Batch_idx 14339 GradUpdates 14340 Loss 2.89999


Steps:  72%|███████▏  | 14361/20000 [31:27<12:30,  7.51it/s]

Samples 287200 Batch_idx 14359 GradUpdates 14360 Loss 2.92618


Steps:  72%|███████▏  | 14381/20000 [31:30<12:31,  7.48it/s]

Samples 287600 Batch_idx 14379 GradUpdates 14380 Loss 2.90030


Steps:  72%|███████▏  | 14401/20000 [31:33<12:29,  7.47it/s]

Samples 288000 Batch_idx 14399 GradUpdates 14400 Loss 2.90665


Steps:  72%|███████▏  | 14421/20000 [31:36<12:23,  7.51it/s]

Samples 288400 Batch_idx 14419 GradUpdates 14420 Loss 2.88078


Steps:  72%|███████▏  | 14441/20000 [31:38<12:20,  7.51it/s]

Samples 288800 Batch_idx 14439 GradUpdates 14440 Loss 2.87761


Steps:  72%|███████▏  | 14461/20000 [31:41<12:18,  7.50it/s]

Samples 289200 Batch_idx 14459 GradUpdates 14460 Loss 2.87406


Steps:  72%|███████▏  | 14481/20000 [31:43<12:15,  7.51it/s]

Samples 289600 Batch_idx 14479 GradUpdates 14480 Loss 2.85924


Steps:  73%|███████▎  | 14501/20000 [31:46<12:14,  7.49it/s]

Samples 290000 Batch_idx 14499 GradUpdates 14500 Loss 2.83175


Steps:  73%|███████▎  | 14521/20000 [31:48<12:17,  7.43it/s]

Samples 290400 Batch_idx 14519 GradUpdates 14520 Loss 2.84369


Steps:  73%|███████▎  | 14541/20000 [31:51<12:08,  7.50it/s]

Samples 290800 Batch_idx 14539 GradUpdates 14540 Loss 2.89647


Steps:  73%|███████▎  | 14561/20000 [31:53<12:06,  7.49it/s]

Samples 291200 Batch_idx 14559 GradUpdates 14560 Loss 2.90069


Steps:  73%|███████▎  | 14581/20000 [31:56<12:00,  7.52it/s]

Samples 291600 Batch_idx 14579 GradUpdates 14580 Loss 2.82972


Steps:  73%|███████▎  | 14601/20000 [31:59<12:00,  7.50it/s]

Samples 292000 Batch_idx 14599 GradUpdates 14600 Loss 2.90786


Steps:  73%|███████▎  | 14621/20000 [32:01<11:57,  7.49it/s]

Samples 292400 Batch_idx 14619 GradUpdates 14620 Loss 2.85065


Steps:  73%|███████▎  | 14641/20000 [32:04<11:54,  7.50it/s]

Samples 292800 Batch_idx 14639 GradUpdates 14640 Loss 2.89441


Steps:  73%|███████▎  | 14661/20000 [32:06<11:52,  7.50it/s]

Samples 293200 Batch_idx 14659 GradUpdates 14660 Loss 2.88677


Steps:  73%|███████▎  | 14681/20000 [32:09<11:49,  7.50it/s]

Samples 293600 Batch_idx 14679 GradUpdates 14680 Loss 2.85265


Steps:  74%|███████▎  | 14701/20000 [32:11<11:47,  7.49it/s]

Samples 294000 Batch_idx 14699 GradUpdates 14700 Loss 2.89351


Steps:  74%|███████▎  | 14721/20000 [32:14<11:44,  7.49it/s]

Samples 294400 Batch_idx 14719 GradUpdates 14720 Loss 2.87240


Steps:  74%|███████▎  | 14741/20000 [32:16<11:43,  7.47it/s]

Samples 294800 Batch_idx 14739 GradUpdates 14740 Loss 2.93564


Steps:  74%|███████▍  | 14761/20000 [32:19<11:39,  7.49it/s]

Samples 295200 Batch_idx 14759 GradUpdates 14760 Loss 2.90401


Steps:  74%|███████▍  | 14781/20000 [32:22<11:35,  7.51it/s]

Samples 295600 Batch_idx 14779 GradUpdates 14780 Loss 2.77810


Steps:  74%|███████▍  | 14801/20000 [32:24<11:40,  7.42it/s]

Samples 296000 Batch_idx 14799 GradUpdates 14800 Loss 2.91190


Steps:  74%|███████▍  | 14821/20000 [32:27<11:31,  7.49it/s]

Samples 296400 Batch_idx 14819 GradUpdates 14820 Loss 2.89862


Steps:  74%|███████▍  | 14841/20000 [32:29<11:28,  7.49it/s]

Samples 296800 Batch_idx 14839 GradUpdates 14840 Loss 2.91420


Steps:  74%|███████▍  | 14861/20000 [32:32<11:25,  7.50it/s]

Samples 297200 Batch_idx 14859 GradUpdates 14860 Loss 2.87570


Steps:  74%|███████▍  | 14881/20000 [32:34<11:23,  7.49it/s]

Samples 297600 Batch_idx 14879 GradUpdates 14880 Loss 2.96012


Steps:  75%|███████▍  | 14901/20000 [32:37<11:20,  7.50it/s]

Samples 298000 Batch_idx 14899 GradUpdates 14900 Loss 2.85967


Steps:  75%|███████▍  | 14921/20000 [32:39<11:19,  7.48it/s]

Samples 298400 Batch_idx 14919 GradUpdates 14920 Loss 2.87552


Steps:  75%|███████▍  | 14941/20000 [32:42<11:15,  7.49it/s]

Samples 298800 Batch_idx 14939 GradUpdates 14940 Loss 2.90204


Steps:  75%|███████▍  | 14961/20000 [32:45<11:10,  7.52it/s]

Samples 299200 Batch_idx 14959 GradUpdates 14960 Loss 2.87883


Steps:  75%|███████▍  | 14981/20000 [32:47<11:10,  7.49it/s]

Samples 299600 Batch_idx 14979 GradUpdates 14980 Loss 2.80791


Steps:  75%|███████▌  | 15001/20000 [32:50<11:06,  7.50it/s]

Samples 300000 Batch_idx 14999 GradUpdates 15000 Loss 2.87483


Steps:  75%|███████▌  | 15021/20000 [32:52<11:04,  7.50it/s]

Samples 300400 Batch_idx 15019 GradUpdates 15020 Loss 2.85493


Steps:  75%|███████▌  | 15041/20000 [32:55<11:03,  7.47it/s]

Samples 300800 Batch_idx 15039 GradUpdates 15040 Loss 2.88264


Steps:  75%|███████▌  | 15061/20000 [32:57<11:06,  7.41it/s]

Samples 301200 Batch_idx 15059 GradUpdates 15060 Loss 2.83475


Steps:  75%|███████▌  | 15081/20000 [33:00<10:56,  7.49it/s]

Samples 301600 Batch_idx 15079 GradUpdates 15080 Loss 2.90782


Steps:  76%|███████▌  | 15101/20000 [33:02<10:53,  7.50it/s]

Samples 302000 Batch_idx 15099 GradUpdates 15100 Loss 2.84736


Steps:  76%|███████▌  | 15121/20000 [33:05<10:58,  7.41it/s]

Samples 302400 Batch_idx 15119 GradUpdates 15120 Loss 2.88251


Steps:  76%|███████▌  | 15141/20000 [33:08<10:48,  7.49it/s]

Samples 302800 Batch_idx 15139 GradUpdates 15140 Loss 2.87655


Steps:  76%|███████▌  | 15161/20000 [33:10<10:47,  7.48it/s]

Samples 303200 Batch_idx 15159 GradUpdates 15160 Loss 2.88088


Steps:  76%|███████▌  | 15181/20000 [33:13<10:42,  7.50it/s]

Samples 303600 Batch_idx 15179 GradUpdates 15180 Loss 2.90380


Steps:  76%|███████▌  | 15201/20000 [33:15<10:39,  7.50it/s]

Samples 304000 Batch_idx 15199 GradUpdates 15200 Loss 2.82572


Steps:  76%|███████▌  | 15221/20000 [33:18<10:37,  7.49it/s]

Samples 304400 Batch_idx 15219 GradUpdates 15220 Loss 2.86301


Steps:  76%|███████▌  | 15241/20000 [33:20<10:34,  7.50it/s]

Samples 304800 Batch_idx 15239 GradUpdates 15240 Loss 2.88094


Steps:  76%|███████▋  | 15261/20000 [33:23<10:31,  7.50it/s]

Samples 305200 Batch_idx 15259 GradUpdates 15260 Loss 2.87578


Steps:  76%|███████▋  | 15281/20000 [33:25<10:29,  7.50it/s]

Samples 305600 Batch_idx 15279 GradUpdates 15280 Loss 2.87570


Steps:  77%|███████▋  | 15301/20000 [33:28<10:25,  7.51it/s]

Samples 306000 Batch_idx 15299 GradUpdates 15300 Loss 2.84753


Steps:  77%|███████▋  | 15321/20000 [33:31<10:23,  7.50it/s]

Samples 306400 Batch_idx 15319 GradUpdates 15320 Loss 2.84618


Steps:  77%|███████▋  | 15341/20000 [33:33<10:27,  7.43it/s]

Samples 306800 Batch_idx 15339 GradUpdates 15340 Loss 2.82978


Steps:  77%|███████▋  | 15361/20000 [33:36<10:19,  7.49it/s]

Samples 307200 Batch_idx 15359 GradUpdates 15360 Loss 2.86729


Steps:  77%|███████▋  | 15381/20000 [33:38<10:14,  7.52it/s]

Samples 307600 Batch_idx 15379 GradUpdates 15380 Loss 2.85645


Steps:  77%|███████▋  | 15401/20000 [33:41<10:13,  7.50it/s]

Samples 308000 Batch_idx 15399 GradUpdates 15400 Loss 2.83431


Steps:  77%|███████▋  | 15421/20000 [33:43<10:11,  7.49it/s]

Samples 308400 Batch_idx 15419 GradUpdates 15420 Loss 2.85537


Steps:  77%|███████▋  | 15441/20000 [33:46<10:08,  7.49it/s]

Samples 308800 Batch_idx 15439 GradUpdates 15440 Loss 2.89894


Steps:  77%|███████▋  | 15461/20000 [33:48<10:05,  7.49it/s]

Samples 309200 Batch_idx 15459 GradUpdates 15460 Loss 2.82146


Steps:  77%|███████▋  | 15481/20000 [33:51<10:02,  7.50it/s]

Samples 309600 Batch_idx 15479 GradUpdates 15480 Loss 2.86480


Steps:  78%|███████▊  | 15501/20000 [33:53<09:59,  7.50it/s]

Samples 310000 Batch_idx 15499 GradUpdates 15500 Loss 2.77172


Steps:  78%|███████▊  | 15521/20000 [33:56<09:57,  7.50it/s]

Samples 310400 Batch_idx 15519 GradUpdates 15520 Loss 2.81458


Steps:  78%|███████▊  | 15541/20000 [33:59<09:54,  7.50it/s]

Samples 310800 Batch_idx 15539 GradUpdates 15540 Loss 2.81602


Steps:  78%|███████▊  | 15561/20000 [34:01<09:52,  7.50it/s]

Samples 311200 Batch_idx 15559 GradUpdates 15560 Loss 2.82931


Steps:  78%|███████▊  | 15581/20000 [34:04<09:49,  7.50it/s]

Samples 311600 Batch_idx 15579 GradUpdates 15580 Loss 2.83473


Steps:  78%|███████▊  | 15601/20000 [34:06<09:47,  7.49it/s]

Samples 312000 Batch_idx 15599 GradUpdates 15600 Loss 2.82756


Steps:  78%|███████▊  | 15621/20000 [34:09<09:49,  7.42it/s]

Samples 312400 Batch_idx 15619 GradUpdates 15620 Loss 2.86272


Steps:  78%|███████▊  | 15641/20000 [34:11<09:45,  7.44it/s]

Samples 312800 Batch_idx 15639 GradUpdates 15640 Loss 2.84165


Steps:  78%|███████▊  | 15661/20000 [34:14<09:40,  7.47it/s]

Samples 313200 Batch_idx 15659 GradUpdates 15660 Loss 2.86919


Steps:  78%|███████▊  | 15681/20000 [34:16<09:37,  7.48it/s]

Samples 313600 Batch_idx 15679 GradUpdates 15680 Loss 2.83480


Steps:  79%|███████▊  | 15701/20000 [34:19<09:33,  7.49it/s]

Samples 314000 Batch_idx 15699 GradUpdates 15700 Loss 2.83294


Steps:  79%|███████▊  | 15721/20000 [34:22<09:31,  7.49it/s]

Samples 314400 Batch_idx 15719 GradUpdates 15720 Loss 2.82697


Steps:  79%|███████▊  | 15741/20000 [34:24<09:30,  7.46it/s]

Samples 314800 Batch_idx 15739 GradUpdates 15740 Loss 2.81129


Steps:  79%|███████▉  | 15761/20000 [34:27<09:26,  7.48it/s]

Samples 315200 Batch_idx 15759 GradUpdates 15760 Loss 2.84368


Steps:  79%|███████▉  | 15781/20000 [34:29<09:23,  7.49it/s]

Samples 315600 Batch_idx 15779 GradUpdates 15780 Loss 2.81615


Steps:  79%|███████▉  | 15801/20000 [34:32<09:21,  7.47it/s]

Samples 316000 Batch_idx 15799 GradUpdates 15800 Loss 2.80123


Steps:  79%|███████▉  | 15821/20000 [34:34<09:18,  7.49it/s]

Samples 316400 Batch_idx 15819 GradUpdates 15820 Loss 2.79717


Steps:  79%|███████▉  | 15841/20000 [34:37<09:15,  7.48it/s]

Samples 316800 Batch_idx 15839 GradUpdates 15840 Loss 2.82986


Steps:  79%|███████▉  | 15861/20000 [34:39<09:13,  7.48it/s]

Samples 317200 Batch_idx 15859 GradUpdates 15860 Loss 2.83505


Steps:  79%|███████▉  | 15881/20000 [34:42<09:10,  7.49it/s]

Samples 317600 Batch_idx 15879 GradUpdates 15880 Loss 2.89619


Steps:  80%|███████▉  | 15901/20000 [34:45<09:13,  7.41it/s]

Samples 318000 Batch_idx 15899 GradUpdates 15900 Loss 2.85415


Steps:  80%|███████▉  | 15921/20000 [34:47<09:05,  7.48it/s]

Samples 318400 Batch_idx 15919 GradUpdates 15920 Loss 2.77629


Steps:  80%|███████▉  | 15941/20000 [34:50<09:02,  7.48it/s]

Samples 318800 Batch_idx 15939 GradUpdates 15940 Loss 2.81128


Steps:  80%|███████▉  | 15961/20000 [34:52<09:00,  7.47it/s]

Samples 319200 Batch_idx 15959 GradUpdates 15960 Loss 2.78248


Steps:  80%|███████▉  | 15981/20000 [34:55<08:57,  7.47it/s]

Samples 319600 Batch_idx 15979 GradUpdates 15980 Loss 2.82283


Steps:  80%|████████  | 16000/20000 [34:57<09:07,  7.31it/s]

Samples 320000 Batch_idx 15999 GradUpdates 16000 Loss 2.80535
Samples 320020 Batch_idx 16000 GradUpdates 16001 Loss 2.81524


Eval Steps: 100%|██████████| 25/25 [00:04<00:00,  5.82it/s]
Steps:  80%|████████  | 16021/20000 [35:07<08:58,  7.39it/s]  

Samples 320400 Batch_idx 16019 GradUpdates 16020 Loss 2.85145


Steps:  80%|████████  | 16041/20000 [35:09<08:49,  7.48it/s]

Samples 320800 Batch_idx 16039 GradUpdates 16040 Loss 2.78400


Steps:  80%|████████  | 16061/20000 [35:12<08:46,  7.48it/s]

Samples 321200 Batch_idx 16059 GradUpdates 16060 Loss 2.81824


Steps:  80%|████████  | 16081/20000 [35:14<08:44,  7.48it/s]

Samples 321600 Batch_idx 16079 GradUpdates 16080 Loss 2.82885


Steps:  81%|████████  | 16101/20000 [35:17<08:41,  7.48it/s]

Samples 322000 Batch_idx 16099 GradUpdates 16100 Loss 2.81474


Steps:  81%|████████  | 16121/20000 [35:20<08:38,  7.48it/s]

Samples 322400 Batch_idx 16119 GradUpdates 16120 Loss 2.87259


Steps:  81%|████████  | 16141/20000 [35:22<08:35,  7.49it/s]

Samples 322800 Batch_idx 16139 GradUpdates 16140 Loss 2.79727


Steps:  81%|████████  | 16161/20000 [35:25<08:34,  7.47it/s]

Samples 323200 Batch_idx 16159 GradUpdates 16160 Loss 2.82223


Steps:  81%|████████  | 16181/20000 [35:27<08:37,  7.37it/s]

Samples 323600 Batch_idx 16179 GradUpdates 16180 Loss 2.83216


Steps:  81%|████████  | 16201/20000 [35:30<08:28,  7.48it/s]

Samples 324000 Batch_idx 16199 GradUpdates 16200 Loss 2.79501


Steps:  81%|████████  | 16221/20000 [35:32<08:25,  7.48it/s]

Samples 324400 Batch_idx 16219 GradUpdates 16220 Loss 2.82008


Steps:  81%|████████  | 16241/20000 [35:35<08:22,  7.48it/s]

Samples 324800 Batch_idx 16239 GradUpdates 16240 Loss 2.79947


Steps:  81%|████████▏ | 16261/20000 [35:37<08:26,  7.38it/s]

Samples 325200 Batch_idx 16259 GradUpdates 16260 Loss 2.83976


Steps:  81%|████████▏ | 16281/20000 [35:40<08:17,  7.47it/s]

Samples 325600 Batch_idx 16279 GradUpdates 16280 Loss 2.81760


Steps:  82%|████████▏ | 16301/20000 [35:43<08:15,  7.47it/s]

Samples 326000 Batch_idx 16299 GradUpdates 16300 Loss 2.83580


Steps:  82%|████████▏ | 16321/20000 [35:45<08:12,  7.47it/s]

Samples 326400 Batch_idx 16319 GradUpdates 16320 Loss 2.78636


Steps:  82%|████████▏ | 16341/20000 [35:48<08:08,  7.48it/s]

Samples 326800 Batch_idx 16339 GradUpdates 16340 Loss 2.81649


Steps:  82%|████████▏ | 16361/20000 [35:50<08:06,  7.49it/s]

Samples 327200 Batch_idx 16359 GradUpdates 16360 Loss 2.84358


Steps:  82%|████████▏ | 16381/20000 [35:53<08:03,  7.48it/s]

Samples 327600 Batch_idx 16379 GradUpdates 16380 Loss 2.78856


Steps:  82%|████████▏ | 16401/20000 [35:55<08:01,  7.47it/s]

Samples 328000 Batch_idx 16399 GradUpdates 16400 Loss 2.82192


Steps:  82%|████████▏ | 16421/20000 [35:58<07:58,  7.47it/s]

Samples 328400 Batch_idx 16419 GradUpdates 16420 Loss 2.80137


Steps:  82%|████████▏ | 16441/20000 [36:01<08:25,  7.04it/s]

Samples 328800 Batch_idx 16439 GradUpdates 16440 Loss 2.82405


Steps:  82%|████████▏ | 16461/20000 [36:04<07:52,  7.49it/s]

Samples 329200 Batch_idx 16459 GradUpdates 16460 Loss 2.73966


Steps:  82%|████████▏ | 16481/20000 [36:06<07:49,  7.50it/s]

Samples 329600 Batch_idx 16479 GradUpdates 16480 Loss 2.81304


Steps:  83%|████████▎ | 16501/20000 [36:09<07:47,  7.48it/s]

Samples 330000 Batch_idx 16499 GradUpdates 16500 Loss 2.78378


Steps:  83%|████████▎ | 16521/20000 [36:11<07:43,  7.51it/s]

Samples 330400 Batch_idx 16519 GradUpdates 16520 Loss 2.77797


Steps:  83%|████████▎ | 16541/20000 [36:14<07:41,  7.50it/s]

Samples 330800 Batch_idx 16539 GradUpdates 16540 Loss 2.80693


Steps:  83%|████████▎ | 16561/20000 [36:16<07:42,  7.43it/s]

Samples 331200 Batch_idx 16559 GradUpdates 16560 Loss 2.77214


Steps:  83%|████████▎ | 16581/20000 [36:19<07:35,  7.50it/s]

Samples 331600 Batch_idx 16579 GradUpdates 16580 Loss 2.82200


Steps:  83%|████████▎ | 16601/20000 [36:21<07:32,  7.50it/s]

Samples 332000 Batch_idx 16599 GradUpdates 16600 Loss 2.76453


Steps:  83%|████████▎ | 16621/20000 [36:24<07:31,  7.49it/s]

Samples 332400 Batch_idx 16619 GradUpdates 16620 Loss 2.83554


Steps:  83%|████████▎ | 16641/20000 [36:26<07:28,  7.49it/s]

Samples 332800 Batch_idx 16639 GradUpdates 16640 Loss 2.78713


Steps:  83%|████████▎ | 16661/20000 [36:29<07:25,  7.50it/s]

Samples 333200 Batch_idx 16659 GradUpdates 16660 Loss 2.80570


Steps:  83%|████████▎ | 16681/20000 [36:32<07:22,  7.51it/s]

Samples 333600 Batch_idx 16679 GradUpdates 16680 Loss 2.79961


Steps:  84%|████████▎ | 16701/20000 [36:34<07:19,  7.50it/s]

Samples 334000 Batch_idx 16699 GradUpdates 16700 Loss 2.73342


Steps:  84%|████████▎ | 16721/20000 [36:37<07:16,  7.50it/s]

Samples 334400 Batch_idx 16719 GradUpdates 16720 Loss 2.77928


Steps:  84%|████████▎ | 16741/20000 [36:39<07:14,  7.50it/s]

Samples 334800 Batch_idx 16739 GradUpdates 16740 Loss 2.78248


Steps:  84%|████████▍ | 16761/20000 [36:42<07:12,  7.50it/s]

Samples 335200 Batch_idx 16759 GradUpdates 16760 Loss 2.77215


Steps:  84%|████████▍ | 16781/20000 [36:44<07:08,  7.51it/s]

Samples 335600 Batch_idx 16779 GradUpdates 16780 Loss 2.79633


Steps:  84%|████████▍ | 16801/20000 [36:47<07:06,  7.50it/s]

Samples 336000 Batch_idx 16799 GradUpdates 16800 Loss 2.77678


Steps:  84%|████████▍ | 16821/20000 [36:49<07:03,  7.50it/s]

Samples 336400 Batch_idx 16819 GradUpdates 16820 Loss 2.77001


Steps:  84%|████████▍ | 16841/20000 [36:52<07:06,  7.41it/s]

Samples 336800 Batch_idx 16839 GradUpdates 16840 Loss 2.78522


Steps:  84%|████████▍ | 16861/20000 [36:55<06:58,  7.50it/s]

Samples 337200 Batch_idx 16859 GradUpdates 16860 Loss 2.78394


Steps:  84%|████████▍ | 16881/20000 [36:57<06:55,  7.51it/s]

Samples 337600 Batch_idx 16879 GradUpdates 16880 Loss 2.75192


Steps:  85%|████████▍ | 16901/20000 [37:00<06:53,  7.50it/s]

Samples 338000 Batch_idx 16899 GradUpdates 16900 Loss 2.79296


Steps:  85%|████████▍ | 16921/20000 [37:02<06:50,  7.50it/s]

Samples 338400 Batch_idx 16919 GradUpdates 16920 Loss 2.80672


Steps:  85%|████████▍ | 16941/20000 [37:05<06:53,  7.39it/s]

Samples 338800 Batch_idx 16939 GradUpdates 16940 Loss 2.82893


Steps:  85%|████████▍ | 16961/20000 [37:07<06:45,  7.49it/s]

Samples 339200 Batch_idx 16959 GradUpdates 16960 Loss 2.75367


Steps:  85%|████████▍ | 16981/20000 [37:10<06:43,  7.49it/s]

Samples 339600 Batch_idx 16979 GradUpdates 16980 Loss 2.81711


Steps:  85%|████████▌ | 17001/20000 [37:12<06:40,  7.49it/s]

Samples 340000 Batch_idx 16999 GradUpdates 17000 Loss 2.76374


Steps:  85%|████████▌ | 17021/20000 [37:15<06:36,  7.52it/s]

Samples 340400 Batch_idx 17019 GradUpdates 17020 Loss 2.79574


Steps:  85%|████████▌ | 17041/20000 [37:18<06:36,  7.47it/s]

Samples 340800 Batch_idx 17039 GradUpdates 17040 Loss 2.80548


Steps:  85%|████████▌ | 17061/20000 [37:20<06:31,  7.51it/s]

Samples 341200 Batch_idx 17059 GradUpdates 17060 Loss 2.80268


Steps:  85%|████████▌ | 17081/20000 [37:23<06:28,  7.51it/s]

Samples 341600 Batch_idx 17079 GradUpdates 17080 Loss 2.82847


Steps:  86%|████████▌ | 17101/20000 [37:25<06:26,  7.50it/s]

Samples 342000 Batch_idx 17099 GradUpdates 17100 Loss 2.79502


Steps:  86%|████████▌ | 17121/20000 [37:28<06:27,  7.42it/s]

Samples 342400 Batch_idx 17119 GradUpdates 17120 Loss 2.77473


Steps:  86%|████████▌ | 17141/20000 [37:30<06:20,  7.51it/s]

Samples 342800 Batch_idx 17139 GradUpdates 17140 Loss 2.81713


Steps:  86%|████████▌ | 17161/20000 [37:33<06:18,  7.50it/s]

Samples 343200 Batch_idx 17159 GradUpdates 17160 Loss 2.85493


Steps:  86%|████████▌ | 17181/20000 [37:35<06:16,  7.49it/s]

Samples 343600 Batch_idx 17179 GradUpdates 17180 Loss 2.75069


Steps:  86%|████████▌ | 17201/20000 [37:38<06:16,  7.43it/s]

Samples 344000 Batch_idx 17199 GradUpdates 17200 Loss 2.81772


Steps:  86%|████████▌ | 17221/20000 [37:41<06:10,  7.49it/s]

Samples 344400 Batch_idx 17219 GradUpdates 17220 Loss 2.82727


Steps:  86%|████████▌ | 17241/20000 [37:43<06:08,  7.50it/s]

Samples 344800 Batch_idx 17239 GradUpdates 17240 Loss 2.81654


Steps:  86%|████████▋ | 17261/20000 [37:46<06:05,  7.49it/s]

Samples 345200 Batch_idx 17259 GradUpdates 17260 Loss 2.75059


Steps:  86%|████████▋ | 17281/20000 [37:48<06:02,  7.50it/s]

Samples 345600 Batch_idx 17279 GradUpdates 17280 Loss 2.73713


Steps:  87%|████████▋ | 17301/20000 [37:51<06:00,  7.49it/s]

Samples 346000 Batch_idx 17299 GradUpdates 17300 Loss 2.83440


Steps:  87%|████████▋ | 17321/20000 [37:53<05:57,  7.49it/s]

Samples 346400 Batch_idx 17319 GradUpdates 17320 Loss 2.84610


Steps:  87%|████████▋ | 17341/20000 [37:56<05:54,  7.49it/s]

Samples 346800 Batch_idx 17339 GradUpdates 17340 Loss 2.82859


Steps:  87%|████████▋ | 17361/20000 [37:58<05:52,  7.49it/s]

Samples 347200 Batch_idx 17359 GradUpdates 17360 Loss 2.83919


Steps:  87%|████████▋ | 17381/20000 [38:01<05:49,  7.50it/s]

Samples 347600 Batch_idx 17379 GradUpdates 17380 Loss 2.77587


Steps:  87%|████████▋ | 17401/20000 [38:03<05:50,  7.42it/s]

Samples 348000 Batch_idx 17399 GradUpdates 17400 Loss 2.77278


Steps:  87%|████████▋ | 17421/20000 [38:06<05:44,  7.50it/s]

Samples 348400 Batch_idx 17419 GradUpdates 17420 Loss 2.79764


Steps:  87%|████████▋ | 17441/20000 [38:09<05:41,  7.49it/s]

Samples 348800 Batch_idx 17439 GradUpdates 17440 Loss 2.73902


Steps:  87%|████████▋ | 17461/20000 [38:11<05:39,  7.49it/s]

Samples 349200 Batch_idx 17459 GradUpdates 17460 Loss 2.80636


Steps:  87%|████████▋ | 17481/20000 [38:14<05:36,  7.49it/s]

Samples 349600 Batch_idx 17479 GradUpdates 17480 Loss 2.78458


Steps:  88%|████████▊ | 17501/20000 [38:16<05:33,  7.49it/s]

Samples 350000 Batch_idx 17499 GradUpdates 17500 Loss 2.70789


Steps:  88%|████████▊ | 17521/20000 [38:19<05:30,  7.49it/s]

Samples 350400 Batch_idx 17519 GradUpdates 17520 Loss 2.78135


Steps:  88%|████████▊ | 17541/20000 [38:21<05:28,  7.49it/s]

Samples 350800 Batch_idx 17539 GradUpdates 17540 Loss 2.73360


Steps:  88%|████████▊ | 17561/20000 [38:24<05:25,  7.49it/s]

Samples 351200 Batch_idx 17559 GradUpdates 17560 Loss 2.77987


Steps:  88%|████████▊ | 17581/20000 [38:26<05:22,  7.49it/s]

Samples 351600 Batch_idx 17579 GradUpdates 17580 Loss 2.80126


Steps:  88%|████████▊ | 17601/20000 [38:29<05:20,  7.50it/s]

Samples 352000 Batch_idx 17599 GradUpdates 17600 Loss 2.72914


Steps:  88%|████████▊ | 17621/20000 [38:32<05:17,  7.49it/s]

Samples 352400 Batch_idx 17619 GradUpdates 17620 Loss 2.79235


Steps:  88%|████████▊ | 17641/20000 [38:34<05:14,  7.50it/s]

Samples 352800 Batch_idx 17639 GradUpdates 17640 Loss 2.82808


Steps:  88%|████████▊ | 17661/20000 [38:37<05:15,  7.42it/s]

Samples 353200 Batch_idx 17659 GradUpdates 17660 Loss 2.76970


Steps:  88%|████████▊ | 17681/20000 [38:39<05:09,  7.50it/s]

Samples 353600 Batch_idx 17679 GradUpdates 17680 Loss 2.79410


Steps:  89%|████████▊ | 17701/20000 [38:42<05:06,  7.50it/s]

Samples 354000 Batch_idx 17699 GradUpdates 17700 Loss 2.77464


Steps:  89%|████████▊ | 17721/20000 [38:44<05:03,  7.50it/s]

Samples 354400 Batch_idx 17719 GradUpdates 17720 Loss 2.76683


Steps:  89%|████████▊ | 17741/20000 [38:47<05:01,  7.50it/s]

Samples 354800 Batch_idx 17739 GradUpdates 17740 Loss 2.75239


Steps:  89%|████████▉ | 17761/20000 [38:49<04:58,  7.50it/s]

Samples 355200 Batch_idx 17759 GradUpdates 17760 Loss 2.79712


Steps:  89%|████████▉ | 17781/20000 [38:52<04:55,  7.50it/s]

Samples 355600 Batch_idx 17779 GradUpdates 17780 Loss 2.75667


Steps:  89%|████████▉ | 17801/20000 [38:55<04:53,  7.50it/s]

Samples 356000 Batch_idx 17799 GradUpdates 17800 Loss 2.79378


Steps:  89%|████████▉ | 17821/20000 [38:57<04:51,  7.48it/s]

Samples 356400 Batch_idx 17819 GradUpdates 17820 Loss 2.79976


Steps:  89%|████████▉ | 17841/20000 [39:00<04:48,  7.49it/s]

Samples 356800 Batch_idx 17839 GradUpdates 17840 Loss 2.80407


Steps:  89%|████████▉ | 17861/20000 [39:02<04:45,  7.50it/s]

Samples 357200 Batch_idx 17859 GradUpdates 17860 Loss 2.76313


Steps:  89%|████████▉ | 17881/20000 [39:05<04:42,  7.50it/s]

Samples 357600 Batch_idx 17879 GradUpdates 17880 Loss 2.74212


Steps:  90%|████████▉ | 17901/20000 [39:07<04:39,  7.50it/s]

Samples 358000 Batch_idx 17899 GradUpdates 17900 Loss 2.76967


Steps:  90%|████████▉ | 17921/20000 [39:10<04:37,  7.50it/s]

Samples 358400 Batch_idx 17919 GradUpdates 17920 Loss 2.79806


Steps:  90%|████████▉ | 17941/20000 [39:12<04:37,  7.43it/s]

Samples 358800 Batch_idx 17939 GradUpdates 17940 Loss 2.79117


Steps:  90%|████████▉ | 17961/20000 [39:15<04:32,  7.49it/s]

Samples 359200 Batch_idx 17959 GradUpdates 17960 Loss 2.72729


Steps:  90%|████████▉ | 17981/20000 [39:17<04:29,  7.50it/s]

Samples 359600 Batch_idx 17979 GradUpdates 17980 Loss 2.82772


Steps:  90%|█████████ | 18000/20000 [39:20<04:32,  7.33it/s]

Samples 360000 Batch_idx 17999 GradUpdates 18000 Loss 2.77983
Samples 360020 Batch_idx 18000 GradUpdates 18001 Loss 2.73955


Eval Steps: 100%|██████████| 25/25 [00:04<00:00,  5.86it/s]
Steps:  90%|█████████ | 18021/20000 [39:29<04:26,  7.42it/s]  

Samples 360400 Batch_idx 18019 GradUpdates 18020 Loss 2.80733


Steps:  90%|█████████ | 18041/20000 [39:32<04:20,  7.51it/s]

Samples 360800 Batch_idx 18039 GradUpdates 18040 Loss 2.78796


Steps:  90%|█████████ | 18061/20000 [39:35<04:18,  7.50it/s]

Samples 361200 Batch_idx 18059 GradUpdates 18060 Loss 2.74401


Steps:  90%|█████████ | 18081/20000 [39:37<04:15,  7.50it/s]

Samples 361600 Batch_idx 18079 GradUpdates 18080 Loss 2.72435


Steps:  91%|█████████ | 18101/20000 [39:40<04:13,  7.50it/s]

Samples 362000 Batch_idx 18099 GradUpdates 18100 Loss 2.76322


Steps:  91%|█████████ | 18121/20000 [39:42<04:10,  7.51it/s]

Samples 362400 Batch_idx 18119 GradUpdates 18120 Loss 2.77622


Steps:  91%|█████████ | 18141/20000 [39:45<04:07,  7.51it/s]

Samples 362800 Batch_idx 18139 GradUpdates 18140 Loss 2.81585


Steps:  91%|█████████ | 18161/20000 [39:47<04:05,  7.50it/s]

Samples 363200 Batch_idx 18159 GradUpdates 18160 Loss 2.82980


Steps:  91%|█████████ | 18181/20000 [39:50<04:02,  7.50it/s]

Samples 363600 Batch_idx 18179 GradUpdates 18180 Loss 2.77793


Steps:  91%|█████████ | 18201/20000 [39:52<03:59,  7.51it/s]

Samples 364000 Batch_idx 18199 GradUpdates 18200 Loss 2.76675


Steps:  91%|█████████ | 18221/20000 [39:55<03:57,  7.50it/s]

Samples 364400 Batch_idx 18219 GradUpdates 18220 Loss 2.72819


Steps:  91%|█████████ | 18241/20000 [39:57<03:54,  7.51it/s]

Samples 364800 Batch_idx 18239 GradUpdates 18240 Loss 2.82749


Steps:  91%|█████████▏| 18261/20000 [40:00<03:54,  7.41it/s]

Samples 365200 Batch_idx 18259 GradUpdates 18260 Loss 2.78378


Steps:  91%|█████████▏| 18281/20000 [40:03<03:49,  7.50it/s]

Samples 365600 Batch_idx 18279 GradUpdates 18280 Loss 2.78070


Steps:  92%|█████████▏| 18301/20000 [40:05<03:46,  7.50it/s]

Samples 366000 Batch_idx 18299 GradUpdates 18300 Loss 2.74161


Steps:  92%|█████████▏| 18321/20000 [40:08<03:43,  7.50it/s]

Samples 366400 Batch_idx 18319 GradUpdates 18320 Loss 2.79171


Steps:  92%|█████████▏| 18341/20000 [40:10<03:41,  7.49it/s]

Samples 366800 Batch_idx 18339 GradUpdates 18340 Loss 2.74426


Steps:  92%|█████████▏| 18361/20000 [40:13<03:38,  7.49it/s]

Samples 367200 Batch_idx 18359 GradUpdates 18360 Loss 2.73553


Steps:  92%|█████████▏| 18381/20000 [40:15<03:35,  7.51it/s]

Samples 367600 Batch_idx 18379 GradUpdates 18380 Loss 2.79740


Steps:  92%|█████████▏| 18401/20000 [40:18<03:34,  7.46it/s]

Samples 368000 Batch_idx 18399 GradUpdates 18400 Loss 2.83890


Steps:  92%|█████████▏| 18421/20000 [40:20<03:31,  7.48it/s]

Samples 368400 Batch_idx 18419 GradUpdates 18420 Loss 2.75035


Steps:  92%|█████████▏| 18441/20000 [40:23<03:28,  7.48it/s]

Samples 368800 Batch_idx 18439 GradUpdates 18440 Loss 2.73062


Steps:  92%|█████████▏| 18461/20000 [40:26<03:25,  7.48it/s]

Samples 369200 Batch_idx 18459 GradUpdates 18460 Loss 2.82245


Steps:  92%|█████████▏| 18480/20000 [40:28<03:28,  7.29it/s]

Samples 369600 Batch_idx 18479 GradUpdates 18480 Loss 2.74060


Steps:  93%|█████████▎| 18501/20000 [40:31<03:20,  7.47it/s]

Samples 370000 Batch_idx 18499 GradUpdates 18500 Loss 2.76201


Steps:  93%|█████████▎| 18521/20000 [40:34<03:17,  7.49it/s]

Samples 370400 Batch_idx 18519 GradUpdates 18520 Loss 2.79116


Steps:  93%|█████████▎| 18541/20000 [40:37<03:14,  7.49it/s]

Samples 370800 Batch_idx 18539 GradUpdates 18540 Loss 2.76313


Steps:  93%|█████████▎| 18561/20000 [40:39<03:12,  7.49it/s]

Samples 371200 Batch_idx 18559 GradUpdates 18560 Loss 2.75580


Steps:  93%|█████████▎| 18581/20000 [40:42<03:09,  7.48it/s]

Samples 371600 Batch_idx 18579 GradUpdates 18580 Loss 2.72526


Steps:  93%|█████████▎| 18601/20000 [40:44<03:06,  7.50it/s]

Samples 372000 Batch_idx 18599 GradUpdates 18600 Loss 2.79639


Steps:  93%|█████████▎| 18621/20000 [40:47<03:04,  7.49it/s]

Samples 372400 Batch_idx 18619 GradUpdates 18620 Loss 2.77736


Steps:  93%|█████████▎| 18641/20000 [40:49<03:01,  7.50it/s]

Samples 372800 Batch_idx 18639 GradUpdates 18640 Loss 2.75279


Steps:  93%|█████████▎| 18661/20000 [40:52<02:58,  7.50it/s]

Samples 373200 Batch_idx 18659 GradUpdates 18660 Loss 2.74994


Steps:  93%|█████████▎| 18681/20000 [40:54<02:56,  7.49it/s]

Samples 373600 Batch_idx 18679 GradUpdates 18680 Loss 2.74085


Steps:  94%|█████████▎| 18701/20000 [40:57<02:54,  7.46it/s]

Samples 374000 Batch_idx 18699 GradUpdates 18700 Loss 2.76626


Steps:  94%|█████████▎| 18721/20000 [41:00<02:50,  7.48it/s]

Samples 374400 Batch_idx 18719 GradUpdates 18720 Loss 2.77706


Steps:  94%|█████████▎| 18741/20000 [41:02<02:47,  7.50it/s]

Samples 374800 Batch_idx 18739 GradUpdates 18740 Loss 2.79182


Steps:  94%|█████████▍| 18761/20000 [41:05<02:45,  7.49it/s]

Samples 375200 Batch_idx 18759 GradUpdates 18760 Loss 2.76972


Steps:  94%|█████████▍| 18781/20000 [41:07<02:42,  7.49it/s]

Samples 375600 Batch_idx 18779 GradUpdates 18780 Loss 2.71815


Steps:  94%|█████████▍| 18801/20000 [41:10<02:40,  7.48it/s]

Samples 376000 Batch_idx 18799 GradUpdates 18800 Loss 2.77508


Steps:  94%|█████████▍| 18821/20000 [41:12<02:38,  7.43it/s]

Samples 376400 Batch_idx 18819 GradUpdates 18820 Loss 2.81056


Steps:  94%|█████████▍| 18841/20000 [41:15<02:34,  7.48it/s]

Samples 376800 Batch_idx 18839 GradUpdates 18840 Loss 2.77514


Steps:  94%|█████████▍| 18861/20000 [41:17<02:32,  7.47it/s]

Samples 377200 Batch_idx 18859 GradUpdates 18860 Loss 2.78277


Steps:  94%|█████████▍| 18881/20000 [41:20<02:29,  7.49it/s]

Samples 377600 Batch_idx 18879 GradUpdates 18880 Loss 2.76600


Steps:  95%|█████████▍| 18901/20000 [41:23<02:26,  7.48it/s]

Samples 378000 Batch_idx 18899 GradUpdates 18900 Loss 2.76907


Steps:  95%|█████████▍| 18921/20000 [41:25<02:24,  7.49it/s]

Samples 378400 Batch_idx 18919 GradUpdates 18920 Loss 2.82946


Steps:  95%|█████████▍| 18941/20000 [41:28<02:21,  7.49it/s]

Samples 378800 Batch_idx 18939 GradUpdates 18940 Loss 2.76307


Steps:  95%|█████████▍| 18961/20000 [41:30<02:18,  7.48it/s]

Samples 379200 Batch_idx 18959 GradUpdates 18960 Loss 2.75529


Steps:  95%|█████████▍| 18981/20000 [41:33<02:16,  7.48it/s]

Samples 379600 Batch_idx 18979 GradUpdates 18980 Loss 2.74478


Steps:  95%|█████████▌| 19001/20000 [41:35<02:13,  7.48it/s]

Samples 380000 Batch_idx 18999 GradUpdates 19000 Loss 2.74819


Steps:  95%|█████████▌| 19021/20000 [41:38<02:11,  7.47it/s]

Samples 380400 Batch_idx 19019 GradUpdates 19020 Loss 2.76623


Steps:  95%|█████████▌| 19041/20000 [41:40<02:08,  7.46it/s]

Samples 380800 Batch_idx 19039 GradUpdates 19040 Loss 2.72643


Steps:  95%|█████████▌| 19061/20000 [41:43<02:05,  7.47it/s]

Samples 381200 Batch_idx 19059 GradUpdates 19060 Loss 2.76450


Steps:  95%|█████████▌| 19081/20000 [41:46<02:03,  7.47it/s]

Samples 381600 Batch_idx 19079 GradUpdates 19080 Loss 2.74343


Steps:  96%|█████████▌| 19101/20000 [41:48<02:01,  7.40it/s]

Samples 382000 Batch_idx 19099 GradUpdates 19100 Loss 2.68289


Steps:  96%|█████████▌| 19121/20000 [41:51<01:57,  7.48it/s]

Samples 382400 Batch_idx 19119 GradUpdates 19120 Loss 2.75865


Steps:  96%|█████████▌| 19141/20000 [41:53<01:54,  7.48it/s]

Samples 382800 Batch_idx 19139 GradUpdates 19140 Loss 2.76065


Steps:  96%|█████████▌| 19161/20000 [41:56<01:52,  7.49it/s]

Samples 383200 Batch_idx 19159 GradUpdates 19160 Loss 2.73727


Steps:  96%|█████████▌| 19181/20000 [41:58<01:49,  7.46it/s]

Samples 383600 Batch_idx 19179 GradUpdates 19180 Loss 2.71421


Steps:  96%|█████████▌| 19201/20000 [42:01<01:46,  7.48it/s]

Samples 384000 Batch_idx 19199 GradUpdates 19200 Loss 2.73549


Steps:  96%|█████████▌| 19221/20000 [42:03<01:44,  7.48it/s]

Samples 384400 Batch_idx 19219 GradUpdates 19220 Loss 2.73159


Steps:  96%|█████████▌| 19241/20000 [42:06<01:41,  7.49it/s]

Samples 384800 Batch_idx 19239 GradUpdates 19240 Loss 2.73159


Steps:  96%|█████████▋| 19261/20000 [42:09<01:38,  7.48it/s]

Samples 385200 Batch_idx 19259 GradUpdates 19260 Loss 2.77376


Steps:  96%|█████████▋| 19281/20000 [42:11<01:35,  7.50it/s]

Samples 385600 Batch_idx 19279 GradUpdates 19280 Loss 2.76974


Steps:  97%|█████████▋| 19301/20000 [42:14<01:33,  7.49it/s]

Samples 386000 Batch_idx 19299 GradUpdates 19300 Loss 2.76366


Steps:  97%|█████████▋| 19321/20000 [42:16<01:30,  7.49it/s]

Samples 386400 Batch_idx 19319 GradUpdates 19320 Loss 2.72918


Steps:  97%|█████████▋| 19341/20000 [42:19<01:27,  7.49it/s]

Samples 386800 Batch_idx 19339 GradUpdates 19340 Loss 2.78770


Steps:  97%|█████████▋| 19361/20000 [42:21<01:26,  7.42it/s]

Samples 387200 Batch_idx 19359 GradUpdates 19360 Loss 2.76319


Steps:  97%|█████████▋| 19381/20000 [42:24<01:22,  7.50it/s]

Samples 387600 Batch_idx 19379 GradUpdates 19380 Loss 2.76791


Steps:  97%|█████████▋| 19401/20000 [42:27<01:20,  7.46it/s]

Samples 388000 Batch_idx 19399 GradUpdates 19400 Loss 2.76214


Steps:  97%|█████████▋| 19421/20000 [42:29<01:17,  7.49it/s]

Samples 388400 Batch_idx 19419 GradUpdates 19420 Loss 2.73450


Steps:  97%|█████████▋| 19441/20000 [42:32<01:14,  7.49it/s]

Samples 388800 Batch_idx 19439 GradUpdates 19440 Loss 2.73658


Steps:  97%|█████████▋| 19461/20000 [42:34<01:11,  7.50it/s]

Samples 389200 Batch_idx 19459 GradUpdates 19460 Loss 2.72720


Steps:  97%|█████████▋| 19481/20000 [42:37<01:09,  7.50it/s]

Samples 389600 Batch_idx 19479 GradUpdates 19480 Loss 2.75420


Steps:  98%|█████████▊| 19501/20000 [42:39<01:06,  7.50it/s]

Samples 390000 Batch_idx 19499 GradUpdates 19500 Loss 2.78278


Steps:  98%|█████████▊| 19521/20000 [42:42<01:03,  7.50it/s]

Samples 390400 Batch_idx 19519 GradUpdates 19520 Loss 2.71111


Steps:  98%|█████████▊| 19541/20000 [42:44<01:01,  7.51it/s]

Samples 390800 Batch_idx 19539 GradUpdates 19540 Loss 2.75701


Steps:  98%|█████████▊| 19561/20000 [42:47<00:58,  7.49it/s]

Samples 391200 Batch_idx 19559 GradUpdates 19560 Loss 2.79572


Steps:  98%|█████████▊| 19581/20000 [42:50<00:55,  7.49it/s]

Samples 391600 Batch_idx 19579 GradUpdates 19580 Loss 2.79800


Steps:  98%|█████████▊| 19601/20000 [42:52<00:53,  7.50it/s]

Samples 392000 Batch_idx 19599 GradUpdates 19600 Loss 2.80573


Steps:  98%|█████████▊| 19621/20000 [42:55<00:51,  7.31it/s]

Samples 392400 Batch_idx 19619 GradUpdates 19620 Loss 2.79015


Steps:  98%|█████████▊| 19641/20000 [42:57<00:47,  7.49it/s]

Samples 392800 Batch_idx 19639 GradUpdates 19640 Loss 2.69897


Steps:  98%|█████████▊| 19661/20000 [43:00<00:45,  7.50it/s]

Samples 393200 Batch_idx 19659 GradUpdates 19660 Loss 2.75229


Steps:  98%|█████████▊| 19681/20000 [43:02<00:42,  7.50it/s]

Samples 393600 Batch_idx 19679 GradUpdates 19680 Loss 2.70710


Steps:  99%|█████████▊| 19701/20000 [43:05<00:39,  7.50it/s]

Samples 394000 Batch_idx 19699 GradUpdates 19700 Loss 2.75166


Steps:  99%|█████████▊| 19721/20000 [43:07<00:37,  7.48it/s]

Samples 394400 Batch_idx 19719 GradUpdates 19720 Loss 2.70420


Steps:  99%|█████████▊| 19741/20000 [43:10<00:34,  7.50it/s]

Samples 394800 Batch_idx 19739 GradUpdates 19740 Loss 2.73647


Steps:  99%|█████████▉| 19761/20000 [43:12<00:31,  7.54it/s]

Samples 395200 Batch_idx 19759 GradUpdates 19760 Loss 2.73862


Steps:  99%|█████████▉| 19781/20000 [43:15<00:29,  7.47it/s]

Samples 395600 Batch_idx 19779 GradUpdates 19780 Loss 2.71935


Steps:  99%|█████████▉| 19801/20000 [43:18<00:26,  7.50it/s]

Samples 396000 Batch_idx 19799 GradUpdates 19800 Loss 2.74745


Steps:  99%|█████████▉| 19821/20000 [43:20<00:23,  7.50it/s]

Samples 396400 Batch_idx 19819 GradUpdates 19820 Loss 2.72459


Steps:  99%|█████████▉| 19841/20000 [43:23<00:21,  7.50it/s]

Samples 396800 Batch_idx 19839 GradUpdates 19840 Loss 2.72869


Steps:  99%|█████████▉| 19861/20000 [43:25<00:18,  7.50it/s]

Samples 397200 Batch_idx 19859 GradUpdates 19860 Loss 2.73476


Steps:  99%|█████████▉| 19881/20000 [43:28<00:15,  7.50it/s]

Samples 397600 Batch_idx 19879 GradUpdates 19880 Loss 2.74850


Steps: 100%|█████████▉| 19901/20000 [43:30<00:13,  7.43it/s]

Samples 398000 Batch_idx 19899 GradUpdates 19900 Loss 2.65928


Steps: 100%|█████████▉| 19921/20000 [43:33<00:10,  7.48it/s]

Samples 398400 Batch_idx 19919 GradUpdates 19920 Loss 2.73439


Steps: 100%|█████████▉| 19941/20000 [43:35<00:07,  7.49it/s]

Samples 398800 Batch_idx 19939 GradUpdates 19940 Loss 2.68386


Steps: 100%|█████████▉| 19961/20000 [43:38<00:05,  7.53it/s]

Samples 399200 Batch_idx 19959 GradUpdates 19960 Loss 2.76759


Steps: 100%|█████████▉| 19981/20000 [43:41<00:02,  7.53it/s]

Samples 399600 Batch_idx 19979 GradUpdates 19980 Loss 2.74373


Steps: 100%|█████████▉| 19999/20000 [43:43<00:00,  7.88it/s]

Samples 400000 Batch_idx 19999 GradUpdates 20000 Loss 2.71305


Eval Steps: 100%|██████████| 25/25 [00:04<00:00,  5.80it/s]
2025-04-17 03:52:32 - INFO - Saved config to /home/raymond/.temp/e2e_sae/e2e_sae/scripts/train_tlens_saes/out/seed-0_lpcoeff-3.0_logits-kl-1.0_lr-0.001_ratio-50.0_blocks.4.hook_resid_pre_2025-04-17_03-08-42/final_config.yaml
2025-04-17 03:52:32 - INFO - Saved model to /home/raymond/.temp/e2e_sae/e2e_sae/scripts/train_tlens_saes/out/seed-0_lpcoeff-3.0_logits-kl-1.0_lr-0.001_ratio-50.0_blocks.4.hook_resid_pre_2025-04-17_03-08-42/samples_400000.pt
Steps: 100%|█████████▉| 19999/20000 [43:50<00:00,  7.60it/s]
Batches: 100%|██████████| 97/97 [00:05<00:00, 18.03it/s]
[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
grad_norm,█▅▄▄▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
grad_updates,▁▁▁▁▁▂▂▂▂▂▃▃▃▄▄▄▄▄▄▄▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇████
loss,█▅▄▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/eval/in_to_orig/blocks.0.hook_resid_post,▁▁▁▁▁▁▁▁▁▁▁
loss/eval/in_to_orig/blocks.1.hook_resid_post,▁▁▁▁▁▁▁▁▁▁▁
loss/eval/in_to_orig/blocks.2.hook_resid_post,▁▁▁▁▁▁▁▁▁▁▁
loss/eval/in_to_orig/blocks.3.hook_resid_post,▁▁▁▁▁▁▁▁▁▁▁
loss/eval/in_to_orig/blocks.4.hook_resid_post,█▄▂▁▁▁▁▁▁▁▁
loss/eval/in_to_orig/blocks.5.hook_resid_post,█▄▂▁▁▁▁▁▁▁▁
loss/eval/in_to_orig/blocks.6.hook_resid_post,█▄▂▁▁▁▁▁▁▁▁

0,1
grad_norm,0.1086
grad_updates,20000.0
loss,2.71305
loss/eval/in_to_orig/blocks.0.hook_resid_post,0.0
loss/eval/in_to_orig/blocks.1.hook_resid_post,0.0
loss/eval/in_to_orig/blocks.2.hook_resid_post,0.0
loss/eval/in_to_orig/blocks.3.hook_resid_post,0.0
loss/eval/in_to_orig/blocks.4.hook_resid_post,0.60561
loss/eval/in_to_orig/blocks.5.hook_resid_post,0.58854
loss/eval/in_to_orig/blocks.6.hook_resid_post,0.57048
