* https://huggingface.co/learn/nlp-course/chapter7/6?fw=pt
* https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/en/chapter7/section6_pt.ipynb

In [1]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate>=0.26.0
#!apt install git-lfs



In [29]:
!transformers-cli env

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



Copy-and-paste the text below in your GitHub issue and FILL OUT the two last points.

- `transformers` version: 4.48.0
- Platform: Linux-5.15.167.4-microsoft-standard-WSL2-x86_64-with-glibc2.35
- Python version: 3.12.2
- Huggingface_hub version: 0.27.1
- Safetensors version: 0.5.2
- Accelerate version: 1.3.0
- Accelerate config: 	not found
- PyTorch version (GPU?): 2.5.1 (True)
- Tensorflow version (GPU?): not installed (NA)
- Flax version (CPU?/GPU?/TPU?): not installed (NA)
- Jax version: not installed
- JaxLib version: not installed
- Using distributed or parallel set-up in script?: <fill in>
- Using GPU in script?: <fill in>
- GPU type: NVIDIA GeForce RTX 4080 Laptop GPU



# Gather the data

In [2]:
def any_keyword_in_string(string, keywords):
    for keyword in keywords:
        if keyword in string:
            return True
    return False

In [3]:
filters = ["pandas", "sklearn", "matplotlib", "seaborn"]
example_1 = "import numpy as np"
example_2 = "import pandas as pd"

print(
    any_keyword_in_string(example_1, filters), any_keyword_in_string(example_2, filters)
)

False True


In [4]:
from collections import defaultdict

from datasets import Dataset
from tqdm import tqdm


def filter_streaming_dataset(dataset, filters):
    filtered_dict = defaultdict(list)
    total = 0
    for sample in tqdm(iter(dataset)):
        total += 1
        if any_keyword_in_string(sample["content"], filters):
            for k, v in sample.items():
                filtered_dict[k].append(v)
    print(f"{len(filtered_dict['content'])/total:.2%} of data after filtering.")
    return Dataset.from_dict(filtered_dict)

In [None]:
# This cell will take a very long time to execute, so you should skip it and go to
# the next one!
from datasets import load_dataset

split = "train"  # "valid"
filters = ["pandas", "sklearn", "matplotlib", "seaborn"]

data = load_dataset(f"transformersbook/codeparrot-{split}", split=split, streaming=True)
filtered_data = filter_streaming_dataset(data, filters)

In [5]:
from datasets import DatasetDict, load_dataset

ds_train = load_dataset("huggingface-course/codeparrot-ds-train", split="train")
ds_valid = load_dataset("huggingface-course/codeparrot-ds-valid", split="validation")

raw_datasets = DatasetDict(
    {
        "train": ds_train,  # .shuffle().select(range(50000)),
        "valid": ds_valid,  # .shuffle().select(range(500))
    }
)

raw_datasets

DatasetDict({
    train: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 606720
    })
    valid: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 3322
    })
})

In [7]:
for key in raw_datasets["train"][0]:
    print(f"{key.upper()}: {raw_datasets['train'][0][key][:200]}")

REPO_NAME: kmike/scikit-learn
PATH: sklearn/utils/__init__.py
COPIES: 3
SIZE: 10094
CONTENT: """
The :mod:`sklearn.utils` module includes various utilites.
"""

from collections import Sequence

import numpy as np
from scipy.sparse import issparse

from .murmurhash import murm
LICENSE: bsd-3-clause


In [8]:
# Sample the train and valid data to quickly test the training loop.
sample_raw_datasets = DatasetDict(
    {
        "train": ds_train.shuffle().select(range(50000)),
        "valid": ds_valid.shuffle().select(range(500)),
    }
)

sample_raw_datasets

DatasetDict({
    train: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 50000
    })
    valid: Dataset({
        features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
        num_rows: 500
    })
})

In [9]:
for key in sample_raw_datasets["train"][0]:
    print(f"{key.upper()}: {sample_raw_datasets['train'][0][key][:200]}")

REPO_NAME: mwv/scikit-learn
PATH: sklearn/metrics/classification.py
COPIES: 95
SIZE: 67713
CONTENT: """Metrics to assess performance on classification task given classe prediction

Functions named as ``*_score`` return a scalar value to maximize: the higher
the better

Function named as ``*_error`` 
LICENSE: bsd-3-clause


# Prepare the dataset

In [10]:
from transformers import AutoTokenizer

context_length = 128
tokenizer = AutoTokenizer.from_pretrained(
    "huggingface-course/code-search-net-tokenizer"
)

outputs = tokenizer(
    raw_datasets["train"][:2]["content"],
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True,
)

print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

Input IDs length: 34
Input chunk lengths: [128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 117, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 41]
Chunk mapping: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [11]:
def tokenize(element):
    outputs = tokenizer(
        element["content"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = sample_raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)
# tokenized_datasets = raw_datasets.map(
#    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
# )
tokenized_datasets

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 1378510
    })
    valid: Dataset({
        features: ['input_ids'],
        num_rows: 13347
    })
})

# Initialize a new model

In [12]:
from transformers import AutoConfig, AutoTokenizer, GPT2LMHeadModel

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

In [13]:
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

GPT-2 size: 124.2M parameters


In [None]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [None]:
out = data_collator([tokenized_datasets["train"][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
# ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`abs
# https://stackoverflow.com/a/78610845/8492021
!pip install accelerate
!pip install transformers

In [None]:
import accelerate

print(accelerate.__version__)

In [None]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="codeparrot-ds-sample",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
    fp16=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
)

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()

# Generate code with a pipeline

In [None]:
import torch
from transformers import pipeline

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# pipe = pipeline(
#    "text-generation", model="renwei2024/codeparrot-ds-sample", device=device
# )
pipe = pipeline(
    "text-generation", model="huggingface-course/codeparrot-ds", device=device
)

In [None]:
txt = """\
# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

In [None]:
txt = """\
# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create dataframe from x and y
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

In [None]:
txt = """\
# dataframe with profession, income and name
df = pd.DataFrame({'profession': x, 'income':y, 'name': z})

# calculate the mean income per profession
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

In [None]:
txt = """
# import random forest regressor from scikit-learn
from sklearn.ensemble import RandomForestRegressor

# fit random forest model with 300 estimators on X, y:
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

# Train with Accelerate

In [14]:
keytoken_ids = []
for keyword in [
    "plt",
    "pd",
    "sk",
    "fit",
    "predict",
    " plt",
    " pd",
    " sk",
    " fit",
    " predict",
    "testtest",
]:
    ids = tokenizer([keyword]).input_ids[0]
    if len(ids) == 1:
        keytoken_ids.append(ids[0])
    else:
        print(f"Keyword has not single token: {keyword}")

Keyword has not single token: testtest


In [15]:
import torch
from torch.nn import CrossEntropyLoss


def keytoken_weighted_loss(inputs, logits, keytoken_ids, alpha=1.0):
    # Shift so that tokens < n predict n
    shift_labels = inputs[..., 1:].contiguous()
    shift_logits = logits[..., :-1, :].contiguous()
    # Calculate per-token loss
    loss_fct = CrossEntropyLoss(reduce=False)
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    # Resize and average loss per sample
    loss_per_sample = loss.view(shift_logits.size(0), shift_logits.size(1)).mean(axis=1)
    # Calculate and scale weighting
    weights = torch.stack([(inputs == kt).float() for kt in keytoken_ids]).sum(
        axis=[0, 2]
    )
    weights = alpha * (1.0 + weights)
    # Calculate weighted average
    weighted_loss = (loss_per_sample * weights).mean()
    return weighted_loss

In [16]:
from torch.utils.data.dataloader import DataLoader

tokenized_datasets.set_format("torch")
train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=32, shuffle=True)
eval_dataloader = DataLoader(tokenized_datasets["valid"], batch_size=32)

In [17]:
weight_decay = 0.1


def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
    params_with_wd, params_without_wd = [], []
    for n, p in model.named_parameters():
        if any(nd in n for nd in no_decay):
            params_without_wd.append(p)
        else:
            params_with_wd.append(p)
    return [
        {"params": params_with_wd, "weight_decay": weight_decay},
        {"params": params_without_wd, "weight_decay": 0.0},
    ]

In [18]:
# https://discuss.huggingface.co/t/chapter-7-questions/11746/107?u=renwei2024
def evaluate():
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(batch["input_ids"], labels=batch["input_ids"])
        losses.append(accelerator.gather(outputs.loss.view(-1)))
    loss = torch.mean(torch.cat(losses))
    try:
        perplexity = torch.exp(loss)
    except OverflowError:
        perplexity = float("inf")
    return loss.item(), perplexity.item()

In [19]:
model = GPT2LMHeadModel(config)

In [20]:
from torch.optim import AdamW

optimizer = AdamW(get_grouped_params(model), lr=5e-4)

In [21]:
from accelerate import Accelerator

# `accelerator = Accelerator(fp16=True)` leads to this TypeError:
#   TypeError: Accelerator.__init__() got an unexpected keyword argument 'fp16'
accelerator = Accelerator(mixed_precision="fp16")

model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [22]:
from transformers import get_scheduler

num_train_epochs = 1
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=200,  # 1_000
    num_training_steps=num_training_steps,
)

In [23]:
from huggingface_hub import Repository, get_full_repo_name

model_name = "codeparrot-ds-sample-accelerate"
repo_name = get_full_repo_name(model_name)
repo_name

'renwei2024/codeparrot-ds-sample-accelerate'

In [24]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [42]:
# Create a remote repo.
from huggingface_hub import HfApi

# Authenticate with your Hugging Face token
api = HfApi()

# Create the model repository on Hugging Face Hub
api.create_repo(repo_id=repo_name)

print(f"Repository '{repo_name}' created successfully!")

Repository 'renwei2024/codeparrot-ds-sample-accelerate' created successfully!


In [25]:
output_dir = "codeparrot-ds-sample-accelerate"
repo = Repository(local_dir=output_dir, clone_from=repo_name)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
/home/renwei/repos/github/renweizhukov/jupyter-lab-notebook/hugging-face-nlp-course/codeparrot-ds-sample-accelerate is already a clone of https://huggingface.co/renwei2024/codeparrot-ds-sample-accelerate. Make sure you pull the latest changes with `repo.git_pull()`.


In [26]:
!pwd
!ls -alh

/home/renwei/repos/github/renweizhukov/jupyter-lab-notebook/hugging-face-nlp-course
total 106M
drwxr-xr-x 4 renwei renwei 4.0K Jan 28 12:33  .
drwxr-xr-x 5 renwei renwei 4.0K Jan 28 11:19  ..
drwxr-xr-x 2 renwei renwei 4.0K Jan 28 11:20  .ipynb_checkpoints
-rw-r--r-- 1 renwei renwei 3.5K Jan 28 12:09 '=0.26.0'
-rw-r--r-- 1 renwei renwei  53M Jan 27 18:35  NeMo-issues-fixed.jsonl
-rw-r--r-- 1 renwei renwei  53M Jan 27 18:35  NeMo-issues.jsonl
-rw-r--r-- 1 renwei renwei  33K Jan 27 18:35  build-new-tokenizer.ipynb
drwxr-xr-x 3 renwei renwei 4.0K Jan 28 11:26  codeparrot-ds-sample-accelerate
-rw-r--r-- 1 renwei renwei  36K Jan 28 11:24  datasets-library-create-your-owne-dataset.ipynb
-rw-r--r-- 1 renwei renwei 100K Jan 27 18:35  datasets-library-time-to-slice-and-dice.ipynb
-rw-r--r-- 1 renwei renwei  45K Jan 27 18:35  semantic-search-with-faiss.ipynb
-rw-r--r-- 1 renwei renwei  46K Jan 28 12:33  train-causal-language-model.ipynb


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [27]:
evaluate()

(10.959161758422852, 57478.2421875)

In [28]:
from tqdm.notebook import tqdm

samples_per_step = 32  # Should be the batch size of `train_dataloader`.
gradient_accumulation_steps = 8
eval_steps = 1_250  # 5_000

model.train()
completed_steps = 0
for epoch in range(num_train_epochs):
    for step, batch in tqdm(
        enumerate(train_dataloader, start=1), total=num_training_steps
    ):
        logits = model(batch["input_ids"]).logits
        loss = keytoken_weighted_loss(batch["input_ids"], logits, keytoken_ids)
        if step % 100 == 0:
            accelerator.print(
                {
                    "samples": step * samples_per_step,
                    "steps": completed_steps,
                    "loss/train": loss.item() * gradient_accumulation_steps,
                }
            )
        loss = loss / gradient_accumulation_steps
        accelerator.backward(loss)
        if step % gradient_accumulation_steps == 0:
            accelerator.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            completed_steps += 1
        if (
            step % (eval_steps * gradient_accumulation_steps)
        ) == 0 or step == num_training_steps:
            eval_loss, perplexity = evaluate()
            accelerator.print({"loss/eval": eval_loss, "perplexity": perplexity})
            model.train()
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
            if accelerator.is_main_process:
                tokenizer.save_pretrained(output_dir)
                repo.push_to_hub(
                    commit_message=f"Training in progress step {step}", blocking=False
                )

  0%|          | 0/43079 [00:00<?, ?it/s]



{'samples': 3200, 'steps': 12, 'loss/train': 88.76261901855469}
{'samples': 6400, 'steps': 24, 'loss/train': 120.48477935791016}
{'samples': 9600, 'steps': 37, 'loss/train': 86.93660736083984}
{'samples': 12800, 'steps': 49, 'loss/train': 72.72986602783203}
{'samples': 16000, 'steps': 62, 'loss/train': 66.69828033447266}
{'samples': 19200, 'steps': 74, 'loss/train': 71.48916625976562}
{'samples': 22400, 'steps': 87, 'loss/train': 55.479026794433594}
{'samples': 25600, 'steps': 99, 'loss/train': 78.30088806152344}
{'samples': 28800, 'steps': 112, 'loss/train': 58.12162780761719}
{'samples': 32000, 'steps': 124, 'loss/train': 65.84077453613281}
{'samples': 35200, 'steps': 137, 'loss/train': 50.55690002441406}
{'samples': 38400, 'steps': 149, 'loss/train': 63.242576599121094}
{'samples': 41600, 'steps': 162, 'loss/train': 72.86725616455078}
{'samples': 44800, 'steps': 174, 'loss/train': 59.62398147583008}
{'samples': 48000, 'steps': 187, 'loss/train': 52.5299186706543}
{'samples': 51200, 



{'samples': 323200, 'steps': 1262, 'loss/train': 27.817899703979492}
{'samples': 326400, 'steps': 1274, 'loss/train': 22.731399536132812}
{'samples': 329600, 'steps': 1287, 'loss/train': 26.90929412841797}
{'samples': 332800, 'steps': 1299, 'loss/train': 35.88505554199219}
{'samples': 336000, 'steps': 1312, 'loss/train': 26.756013870239258}
{'samples': 339200, 'steps': 1324, 'loss/train': 23.115341186523438}
{'samples': 342400, 'steps': 1337, 'loss/train': 27.553447723388672}
{'samples': 345600, 'steps': 1349, 'loss/train': 22.172344207763672}
{'samples': 348800, 'steps': 1362, 'loss/train': 21.983013153076172}
{'samples': 352000, 'steps': 1374, 'loss/train': 25.516576766967773}
{'samples': 355200, 'steps': 1387, 'loss/train': 24.908145904541016}
{'samples': 358400, 'steps': 1399, 'loss/train': 25.049896240234375}
{'samples': 361600, 'steps': 1412, 'loss/train': 23.811092376708984}
{'samples': 364800, 'steps': 1424, 'loss/train': 29.2843074798584}
{'samples': 368000, 'steps': 1437, 'lo



{'samples': 643200, 'steps': 2512, 'loss/train': 22.02446746826172}
{'samples': 646400, 'steps': 2524, 'loss/train': 20.129112243652344}
{'samples': 649600, 'steps': 2537, 'loss/train': 17.414730072021484}
{'samples': 652800, 'steps': 2549, 'loss/train': 19.14794921875}
{'samples': 656000, 'steps': 2562, 'loss/train': 23.21855354309082}
{'samples': 659200, 'steps': 2574, 'loss/train': 19.828880310058594}
{'samples': 662400, 'steps': 2587, 'loss/train': 33.73101043701172}
{'samples': 665600, 'steps': 2599, 'loss/train': 19.425827026367188}
{'samples': 668800, 'steps': 2612, 'loss/train': 21.813602447509766}
{'samples': 672000, 'steps': 2624, 'loss/train': 25.331449508666992}
{'samples': 675200, 'steps': 2637, 'loss/train': 19.86335563659668}
{'samples': 678400, 'steps': 2649, 'loss/train': 19.77545928955078}
{'samples': 681600, 'steps': 2662, 'loss/train': 23.488609313964844}
{'samples': 684800, 'steps': 2674, 'loss/train': 18.18423080444336}
{'samples': 688000, 'steps': 2687, 'loss/tra



{'samples': 963200, 'steps': 3762, 'loss/train': 18.7039852142334}
{'samples': 966400, 'steps': 3774, 'loss/train': 18.308473587036133}
{'samples': 969600, 'steps': 3787, 'loss/train': 19.59339141845703}
{'samples': 972800, 'steps': 3799, 'loss/train': 21.16069984436035}
{'samples': 976000, 'steps': 3812, 'loss/train': 19.145275115966797}
{'samples': 979200, 'steps': 3824, 'loss/train': 18.267242431640625}
{'samples': 982400, 'steps': 3837, 'loss/train': 20.043155670166016}
{'samples': 985600, 'steps': 3849, 'loss/train': 20.359773635864258}
{'samples': 988800, 'steps': 3862, 'loss/train': 22.513940811157227}
{'samples': 992000, 'steps': 3874, 'loss/train': 18.009235382080078}
{'samples': 995200, 'steps': 3887, 'loss/train': 19.075660705566406}
{'samples': 998400, 'steps': 3899, 'loss/train': 23.866302490234375}
{'samples': 1001600, 'steps': 3912, 'loss/train': 18.74401092529297}
{'samples': 1004800, 'steps': 3924, 'loss/train': 19.85009765625}
{'samples': 1008000, 'steps': 3937, 'loss



{'samples': 1283200, 'steps': 5012, 'loss/train': 15.818708419799805}
{'samples': 1286400, 'steps': 5024, 'loss/train': 13.763473510742188}
{'samples': 1289600, 'steps': 5037, 'loss/train': 21.070140838623047}
{'samples': 1292800, 'steps': 5049, 'loss/train': 19.028566360473633}
{'samples': 1296000, 'steps': 5062, 'loss/train': 19.372900009155273}
{'samples': 1299200, 'steps': 5074, 'loss/train': 19.606698989868164}
{'samples': 1302400, 'steps': 5087, 'loss/train': 18.30612564086914}
{'samples': 1305600, 'steps': 5099, 'loss/train': 21.161514282226562}
{'samples': 1308800, 'steps': 5112, 'loss/train': 19.043243408203125}
{'samples': 1312000, 'steps': 5124, 'loss/train': 17.697551727294922}
{'samples': 1315200, 'steps': 5137, 'loss/train': 19.70696449279785}
{'samples': 1318400, 'steps': 5149, 'loss/train': 22.47640609741211}
{'samples': 1321600, 'steps': 5162, 'loss/train': 22.29026985168457}
{'samples': 1324800, 'steps': 5174, 'loss/train': 28.1436767578125}
{'samples': 1328000, 'step