In [1]:
from transformers import AutoModelForCausalLM
from peft import get_peft_config, get_peft_model, PrefixTuningConfig, TaskType, PeftType,LoraConfig
import torch
from datasets import load_dataset
import os
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from transformers import default_data_collator, get_linear_schedule_with_warmup
from tqdm import tqdm
from datasets import load_dataset

device = "cuda"
model_name_or_path = "bigscience/bloomz-1b7"
tokenizer_name_or_path = "bigscience/bloomz-1b7"
peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)


dataset_name = "twitter_complaints"
checkpoint_name = f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}_v1.pt".replace(
    "/", "_"
)
text_column = "Tweet text"
label_column = "text_label"
max_length = 64
lr = 3e-2
num_epochs = 1000
batch_size = 8

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.__version__

'2.0.0+cu118'

In [3]:
from datasets import load_dataset

dataset = load_dataset("ought/raft", dataset_name)

classes = [k.replace("_", " ") for k in dataset["train"].features["Label"].names]
print(classes)
dataset = dataset.map(
    lambda x: {"text_label": [classes[label] for label in x["Label"]]},
    batched=True,
    num_proc=1,
)
print(dataset)
dataset["train"][0]

Found cached dataset raft (C:/Users/1/.cache/huggingface/datasets/ought___raft/twitter_complaints/1.1.0/79c4de1312c1e3730043f7db07179c914f48403101f7124e2fe336f6f54d9f84)
100%|██████████| 2/2 [00:00<00:00, 186.70it/s]
Loading cached processed dataset at C:\Users\1\.cache\huggingface\datasets\ought___raft\twitter_complaints\1.1.0\79c4de1312c1e3730043f7db07179c914f48403101f7124e2fe336f6f54d9f84\cache-c03af01af789f5a4.arrow
Loading cached processed dataset at C:\Users\1\.cache\huggingface\datasets\ought___raft\twitter_complaints\1.1.0\79c4de1312c1e3730043f7db07179c914f48403101f7124e2fe336f6f54d9f84\cache-1b61bb471c5a6285.arrow


['Unlabeled', 'complaint', 'no complaint']
DatasetDict({
    train: Dataset({
        features: ['Tweet text', 'ID', 'Label', 'text_label'],
        num_rows: 50
    })
    test: Dataset({
        features: ['Tweet text', 'ID', 'Label', 'text_label'],
        num_rows: 3399
    })
})


{'Tweet text': '@HMRCcustomers No this is my first job',
 'ID': 0,
 'Label': 2,
 'text_label': 'no complaint'}

In [4]:
# data preprocessing
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
target_max_length = max([len(tokenizer(class_label)["input_ids"]) for class_label in classes])
print(target_max_length)


def preprocess_function(examples):
    batch_size = len(examples[text_column])
    inputs = [f"{text_column} : {x} Label : " for x in examples[text_column]]
    targets = [str(x) for x in examples[label_column]]
    model_inputs = tokenizer(inputs)
    labels = tokenizer(targets)
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i] + [tokenizer.pad_token_id]
        # print(i, sample_input_ids, label_input_ids)
        model_inputs["input_ids"][i] = sample_input_ids + label_input_ids
        labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids
        model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])
    # print(model_inputs)
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
            "attention_mask"
        ][i]
        labels["input_ids"][i] = [-100] * (max_length - len(sample_input_ids)) + label_input_ids
        model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])
        model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])
        labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


processed_datasets = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["train"]


train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
)
eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)


3


                                                                                          

In [5]:
def test_preprocess_function(examples):
    batch_size = len(examples[text_column])
    inputs = [f"{text_column} : {x} Label : " for x in examples[text_column]]
    model_inputs = tokenizer(inputs)
    # print(model_inputs)
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
            "attention_mask"
        ][i]
        model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])
        model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])
    return model_inputs


test_dataset = dataset["test"].map(
    test_preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

test_dataloader = DataLoader(test_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)
next(iter(test_dataloader))

                                                                                           

{'input_ids': tensor([[     3,      3,      3,      3,      3,      3,      3,      3,      3,
               3,      3,      3,      3,      3,      3,      3,      3,      3,
               3,      3,      3,      3,      3,      3,      3,      3,      3,
          227985,   5484,    915,   2566,  74757,  64626,  12384,  44639,    613,
           52282,   2670,  79920,   3344,   1002,    368,  17646,  14472,   8348,
             664,    718,      4,  19036,     17,  31849,     17,   6312,     76,
              44,  62470,     56,     91,     50,  14839,     21,  77658,    915,
             210],
         [     3,      3,      3,      3,      3,      3,      3,      3,      3,
               3,      3,      3,      3,      3,      3,      3,      3,      3,
               3,      3,      3,      3,      3,      3,      3,      3,      3,
               3,      3,      3,      3, 227985,   5484,    915,    405, 187059,
            2256,    664,   2550,  18833,  18607, 162467,      4, 

In [6]:
# creating model
peft_config.fan_in_fan_out = False
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
model



trainable params: 1572864 || all params: 1723981824 || trainable%: 0.09123437254985815


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): BloomForCausalLM(
      (transformer): BloomModel(
        (word_embeddings): Embedding(250880, 2048)
        (word_embeddings_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (h): ModuleList(
          (0-23): 24 x BloomBlock(
            (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
            (self_attention): BloomAttention(
              (query_key_value): MergedLinear(
                in_features=2048, out_features=6144, bias=True
                (lora_dropout): Dropout(p=0.1, inplace=False)
                (lora_A): Linear(in_features=2048, out_features=16, bias=False)
                (lora_B): Conv1d(16, 4096, kernel_size=(1,), stride=(1,), groups=2, bias=False)
              )
              (dense): Linear(in_features=2048, out_features=2048, bias=True)
              (attention_dropout): Dropout(p=0.0, inplace=False)
            )
            (post_attention_lay

In [7]:
# data preprocessing
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

In [8]:


model.eval()
i = 16
inputs = tokenizer(f'{text_column} : {dataset["test"][i]["Tweet text"]} Label : ', return_tensors="pt")
print(dataset["test"][i]["Tweet text"])
print(dataset["test"][i]["text_label"])
print(inputs)

output_before_ft=model.generate(**inputs)

print(output_before_ft)
print(tokenizer.batch_decode(output_before_ft.detach().cpu().numpy(), skip_special_tokens=True))

Input length of input_ids is 34, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Hey @nytimes your link to cancel my subscription isn't working and nobody is answering the chat. Please don't play that kind of stupid game.
Unlabeled
{'input_ids': tensor([[227985,   5484,    915,  54078,   2566,   7782,  24502,   2632,   8989,
            427,  36992,   2670, 140711,  21994,  10789,    530,  88399,    632,
         183542,    368,  44799,     17,  29901,   5926,   7229,    861,  11596,
            461,  78851,  14775,     17,  77658,    915,    210]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
tensor([[227985,   5484,    915,  54078,   2566,   7782,  24502,   2632,   8989,
            427,  36992,   2670, 140711,  21994,  10789,    530,  88399,    632,
         183542,    368,  44799,     17,  29901,   5926,   7229,    861,  11596,
            461,  78851,  14775,     17,  77658,    915,    210,   2566]])
["Tweet text : Hey @nytimes your link to cancel my subscription i

In [9]:
# model
# optimizer and lr scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

In [10]:
# training and evaluation
model = model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        #         print(batch)
        #         print(batch["input_ids"].shape)
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    eval_loss = 0
    eval_preds = []
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        eval_loss += loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
        )

    eval_epoch_loss = eval_loss / len(eval_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(train_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

100%|██████████| 7/7 [00:13<00:00,  1.89s/it]
100%|██████████| 7/7 [00:02<00:00,  2.75it/s]


epoch=0: train_ppl=tensor(2147057.7500, device='cuda:0') train_epoch_loss=tensor(14.5796, device='cuda:0') eval_ppl=tensor(17055.5391, device='cuda:0') eval_epoch_loss=tensor(9.7442, device='cuda:0')


100%|██████████| 7/7 [00:12<00:00,  1.72s/it]
100%|██████████| 7/7 [00:01<00:00,  3.74it/s]


epoch=1: train_ppl=tensor(2247.4985, device='cuda:0') train_epoch_loss=tensor(7.7176, device='cuda:0') eval_ppl=tensor(70189.1406, device='cuda:0') eval_epoch_loss=tensor(11.1589, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.55s/it]
100%|██████████| 7/7 [00:02<00:00,  3.03it/s]


epoch=2: train_ppl=tensor(768.9080, device='cuda:0') train_epoch_loss=tensor(6.6450, device='cuda:0') eval_ppl=tensor(179.1304, device='cuda:0') eval_epoch_loss=tensor(5.1881, device='cuda:0')


100%|██████████| 7/7 [00:11<00:00,  1.60s/it]
100%|██████████| 7/7 [00:02<00:00,  2.35it/s]


epoch=3: train_ppl=tensor(156.2501, device='cuda:0') train_epoch_loss=tensor(5.0515, device='cuda:0') eval_ppl=tensor(126.2592, device='cuda:0') eval_epoch_loss=tensor(4.8383, device='cuda:0')


100%|██████████| 7/7 [00:11<00:00,  1.64s/it]
100%|██████████| 7/7 [00:02<00:00,  3.22it/s]


epoch=4: train_ppl=tensor(114.6100, device='cuda:0') train_epoch_loss=tensor(4.7415, device='cuda:0') eval_ppl=tensor(91.6228, device='cuda:0') eval_epoch_loss=tensor(4.5177, device='cuda:0')


100%|██████████| 7/7 [00:11<00:00,  1.68s/it]
100%|██████████| 7/7 [00:02<00:00,  2.94it/s]


epoch=5: train_ppl=tensor(94.5421, device='cuda:0') train_epoch_loss=tensor(4.5490, device='cuda:0') eval_ppl=tensor(72.7907, device='cuda:0') eval_epoch_loss=tensor(4.2876, device='cuda:0')


100%|██████████| 7/7 [00:11<00:00,  1.68s/it]
100%|██████████| 7/7 [00:02<00:00,  3.05it/s]


epoch=6: train_ppl=tensor(73.7979, device='cuda:0') train_epoch_loss=tensor(4.3013, device='cuda:0') eval_ppl=tensor(64.8535, device='cuda:0') eval_epoch_loss=tensor(4.1721, device='cuda:0')


100%|██████████| 7/7 [00:11<00:00,  1.61s/it]
100%|██████████| 7/7 [00:02<00:00,  2.88it/s]


epoch=7: train_ppl=tensor(57.5551, device='cuda:0') train_epoch_loss=tensor(4.0527, device='cuda:0') eval_ppl=tensor(50.0166, device='cuda:0') eval_epoch_loss=tensor(3.9124, device='cuda:0')


100%|██████████| 7/7 [00:11<00:00,  1.57s/it]
100%|██████████| 7/7 [00:02<00:00,  3.41it/s]


epoch=8: train_ppl=tensor(40.9635, device='cuda:0') train_epoch_loss=tensor(3.7127, device='cuda:0') eval_ppl=tensor(33.1957, device='cuda:0') eval_epoch_loss=tensor(3.5024, device='cuda:0')


100%|██████████| 7/7 [00:11<00:00,  1.63s/it]
100%|██████████| 7/7 [00:02<00:00,  3.03it/s]


epoch=9: train_ppl=tensor(27.8738, device='cuda:0') train_epoch_loss=tensor(3.3277, device='cuda:0') eval_ppl=tensor(22.1042, device='cuda:0') eval_epoch_loss=tensor(3.0958, device='cuda:0')


100%|██████████| 7/7 [00:11<00:00,  1.64s/it]
100%|██████████| 7/7 [00:02<00:00,  3.40it/s]


epoch=10: train_ppl=tensor(19.4574, device='cuda:0') train_epoch_loss=tensor(2.9682, device='cuda:0') eval_ppl=tensor(15.2874, device='cuda:0') eval_epoch_loss=tensor(2.7270, device='cuda:0')


100%|██████████| 7/7 [00:11<00:00,  1.66s/it]
100%|██████████| 7/7 [00:02<00:00,  2.86it/s]


epoch=11: train_ppl=tensor(12.5436, device='cuda:0') train_epoch_loss=tensor(2.5292, device='cuda:0') eval_ppl=tensor(10.3358, device='cuda:0') eval_epoch_loss=tensor(2.3356, device='cuda:0')


100%|██████████| 7/7 [00:11<00:00,  1.70s/it]
100%|██████████| 7/7 [00:04<00:00,  1.52it/s]


epoch=12: train_ppl=tensor(8.8557, device='cuda:0') train_epoch_loss=tensor(2.1811, device='cuda:0') eval_ppl=tensor(7.6037, device='cuda:0') eval_epoch_loss=tensor(2.0286, device='cuda:0')


100%|██████████| 7/7 [00:11<00:00,  1.65s/it]
100%|██████████| 7/7 [00:04<00:00,  1.44it/s]


epoch=13: train_ppl=tensor(6.5369, device='cuda:0') train_epoch_loss=tensor(1.8775, device='cuda:0') eval_ppl=tensor(6.0813, device='cuda:0') eval_epoch_loss=tensor(1.8052, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.53s/it]
100%|██████████| 7/7 [00:05<00:00,  1.35it/s]


epoch=14: train_ppl=tensor(5.7905, device='cuda:0') train_epoch_loss=tensor(1.7562, device='cuda:0') eval_ppl=tensor(5.8329, device='cuda:0') eval_epoch_loss=tensor(1.7635, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.47s/it]
100%|██████████| 7/7 [00:04<00:00,  1.49it/s]


epoch=15: train_ppl=tensor(5.5424, device='cuda:0') train_epoch_loss=tensor(1.7124, device='cuda:0') eval_ppl=tensor(5.1194, device='cuda:0') eval_epoch_loss=tensor(1.6330, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.43s/it]
100%|██████████| 7/7 [00:04<00:00,  1.55it/s]


epoch=16: train_ppl=tensor(5.4518, device='cuda:0') train_epoch_loss=tensor(1.6959, device='cuda:0') eval_ppl=tensor(5.1051, device='cuda:0') eval_epoch_loss=tensor(1.6302, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.50s/it]
100%|██████████| 7/7 [00:04<00:00,  1.56it/s]


epoch=17: train_ppl=tensor(5.0996, device='cuda:0') train_epoch_loss=tensor(1.6292, device='cuda:0') eval_ppl=tensor(5.0111, device='cuda:0') eval_epoch_loss=tensor(1.6117, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.45s/it]
100%|██████████| 7/7 [00:04<00:00,  1.61it/s]


epoch=18: train_ppl=tensor(4.8039, device='cuda:0') train_epoch_loss=tensor(1.5694, device='cuda:0') eval_ppl=tensor(4.7597, device='cuda:0') eval_epoch_loss=tensor(1.5602, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.47s/it]
100%|██████████| 7/7 [00:04<00:00,  1.53it/s]


epoch=19: train_ppl=tensor(4.6394, device='cuda:0') train_epoch_loss=tensor(1.5346, device='cuda:0') eval_ppl=tensor(5.3169, device='cuda:0') eval_epoch_loss=tensor(1.6709, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.48s/it]
100%|██████████| 7/7 [00:04<00:00,  1.41it/s]


epoch=20: train_ppl=tensor(4.7480, device='cuda:0') train_epoch_loss=tensor(1.5577, device='cuda:0') eval_ppl=tensor(4.7718, device='cuda:0') eval_epoch_loss=tensor(1.5627, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.47s/it]
100%|██████████| 7/7 [00:03<00:00,  1.77it/s]


epoch=21: train_ppl=tensor(5.1500, device='cuda:0') train_epoch_loss=tensor(1.6390, device='cuda:0') eval_ppl=tensor(4.8477, device='cuda:0') eval_epoch_loss=tensor(1.5785, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.44s/it]
100%|██████████| 7/7 [00:05<00:00,  1.35it/s]


epoch=22: train_ppl=tensor(5.9320, device='cuda:0') train_epoch_loss=tensor(1.7804, device='cuda:0') eval_ppl=tensor(5.4078, device='cuda:0') eval_epoch_loss=tensor(1.6878, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.47s/it]
100%|██████████| 7/7 [00:04<00:00,  1.46it/s]


epoch=23: train_ppl=tensor(5.3055, device='cuda:0') train_epoch_loss=tensor(1.6687, device='cuda:0') eval_ppl=tensor(5.3855, device='cuda:0') eval_epoch_loss=tensor(1.6837, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.53s/it]
100%|██████████| 7/7 [00:04<00:00,  1.52it/s]


epoch=24: train_ppl=tensor(4.9551, device='cuda:0') train_epoch_loss=tensor(1.6004, device='cuda:0') eval_ppl=tensor(4.5630, device='cuda:0') eval_epoch_loss=tensor(1.5180, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.51s/it]
100%|██████████| 7/7 [00:04<00:00,  1.48it/s]


epoch=25: train_ppl=tensor(4.8137, device='cuda:0') train_epoch_loss=tensor(1.5715, device='cuda:0') eval_ppl=tensor(6.3486, device='cuda:0') eval_epoch_loss=tensor(1.8482, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.48s/it]
100%|██████████| 7/7 [00:04<00:00,  1.68it/s]


epoch=26: train_ppl=tensor(5.0006, device='cuda:0') train_epoch_loss=tensor(1.6096, device='cuda:0') eval_ppl=tensor(4.6270, device='cuda:0') eval_epoch_loss=tensor(1.5319, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.52s/it]
100%|██████████| 7/7 [00:03<00:00,  1.82it/s]


epoch=27: train_ppl=tensor(4.6597, device='cuda:0') train_epoch_loss=tensor(1.5390, device='cuda:0') eval_ppl=tensor(4.5980, device='cuda:0') eval_epoch_loss=tensor(1.5256, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.43s/it]
100%|██████████| 7/7 [00:04<00:00,  1.45it/s]


epoch=28: train_ppl=tensor(4.3587, device='cuda:0') train_epoch_loss=tensor(1.4722, device='cuda:0') eval_ppl=tensor(4.3045, device='cuda:0') eval_epoch_loss=tensor(1.4597, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.51s/it]
100%|██████████| 7/7 [00:04<00:00,  1.64it/s]


epoch=29: train_ppl=tensor(4.5376, device='cuda:0') train_epoch_loss=tensor(1.5124, device='cuda:0') eval_ppl=tensor(4.4625, device='cuda:0') eval_epoch_loss=tensor(1.4957, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.49s/it]
100%|██████████| 7/7 [00:04<00:00,  1.56it/s]


epoch=30: train_ppl=tensor(4.4709, device='cuda:0') train_epoch_loss=tensor(1.4976, device='cuda:0') eval_ppl=tensor(4.1939, device='cuda:0') eval_epoch_loss=tensor(1.4336, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.48s/it]
100%|██████████| 7/7 [00:04<00:00,  1.46it/s]


epoch=31: train_ppl=tensor(4.5131, device='cuda:0') train_epoch_loss=tensor(1.5070, device='cuda:0') eval_ppl=tensor(4.4524, device='cuda:0') eval_epoch_loss=tensor(1.4935, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.49s/it]
100%|██████████| 7/7 [00:04<00:00,  1.61it/s]


epoch=32: train_ppl=tensor(6.2146, device='cuda:0') train_epoch_loss=tensor(1.8269, device='cuda:0') eval_ppl=tensor(6.1830, device='cuda:0') eval_epoch_loss=tensor(1.8218, device='cuda:0')


100%|██████████| 7/7 [00:11<00:00,  1.61s/it]
100%|██████████| 7/7 [00:04<00:00,  1.54it/s]


epoch=33: train_ppl=tensor(5.4666, device='cuda:0') train_epoch_loss=tensor(1.6987, device='cuda:0') eval_ppl=tensor(4.5849, device='cuda:0') eval_epoch_loss=tensor(1.5228, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.47s/it]
100%|██████████| 7/7 [00:05<00:00,  1.36it/s]


epoch=34: train_ppl=tensor(5.4275, device='cuda:0') train_epoch_loss=tensor(1.6915, device='cuda:0') eval_ppl=tensor(4.9841, device='cuda:0') eval_epoch_loss=tensor(1.6062, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.48s/it]
100%|██████████| 7/7 [00:04<00:00,  1.67it/s]


epoch=35: train_ppl=tensor(4.3363, device='cuda:0') train_epoch_loss=tensor(1.4670, device='cuda:0') eval_ppl=tensor(4.6774, device='cuda:0') eval_epoch_loss=tensor(1.5428, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.50s/it]
100%|██████████| 7/7 [00:04<00:00,  1.44it/s]


epoch=36: train_ppl=tensor(4.9734, device='cuda:0') train_epoch_loss=tensor(1.6041, device='cuda:0') eval_ppl=tensor(5.1057, device='cuda:0') eval_epoch_loss=tensor(1.6304, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.46s/it]
100%|██████████| 7/7 [00:04<00:00,  1.60it/s]


epoch=37: train_ppl=tensor(4.9514, device='cuda:0') train_epoch_loss=tensor(1.5997, device='cuda:0') eval_ppl=tensor(6.2263, device='cuda:0') eval_epoch_loss=tensor(1.8288, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.46s/it]
100%|██████████| 7/7 [00:04<00:00,  1.61it/s]


epoch=38: train_ppl=tensor(5.8657, device='cuda:0') train_epoch_loss=tensor(1.7691, device='cuda:0') eval_ppl=tensor(5.6813, device='cuda:0') eval_epoch_loss=tensor(1.7372, device='cuda:0')


100%|██████████| 7/7 [00:09<00:00,  1.41s/it]
100%|██████████| 7/7 [00:04<00:00,  1.45it/s]


epoch=39: train_ppl=tensor(5.4912, device='cuda:0') train_epoch_loss=tensor(1.7032, device='cuda:0') eval_ppl=tensor(5.1025, device='cuda:0') eval_epoch_loss=tensor(1.6297, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.48s/it]
100%|██████████| 7/7 [00:03<00:00,  1.77it/s]


epoch=40: train_ppl=tensor(5.1769, device='cuda:0') train_epoch_loss=tensor(1.6442, device='cuda:0') eval_ppl=tensor(5.2572, device='cuda:0') eval_epoch_loss=tensor(1.6596, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.50s/it]
100%|██████████| 7/7 [00:04<00:00,  1.57it/s]


epoch=41: train_ppl=tensor(5.9999, device='cuda:0') train_epoch_loss=tensor(1.7917, device='cuda:0') eval_ppl=tensor(5.6231, device='cuda:0') eval_epoch_loss=tensor(1.7269, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.47s/it]
100%|██████████| 7/7 [00:04<00:00,  1.72it/s]


epoch=42: train_ppl=tensor(6.1190, device='cuda:0') train_epoch_loss=tensor(1.8114, device='cuda:0') eval_ppl=tensor(5.7710, device='cuda:0') eval_epoch_loss=tensor(1.7528, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.46s/it]
100%|██████████| 7/7 [00:04<00:00,  1.58it/s]


epoch=43: train_ppl=tensor(6.1302, device='cuda:0') train_epoch_loss=tensor(1.8132, device='cuda:0') eval_ppl=tensor(4.7781, device='cuda:0') eval_epoch_loss=tensor(1.5640, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.51s/it]
100%|██████████| 7/7 [00:04<00:00,  1.72it/s]


epoch=44: train_ppl=tensor(4.6870, device='cuda:0') train_epoch_loss=tensor(1.5448, device='cuda:0') eval_ppl=tensor(6.4075, device='cuda:0') eval_epoch_loss=tensor(1.8575, device='cuda:0')


100%|██████████| 7/7 [00:09<00:00,  1.41s/it]
100%|██████████| 7/7 [00:04<00:00,  1.63it/s]


epoch=45: train_ppl=tensor(5.0556, device='cuda:0') train_epoch_loss=tensor(1.6205, device='cuda:0') eval_ppl=tensor(4.9763, device='cuda:0') eval_epoch_loss=tensor(1.6047, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.48s/it]
100%|██████████| 7/7 [00:04<00:00,  1.70it/s]


epoch=46: train_ppl=tensor(4.4660, device='cuda:0') train_epoch_loss=tensor(1.4965, device='cuda:0') eval_ppl=tensor(4.0125, device='cuda:0') eval_epoch_loss=tensor(1.3894, device='cuda:0')


100%|██████████| 7/7 [00:11<00:00,  1.58s/it]
100%|██████████| 7/7 [00:04<00:00,  1.52it/s]


epoch=47: train_ppl=tensor(4.2064, device='cuda:0') train_epoch_loss=tensor(1.4366, device='cuda:0') eval_ppl=tensor(4.1948, device='cuda:0') eval_epoch_loss=tensor(1.4339, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.52s/it]
100%|██████████| 7/7 [00:05<00:00,  1.40it/s]


epoch=48: train_ppl=tensor(4.1184, device='cuda:0') train_epoch_loss=tensor(1.4155, device='cuda:0') eval_ppl=tensor(4.2873, device='cuda:0') eval_epoch_loss=tensor(1.4556, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.50s/it]
100%|██████████| 7/7 [00:04<00:00,  1.57it/s]


epoch=49: train_ppl=tensor(4.8836, device='cuda:0') train_epoch_loss=tensor(1.5859, device='cuda:0') eval_ppl=tensor(5.2454, device='cuda:0') eval_epoch_loss=tensor(1.6573, device='cuda:0')


100%|██████████| 7/7 [00:09<00:00,  1.40s/it]
100%|██████████| 7/7 [00:04<00:00,  1.61it/s]


epoch=50: train_ppl=tensor(3.9049, device='cuda:0') train_epoch_loss=tensor(1.3622, device='cuda:0') eval_ppl=tensor(3.7045, device='cuda:0') eval_epoch_loss=tensor(1.3095, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.48s/it]
100%|██████████| 7/7 [00:04<00:00,  1.60it/s]


epoch=51: train_ppl=tensor(3.6509, device='cuda:0') train_epoch_loss=tensor(1.2950, device='cuda:0') eval_ppl=tensor(3.3386, device='cuda:0') eval_epoch_loss=tensor(1.2056, device='cuda:0')


100%|██████████| 7/7 [00:09<00:00,  1.33s/it]
100%|██████████| 7/7 [00:04<00:00,  1.57it/s]


epoch=52: train_ppl=tensor(3.1934, device='cuda:0') train_epoch_loss=tensor(1.1611, device='cuda:0') eval_ppl=tensor(3.5021, device='cuda:0') eval_epoch_loss=tensor(1.2534, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.50s/it]
100%|██████████| 7/7 [00:04<00:00,  1.47it/s]


epoch=53: train_ppl=tensor(3.1739, device='cuda:0') train_epoch_loss=tensor(1.1550, device='cuda:0') eval_ppl=tensor(3.8897, device='cuda:0') eval_epoch_loss=tensor(1.3583, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.50s/it]
100%|██████████| 7/7 [00:04<00:00,  1.65it/s]


epoch=54: train_ppl=tensor(2.9979, device='cuda:0') train_epoch_loss=tensor(1.0979, device='cuda:0') eval_ppl=tensor(3.6220, device='cuda:0') eval_epoch_loss=tensor(1.2870, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.56s/it]
100%|██████████| 7/7 [00:04<00:00,  1.59it/s]


epoch=55: train_ppl=tensor(2.6886, device='cuda:0') train_epoch_loss=tensor(0.9890, device='cuda:0') eval_ppl=tensor(2.2668, device='cuda:0') eval_epoch_loss=tensor(0.8184, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.48s/it]
100%|██████████| 7/7 [00:04<00:00,  1.56it/s]


epoch=56: train_ppl=tensor(2.4755, device='cuda:0') train_epoch_loss=tensor(0.9064, device='cuda:0') eval_ppl=tensor(2.2911, device='cuda:0') eval_epoch_loss=tensor(0.8290, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.46s/it]
100%|██████████| 7/7 [00:04<00:00,  1.42it/s]


epoch=57: train_ppl=tensor(2.6490, device='cuda:0') train_epoch_loss=tensor(0.9742, device='cuda:0') eval_ppl=tensor(2.6530, device='cuda:0') eval_epoch_loss=tensor(0.9757, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.50s/it]
100%|██████████| 7/7 [00:04<00:00,  1.53it/s]


epoch=58: train_ppl=tensor(3.1932, device='cuda:0') train_epoch_loss=tensor(1.1610, device='cuda:0') eval_ppl=tensor(3.3214, device='cuda:0') eval_epoch_loss=tensor(1.2004, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.48s/it]
100%|██████████| 7/7 [00:05<00:00,  1.39it/s]


epoch=59: train_ppl=tensor(26.9913, device='cuda:0') train_epoch_loss=tensor(3.2955, device='cuda:0') eval_ppl=tensor(12.3063, device='cuda:0') eval_epoch_loss=tensor(2.5101, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.44s/it]
100%|██████████| 7/7 [00:04<00:00,  1.42it/s]


epoch=60: train_ppl=tensor(10.6606, device='cuda:0') train_epoch_loss=tensor(2.3666, device='cuda:0') eval_ppl=tensor(9.1403, device='cuda:0') eval_epoch_loss=tensor(2.2127, device='cuda:0')


100%|██████████| 7/7 [00:09<00:00,  1.37s/it]
100%|██████████| 7/7 [00:04<00:00,  1.56it/s]


epoch=61: train_ppl=tensor(8.2999, device='cuda:0') train_epoch_loss=tensor(2.1162, device='cuda:0') eval_ppl=tensor(6.8725, device='cuda:0') eval_epoch_loss=tensor(1.9275, device='cuda:0')


100%|██████████| 7/7 [00:09<00:00,  1.41s/it]
100%|██████████| 7/7 [00:04<00:00,  1.60it/s]


epoch=62: train_ppl=tensor(6.4960, device='cuda:0') train_epoch_loss=tensor(1.8712, device='cuda:0') eval_ppl=tensor(5.7877, device='cuda:0') eval_epoch_loss=tensor(1.7557, device='cuda:0')


100%|██████████| 7/7 [00:09<00:00,  1.43s/it]
100%|██████████| 7/7 [00:04<00:00,  1.53it/s]


epoch=63: train_ppl=tensor(5.5613, device='cuda:0') train_epoch_loss=tensor(1.7158, device='cuda:0') eval_ppl=tensor(4.7855, device='cuda:0') eval_epoch_loss=tensor(1.5656, device='cuda:0')


100%|██████████| 7/7 [00:09<00:00,  1.31s/it]
100%|██████████| 7/7 [00:04<00:00,  1.63it/s]


epoch=64: train_ppl=tensor(3.7847, device='cuda:0') train_epoch_loss=tensor(1.3310, device='cuda:0') eval_ppl=tensor(3.8232, device='cuda:0') eval_epoch_loss=tensor(1.3411, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.43s/it]
100%|██████████| 7/7 [00:04<00:00,  1.66it/s]


epoch=65: train_ppl=tensor(3.3962, device='cuda:0') train_epoch_loss=tensor(1.2226, device='cuda:0') eval_ppl=tensor(4.8260, device='cuda:0') eval_epoch_loss=tensor(1.5740, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.48s/it]
100%|██████████| 7/7 [00:04<00:00,  1.60it/s]


epoch=66: train_ppl=tensor(3.8934, device='cuda:0') train_epoch_loss=tensor(1.3593, device='cuda:0') eval_ppl=tensor(3.5248, device='cuda:0') eval_epoch_loss=tensor(1.2598, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.47s/it]
100%|██████████| 7/7 [00:04<00:00,  1.49it/s]


epoch=67: train_ppl=tensor(3.6442, device='cuda:0') train_epoch_loss=tensor(1.2931, device='cuda:0') eval_ppl=tensor(3.5831, device='cuda:0') eval_epoch_loss=tensor(1.2762, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.47s/it]
100%|██████████| 7/7 [00:05<00:00,  1.38it/s]


epoch=68: train_ppl=tensor(3.4483, device='cuda:0') train_epoch_loss=tensor(1.2379, device='cuda:0') eval_ppl=tensor(3.8145, device='cuda:0') eval_epoch_loss=tensor(1.3388, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.46s/it]
100%|██████████| 7/7 [00:04<00:00,  1.41it/s]


epoch=69: train_ppl=tensor(3.8284, device='cuda:0') train_epoch_loss=tensor(1.3424, device='cuda:0') eval_ppl=tensor(3.6712, device='cuda:0') eval_epoch_loss=tensor(1.3005, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.48s/it]
100%|██████████| 7/7 [00:04<00:00,  1.51it/s]


epoch=70: train_ppl=tensor(3.1541, device='cuda:0') train_epoch_loss=tensor(1.1487, device='cuda:0') eval_ppl=tensor(4.8301e+14, device='cuda:0') eval_epoch_loss=tensor(33.8111, device='cuda:0')


100%|██████████| 7/7 [00:09<00:00,  1.39s/it]
100%|██████████| 7/7 [00:04<00:00,  1.49it/s]


epoch=71: train_ppl=tensor(1.5866e+12, device='cuda:0') train_epoch_loss=tensor(28.0926, device='cuda:0') eval_ppl=tensor(1.2473e+09, device='cuda:0') eval_epoch_loss=tensor(20.9443, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.45s/it]
100%|██████████| 7/7 [00:04<00:00,  1.59it/s]


epoch=72: train_ppl=tensor(36319508., device='cuda:0') train_epoch_loss=tensor(17.4079, device='cuda:0') eval_ppl=tensor(1031608.7500, device='cuda:0') eval_epoch_loss=tensor(13.8466, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.57s/it]
100%|██████████| 7/7 [00:04<00:00,  1.48it/s]


epoch=73: train_ppl=tensor(192373.3750, device='cuda:0') train_epoch_loss=tensor(12.1672, device='cuda:0') eval_ppl=tensor(30076.4199, device='cuda:0') eval_epoch_loss=tensor(10.3115, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.44s/it]
100%|██████████| 7/7 [00:04<00:00,  1.52it/s]


epoch=74: train_ppl=tensor(9749.2637, device='cuda:0') train_epoch_loss=tensor(9.1849, device='cuda:0') eval_ppl=tensor(5167.2437, device='cuda:0') eval_epoch_loss=tensor(8.5501, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.48s/it]
100%|██████████| 7/7 [00:04<00:00,  1.55it/s]


epoch=75: train_ppl=tensor(5957.6050, device='cuda:0') train_epoch_loss=tensor(8.6924, device='cuda:0') eval_ppl=tensor(9904.8037, device='cuda:0') eval_epoch_loss=tensor(9.2008, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.49s/it]
100%|██████████| 7/7 [00:04<00:00,  1.50it/s]


epoch=76: train_ppl=tensor(6389.8403, device='cuda:0') train_epoch_loss=tensor(8.7625, device='cuda:0') eval_ppl=tensor(4273.9570, device='cuda:0') eval_epoch_loss=tensor(8.3603, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.52s/it]
100%|██████████| 7/7 [00:04<00:00,  1.56it/s]


epoch=77: train_ppl=tensor(2778.2258, device='cuda:0') train_epoch_loss=tensor(7.9296, device='cuda:0') eval_ppl=tensor(1921.2511, device='cuda:0') eval_epoch_loss=tensor(7.5607, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.50s/it]
100%|██████████| 7/7 [00:04<00:00,  1.61it/s]


epoch=78: train_ppl=tensor(2006.3694, device='cuda:0') train_epoch_loss=tensor(7.6041, device='cuda:0') eval_ppl=tensor(1310.5887, device='cuda:0') eval_epoch_loss=tensor(7.1782, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.53s/it]
100%|██████████| 7/7 [00:04<00:00,  1.51it/s]


epoch=79: train_ppl=tensor(1960.7566, device='cuda:0') train_epoch_loss=tensor(7.5811, device='cuda:0') eval_ppl=tensor(1250.5719, device='cuda:0') eval_epoch_loss=tensor(7.1314, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.46s/it]
100%|██████████| 7/7 [00:04<00:00,  1.55it/s]


epoch=80: train_ppl=tensor(1560.7786, device='cuda:0') train_epoch_loss=tensor(7.3529, device='cuda:0') eval_ppl=tensor(893.8795, device='cuda:0') eval_epoch_loss=tensor(6.7956, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.50s/it]
100%|██████████| 7/7 [00:04<00:00,  1.42it/s]


epoch=81: train_ppl=tensor(832.9060, device='cuda:0') train_epoch_loss=tensor(6.7249, device='cuda:0') eval_ppl=tensor(757.5599, device='cuda:0') eval_epoch_loss=tensor(6.6301, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.48s/it]
100%|██████████| 7/7 [00:04<00:00,  1.52it/s]


epoch=82: train_ppl=tensor(725.0619, device='cuda:0') train_epoch_loss=tensor(6.5863, device='cuda:0') eval_ppl=tensor(797.9865, device='cuda:0') eval_epoch_loss=tensor(6.6821, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.46s/it]
100%|██████████| 7/7 [00:04<00:00,  1.68it/s]


epoch=83: train_ppl=tensor(698.9012, device='cuda:0') train_epoch_loss=tensor(6.5495, device='cuda:0') eval_ppl=tensor(743.6677, device='cuda:0') eval_epoch_loss=tensor(6.6116, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.48s/it]
100%|██████████| 7/7 [00:04<00:00,  1.60it/s]


epoch=84: train_ppl=tensor(1413.1799, device='cuda:0') train_epoch_loss=tensor(7.2536, device='cuda:0') eval_ppl=tensor(1051.6860, device='cuda:0') eval_epoch_loss=tensor(6.9581, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.45s/it]
100%|██████████| 7/7 [00:04<00:00,  1.57it/s]


epoch=85: train_ppl=tensor(1607.7856, device='cuda:0') train_epoch_loss=tensor(7.3826, device='cuda:0') eval_ppl=tensor(462.8915, device='cuda:0') eval_epoch_loss=tensor(6.1375, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.49s/it]
100%|██████████| 7/7 [00:04<00:00,  1.52it/s]


epoch=86: train_ppl=tensor(451.7462, device='cuda:0') train_epoch_loss=tensor(6.1131, device='cuda:0') eval_ppl=tensor(309.7563, device='cuda:0') eval_epoch_loss=tensor(5.7358, device='cuda:0')


100%|██████████| 7/7 [00:09<00:00,  1.41s/it]
100%|██████████| 7/7 [00:04<00:00,  1.53it/s]


epoch=87: train_ppl=tensor(21381.9141, device='cuda:0') train_epoch_loss=tensor(9.9703, device='cuda:0') eval_ppl=tensor(1111.9598, device='cuda:0') eval_epoch_loss=tensor(7.0139, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.49s/it]
100%|██████████| 7/7 [00:04<00:00,  1.57it/s]


epoch=88: train_ppl=tensor(2620.5852, device='cuda:0') train_epoch_loss=tensor(7.8712, device='cuda:0') eval_ppl=tensor(251481.4375, device='cuda:0') eval_epoch_loss=tensor(12.4351, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.48s/it]
100%|██████████| 7/7 [00:04<00:00,  1.48it/s]


epoch=89: train_ppl=tensor(14316.0107, device='cuda:0') train_epoch_loss=tensor(9.5691, device='cuda:0') eval_ppl=tensor(2109.6038, device='cuda:0') eval_epoch_loss=tensor(7.6543, device='cuda:0')


100%|██████████| 7/7 [00:11<00:00,  1.58s/it]
100%|██████████| 7/7 [00:06<00:00,  1.11it/s]


epoch=90: train_ppl=tensor(2041.8296, device='cuda:0') train_epoch_loss=tensor(7.6216, device='cuda:0') eval_ppl=tensor(1607.7979, device='cuda:0') eval_epoch_loss=tensor(7.3826, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.47s/it]
100%|██████████| 7/7 [00:04<00:00,  1.57it/s]


epoch=91: train_ppl=tensor(1269.8362, device='cuda:0') train_epoch_loss=tensor(7.1466, device='cuda:0') eval_ppl=tensor(1150.0791, device='cuda:0') eval_epoch_loss=tensor(7.0476, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.47s/it]
100%|██████████| 7/7 [00:04<00:00,  1.62it/s]


epoch=92: train_ppl=tensor(853.0311, device='cuda:0') train_epoch_loss=tensor(6.7488, device='cuda:0') eval_ppl=tensor(862.1588, device='cuda:0') eval_epoch_loss=tensor(6.7594, device='cuda:0')


100%|██████████| 7/7 [00:09<00:00,  1.42s/it]
100%|██████████| 7/7 [00:04<00:00,  1.55it/s]


epoch=93: train_ppl=tensor(832.2104, device='cuda:0') train_epoch_loss=tensor(6.7241, device='cuda:0') eval_ppl=tensor(747.7632, device='cuda:0') eval_epoch_loss=tensor(6.6171, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.50s/it]
100%|██████████| 7/7 [00:04<00:00,  1.55it/s]


epoch=94: train_ppl=tensor(818.4894, device='cuda:0') train_epoch_loss=tensor(6.7075, device='cuda:0') eval_ppl=tensor(605.9000, device='cuda:0') eval_epoch_loss=tensor(6.4067, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.48s/it]
100%|██████████| 7/7 [00:04<00:00,  1.56it/s]


epoch=95: train_ppl=tensor(425.6521, device='cuda:0') train_epoch_loss=tensor(6.0536, device='cuda:0') eval_ppl=tensor(501.3121, device='cuda:0') eval_epoch_loss=tensor(6.2172, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.52s/it]
100%|██████████| 7/7 [00:04<00:00,  1.70it/s]


epoch=96: train_ppl=tensor(416.8454, device='cuda:0') train_epoch_loss=tensor(6.0327, device='cuda:0') eval_ppl=tensor(357.4402, device='cuda:0') eval_epoch_loss=tensor(5.8790, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.51s/it]
100%|██████████| 7/7 [00:04<00:00,  1.48it/s]


epoch=97: train_ppl=tensor(330.0089, device='cuda:0') train_epoch_loss=tensor(5.7991, device='cuda:0') eval_ppl=tensor(356.8124, device='cuda:0') eval_epoch_loss=tensor(5.8772, device='cuda:0')


100%|██████████| 7/7 [00:11<00:00,  1.65s/it]
100%|██████████| 7/7 [00:05<00:00,  1.28it/s]


epoch=98: train_ppl=tensor(326.6364, device='cuda:0') train_epoch_loss=tensor(5.7888, device='cuda:0') eval_ppl=tensor(351.0346, device='cuda:0') eval_epoch_loss=tensor(5.8609, device='cuda:0')


100%|██████████| 7/7 [00:11<00:00,  1.64s/it]
100%|██████████| 7/7 [00:04<00:00,  1.51it/s]


epoch=99: train_ppl=tensor(282.8800, device='cuda:0') train_epoch_loss=tensor(5.6450, device='cuda:0') eval_ppl=tensor(292.4944, device='cuda:0') eval_epoch_loss=tensor(5.6784, device='cuda:0')


100%|██████████| 7/7 [00:11<00:00,  1.63s/it]
100%|██████████| 7/7 [00:05<00:00,  1.21it/s]


epoch=100: train_ppl=tensor(194.3111, device='cuda:0') train_epoch_loss=tensor(5.2695, device='cuda:0') eval_ppl=tensor(290.1393, device='cuda:0') eval_epoch_loss=tensor(5.6704, device='cuda:0')


100%|██████████| 7/7 [00:09<00:00,  1.40s/it]
100%|██████████| 7/7 [00:04<00:00,  1.57it/s]


epoch=101: train_ppl=tensor(245.5312, device='cuda:0') train_epoch_loss=tensor(5.5034, device='cuda:0') eval_ppl=tensor(288.6603, device='cuda:0') eval_epoch_loss=tensor(5.6653, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.50s/it]
100%|██████████| 7/7 [00:04<00:00,  1.52it/s]


epoch=102: train_ppl=tensor(288.0820, device='cuda:0') train_epoch_loss=tensor(5.6632, device='cuda:0') eval_ppl=tensor(260.9435, device='cuda:0') eval_epoch_loss=tensor(5.5643, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.56s/it]
100%|██████████| 7/7 [00:04<00:00,  1.65it/s]


epoch=103: train_ppl=tensor(195.6709, device='cuda:0') train_epoch_loss=tensor(5.2764, device='cuda:0') eval_ppl=tensor(296.5519, device='cuda:0') eval_epoch_loss=tensor(5.6922, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.54s/it]
100%|██████████| 7/7 [00:04<00:00,  1.55it/s]


epoch=104: train_ppl=tensor(306.8519, device='cuda:0') train_epoch_loss=tensor(5.7264, device='cuda:0') eval_ppl=tensor(462.5837, device='cuda:0') eval_epoch_loss=tensor(6.1368, device='cuda:0')


100%|██████████| 7/7 [00:09<00:00,  1.35s/it]
100%|██████████| 7/7 [00:04<00:00,  1.64it/s]


epoch=105: train_ppl=tensor(341.3700, device='cuda:0') train_epoch_loss=tensor(5.8330, device='cuda:0') eval_ppl=tensor(434.4532, device='cuda:0') eval_epoch_loss=tensor(6.0741, device='cuda:0')


100%|██████████| 7/7 [1:03:42<00:00, 546.03s/it] 
100%|██████████| 7/7 [00:48<00:00,  6.87s/it]


epoch=106: train_ppl=tensor(230.8731, device='cuda:0') train_epoch_loss=tensor(5.4419, device='cuda:0') eval_ppl=tensor(407.2453, device='cuda:0') eval_epoch_loss=tensor(6.0094, device='cuda:0')


100%|██████████| 7/7 [1:13:23<00:00, 629.11s/it]
100%|██████████| 7/7 [00:21<00:00,  3.03s/it]


epoch=107: train_ppl=tensor(327.7372, device='cuda:0') train_epoch_loss=tensor(5.7922, device='cuda:0') eval_ppl=tensor(424.4761, device='cuda:0') eval_epoch_loss=tensor(6.0509, device='cuda:0')


100%|██████████| 7/7 [57:28<00:00, 492.69s/it]
100%|██████████| 7/7 [01:15<00:00, 10.84s/it]


epoch=108: train_ppl=tensor(264.3441, device='cuda:0') train_epoch_loss=tensor(5.5773, device='cuda:0') eval_ppl=tensor(187.5572, device='cuda:0') eval_epoch_loss=tensor(5.2341, device='cuda:0')


100%|██████████| 7/7 [1:28:26<00:00, 758.05s/it]
100%|██████████| 7/7 [00:55<00:00,  7.91s/it]


epoch=109: train_ppl=tensor(156.6410, device='cuda:0') train_epoch_loss=tensor(5.0540, device='cuda:0') eval_ppl=tensor(182.8863, device='cuda:0') eval_epoch_loss=tensor(5.2089, device='cuda:0')


100%|██████████| 7/7 [1:17:22<00:00, 663.15s/it] 
100%|██████████| 7/7 [00:41<00:00,  5.91s/it]


epoch=110: train_ppl=tensor(146.5101, device='cuda:0') train_epoch_loss=tensor(4.9871, device='cuda:0') eval_ppl=tensor(172.6053, device='cuda:0') eval_epoch_loss=tensor(5.1510, device='cuda:0')


100%|██████████| 7/7 [1:06:11<00:00, 567.43s/it]
100%|██████████| 7/7 [00:34<00:00,  4.90s/it]


epoch=111: train_ppl=tensor(118.3200, device='cuda:0') train_epoch_loss=tensor(4.7734, device='cuda:0') eval_ppl=tensor(169.8196, device='cuda:0') eval_epoch_loss=tensor(5.1347, device='cuda:0')


100%|██████████| 7/7 [1:10:44<00:00, 606.41s/it] 
100%|██████████| 7/7 [00:43<00:00,  6.16s/it]


epoch=112: train_ppl=tensor(128.8045, device='cuda:0') train_epoch_loss=tensor(4.8583, device='cuda:0') eval_ppl=tensor(171.2877, device='cuda:0') eval_epoch_loss=tensor(5.1433, device='cuda:0')


100%|██████████| 7/7 [1:19:16<00:00, 679.46s/it] 
100%|██████████| 7/7 [01:10<00:00, 10.14s/it]


epoch=113: train_ppl=tensor(105.5585, device='cuda:0') train_epoch_loss=tensor(4.6593, device='cuda:0') eval_ppl=tensor(143.7852, device='cuda:0') eval_epoch_loss=tensor(4.9683, device='cuda:0')


100%|██████████| 7/7 [56:09<00:00, 481.29s/it]   
100%|██████████| 7/7 [00:31<00:00,  4.45s/it]


epoch=114: train_ppl=tensor(116.1653, device='cuda:0') train_epoch_loss=tensor(4.7550, device='cuda:0') eval_ppl=tensor(131.2563, device='cuda:0') eval_epoch_loss=tensor(4.8772, device='cuda:0')


100%|██████████| 7/7 [1:23:43<00:00, 717.65s/it] 
100%|██████████| 7/7 [00:39<00:00,  5.61s/it]


epoch=115: train_ppl=tensor(104.4153, device='cuda:0') train_epoch_loss=tensor(4.6484, device='cuda:0') eval_ppl=tensor(125.1445, device='cuda:0') eval_epoch_loss=tensor(4.8295, device='cuda:0')


100%|██████████| 7/7 [1:12:53<00:00, 624.85s/it] 
100%|██████████| 7/7 [01:16<00:00, 10.96s/it]


epoch=116: train_ppl=tensor(96.5783, device='cuda:0') train_epoch_loss=tensor(4.5704, device='cuda:0') eval_ppl=tensor(108.7549, device='cuda:0') eval_epoch_loss=tensor(4.6891, device='cuda:0')


100%|██████████| 7/7 [1:20:29<00:00, 689.98s/it] 
100%|██████████| 7/7 [00:51<00:00,  7.30s/it]


epoch=117: train_ppl=tensor(84.0641, device='cuda:0') train_epoch_loss=tensor(4.4316, device='cuda:0') eval_ppl=tensor(164.1669, device='cuda:0') eval_epoch_loss=tensor(5.1009, device='cuda:0')


100%|██████████| 7/7 [1:07:06<00:00, 575.26s/it]
100%|██████████| 7/7 [00:28<00:00,  4.07s/it]


epoch=118: train_ppl=tensor(91.2698, device='cuda:0') train_epoch_loss=tensor(4.5138, device='cuda:0') eval_ppl=tensor(117.1255, device='cuda:0') eval_epoch_loss=tensor(4.7632, device='cuda:0')


100%|██████████| 7/7 [1:19:54<00:00, 684.96s/it] 
100%|██████████| 7/7 [00:39<00:00,  5.69s/it]


epoch=119: train_ppl=tensor(88.0853, device='cuda:0') train_epoch_loss=tensor(4.4783, device='cuda:0') eval_ppl=tensor(115.5842, device='cuda:0') eval_epoch_loss=tensor(4.7500, device='cuda:0')


100%|██████████| 7/7 [1:10:44<00:00, 606.39s/it]
100%|██████████| 7/7 [00:59<00:00,  8.46s/it]


epoch=120: train_ppl=tensor(82.2141, device='cuda:0') train_epoch_loss=tensor(4.4093, device='cuda:0') eval_ppl=tensor(94.4162, device='cuda:0') eval_epoch_loss=tensor(4.5477, device='cuda:0')


100%|██████████| 7/7 [1:02:33<00:00, 536.27s/it] 
100%|██████████| 7/7 [00:38<00:00,  5.45s/it]


epoch=121: train_ppl=tensor(76.4990, device='cuda:0') train_epoch_loss=tensor(4.3373, device='cuda:0') eval_ppl=tensor(90.6872, device='cuda:0') eval_epoch_loss=tensor(4.5074, device='cuda:0')


100%|██████████| 7/7 [48:46<00:00, 418.13s/it]
100%|██████████| 7/7 [00:43<00:00,  6.16s/it]


epoch=122: train_ppl=tensor(54.8191, device='cuda:0') train_epoch_loss=tensor(4.0040, device='cuda:0') eval_ppl=tensor(86.9493, device='cuda:0') eval_epoch_loss=tensor(4.4653, device='cuda:0')


100%|██████████| 7/7 [55:35<00:00, 476.56s/it]  
100%|██████████| 7/7 [01:11<00:00, 10.16s/it]


epoch=123: train_ppl=tensor(52.6746, device='cuda:0') train_epoch_loss=tensor(3.9641, device='cuda:0') eval_ppl=tensor(90.1590, device='cuda:0') eval_epoch_loss=tensor(4.5016, device='cuda:0')


100%|██████████| 7/7 [1:15:40<00:00, 648.59s/it] 
100%|██████████| 7/7 [00:05<00:00,  1.27it/s]


epoch=124: train_ppl=tensor(43.6062, device='cuda:0') train_epoch_loss=tensor(3.7752, device='cuda:0') eval_ppl=tensor(71.1353, device='cuda:0') eval_epoch_loss=tensor(4.2646, device='cuda:0')


100%|██████████| 7/7 [00:08<00:00,  1.18s/it]
100%|██████████| 7/7 [00:04<00:00,  1.44it/s]


epoch=125: train_ppl=tensor(52.7181, device='cuda:0') train_epoch_loss=tensor(3.9650, device='cuda:0') eval_ppl=tensor(67.1254, device='cuda:0') eval_epoch_loss=tensor(4.2066, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.49s/it]
100%|██████████| 7/7 [00:05<00:00,  1.34it/s]


epoch=126: train_ppl=tensor(44.5158, device='cuda:0') train_epoch_loss=tensor(3.7958, device='cuda:0') eval_ppl=tensor(55.9668, device='cuda:0') eval_epoch_loss=tensor(4.0248, device='cuda:0')


100%|██████████| 7/7 [00:10<00:00,  1.53s/it]
100%|██████████| 7/7 [00:04<00:00,  1.44it/s]


epoch=127: train_ppl=tensor(41.6829, device='cuda:0') train_epoch_loss=tensor(3.7301, device='cuda:0') eval_ppl=tensor(47.1066, device='cuda:0') eval_epoch_loss=tensor(3.8524, device='cuda:0')


100%|██████████| 7/7 [00:11<00:00,  1.57s/it]
100%|██████████| 7/7 [00:16<00:00,  2.43s/it]


epoch=128: train_ppl=tensor(26.5127, device='cuda:0') train_epoch_loss=tensor(3.2776, device='cuda:0') eval_ppl=tensor(46.7074, device='cuda:0') eval_epoch_loss=tensor(3.8439, device='cuda:0')


100%|██████████| 7/7 [00:04<00:00,  1.73it/s]
100%|██████████| 7/7 [00:01<00:00,  5.95it/s]


epoch=129: train_ppl=tensor(26.3599, device='cuda:0') train_epoch_loss=tensor(3.2718, device='cuda:0') eval_ppl=tensor(44.5336, device='cuda:0') eval_epoch_loss=tensor(3.7962, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.33it/s]
100%|██████████| 7/7 [00:01<00:00,  4.43it/s]


epoch=130: train_ppl=tensor(32.4588, device='cuda:0') train_epoch_loss=tensor(3.4800, device='cuda:0') eval_ppl=tensor(41.8781, device='cuda:0') eval_epoch_loss=tensor(3.7348, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  1.98it/s]
100%|██████████| 7/7 [00:01<00:00,  4.83it/s]


epoch=131: train_ppl=tensor(21.1635, device='cuda:0') train_epoch_loss=tensor(3.0523, device='cuda:0') eval_ppl=tensor(39.5001, device='cuda:0') eval_epoch_loss=tensor(3.6763, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  1.80it/s]
100%|██████████| 7/7 [00:01<00:00,  3.89it/s]


epoch=132: train_ppl=tensor(20.6082, device='cuda:0') train_epoch_loss=tensor(3.0257, device='cuda:0') eval_ppl=tensor(35.3787, device='cuda:0') eval_epoch_loss=tensor(3.5661, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.54it/s]
100%|██████████| 7/7 [00:01<00:00,  4.92it/s]


epoch=133: train_ppl=tensor(27.4504, device='cuda:0') train_epoch_loss=tensor(3.3124, device='cuda:0') eval_ppl=tensor(33.0193, device='cuda:0') eval_epoch_loss=tensor(3.4971, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.66it/s]
100%|██████████| 7/7 [00:01<00:00,  5.97it/s]


epoch=134: train_ppl=tensor(31.2349, device='cuda:0') train_epoch_loss=tensor(3.4415, device='cuda:0') eval_ppl=tensor(32.3861, device='cuda:0') eval_epoch_loss=tensor(3.4777, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.59it/s]
100%|██████████| 7/7 [00:01<00:00,  4.29it/s]


epoch=135: train_ppl=tensor(24.6361, device='cuda:0') train_epoch_loss=tensor(3.2042, device='cuda:0') eval_ppl=tensor(31.4180, device='cuda:0') eval_epoch_loss=tensor(3.4474, device='cuda:0')


100%|██████████| 7/7 [00:05<00:00,  1.32it/s]
100%|██████████| 7/7 [00:01<00:00,  5.15it/s]


epoch=136: train_ppl=tensor(24.6455, device='cuda:0') train_epoch_loss=tensor(3.2046, device='cuda:0') eval_ppl=tensor(29.9343, device='cuda:0') eval_epoch_loss=tensor(3.3990, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.47it/s]
100%|██████████| 7/7 [00:01<00:00,  5.20it/s]


epoch=137: train_ppl=tensor(23.5283, device='cuda:0') train_epoch_loss=tensor(3.1582, device='cuda:0') eval_ppl=tensor(29.1420, device='cuda:0') eval_epoch_loss=tensor(3.3722, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  2.31it/s]
100%|██████████| 7/7 [00:01<00:00,  5.75it/s]


epoch=138: train_ppl=tensor(21.4466, device='cuda:0') train_epoch_loss=tensor(3.0656, device='cuda:0') eval_ppl=tensor(27.8836, device='cuda:0') eval_epoch_loss=tensor(3.3280, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.61it/s]
100%|██████████| 7/7 [00:01<00:00,  5.87it/s]


epoch=139: train_ppl=tensor(16.8838, device='cuda:0') train_epoch_loss=tensor(2.8264, device='cuda:0') eval_ppl=tensor(27.8093, device='cuda:0') eval_epoch_loss=tensor(3.3254, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.61it/s]
100%|██████████| 7/7 [00:01<00:00,  5.62it/s]


epoch=140: train_ppl=tensor(18.6088, device='cuda:0') train_epoch_loss=tensor(2.9236, device='cuda:0') eval_ppl=tensor(30.9546, device='cuda:0') eval_epoch_loss=tensor(3.4325, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.57it/s]
100%|██████████| 7/7 [00:01<00:00,  5.44it/s]


epoch=141: train_ppl=tensor(22.2948, device='cuda:0') train_epoch_loss=tensor(3.1044, device='cuda:0') eval_ppl=tensor(27.0198, device='cuda:0') eval_epoch_loss=tensor(3.2966, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.50it/s]
100%|██████████| 7/7 [00:01<00:00,  5.88it/s]


epoch=142: train_ppl=tensor(23.4757, device='cuda:0') train_epoch_loss=tensor(3.1560, device='cuda:0') eval_ppl=tensor(25.7007, device='cuda:0') eval_epoch_loss=tensor(3.2465, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.67it/s]
100%|██████████| 7/7 [00:01<00:00,  5.87it/s]


epoch=143: train_ppl=tensor(15.6382, device='cuda:0') train_epoch_loss=tensor(2.7497, device='cuda:0') eval_ppl=tensor(23.8874, device='cuda:0') eval_epoch_loss=tensor(3.1734, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.66it/s]
100%|██████████| 7/7 [00:01<00:00,  5.90it/s]


epoch=144: train_ppl=tensor(18.6753, device='cuda:0') train_epoch_loss=tensor(2.9272, device='cuda:0') eval_ppl=tensor(23.4970, device='cuda:0') eval_epoch_loss=tensor(3.1569, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.38it/s]


epoch=145: train_ppl=tensor(18.6349, device='cuda:0') train_epoch_loss=tensor(2.9250, device='cuda:0') eval_ppl=tensor(26.1426, device='cuda:0') eval_epoch_loss=tensor(3.2636, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.53it/s]
100%|██████████| 7/7 [00:01<00:00,  5.21it/s]


epoch=146: train_ppl=tensor(20.2580, device='cuda:0') train_epoch_loss=tensor(3.0085, device='cuda:0') eval_ppl=tensor(25.0693, device='cuda:0') eval_epoch_loss=tensor(3.2216, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.36it/s]
100%|██████████| 7/7 [00:01<00:00,  5.90it/s]


epoch=147: train_ppl=tensor(14.5909, device='cuda:0') train_epoch_loss=tensor(2.6804, device='cuda:0') eval_ppl=tensor(22.0161, device='cuda:0') eval_epoch_loss=tensor(3.0918, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  1.91it/s]
100%|██████████| 7/7 [00:04<00:00,  1.53it/s]


epoch=148: train_ppl=tensor(17.2339, device='cuda:0') train_epoch_loss=tensor(2.8469, device='cuda:0') eval_ppl=tensor(20.8976, device='cuda:0') eval_epoch_loss=tensor(3.0396, device='cuda:0')


100%|██████████| 7/7 [00:04<00:00,  1.55it/s]
100%|██████████| 7/7 [00:01<00:00,  5.89it/s]


epoch=149: train_ppl=tensor(17.3833, device='cuda:0') train_epoch_loss=tensor(2.8555, device='cuda:0') eval_ppl=tensor(19.6981, device='cuda:0') eval_epoch_loss=tensor(2.9805, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.70it/s]
100%|██████████| 7/7 [00:01<00:00,  5.86it/s]


epoch=150: train_ppl=tensor(14.6203, device='cuda:0') train_epoch_loss=tensor(2.6824, device='cuda:0') eval_ppl=tensor(17.9491, device='cuda:0') eval_epoch_loss=tensor(2.8875, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.53it/s]
100%|██████████| 7/7 [00:01<00:00,  4.83it/s]


epoch=151: train_ppl=tensor(14.3766, device='cuda:0') train_epoch_loss=tensor(2.6656, device='cuda:0') eval_ppl=tensor(15.8578, device='cuda:0') eval_epoch_loss=tensor(2.7637, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.47it/s]
100%|██████████| 7/7 [00:01<00:00,  5.92it/s]


epoch=152: train_ppl=tensor(14.8325, device='cuda:0') train_epoch_loss=tensor(2.6968, device='cuda:0') eval_ppl=tensor(17.5758, device='cuda:0') eval_epoch_loss=tensor(2.8665, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.92it/s]


epoch=153: train_ppl=tensor(13.9416, device='cuda:0') train_epoch_loss=tensor(2.6349, device='cuda:0') eval_ppl=tensor(15.6263, device='cuda:0') eval_epoch_loss=tensor(2.7490, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.70it/s]
100%|██████████| 7/7 [00:01<00:00,  5.86it/s]


epoch=154: train_ppl=tensor(10.5515, device='cuda:0') train_epoch_loss=tensor(2.3563, device='cuda:0') eval_ppl=tensor(14.6163, device='cuda:0') eval_epoch_loss=tensor(2.6821, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.81it/s]


epoch=155: train_ppl=tensor(12.2156, device='cuda:0') train_epoch_loss=tensor(2.5027, device='cuda:0') eval_ppl=tensor(14.0427, device='cuda:0') eval_epoch_loss=tensor(2.6421, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.70it/s]
100%|██████████| 7/7 [00:01<00:00,  5.88it/s]


epoch=156: train_ppl=tensor(9.8010, device='cuda:0') train_epoch_loss=tensor(2.2825, device='cuda:0') eval_ppl=tensor(13.5657, device='cuda:0') eval_epoch_loss=tensor(2.6075, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.67it/s]
100%|██████████| 7/7 [00:01<00:00,  5.95it/s]


epoch=157: train_ppl=tensor(12.0428, device='cuda:0') train_epoch_loss=tensor(2.4885, device='cuda:0') eval_ppl=tensor(14.2585, device='cuda:0') eval_epoch_loss=tensor(2.6574, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  4.48it/s]


epoch=158: train_ppl=tensor(15.4568, device='cuda:0') train_epoch_loss=tensor(2.7380, device='cuda:0') eval_ppl=tensor(13.0819, device='cuda:0') eval_epoch_loss=tensor(2.5712, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  1.87it/s]
100%|██████████| 7/7 [00:01<00:00,  5.86it/s]


epoch=159: train_ppl=tensor(13.4884, device='cuda:0') train_epoch_loss=tensor(2.6018, device='cuda:0') eval_ppl=tensor(13.1807, device='cuda:0') eval_epoch_loss=tensor(2.5788, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.70it/s]
100%|██████████| 7/7 [00:01<00:00,  5.87it/s]


epoch=160: train_ppl=tensor(10.7734, device='cuda:0') train_epoch_loss=tensor(2.3771, device='cuda:0') eval_ppl=tensor(12.2050, device='cuda:0') eval_epoch_loss=tensor(2.5018, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.70it/s]
100%|██████████| 7/7 [00:01<00:00,  5.86it/s]


epoch=161: train_ppl=tensor(4729.7832, device='cuda:0') train_epoch_loss=tensor(8.4616, device='cuda:0') eval_ppl=tensor(1.1145e+08, device='cuda:0') eval_epoch_loss=tensor(18.5291, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.86it/s]


epoch=162: train_ppl=tensor(2886703., device='cuda:0') train_epoch_loss=tensor(14.8756, device='cuda:0') eval_ppl=tensor(3717.2678, device='cuda:0') eval_epoch_loss=tensor(8.2207, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.70it/s]
100%|██████████| 7/7 [00:01<00:00,  5.84it/s]


epoch=163: train_ppl=tensor(8744.9775, device='cuda:0') train_epoch_loss=tensor(9.0762, device='cuda:0') eval_ppl=tensor(7837.7812, device='cuda:0') eval_epoch_loss=tensor(8.9667, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.70it/s]
100%|██████████| 7/7 [00:01<00:00,  5.91it/s]


epoch=164: train_ppl=tensor(5638.1538, device='cuda:0') train_epoch_loss=tensor(8.6373, device='cuda:0') eval_ppl=tensor(2123.6399, device='cuda:0') eval_epoch_loss=tensor(7.6609, device='cuda:0')


100%|██████████| 7/7 [00:05<00:00,  1.35it/s]
100%|██████████| 7/7 [00:02<00:00,  2.86it/s]


epoch=165: train_ppl=tensor(1618.7377, device='cuda:0') train_epoch_loss=tensor(7.3894, device='cuda:0') eval_ppl=tensor(956.0134, device='cuda:0') eval_epoch_loss=tensor(6.8628, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  2.23it/s]
100%|██████████| 7/7 [00:01<00:00,  5.88it/s]


epoch=166: train_ppl=tensor(680.7069, device='cuda:0') train_epoch_loss=tensor(6.5231, device='cuda:0') eval_ppl=tensor(406.7025, device='cuda:0') eval_epoch_loss=tensor(6.0081, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.65it/s]
100%|██████████| 7/7 [00:01<00:00,  5.96it/s]


epoch=167: train_ppl=tensor(311.6686, device='cuda:0') train_epoch_loss=tensor(5.7419, device='cuda:0') eval_ppl=tensor(215.5875, device='cuda:0') eval_epoch_loss=tensor(5.3734, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.67it/s]
100%|██████████| 7/7 [00:01<00:00,  5.97it/s]


epoch=168: train_ppl=tensor(183.7753, device='cuda:0') train_epoch_loss=tensor(5.2137, device='cuda:0') eval_ppl=tensor(141.6619, device='cuda:0') eval_epoch_loss=tensor(4.9534, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.64it/s]
100%|██████████| 7/7 [00:01<00:00,  5.96it/s]


epoch=169: train_ppl=tensor(120.4049, device='cuda:0') train_epoch_loss=tensor(4.7909, device='cuda:0') eval_ppl=tensor(107.0402, device='cuda:0') eval_epoch_loss=tensor(4.6732, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  1.98it/s]
100%|██████████| 7/7 [00:01<00:00,  5.25it/s]


epoch=170: train_ppl=tensor(86.3314, device='cuda:0') train_epoch_loss=tensor(4.4582, device='cuda:0') eval_ppl=tensor(78.4089, device='cuda:0') eval_epoch_loss=tensor(4.3619, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.47it/s]
100%|██████████| 7/7 [00:01<00:00,  5.86it/s]


epoch=171: train_ppl=tensor(69.0507, device='cuda:0') train_epoch_loss=tensor(4.2348, device='cuda:0') eval_ppl=tensor(59.6059, device='cuda:0') eval_epoch_loss=tensor(4.0878, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.36it/s]
100%|██████████| 7/7 [00:01<00:00,  4.94it/s]


epoch=172: train_ppl=tensor(49.0972, device='cuda:0') train_epoch_loss=tensor(3.8938, device='cuda:0') eval_ppl=tensor(47.1028, device='cuda:0') eval_epoch_loss=tensor(3.8523, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.54it/s]
100%|██████████| 7/7 [00:01<00:00,  5.86it/s]


epoch=173: train_ppl=tensor(40.9924, device='cuda:0') train_epoch_loss=tensor(3.7134, device='cuda:0') eval_ppl=tensor(37.9117, device='cuda:0') eval_epoch_loss=tensor(3.6353, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.63it/s]
100%|██████████| 7/7 [00:01<00:00,  5.89it/s]


epoch=174: train_ppl=tensor(32.0560, device='cuda:0') train_epoch_loss=tensor(3.4675, device='cuda:0') eval_ppl=tensor(31.0182, device='cuda:0') eval_epoch_loss=tensor(3.4346, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.66it/s]
100%|██████████| 7/7 [00:01<00:00,  5.89it/s]


epoch=175: train_ppl=tensor(28.4849, device='cuda:0') train_epoch_loss=tensor(3.3494, device='cuda:0') eval_ppl=tensor(24.8487, device='cuda:0') eval_epoch_loss=tensor(3.2128, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.71it/s]
100%|██████████| 7/7 [00:01<00:00,  5.74it/s]


epoch=176: train_ppl=tensor(20.2302, device='cuda:0') train_epoch_loss=tensor(3.0072, device='cuda:0') eval_ppl=tensor(17.4857, device='cuda:0') eval_epoch_loss=tensor(2.8614, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.67it/s]
100%|██████████| 7/7 [00:01<00:00,  5.90it/s]


epoch=177: train_ppl=tensor(17.8069, device='cuda:0') train_epoch_loss=tensor(2.8796, device='cuda:0') eval_ppl=tensor(14.6701, device='cuda:0') eval_epoch_loss=tensor(2.6858, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.65it/s]
100%|██████████| 7/7 [00:01<00:00,  5.93it/s]


epoch=178: train_ppl=tensor(14.6483, device='cuda:0') train_epoch_loss=tensor(2.6843, device='cuda:0') eval_ppl=tensor(12.1484, device='cuda:0') eval_epoch_loss=tensor(2.4972, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.66it/s]
100%|██████████| 7/7 [00:01<00:00,  5.86it/s]


epoch=179: train_ppl=tensor(11.8603, device='cuda:0') train_epoch_loss=tensor(2.4732, device='cuda:0') eval_ppl=tensor(11.4125, device='cuda:0') eval_epoch_loss=tensor(2.4347, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.65it/s]
100%|██████████| 7/7 [00:01<00:00,  5.91it/s]


epoch=180: train_ppl=tensor(10.1906, device='cuda:0') train_epoch_loss=tensor(2.3215, device='cuda:0') eval_ppl=tensor(9.7964, device='cuda:0') eval_epoch_loss=tensor(2.2820, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.49it/s]
100%|██████████| 7/7 [00:01<00:00,  5.49it/s]


epoch=181: train_ppl=tensor(8.8372, device='cuda:0') train_epoch_loss=tensor(2.1790, device='cuda:0') eval_ppl=tensor(8.8149, device='cuda:0') eval_epoch_loss=tensor(2.1764, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.93it/s]


epoch=182: train_ppl=tensor(8.2998, device='cuda:0') train_epoch_loss=tensor(2.1162, device='cuda:0') eval_ppl=tensor(8.1098, device='cuda:0') eval_epoch_loss=tensor(2.0931, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.67it/s]
100%|██████████| 7/7 [00:01<00:00,  5.93it/s]


epoch=183: train_ppl=tensor(7.1051, device='cuda:0') train_epoch_loss=tensor(1.9608, device='cuda:0') eval_ppl=tensor(7.0391, device='cuda:0') eval_epoch_loss=tensor(1.9515, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.93it/s]


epoch=184: train_ppl=tensor(6.5367, device='cuda:0') train_epoch_loss=tensor(1.8774, device='cuda:0') eval_ppl=tensor(6.7596, device='cuda:0') eval_epoch_loss=tensor(1.9110, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.90it/s]


epoch=185: train_ppl=tensor(6.1194, device='cuda:0') train_epoch_loss=tensor(1.8115, device='cuda:0') eval_ppl=tensor(6.3233, device='cuda:0') eval_epoch_loss=tensor(1.8442, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.88it/s]


epoch=186: train_ppl=tensor(5.9053, device='cuda:0') train_epoch_loss=tensor(1.7758, device='cuda:0') eval_ppl=tensor(6.0458, device='cuda:0') eval_epoch_loss=tensor(1.7994, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.70it/s]
100%|██████████| 7/7 [00:01<00:00,  5.82it/s]


epoch=187: train_ppl=tensor(5.8165, device='cuda:0') train_epoch_loss=tensor(1.7607, device='cuda:0') eval_ppl=tensor(5.6975, device='cuda:0') eval_epoch_loss=tensor(1.7400, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.70it/s]
100%|██████████| 7/7 [00:01<00:00,  5.87it/s]


epoch=188: train_ppl=tensor(5.7422, device='cuda:0') train_epoch_loss=tensor(1.7478, device='cuda:0') eval_ppl=tensor(5.5942, device='cuda:0') eval_epoch_loss=tensor(1.7217, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.93it/s]


epoch=189: train_ppl=tensor(5.4696, device='cuda:0') train_epoch_loss=tensor(1.6992, device='cuda:0') eval_ppl=tensor(5.6377, device='cuda:0') eval_epoch_loss=tensor(1.7295, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.67it/s]
100%|██████████| 7/7 [00:01<00:00,  5.95it/s]


epoch=190: train_ppl=tensor(5.7363, device='cuda:0') train_epoch_loss=tensor(1.7468, device='cuda:0') eval_ppl=tensor(5.8146, device='cuda:0') eval_epoch_loss=tensor(1.7604, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.67it/s]
100%|██████████| 7/7 [00:01<00:00,  5.76it/s]


epoch=191: train_ppl=tensor(5.5163, device='cuda:0') train_epoch_loss=tensor(1.7077, device='cuda:0') eval_ppl=tensor(5.3265, device='cuda:0') eval_epoch_loss=tensor(1.6727, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.62it/s]
100%|██████████| 7/7 [00:01<00:00,  5.92it/s]


epoch=192: train_ppl=tensor(5.2055, device='cuda:0') train_epoch_loss=tensor(1.6497, device='cuda:0') eval_ppl=tensor(5.4654, device='cuda:0') eval_epoch_loss=tensor(1.6984, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.43it/s]
100%|██████████| 7/7 [00:01<00:00,  5.27it/s]


epoch=193: train_ppl=tensor(5.3265, device='cuda:0') train_epoch_loss=tensor(1.6727, device='cuda:0') eval_ppl=tensor(5.3215, device='cuda:0') eval_epoch_loss=tensor(1.6718, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  2.05it/s]
100%|██████████| 7/7 [00:01<00:00,  5.27it/s]


epoch=194: train_ppl=tensor(5.2095, device='cuda:0') train_epoch_loss=tensor(1.6505, device='cuda:0') eval_ppl=tensor(5.2199, device='cuda:0') eval_epoch_loss=tensor(1.6525, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  2.31it/s]
100%|██████████| 7/7 [00:01<00:00,  5.56it/s]


epoch=195: train_ppl=tensor(5.1002, device='cuda:0') train_epoch_loss=tensor(1.6293, device='cuda:0') eval_ppl=tensor(5.5785, device='cuda:0') eval_epoch_loss=tensor(1.7189, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.61it/s]
100%|██████████| 7/7 [00:09<00:00,  1.35s/it]


epoch=196: train_ppl=tensor(5.0758, device='cuda:0') train_epoch_loss=tensor(1.6245, device='cuda:0') eval_ppl=tensor(5.0966, device='cuda:0') eval_epoch_loss=tensor(1.6286, device='cuda:0')


100%|██████████| 7/7 [00:08<00:00,  1.19s/it]
100%|██████████| 7/7 [00:01<00:00,  4.07it/s]


epoch=197: train_ppl=tensor(4.9946, device='cuda:0') train_epoch_loss=tensor(1.6084, device='cuda:0') eval_ppl=tensor(5.0776, device='cuda:0') eval_epoch_loss=tensor(1.6248, device='cuda:0')


100%|██████████| 7/7 [00:09<00:00,  1.41s/it]
100%|██████████| 7/7 [00:01<00:00,  3.92it/s]


epoch=198: train_ppl=tensor(5.0999, device='cuda:0') train_epoch_loss=tensor(1.6292, device='cuda:0') eval_ppl=tensor(5.1122, device='cuda:0') eval_epoch_loss=tensor(1.6316, device='cuda:0')


100%|██████████| 7/7 [00:04<00:00,  1.64it/s]
100%|██████████| 7/7 [00:01<00:00,  3.79it/s]


epoch=199: train_ppl=tensor(4.9479, device='cuda:0') train_epoch_loss=tensor(1.5990, device='cuda:0') eval_ppl=tensor(4.9795, device='cuda:0') eval_epoch_loss=tensor(1.6053, device='cuda:0')


100%|██████████| 7/7 [00:08<00:00,  1.25s/it]
100%|██████████| 7/7 [00:01<00:00,  3.53it/s]


epoch=200: train_ppl=tensor(4.9841, device='cuda:0') train_epoch_loss=tensor(1.6062, device='cuda:0') eval_ppl=tensor(5.1310, device='cuda:0') eval_epoch_loss=tensor(1.6353, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.44it/s]
100%|██████████| 7/7 [00:01<00:00,  4.53it/s]


epoch=201: train_ppl=tensor(4.7981, device='cuda:0') train_epoch_loss=tensor(1.5682, device='cuda:0') eval_ppl=tensor(5.3788, device='cuda:0') eval_epoch_loss=tensor(1.6825, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  2.07it/s]
100%|██████████| 7/7 [00:01<00:00,  4.59it/s]


epoch=202: train_ppl=tensor(7.0602, device='cuda:0') train_epoch_loss=tensor(1.9545, device='cuda:0') eval_ppl=tensor(44.7433, device='cuda:0') eval_epoch_loss=tensor(3.8009, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  2.15it/s]
100%|██████████| 7/7 [00:01<00:00,  5.73it/s]


epoch=203: train_ppl=tensor(55.0091, device='cuda:0') train_epoch_loss=tensor(4.0075, device='cuda:0') eval_ppl=tensor(17.2294, device='cuda:0') eval_epoch_loss=tensor(2.8466, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.45it/s]
100%|██████████| 7/7 [00:01<00:00,  5.23it/s]


epoch=204: train_ppl=tensor(10.4278, device='cuda:0') train_epoch_loss=tensor(2.3445, device='cuda:0') eval_ppl=tensor(8.9410, device='cuda:0') eval_epoch_loss=tensor(2.1906, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.61it/s]
100%|██████████| 7/7 [00:01<00:00,  5.82it/s]


epoch=205: train_ppl=tensor(9.9083, device='cuda:0') train_epoch_loss=tensor(2.2934, device='cuda:0') eval_ppl=tensor(9.9462, device='cuda:0') eval_epoch_loss=tensor(2.2972, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  2.29it/s]
100%|██████████| 7/7 [00:01<00:00,  5.23it/s]


epoch=206: train_ppl=tensor(8.0042, device='cuda:0') train_epoch_loss=tensor(2.0800, device='cuda:0') eval_ppl=tensor(7.9223, device='cuda:0') eval_epoch_loss=tensor(2.0697, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.46it/s]
100%|██████████| 7/7 [00:01<00:00,  5.74it/s]


epoch=207: train_ppl=tensor(7.1478, device='cuda:0') train_epoch_loss=tensor(1.9668, device='cuda:0') eval_ppl=tensor(6.9733, device='cuda:0') eval_epoch_loss=tensor(1.9421, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.55it/s]
100%|██████████| 7/7 [00:01<00:00,  5.71it/s]


epoch=208: train_ppl=tensor(6.7635, device='cuda:0') train_epoch_loss=tensor(1.9115, device='cuda:0') eval_ppl=tensor(6.8305, device='cuda:0') eval_epoch_loss=tensor(1.9214, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.52it/s]
100%|██████████| 7/7 [00:01<00:00,  5.95it/s]


epoch=209: train_ppl=tensor(6.2292, device='cuda:0') train_epoch_loss=tensor(1.8293, device='cuda:0') eval_ppl=tensor(6.1639, device='cuda:0') eval_epoch_loss=tensor(1.8187, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.63it/s]
100%|██████████| 7/7 [00:01<00:00,  5.84it/s]


epoch=210: train_ppl=tensor(5.9026, device='cuda:0') train_epoch_loss=tensor(1.7754, device='cuda:0') eval_ppl=tensor(5.8496, device='cuda:0') eval_epoch_loss=tensor(1.7664, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.65it/s]
100%|██████████| 7/7 [00:01<00:00,  5.91it/s]


epoch=211: train_ppl=tensor(5.5781, device='cuda:0') train_epoch_loss=tensor(1.7188, device='cuda:0') eval_ppl=tensor(5.9152, device='cuda:0') eval_epoch_loss=tensor(1.7775, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.67it/s]
100%|██████████| 7/7 [00:01<00:00,  5.93it/s]


epoch=212: train_ppl=tensor(5.5356, device='cuda:0') train_epoch_loss=tensor(1.7112, device='cuda:0') eval_ppl=tensor(5.6746, device='cuda:0') eval_epoch_loss=tensor(1.7360, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.58it/s]
100%|██████████| 7/7 [00:01<00:00,  5.78it/s]


epoch=213: train_ppl=tensor(5.5380, device='cuda:0') train_epoch_loss=tensor(1.7116, device='cuda:0') eval_ppl=tensor(5.5362, device='cuda:0') eval_epoch_loss=tensor(1.7113, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.61it/s]
100%|██████████| 7/7 [00:01<00:00,  5.61it/s]


epoch=214: train_ppl=tensor(5.4259, device='cuda:0') train_epoch_loss=tensor(1.6912, device='cuda:0') eval_ppl=tensor(5.4631, device='cuda:0') eval_epoch_loss=tensor(1.6980, device='cuda:0')


100%|██████████| 7/7 [00:04<00:00,  1.42it/s]
100%|██████████| 7/7 [00:01<00:00,  3.99it/s]


epoch=215: train_ppl=tensor(5.4166, device='cuda:0') train_epoch_loss=tensor(1.6895, device='cuda:0') eval_ppl=tensor(5.3779, device='cuda:0') eval_epoch_loss=tensor(1.6823, device='cuda:0')


100%|██████████| 7/7 [00:06<00:00,  1.03it/s]
100%|██████████| 7/7 [00:02<00:00,  2.92it/s]


epoch=216: train_ppl=tensor(5.5998, device='cuda:0') train_epoch_loss=tensor(1.7227, device='cuda:0') eval_ppl=tensor(5.4068, device='cuda:0') eval_epoch_loss=tensor(1.6877, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  1.76it/s]
100%|██████████| 7/7 [00:01<00:00,  3.94it/s]


epoch=217: train_ppl=tensor(5.3604, device='cuda:0') train_epoch_loss=tensor(1.6790, device='cuda:0') eval_ppl=tensor(5.3069, device='cuda:0') eval_epoch_loss=tensor(1.6690, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  1.78it/s]
100%|██████████| 7/7 [00:01<00:00,  3.98it/s]


epoch=218: train_ppl=tensor(5.3282, device='cuda:0') train_epoch_loss=tensor(1.6730, device='cuda:0') eval_ppl=tensor(5.2069, device='cuda:0') eval_epoch_loss=tensor(1.6500, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  1.80it/s]
100%|██████████| 7/7 [00:01<00:00,  3.89it/s]


epoch=219: train_ppl=tensor(5.1977, device='cuda:0') train_epoch_loss=tensor(1.6482, device='cuda:0') eval_ppl=tensor(5.2697, device='cuda:0') eval_epoch_loss=tensor(1.6620, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  1.76it/s]
100%|██████████| 7/7 [00:01<00:00,  4.00it/s]


epoch=220: train_ppl=tensor(5.2598, device='cuda:0') train_epoch_loss=tensor(1.6601, device='cuda:0') eval_ppl=tensor(5.1661, device='cuda:0') eval_epoch_loss=tensor(1.6421, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  1.77it/s]
100%|██████████| 7/7 [00:01<00:00,  3.98it/s]


epoch=221: train_ppl=tensor(5.0788, device='cuda:0') train_epoch_loss=tensor(1.6251, device='cuda:0') eval_ppl=tensor(5.1261, device='cuda:0') eval_epoch_loss=tensor(1.6343, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  1.78it/s]
100%|██████████| 7/7 [00:01<00:00,  4.00it/s]


epoch=222: train_ppl=tensor(5.2984, device='cuda:0') train_epoch_loss=tensor(1.6674, device='cuda:0') eval_ppl=tensor(5.1988, device='cuda:0') eval_epoch_loss=tensor(1.6484, device='cuda:0')


100%|██████████| 7/7 [00:04<00:00,  1.75it/s]
100%|██████████| 7/7 [00:01<00:00,  3.96it/s]


epoch=223: train_ppl=tensor(5.0280, device='cuda:0') train_epoch_loss=tensor(1.6150, device='cuda:0') eval_ppl=tensor(5.0679, device='cuda:0') eval_epoch_loss=tensor(1.6229, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  1.80it/s]
100%|██████████| 7/7 [00:01<00:00,  4.05it/s]


epoch=224: train_ppl=tensor(4.9639, device='cuda:0') train_epoch_loss=tensor(1.6022, device='cuda:0') eval_ppl=tensor(5.1378, device='cuda:0') eval_epoch_loss=tensor(1.6366, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.42it/s]
100%|██████████| 7/7 [00:01<00:00,  5.90it/s]


epoch=225: train_ppl=tensor(4.8728, device='cuda:0') train_epoch_loss=tensor(1.5837, device='cuda:0') eval_ppl=tensor(5.3060, device='cuda:0') eval_epoch_loss=tensor(1.6688, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.91it/s]


epoch=226: train_ppl=tensor(4.9433, device='cuda:0') train_epoch_loss=tensor(1.5980, device='cuda:0') eval_ppl=tensor(5.1440, device='cuda:0') eval_epoch_loss=tensor(1.6378, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.71it/s]
100%|██████████| 7/7 [00:01<00:00,  5.94it/s]


epoch=227: train_ppl=tensor(4.9264, device='cuda:0') train_epoch_loss=tensor(1.5946, device='cuda:0') eval_ppl=tensor(5.0747, device='cuda:0') eval_epoch_loss=tensor(1.6243, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.90it/s]


epoch=228: train_ppl=tensor(4.9432, device='cuda:0') train_epoch_loss=tensor(1.5980, device='cuda:0') eval_ppl=tensor(5.0562, device='cuda:0') eval_epoch_loss=tensor(1.6206, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.70it/s]
100%|██████████| 7/7 [00:01<00:00,  5.89it/s]


epoch=229: train_ppl=tensor(4.9404, device='cuda:0') train_epoch_loss=tensor(1.5974, device='cuda:0') eval_ppl=tensor(4.9904, device='cuda:0') eval_epoch_loss=tensor(1.6075, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.70it/s]
100%|██████████| 7/7 [00:01<00:00,  5.92it/s]


epoch=230: train_ppl=tensor(5.0619, device='cuda:0') train_epoch_loss=tensor(1.6217, device='cuda:0') eval_ppl=tensor(5.0000, device='cuda:0') eval_epoch_loss=tensor(1.6094, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.93it/s]


epoch=231: train_ppl=tensor(4.8737, device='cuda:0') train_epoch_loss=tensor(1.5839, device='cuda:0') eval_ppl=tensor(5.1193, device='cuda:0') eval_epoch_loss=tensor(1.6330, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.94it/s]


epoch=232: train_ppl=tensor(4.8906, device='cuda:0') train_epoch_loss=tensor(1.5873, device='cuda:0') eval_ppl=tensor(5.0081, device='cuda:0') eval_epoch_loss=tensor(1.6111, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.93it/s]


epoch=233: train_ppl=tensor(4.8959, device='cuda:0') train_epoch_loss=tensor(1.5884, device='cuda:0') eval_ppl=tensor(5.0407, device='cuda:0') eval_epoch_loss=tensor(1.6175, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.87it/s]


epoch=234: train_ppl=tensor(4.9138, device='cuda:0') train_epoch_loss=tensor(1.5921, device='cuda:0') eval_ppl=tensor(4.8893, device='cuda:0') eval_epoch_loss=tensor(1.5870, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.70it/s]
100%|██████████| 7/7 [00:01<00:00,  5.87it/s]


epoch=235: train_ppl=tensor(4.9134, device='cuda:0') train_epoch_loss=tensor(1.5920, device='cuda:0') eval_ppl=tensor(4.8702, device='cuda:0') eval_epoch_loss=tensor(1.5831, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.70it/s]
100%|██████████| 7/7 [00:01<00:00,  5.91it/s]


epoch=236: train_ppl=tensor(4.9317, device='cuda:0') train_epoch_loss=tensor(1.5957, device='cuda:0') eval_ppl=tensor(4.9726, device='cuda:0') eval_epoch_loss=tensor(1.6039, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.70it/s]
100%|██████████| 7/7 [00:01<00:00,  5.88it/s]


epoch=237: train_ppl=tensor(4.7639, device='cuda:0') train_epoch_loss=tensor(1.5611, device='cuda:0') eval_ppl=tensor(4.8889, device='cuda:0') eval_epoch_loss=tensor(1.5870, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.87it/s]


epoch=238: train_ppl=tensor(4.8728, device='cuda:0') train_epoch_loss=tensor(1.5837, device='cuda:0') eval_ppl=tensor(5.0716, device='cuda:0') eval_epoch_loss=tensor(1.6237, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.86it/s]


epoch=239: train_ppl=tensor(4.8709, device='cuda:0') train_epoch_loss=tensor(1.5833, device='cuda:0') eval_ppl=tensor(4.9872, device='cuda:0') eval_epoch_loss=tensor(1.6069, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.87it/s]


epoch=240: train_ppl=tensor(4.8858, device='cuda:0') train_epoch_loss=tensor(1.5863, device='cuda:0') eval_ppl=tensor(4.8938, device='cuda:0') eval_epoch_loss=tensor(1.5880, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.93it/s]


epoch=241: train_ppl=tensor(4.7205, device='cuda:0') train_epoch_loss=tensor(1.5519, device='cuda:0') eval_ppl=tensor(4.7694, device='cuda:0') eval_epoch_loss=tensor(1.5622, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.86it/s]


epoch=242: train_ppl=tensor(4.7300, device='cuda:0') train_epoch_loss=tensor(1.5539, device='cuda:0') eval_ppl=tensor(4.7706, device='cuda:0') eval_epoch_loss=tensor(1.5625, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.67it/s]
100%|██████████| 7/7 [00:01<00:00,  5.91it/s]


epoch=243: train_ppl=tensor(4.7436, device='cuda:0') train_epoch_loss=tensor(1.5568, device='cuda:0') eval_ppl=tensor(4.9057, device='cuda:0') eval_epoch_loss=tensor(1.5904, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.90it/s]


epoch=244: train_ppl=tensor(4.7235, device='cuda:0') train_epoch_loss=tensor(1.5526, device='cuda:0') eval_ppl=tensor(4.6729, device='cuda:0') eval_epoch_loss=tensor(1.5418, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.66it/s]
100%|██████████| 7/7 [00:01<00:00,  5.82it/s]


epoch=245: train_ppl=tensor(4.7665, device='cuda:0') train_epoch_loss=tensor(1.5616, device='cuda:0') eval_ppl=tensor(4.8524, device='cuda:0') eval_epoch_loss=tensor(1.5795, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.88it/s]


epoch=246: train_ppl=tensor(4.5961, device='cuda:0') train_epoch_loss=tensor(1.5252, device='cuda:0') eval_ppl=tensor(4.6752, device='cuda:0') eval_epoch_loss=tensor(1.5423, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.92it/s]


epoch=247: train_ppl=tensor(4.6196, device='cuda:0') train_epoch_loss=tensor(1.5303, device='cuda:0') eval_ppl=tensor(4.8249, device='cuda:0') eval_epoch_loss=tensor(1.5738, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.67it/s]
100%|██████████| 7/7 [00:01<00:00,  5.91it/s]


epoch=248: train_ppl=tensor(4.6872, device='cuda:0') train_epoch_loss=tensor(1.5448, device='cuda:0') eval_ppl=tensor(4.6567, device='cuda:0') eval_epoch_loss=tensor(1.5383, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.87it/s]


epoch=249: train_ppl=tensor(4.7370, device='cuda:0') train_epoch_loss=tensor(1.5554, device='cuda:0') eval_ppl=tensor(4.6365, device='cuda:0') eval_epoch_loss=tensor(1.5340, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.91it/s]


epoch=250: train_ppl=tensor(4.5956, device='cuda:0') train_epoch_loss=tensor(1.5251, device='cuda:0') eval_ppl=tensor(4.7927, device='cuda:0') eval_epoch_loss=tensor(1.5671, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.89it/s]


epoch=251: train_ppl=tensor(4.6789, device='cuda:0') train_epoch_loss=tensor(1.5431, device='cuda:0') eval_ppl=tensor(4.6568, device='cuda:0') eval_epoch_loss=tensor(1.5383, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.88it/s]


epoch=252: train_ppl=tensor(4.9126, device='cuda:0') train_epoch_loss=tensor(1.5918, device='cuda:0') eval_ppl=tensor(4.7010, device='cuda:0') eval_epoch_loss=tensor(1.5478, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.93it/s]


epoch=253: train_ppl=tensor(4.6794, device='cuda:0') train_epoch_loss=tensor(1.5432, device='cuda:0') eval_ppl=tensor(4.5674, device='cuda:0') eval_epoch_loss=tensor(1.5189, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.89it/s]


epoch=254: train_ppl=tensor(4.5954, device='cuda:0') train_epoch_loss=tensor(1.5251, device='cuda:0') eval_ppl=tensor(4.9675, device='cuda:0') eval_epoch_loss=tensor(1.6029, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.87it/s]


epoch=255: train_ppl=tensor(4.8208, device='cuda:0') train_epoch_loss=tensor(1.5729, device='cuda:0') eval_ppl=tensor(4.6638, device='cuda:0') eval_epoch_loss=tensor(1.5398, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.89it/s]


epoch=256: train_ppl=tensor(4.6748, device='cuda:0') train_epoch_loss=tensor(1.5422, device='cuda:0') eval_ppl=tensor(4.5105, device='cuda:0') eval_epoch_loss=tensor(1.5064, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.88it/s]


epoch=257: train_ppl=tensor(4.4090, device='cuda:0') train_epoch_loss=tensor(1.4836, device='cuda:0') eval_ppl=tensor(4.8167, device='cuda:0') eval_epoch_loss=tensor(1.5721, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.89it/s]


epoch=258: train_ppl=tensor(4.5941, device='cuda:0') train_epoch_loss=tensor(1.5248, device='cuda:0') eval_ppl=tensor(4.7256, device='cuda:0') eval_epoch_loss=tensor(1.5530, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.89it/s]


epoch=259: train_ppl=tensor(4.6011, device='cuda:0') train_epoch_loss=tensor(1.5263, device='cuda:0') eval_ppl=tensor(4.6552, device='cuda:0') eval_epoch_loss=tensor(1.5380, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.70it/s]
100%|██████████| 7/7 [00:01<00:00,  5.87it/s]


epoch=260: train_ppl=tensor(4.6310, device='cuda:0') train_epoch_loss=tensor(1.5328, device='cuda:0') eval_ppl=tensor(4.9082, device='cuda:0') eval_epoch_loss=tensor(1.5909, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.95it/s]


epoch=261: train_ppl=tensor(4.7607, device='cuda:0') train_epoch_loss=tensor(1.5604, device='cuda:0') eval_ppl=tensor(4.7838, device='cuda:0') eval_epoch_loss=tensor(1.5652, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.90it/s]


epoch=262: train_ppl=tensor(4.4793, device='cuda:0') train_epoch_loss=tensor(1.4995, device='cuda:0') eval_ppl=tensor(5.1271, device='cuda:0') eval_epoch_loss=tensor(1.6345, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.93it/s]


epoch=263: train_ppl=tensor(4.9297, device='cuda:0') train_epoch_loss=tensor(1.5953, device='cuda:0') eval_ppl=tensor(4.8419, device='cuda:0') eval_epoch_loss=tensor(1.5773, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.70it/s]
100%|██████████| 7/7 [00:01<00:00,  5.92it/s]


epoch=264: train_ppl=tensor(4.6755, device='cuda:0') train_epoch_loss=tensor(1.5423, device='cuda:0') eval_ppl=tensor(4.7187, device='cuda:0') eval_epoch_loss=tensor(1.5515, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.70it/s]
100%|██████████| 7/7 [00:01<00:00,  5.85it/s]


epoch=265: train_ppl=tensor(4.5341, device='cuda:0') train_epoch_loss=tensor(1.5116, device='cuda:0') eval_ppl=tensor(5.2514, device='cuda:0') eval_epoch_loss=tensor(1.6585, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.70it/s]
100%|██████████| 7/7 [00:01<00:00,  5.88it/s]


epoch=266: train_ppl=tensor(4.9735, device='cuda:0') train_epoch_loss=tensor(1.6041, device='cuda:0') eval_ppl=tensor(4.8615, device='cuda:0') eval_epoch_loss=tensor(1.5813, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.89it/s]


epoch=267: train_ppl=tensor(4.7792, device='cuda:0') train_epoch_loss=tensor(1.5643, device='cuda:0') eval_ppl=tensor(4.8746, device='cuda:0') eval_epoch_loss=tensor(1.5840, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.88it/s]


epoch=268: train_ppl=tensor(4.9600, device='cuda:0') train_epoch_loss=tensor(1.6014, device='cuda:0') eval_ppl=tensor(5.0958, device='cuda:0') eval_epoch_loss=tensor(1.6284, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.85it/s]


epoch=269: train_ppl=tensor(4.8863, device='cuda:0') train_epoch_loss=tensor(1.5864, device='cuda:0') eval_ppl=tensor(4.8407, device='cuda:0') eval_epoch_loss=tensor(1.5770, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.91it/s]


epoch=270: train_ppl=tensor(5.0162, device='cuda:0') train_epoch_loss=tensor(1.6127, device='cuda:0') eval_ppl=tensor(5.2698, device='cuda:0') eval_epoch_loss=tensor(1.6620, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.93it/s]


epoch=271: train_ppl=tensor(4.6021, device='cuda:0') train_epoch_loss=tensor(1.5265, device='cuda:0') eval_ppl=tensor(4.7942, device='cuda:0') eval_epoch_loss=tensor(1.5674, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.90it/s]


epoch=272: train_ppl=tensor(5.2471, device='cuda:0') train_epoch_loss=tensor(1.6577, device='cuda:0') eval_ppl=tensor(5.0339, device='cuda:0') eval_epoch_loss=tensor(1.6162, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.92it/s]


epoch=273: train_ppl=tensor(5.6672, device='cuda:0') train_epoch_loss=tensor(1.7347, device='cuda:0') eval_ppl=tensor(5.8758, device='cuda:0') eval_epoch_loss=tensor(1.7708, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.90it/s]


epoch=274: train_ppl=tensor(5.7142, device='cuda:0') train_epoch_loss=tensor(1.7430, device='cuda:0') eval_ppl=tensor(6.2029, device='cuda:0') eval_epoch_loss=tensor(1.8250, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.67it/s]
100%|██████████| 7/7 [00:01<00:00,  5.79it/s]


epoch=275: train_ppl=tensor(5.6510, device='cuda:0') train_epoch_loss=tensor(1.7318, device='cuda:0') eval_ppl=tensor(5.5988, device='cuda:0') eval_epoch_loss=tensor(1.7225, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.61it/s]
100%|██████████| 7/7 [00:01<00:00,  5.82it/s]


epoch=276: train_ppl=tensor(5.3457, device='cuda:0') train_epoch_loss=tensor(1.6763, device='cuda:0') eval_ppl=tensor(5.0282, device='cuda:0') eval_epoch_loss=tensor(1.6151, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.56it/s]
100%|██████████| 7/7 [00:01<00:00,  5.82it/s]


epoch=277: train_ppl=tensor(5.8078, device='cuda:0') train_epoch_loss=tensor(1.7592, device='cuda:0') eval_ppl=tensor(5.6584, device='cuda:0') eval_epoch_loss=tensor(1.7331, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.63it/s]
100%|██████████| 7/7 [00:01<00:00,  5.79it/s]


epoch=278: train_ppl=tensor(5.6002, device='cuda:0') train_epoch_loss=tensor(1.7228, device='cuda:0') eval_ppl=tensor(5.1820, device='cuda:0') eval_epoch_loss=tensor(1.6452, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.65it/s]
100%|██████████| 7/7 [00:01<00:00,  5.82it/s]


epoch=279: train_ppl=tensor(5.5167, device='cuda:0') train_epoch_loss=tensor(1.7078, device='cuda:0') eval_ppl=tensor(4.5513, device='cuda:0') eval_epoch_loss=tensor(1.5154, device='cuda:0')


100%|██████████| 7/7 [00:05<00:00,  1.20it/s]
100%|██████████| 7/7 [00:01<00:00,  5.92it/s]


epoch=280: train_ppl=tensor(4.7229, device='cuda:0') train_epoch_loss=tensor(1.5524, device='cuda:0') eval_ppl=tensor(4.6903, device='cuda:0') eval_epoch_loss=tensor(1.5455, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.89it/s]


epoch=281: train_ppl=tensor(4.4706, device='cuda:0') train_epoch_loss=tensor(1.4975, device='cuda:0') eval_ppl=tensor(4.2901, device='cuda:0') eval_epoch_loss=tensor(1.4563, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.91it/s]


epoch=282: train_ppl=tensor(4.4576, device='cuda:0') train_epoch_loss=tensor(1.4946, device='cuda:0') eval_ppl=tensor(4.2372, device='cuda:0') eval_epoch_loss=tensor(1.4439, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.86it/s]


epoch=283: train_ppl=tensor(4.9761, device='cuda:0') train_epoch_loss=tensor(1.6047, device='cuda:0') eval_ppl=tensor(4.4821, device='cuda:0') eval_epoch_loss=tensor(1.5001, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.92it/s]


epoch=284: train_ppl=tensor(4.5256, device='cuda:0') train_epoch_loss=tensor(1.5097, device='cuda:0') eval_ppl=tensor(4.2851, device='cuda:0') eval_epoch_loss=tensor(1.4552, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.90it/s]


epoch=285: train_ppl=tensor(4.7024, device='cuda:0') train_epoch_loss=tensor(1.5481, device='cuda:0') eval_ppl=tensor(4.7000, device='cuda:0') eval_epoch_loss=tensor(1.5476, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.66it/s]
100%|██████████| 7/7 [00:01<00:00,  5.88it/s]


epoch=286: train_ppl=tensor(4.9178, device='cuda:0') train_epoch_loss=tensor(1.5929, device='cuda:0') eval_ppl=tensor(4.1395, device='cuda:0') eval_epoch_loss=tensor(1.4206, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.67it/s]
100%|██████████| 7/7 [00:01<00:00,  5.87it/s]


epoch=287: train_ppl=tensor(4.5884, device='cuda:0') train_epoch_loss=tensor(1.5235, device='cuda:0') eval_ppl=tensor(4.0724, device='cuda:0') eval_epoch_loss=tensor(1.4042, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.84it/s]


epoch=288: train_ppl=tensor(4.2387, device='cuda:0') train_epoch_loss=tensor(1.4443, device='cuda:0') eval_ppl=tensor(4.3179, device='cuda:0') eval_epoch_loss=tensor(1.4628, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.65it/s]
100%|██████████| 7/7 [00:01<00:00,  5.81it/s]


epoch=289: train_ppl=tensor(4.2055, device='cuda:0') train_epoch_loss=tensor(1.4364, device='cuda:0') eval_ppl=tensor(4.2192, device='cuda:0') eval_epoch_loss=tensor(1.4396, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.67it/s]
100%|██████████| 7/7 [00:01<00:00,  5.85it/s]


epoch=290: train_ppl=tensor(4.4973, device='cuda:0') train_epoch_loss=tensor(1.5035, device='cuda:0') eval_ppl=tensor(4.3013, device='cuda:0') eval_epoch_loss=tensor(1.4589, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.67it/s]
100%|██████████| 7/7 [00:01<00:00,  5.89it/s]


epoch=291: train_ppl=tensor(4.3682, device='cuda:0') train_epoch_loss=tensor(1.4743, device='cuda:0') eval_ppl=tensor(4.4019, device='cuda:0') eval_epoch_loss=tensor(1.4820, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.67it/s]
100%|██████████| 7/7 [00:01<00:00,  5.86it/s]


epoch=292: train_ppl=tensor(4.3392, device='cuda:0') train_epoch_loss=tensor(1.4677, device='cuda:0') eval_ppl=tensor(4.3312, device='cuda:0') eval_epoch_loss=tensor(1.4658, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.66it/s]
100%|██████████| 7/7 [00:01<00:00,  5.84it/s]


epoch=293: train_ppl=tensor(4.3518, device='cuda:0') train_epoch_loss=tensor(1.4706, device='cuda:0') eval_ppl=tensor(4.2246, device='cuda:0') eval_epoch_loss=tensor(1.4409, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.67it/s]
100%|██████████| 7/7 [00:01<00:00,  5.94it/s]


epoch=294: train_ppl=tensor(4.4009, device='cuda:0') train_epoch_loss=tensor(1.4818, device='cuda:0') eval_ppl=tensor(4.2463, device='cuda:0') eval_epoch_loss=tensor(1.4460, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.91it/s]


epoch=295: train_ppl=tensor(4.4954, device='cuda:0') train_epoch_loss=tensor(1.5031, device='cuda:0') eval_ppl=tensor(4.6500, device='cuda:0') eval_epoch_loss=tensor(1.5369, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.66it/s]
100%|██████████| 7/7 [00:01<00:00,  5.92it/s]


epoch=296: train_ppl=tensor(4.2684, device='cuda:0') train_epoch_loss=tensor(1.4512, device='cuda:0') eval_ppl=tensor(4.6351, device='cuda:0') eval_epoch_loss=tensor(1.5336, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.90it/s]


epoch=297: train_ppl=tensor(5.2544, device='cuda:0') train_epoch_loss=tensor(1.6591, device='cuda:0') eval_ppl=tensor(5.3665, device='cuda:0') eval_epoch_loss=tensor(1.6802, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.70it/s]
100%|██████████| 7/7 [00:01<00:00,  5.90it/s]


epoch=298: train_ppl=tensor(4.9401, device='cuda:0') train_epoch_loss=tensor(1.5974, device='cuda:0') eval_ppl=tensor(4.6928, device='cuda:0') eval_epoch_loss=tensor(1.5460, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.85it/s]


epoch=299: train_ppl=tensor(4.4342, device='cuda:0') train_epoch_loss=tensor(1.4894, device='cuda:0') eval_ppl=tensor(4.2238, device='cuda:0') eval_epoch_loss=tensor(1.4407, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.87it/s]


epoch=300: train_ppl=tensor(4.3955, device='cuda:0') train_epoch_loss=tensor(1.4806, device='cuda:0') eval_ppl=tensor(4.3003, device='cuda:0') eval_epoch_loss=tensor(1.4587, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.67it/s]
100%|██████████| 7/7 [00:01<00:00,  5.81it/s]


epoch=301: train_ppl=tensor(4.1767, device='cuda:0') train_epoch_loss=tensor(1.4295, device='cuda:0') eval_ppl=tensor(4.2501, device='cuda:0') eval_epoch_loss=tensor(1.4470, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.67it/s]
100%|██████████| 7/7 [00:01<00:00,  5.91it/s]


epoch=302: train_ppl=tensor(4.3841, device='cuda:0') train_epoch_loss=tensor(1.4780, device='cuda:0') eval_ppl=tensor(4.4486, device='cuda:0') eval_epoch_loss=tensor(1.4926, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.66it/s]
100%|██████████| 7/7 [00:01<00:00,  5.91it/s]


epoch=303: train_ppl=tensor(4.3241, device='cuda:0') train_epoch_loss=tensor(1.4642, device='cuda:0') eval_ppl=tensor(4.3305, device='cuda:0') eval_epoch_loss=tensor(1.4657, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.89it/s]


epoch=304: train_ppl=tensor(4.3716, device='cuda:0') train_epoch_loss=tensor(1.4751, device='cuda:0') eval_ppl=tensor(4.2801, device='cuda:0') eval_epoch_loss=tensor(1.4540, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.92it/s]


epoch=305: train_ppl=tensor(4.2885, device='cuda:0') train_epoch_loss=tensor(1.4559, device='cuda:0') eval_ppl=tensor(4.5334, device='cuda:0') eval_epoch_loss=tensor(1.5115, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.88it/s]


epoch=306: train_ppl=tensor(4.4508, device='cuda:0') train_epoch_loss=tensor(1.4931, device='cuda:0') eval_ppl=tensor(4.5012, device='cuda:0') eval_epoch_loss=tensor(1.5043, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.92it/s]


epoch=307: train_ppl=tensor(4.3476, device='cuda:0') train_epoch_loss=tensor(1.4696, device='cuda:0') eval_ppl=tensor(4.2859, device='cuda:0') eval_epoch_loss=tensor(1.4553, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.92it/s]


epoch=308: train_ppl=tensor(4.7140, device='cuda:0') train_epoch_loss=tensor(1.5505, device='cuda:0') eval_ppl=tensor(4.3404, device='cuda:0') eval_epoch_loss=tensor(1.4680, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.70it/s]
100%|██████████| 7/7 [00:01<00:00,  5.84it/s]


epoch=309: train_ppl=tensor(4.5628, device='cuda:0') train_epoch_loss=tensor(1.5179, device='cuda:0') eval_ppl=tensor(4.3723, device='cuda:0') eval_epoch_loss=tensor(1.4753, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.84it/s]


epoch=310: train_ppl=tensor(4.2638, device='cuda:0') train_epoch_loss=tensor(1.4502, device='cuda:0') eval_ppl=tensor(4.1153, device='cuda:0') eval_epoch_loss=tensor(1.4147, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.67it/s]
100%|██████████| 7/7 [00:01<00:00,  5.86it/s]


epoch=311: train_ppl=tensor(4.1798, device='cuda:0') train_epoch_loss=tensor(1.4303, device='cuda:0') eval_ppl=tensor(4.0864, device='cuda:0') eval_epoch_loss=tensor(1.4077, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.91it/s]


epoch=312: train_ppl=tensor(4.2534, device='cuda:0') train_epoch_loss=tensor(1.4477, device='cuda:0') eval_ppl=tensor(4.1080, device='cuda:0') eval_epoch_loss=tensor(1.4129, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.91it/s]


epoch=313: train_ppl=tensor(4.2907, device='cuda:0') train_epoch_loss=tensor(1.4565, device='cuda:0') eval_ppl=tensor(3.9402, device='cuda:0') eval_epoch_loss=tensor(1.3712, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.85it/s]


epoch=314: train_ppl=tensor(4.3360, device='cuda:0') train_epoch_loss=tensor(1.4670, device='cuda:0') eval_ppl=tensor(4.0697, device='cuda:0') eval_epoch_loss=tensor(1.4036, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.70it/s]
100%|██████████| 7/7 [00:01<00:00,  5.93it/s]


epoch=315: train_ppl=tensor(4.2116, device='cuda:0') train_epoch_loss=tensor(1.4378, device='cuda:0') eval_ppl=tensor(4.0339, device='cuda:0') eval_epoch_loss=tensor(1.3947, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.91it/s]


epoch=316: train_ppl=tensor(4.2387, device='cuda:0') train_epoch_loss=tensor(1.4443, device='cuda:0') eval_ppl=tensor(4.1327, device='cuda:0') eval_epoch_loss=tensor(1.4189, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.67it/s]
100%|██████████| 7/7 [00:01<00:00,  5.90it/s]


epoch=317: train_ppl=tensor(4.2138, device='cuda:0') train_epoch_loss=tensor(1.4384, device='cuda:0') eval_ppl=tensor(4.0569, device='cuda:0') eval_epoch_loss=tensor(1.4004, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.89it/s]


epoch=318: train_ppl=tensor(4.0033, device='cuda:0') train_epoch_loss=tensor(1.3871, device='cuda:0') eval_ppl=tensor(4.0373, device='cuda:0') eval_epoch_loss=tensor(1.3956, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.87it/s]


epoch=319: train_ppl=tensor(4.0936, device='cuda:0') train_epoch_loss=tensor(1.4094, device='cuda:0') eval_ppl=tensor(3.9499, device='cuda:0') eval_epoch_loss=tensor(1.3737, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.70it/s]
100%|██████████| 7/7 [00:01<00:00,  5.81it/s]


epoch=320: train_ppl=tensor(4.0112, device='cuda:0') train_epoch_loss=tensor(1.3891, device='cuda:0') eval_ppl=tensor(3.9182, device='cuda:0') eval_epoch_loss=tensor(1.3656, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.90it/s]


epoch=321: train_ppl=tensor(4.0750, device='cuda:0') train_epoch_loss=tensor(1.4049, device='cuda:0') eval_ppl=tensor(6.3028, device='cuda:0') eval_epoch_loss=tensor(1.8410, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.67it/s]
100%|██████████| 7/7 [00:01<00:00,  5.89it/s]


epoch=322: train_ppl=tensor(18.9873, device='cuda:0') train_epoch_loss=tensor(2.9438, device='cuda:0') eval_ppl=tensor(4.6472, device='cuda:0') eval_epoch_loss=tensor(1.5363, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.90it/s]


epoch=323: train_ppl=tensor(4.3955, device='cuda:0') train_epoch_loss=tensor(1.4806, device='cuda:0') eval_ppl=tensor(4.2160, device='cuda:0') eval_epoch_loss=tensor(1.4389, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.67it/s]
100%|██████████| 7/7 [00:01<00:00,  5.85it/s]


epoch=324: train_ppl=tensor(4.3592, device='cuda:0') train_epoch_loss=tensor(1.4723, device='cuda:0') eval_ppl=tensor(4.0481, device='cuda:0') eval_epoch_loss=tensor(1.3982, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.94it/s]


epoch=325: train_ppl=tensor(4.0304, device='cuda:0') train_epoch_loss=tensor(1.3939, device='cuda:0') eval_ppl=tensor(4.1472, device='cuda:0') eval_epoch_loss=tensor(1.4224, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.70it/s]
100%|██████████| 7/7 [00:01<00:00,  5.89it/s]


epoch=326: train_ppl=tensor(4.0646, device='cuda:0') train_epoch_loss=tensor(1.4023, device='cuda:0') eval_ppl=tensor(3.9169, device='cuda:0') eval_epoch_loss=tensor(1.3653, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.65it/s]
100%|██████████| 7/7 [00:01<00:00,  5.86it/s]


epoch=327: train_ppl=tensor(4.3146, device='cuda:0') train_epoch_loss=tensor(1.4620, device='cuda:0') eval_ppl=tensor(4.1895, device='cuda:0') eval_epoch_loss=tensor(1.4326, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.92it/s]


epoch=328: train_ppl=tensor(4.0440, device='cuda:0') train_epoch_loss=tensor(1.3972, device='cuda:0') eval_ppl=tensor(3.9775, device='cuda:0') eval_epoch_loss=tensor(1.3806, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.92it/s]


epoch=329: train_ppl=tensor(4.1344, device='cuda:0') train_epoch_loss=tensor(1.4193, device='cuda:0') eval_ppl=tensor(3.9449, device='cuda:0') eval_epoch_loss=tensor(1.3724, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.70it/s]
100%|██████████| 7/7 [00:01<00:00,  5.88it/s]


epoch=330: train_ppl=tensor(4.0507, device='cuda:0') train_epoch_loss=tensor(1.3989, device='cuda:0') eval_ppl=tensor(4.3031, device='cuda:0') eval_epoch_loss=tensor(1.4593, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.88it/s]


epoch=331: train_ppl=tensor(4.0651, device='cuda:0') train_epoch_loss=tensor(1.4024, device='cuda:0') eval_ppl=tensor(3.8897, device='cuda:0') eval_epoch_loss=tensor(1.3583, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.70it/s]
100%|██████████| 7/7 [00:01<00:00,  5.94it/s]


epoch=332: train_ppl=tensor(3.9659, device='cuda:0') train_epoch_loss=tensor(1.3777, device='cuda:0') eval_ppl=tensor(3.8987, device='cuda:0') eval_epoch_loss=tensor(1.3606, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.88it/s]


epoch=333: train_ppl=tensor(3.8730, device='cuda:0') train_epoch_loss=tensor(1.3540, device='cuda:0') eval_ppl=tensor(3.8613, device='cuda:0') eval_epoch_loss=tensor(1.3510, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.88it/s]


epoch=334: train_ppl=tensor(4.1270, device='cuda:0') train_epoch_loss=tensor(1.4176, device='cuda:0') eval_ppl=tensor(3.9561, device='cuda:0') eval_epoch_loss=tensor(1.3752, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.91it/s]


epoch=335: train_ppl=tensor(3.9909, device='cuda:0') train_epoch_loss=tensor(1.3840, device='cuda:0') eval_ppl=tensor(3.8308, device='cuda:0') eval_epoch_loss=tensor(1.3431, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.70it/s]
100%|██████████| 7/7 [00:01<00:00,  5.86it/s]


epoch=336: train_ppl=tensor(3.8588, device='cuda:0') train_epoch_loss=tensor(1.3503, device='cuda:0') eval_ppl=tensor(4.0786, device='cuda:0') eval_epoch_loss=tensor(1.4058, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.59it/s]
100%|██████████| 7/7 [00:01<00:00,  5.89it/s]


epoch=337: train_ppl=tensor(3.9442, device='cuda:0') train_epoch_loss=tensor(1.3722, device='cuda:0') eval_ppl=tensor(3.8482, device='cuda:0') eval_epoch_loss=tensor(1.3476, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.61it/s]
100%|██████████| 7/7 [00:01<00:00,  5.60it/s]


epoch=338: train_ppl=tensor(3.9919, device='cuda:0') train_epoch_loss=tensor(1.3843, device='cuda:0') eval_ppl=tensor(3.7856, device='cuda:0') eval_epoch_loss=tensor(1.3312, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.43it/s]
100%|██████████| 7/7 [00:01<00:00,  5.88it/s]


epoch=339: train_ppl=tensor(3.9431, device='cuda:0') train_epoch_loss=tensor(1.3720, device='cuda:0') eval_ppl=tensor(4.0312, device='cuda:0') eval_epoch_loss=tensor(1.3941, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.63it/s]
100%|██████████| 7/7 [00:01<00:00,  5.87it/s]


epoch=340: train_ppl=tensor(3.7987, device='cuda:0') train_epoch_loss=tensor(1.3347, device='cuda:0') eval_ppl=tensor(3.6965, device='cuda:0') eval_epoch_loss=tensor(1.3074, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.67it/s]
100%|██████████| 7/7 [00:01<00:00,  5.20it/s]


epoch=341: train_ppl=tensor(3.8176, device='cuda:0') train_epoch_loss=tensor(1.3396, device='cuda:0') eval_ppl=tensor(3.7233, device='cuda:0') eval_epoch_loss=tensor(1.3146, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  1.97it/s]
100%|██████████| 7/7 [00:02<00:00,  3.31it/s]


epoch=342: train_ppl=tensor(3.6917, device='cuda:0') train_epoch_loss=tensor(1.3061, device='cuda:0') eval_ppl=tensor(3.7829, device='cuda:0') eval_epoch_loss=tensor(1.3305, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.70it/s]
100%|██████████| 7/7 [00:01<00:00,  5.86it/s]


epoch=343: train_ppl=tensor(3.9441, device='cuda:0') train_epoch_loss=tensor(1.3722, device='cuda:0') eval_ppl=tensor(3.8452, device='cuda:0') eval_epoch_loss=tensor(1.3468, device='cuda:0')


100%|██████████| 7/7 [00:05<00:00,  1.36it/s]
100%|██████████| 7/7 [00:02<00:00,  3.49it/s]


epoch=344: train_ppl=tensor(3.8283, device='cuda:0') train_epoch_loss=tensor(1.3424, device='cuda:0') eval_ppl=tensor(3.7098, device='cuda:0') eval_epoch_loss=tensor(1.3110, device='cuda:0')


100%|██████████| 7/7 [00:05<00:00,  1.29it/s]
100%|██████████| 7/7 [00:02<00:00,  3.49it/s]


epoch=345: train_ppl=tensor(3.7372, device='cuda:0') train_epoch_loss=tensor(1.3183, device='cuda:0') eval_ppl=tensor(3.8067, device='cuda:0') eval_epoch_loss=tensor(1.3368, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  1.79it/s]
100%|██████████| 7/7 [00:01<00:00,  3.89it/s]


epoch=346: train_ppl=tensor(3.8106, device='cuda:0') train_epoch_loss=tensor(1.3378, device='cuda:0') eval_ppl=tensor(3.7741, device='cuda:0') eval_epoch_loss=tensor(1.3282, device='cuda:0')


100%|██████████| 7/7 [00:05<00:00,  1.39it/s]
100%|██████████| 7/7 [00:01<00:00,  3.99it/s]


epoch=347: train_ppl=tensor(3.7944, device='cuda:0') train_epoch_loss=tensor(1.3335, device='cuda:0') eval_ppl=tensor(3.6485, device='cuda:0') eval_epoch_loss=tensor(1.2943, device='cuda:0')


100%|██████████| 7/7 [00:04<00:00,  1.64it/s]
100%|██████████| 7/7 [00:01<00:00,  3.82it/s]


epoch=348: train_ppl=tensor(3.9097, device='cuda:0') train_epoch_loss=tensor(1.3634, device='cuda:0') eval_ppl=tensor(3.6590, device='cuda:0') eval_epoch_loss=tensor(1.2972, device='cuda:0')


100%|██████████| 7/7 [00:04<00:00,  1.45it/s]
100%|██████████| 7/7 [00:02<00:00,  3.37it/s]


epoch=349: train_ppl=tensor(3.6795, device='cuda:0') train_epoch_loss=tensor(1.3028, device='cuda:0') eval_ppl=tensor(4.0028, device='cuda:0') eval_epoch_loss=tensor(1.3870, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  1.81it/s]
100%|██████████| 7/7 [00:01<00:00,  4.87it/s]


epoch=350: train_ppl=tensor(3.6226, device='cuda:0') train_epoch_loss=tensor(1.2872, device='cuda:0') eval_ppl=tensor(3.6816, device='cuda:0') eval_epoch_loss=tensor(1.3034, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.52it/s]
100%|██████████| 7/7 [00:01<00:00,  5.57it/s]


epoch=351: train_ppl=tensor(3.8322, device='cuda:0') train_epoch_loss=tensor(1.3434, device='cuda:0') eval_ppl=tensor(3.6693, device='cuda:0') eval_epoch_loss=tensor(1.3000, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.36it/s]
100%|██████████| 7/7 [00:01<00:00,  5.79it/s]


epoch=352: train_ppl=tensor(3.7696, device='cuda:0') train_epoch_loss=tensor(1.3270, device='cuda:0') eval_ppl=tensor(3.9425, device='cuda:0') eval_epoch_loss=tensor(1.3718, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.52it/s]
100%|██████████| 7/7 [00:01<00:00,  5.87it/s]


epoch=353: train_ppl=tensor(3.6958, device='cuda:0') train_epoch_loss=tensor(1.3072, device='cuda:0') eval_ppl=tensor(3.7408, device='cuda:0') eval_epoch_loss=tensor(1.3193, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.88it/s]


epoch=354: train_ppl=tensor(3.6711, device='cuda:0') train_epoch_loss=tensor(1.3005, device='cuda:0') eval_ppl=tensor(3.8862, device='cuda:0') eval_epoch_loss=tensor(1.3574, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.67it/s]
100%|██████████| 7/7 [00:01<00:00,  5.88it/s]


epoch=355: train_ppl=tensor(3.8362, device='cuda:0') train_epoch_loss=tensor(1.3445, device='cuda:0') eval_ppl=tensor(3.8441, device='cuda:0') eval_epoch_loss=tensor(1.3465, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.70it/s]
100%|██████████| 7/7 [00:01<00:00,  5.88it/s]


epoch=356: train_ppl=tensor(3.7541, device='cuda:0') train_epoch_loss=tensor(1.3228, device='cuda:0') eval_ppl=tensor(3.7612, device='cuda:0') eval_epoch_loss=tensor(1.3247, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.88it/s]


epoch=357: train_ppl=tensor(3.6144, device='cuda:0') train_epoch_loss=tensor(1.2849, device='cuda:0') eval_ppl=tensor(3.8194, device='cuda:0') eval_epoch_loss=tensor(1.3401, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.93it/s]


epoch=358: train_ppl=tensor(3.7721, device='cuda:0') train_epoch_loss=tensor(1.3276, device='cuda:0') eval_ppl=tensor(3.7449, device='cuda:0') eval_epoch_loss=tensor(1.3204, device='cuda:0')


100%|██████████| 7/7 [00:04<00:00,  1.70it/s]
100%|██████████| 7/7 [00:02<00:00,  2.69it/s]


epoch=359: train_ppl=tensor(3.6901, device='cuda:0') train_epoch_loss=tensor(1.3057, device='cuda:0') eval_ppl=tensor(3.6654, device='cuda:0') eval_epoch_loss=tensor(1.2989, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  2.09it/s]
100%|██████████| 7/7 [00:01<00:00,  4.85it/s]


epoch=360: train_ppl=tensor(3.7465, device='cuda:0') train_epoch_loss=tensor(1.3208, device='cuda:0') eval_ppl=tensor(3.6990, device='cuda:0') eval_epoch_loss=tensor(1.3081, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  2.06it/s]
100%|██████████| 7/7 [00:01<00:00,  5.84it/s]


epoch=361: train_ppl=tensor(3.6192, device='cuda:0') train_epoch_loss=tensor(1.2862, device='cuda:0') eval_ppl=tensor(3.5664, device='cuda:0') eval_epoch_loss=tensor(1.2715, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.91it/s]


epoch=362: train_ppl=tensor(3.5012, device='cuda:0') train_epoch_loss=tensor(1.2531, device='cuda:0') eval_ppl=tensor(3.5859, device='cuda:0') eval_epoch_loss=tensor(1.2770, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.91it/s]


epoch=363: train_ppl=tensor(3.6116, device='cuda:0') train_epoch_loss=tensor(1.2841, device='cuda:0') eval_ppl=tensor(3.5728, device='cuda:0') eval_epoch_loss=tensor(1.2734, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.67it/s]
100%|██████████| 7/7 [00:01<00:00,  5.93it/s]


epoch=364: train_ppl=tensor(3.5115, device='cuda:0') train_epoch_loss=tensor(1.2560, device='cuda:0') eval_ppl=tensor(3.4934, device='cuda:0') eval_epoch_loss=tensor(1.2509, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  2.23it/s]
100%|██████████| 7/7 [00:01<00:00,  5.53it/s]


epoch=365: train_ppl=tensor(3.5912, device='cuda:0') train_epoch_loss=tensor(1.2785, device='cuda:0') eval_ppl=tensor(3.5553, device='cuda:0') eval_epoch_loss=tensor(1.2684, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.54it/s]
100%|██████████| 7/7 [00:01<00:00,  5.59it/s]


epoch=366: train_ppl=tensor(3.5035, device='cuda:0') train_epoch_loss=tensor(1.2538, device='cuda:0') eval_ppl=tensor(3.5910, device='cuda:0') eval_epoch_loss=tensor(1.2784, device='cuda:0')


100%|██████████| 7/7 [00:04<00:00,  1.55it/s]
100%|██████████| 7/7 [00:01<00:00,  5.86it/s]


epoch=367: train_ppl=tensor(3.5115, device='cuda:0') train_epoch_loss=tensor(1.2560, device='cuda:0') eval_ppl=tensor(3.4561, device='cuda:0') eval_epoch_loss=tensor(1.2401, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.67it/s]
100%|██████████| 7/7 [00:01<00:00,  5.81it/s]


epoch=368: train_ppl=tensor(3.4655, device='cuda:0') train_epoch_loss=tensor(1.2428, device='cuda:0') eval_ppl=tensor(3.4871, device='cuda:0') eval_epoch_loss=tensor(1.2491, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.60it/s]
100%|██████████| 7/7 [00:01<00:00,  5.61it/s]


epoch=369: train_ppl=tensor(3.6106, device='cuda:0') train_epoch_loss=tensor(1.2839, device='cuda:0') eval_ppl=tensor(3.5278, device='cuda:0') eval_epoch_loss=tensor(1.2607, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.58it/s]
100%|██████████| 7/7 [00:01<00:00,  4.79it/s]


epoch=370: train_ppl=tensor(3.4821, device='cuda:0') train_epoch_loss=tensor(1.2476, device='cuda:0') eval_ppl=tensor(3.4715, device='cuda:0') eval_epoch_loss=tensor(1.2446, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.64it/s]
100%|██████████| 7/7 [00:01<00:00,  5.05it/s]


epoch=371: train_ppl=tensor(3.5225, device='cuda:0') train_epoch_loss=tensor(1.2592, device='cuda:0') eval_ppl=tensor(3.4770, device='cuda:0') eval_epoch_loss=tensor(1.2462, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  1.88it/s]
100%|██████████| 7/7 [00:01<00:00,  5.88it/s]


epoch=372: train_ppl=tensor(3.4593, device='cuda:0') train_epoch_loss=tensor(1.2411, device='cuda:0') eval_ppl=tensor(3.4114, device='cuda:0') eval_epoch_loss=tensor(1.2271, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.61it/s]
100%|██████████| 7/7 [00:01<00:00,  5.35it/s]


epoch=373: train_ppl=tensor(3.5516, device='cuda:0') train_epoch_loss=tensor(1.2674, device='cuda:0') eval_ppl=tensor(3.8275, device='cuda:0') eval_epoch_loss=tensor(1.3422, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.45it/s]
100%|██████████| 7/7 [00:01<00:00,  5.68it/s]


epoch=374: train_ppl=tensor(3.7501, device='cuda:0') train_epoch_loss=tensor(1.3218, device='cuda:0') eval_ppl=tensor(3.7354, device='cuda:0') eval_epoch_loss=tensor(1.3179, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.63it/s]
100%|██████████| 7/7 [00:01<00:00,  5.58it/s]


epoch=375: train_ppl=tensor(3.6231, device='cuda:0') train_epoch_loss=tensor(1.2873, device='cuda:0') eval_ppl=tensor(3.5689, device='cuda:0') eval_epoch_loss=tensor(1.2723, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.51it/s]
100%|██████████| 7/7 [00:01<00:00,  5.61it/s]


epoch=376: train_ppl=tensor(3.5892, device='cuda:0') train_epoch_loss=tensor(1.2779, device='cuda:0') eval_ppl=tensor(3.6536, device='cuda:0') eval_epoch_loss=tensor(1.2957, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.58it/s]
100%|██████████| 7/7 [00:01<00:00,  5.00it/s]


epoch=377: train_ppl=tensor(3.5950, device='cuda:0') train_epoch_loss=tensor(1.2795, device='cuda:0') eval_ppl=tensor(3.4161, device='cuda:0') eval_epoch_loss=tensor(1.2285, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.55it/s]
100%|██████████| 7/7 [00:01<00:00,  5.72it/s]


epoch=378: train_ppl=tensor(3.4426, device='cuda:0') train_epoch_loss=tensor(1.2362, device='cuda:0') eval_ppl=tensor(3.4090, device='cuda:0') eval_epoch_loss=tensor(1.2264, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.50it/s]
100%|██████████| 7/7 [00:01<00:00,  5.50it/s]


epoch=379: train_ppl=tensor(3.4944, device='cuda:0') train_epoch_loss=tensor(1.2512, device='cuda:0') eval_ppl=tensor(3.5061, device='cuda:0') eval_epoch_loss=tensor(1.2545, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.53it/s]
100%|██████████| 7/7 [00:01<00:00,  5.75it/s]


epoch=380: train_ppl=tensor(3.6101, device='cuda:0') train_epoch_loss=tensor(1.2837, device='cuda:0') eval_ppl=tensor(3.3756, device='cuda:0') eval_epoch_loss=tensor(1.2166, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.50it/s]
100%|██████████| 7/7 [00:01<00:00,  5.89it/s]


epoch=381: train_ppl=tensor(3.8466, device='cuda:0') train_epoch_loss=tensor(1.3472, device='cuda:0') eval_ppl=tensor(3.3772, device='cuda:0') eval_epoch_loss=tensor(1.2171, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.54it/s]
100%|██████████| 7/7 [00:01<00:00,  5.89it/s]


epoch=382: train_ppl=tensor(3.7088, device='cuda:0') train_epoch_loss=tensor(1.3107, device='cuda:0') eval_ppl=tensor(3.8742, device='cuda:0') eval_epoch_loss=tensor(1.3543, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.65it/s]
100%|██████████| 7/7 [00:01<00:00,  5.86it/s]


epoch=383: train_ppl=tensor(3.8095, device='cuda:0') train_epoch_loss=tensor(1.3375, device='cuda:0') eval_ppl=tensor(3.2778, device='cuda:0') eval_epoch_loss=tensor(1.1872, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.35it/s]
100%|██████████| 7/7 [00:01<00:00,  5.32it/s]


epoch=384: train_ppl=tensor(3.3619, device='cuda:0') train_epoch_loss=tensor(1.2125, device='cuda:0') eval_ppl=tensor(3.7890, device='cuda:0') eval_epoch_loss=tensor(1.3321, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.49it/s]
100%|██████████| 7/7 [00:01<00:00,  5.37it/s]


epoch=385: train_ppl=tensor(3.4495, device='cuda:0') train_epoch_loss=tensor(1.2382, device='cuda:0') eval_ppl=tensor(3.2471, device='cuda:0') eval_epoch_loss=tensor(1.1778, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.55it/s]
100%|██████████| 7/7 [00:01<00:00,  5.77it/s]


epoch=386: train_ppl=tensor(3.3169, device='cuda:0') train_epoch_loss=tensor(1.1990, device='cuda:0') eval_ppl=tensor(3.3651, device='cuda:0') eval_epoch_loss=tensor(1.2135, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.61it/s]
100%|██████████| 7/7 [00:01<00:00,  4.87it/s]


epoch=387: train_ppl=tensor(3.5014, device='cuda:0') train_epoch_loss=tensor(1.2531, device='cuda:0') eval_ppl=tensor(3.2207, device='cuda:0') eval_epoch_loss=tensor(1.1696, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  1.84it/s]
100%|██████████| 7/7 [00:01<00:00,  4.22it/s]


epoch=388: train_ppl=tensor(3.4642, device='cuda:0') train_epoch_loss=tensor(1.2425, device='cuda:0') eval_ppl=tensor(3.2356, device='cuda:0') eval_epoch_loss=tensor(1.1742, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  2.10it/s]
100%|██████████| 7/7 [00:01<00:00,  5.15it/s]


epoch=389: train_ppl=tensor(3.5934, device='cuda:0') train_epoch_loss=tensor(1.2791, device='cuda:0') eval_ppl=tensor(3.4729, device='cuda:0') eval_epoch_loss=tensor(1.2450, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.63it/s]
100%|██████████| 7/7 [00:01<00:00,  5.83it/s]


epoch=390: train_ppl=tensor(3.8310, device='cuda:0') train_epoch_loss=tensor(1.3431, device='cuda:0') eval_ppl=tensor(3.3575, device='cuda:0') eval_epoch_loss=tensor(1.2112, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.64it/s]
100%|██████████| 7/7 [00:01<00:00,  5.91it/s]


epoch=391: train_ppl=tensor(3.4855, device='cuda:0') train_epoch_loss=tensor(1.2486, device='cuda:0') eval_ppl=tensor(3.3604, device='cuda:0') eval_epoch_loss=tensor(1.2121, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.64it/s]
100%|██████████| 7/7 [00:01<00:00,  5.91it/s]


epoch=392: train_ppl=tensor(3.7299, device='cuda:0') train_epoch_loss=tensor(1.3164, device='cuda:0') eval_ppl=tensor(3.3376, device='cuda:0') eval_epoch_loss=tensor(1.2053, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.84it/s]


epoch=393: train_ppl=tensor(3.4670, device='cuda:0') train_epoch_loss=tensor(1.2433, device='cuda:0') eval_ppl=tensor(3.3359, device='cuda:0') eval_epoch_loss=tensor(1.2048, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.86it/s]


epoch=394: train_ppl=tensor(3.9022, device='cuda:0') train_epoch_loss=tensor(1.3615, device='cuda:0') eval_ppl=tensor(3.4133, device='cuda:0') eval_epoch_loss=tensor(1.2277, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.67it/s]
100%|██████████| 7/7 [00:01<00:00,  5.78it/s]


epoch=395: train_ppl=tensor(3.9153, device='cuda:0') train_epoch_loss=tensor(1.3649, device='cuda:0') eval_ppl=tensor(3.5578, device='cuda:0') eval_epoch_loss=tensor(1.2692, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.67it/s]
100%|██████████| 7/7 [00:01<00:00,  5.82it/s]


epoch=396: train_ppl=tensor(3.4937, device='cuda:0') train_epoch_loss=tensor(1.2510, device='cuda:0') eval_ppl=tensor(3.3252, device='cuda:0') eval_epoch_loss=tensor(1.2015, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.77it/s]


epoch=397: train_ppl=tensor(3.6602, device='cuda:0') train_epoch_loss=tensor(1.2975, device='cuda:0') eval_ppl=tensor(3.4509, device='cuda:0') eval_epoch_loss=tensor(1.2386, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.92it/s]


epoch=398: train_ppl=tensor(13.5551, device='cuda:0') train_epoch_loss=tensor(2.6068, device='cuda:0') eval_ppl=tensor(284.9440, device='cuda:0') eval_epoch_loss=tensor(5.6523, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.39it/s]
100%|██████████| 7/7 [00:01<00:00,  5.48it/s]


epoch=399: train_ppl=tensor(36.0038, device='cuda:0') train_epoch_loss=tensor(3.5836, device='cuda:0') eval_ppl=tensor(12.5606, device='cuda:0') eval_epoch_loss=tensor(2.5306, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.47it/s]
100%|██████████| 7/7 [00:01<00:00,  5.10it/s]


epoch=400: train_ppl=tensor(8.2576, device='cuda:0') train_epoch_loss=tensor(2.1111, device='cuda:0') eval_ppl=tensor(7.4948, device='cuda:0') eval_epoch_loss=tensor(2.0142, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.55it/s]
100%|██████████| 7/7 [00:01<00:00,  4.47it/s]


epoch=401: train_ppl=tensor(6.3010, device='cuda:0') train_epoch_loss=tensor(1.8407, device='cuda:0') eval_ppl=tensor(5.2916, device='cuda:0') eval_epoch_loss=tensor(1.6661, device='cuda:0')


100%|██████████| 7/7 [00:06<00:00,  1.06it/s]
100%|██████████| 7/7 [00:02<00:00,  3.49it/s]


epoch=402: train_ppl=tensor(4.7689, device='cuda:0') train_epoch_loss=tensor(1.5621, device='cuda:0') eval_ppl=tensor(4.5893, device='cuda:0') eval_epoch_loss=tensor(1.5237, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  1.98it/s]
100%|██████████| 7/7 [00:01<00:00,  5.81it/s]


epoch=403: train_ppl=tensor(4.3795, device='cuda:0') train_epoch_loss=tensor(1.4769, device='cuda:0') eval_ppl=tensor(4.2043, device='cuda:0') eval_epoch_loss=tensor(1.4361, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.67it/s]
100%|██████████| 7/7 [00:01<00:00,  5.78it/s]


epoch=404: train_ppl=tensor(4.4240, device='cuda:0') train_epoch_loss=tensor(1.4870, device='cuda:0') eval_ppl=tensor(4.2392, device='cuda:0') eval_epoch_loss=tensor(1.4444, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.51it/s]
100%|██████████| 7/7 [00:01<00:00,  5.93it/s]


epoch=405: train_ppl=tensor(4.1170, device='cuda:0') train_epoch_loss=tensor(1.4151, device='cuda:0') eval_ppl=tensor(4.0911, device='cuda:0') eval_epoch_loss=tensor(1.4088, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  1.96it/s]
100%|██████████| 7/7 [00:01<00:00,  5.65it/s]


epoch=406: train_ppl=tensor(4.2520, device='cuda:0') train_epoch_loss=tensor(1.4474, device='cuda:0') eval_ppl=tensor(4.2296, device='cuda:0') eval_epoch_loss=tensor(1.4421, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  2.09it/s]
100%|██████████| 7/7 [00:01<00:00,  5.81it/s]


epoch=407: train_ppl=tensor(4.1580, device='cuda:0') train_epoch_loss=tensor(1.4250, device='cuda:0') eval_ppl=tensor(4.2492, device='cuda:0') eval_epoch_loss=tensor(1.4467, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.70it/s]
100%|██████████| 7/7 [00:01<00:00,  5.91it/s]


epoch=408: train_ppl=tensor(4.0478, device='cuda:0') train_epoch_loss=tensor(1.3982, device='cuda:0') eval_ppl=tensor(4.2340, device='cuda:0') eval_epoch_loss=tensor(1.4432, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.92it/s]


epoch=409: train_ppl=tensor(4.0415, device='cuda:0') train_epoch_loss=tensor(1.3966, device='cuda:0') eval_ppl=tensor(4.1791, device='cuda:0') eval_epoch_loss=tensor(1.4301, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.70it/s]
100%|██████████| 7/7 [00:01<00:00,  5.89it/s]


epoch=410: train_ppl=tensor(4.0993, device='cuda:0') train_epoch_loss=tensor(1.4108, device='cuda:0') eval_ppl=tensor(4.1655, device='cuda:0') eval_epoch_loss=tensor(1.4268, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.86it/s]


epoch=411: train_ppl=tensor(4.0974, device='cuda:0') train_epoch_loss=tensor(1.4103, device='cuda:0') eval_ppl=tensor(4.1371, device='cuda:0') eval_epoch_loss=tensor(1.4200, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.71it/s]
100%|██████████| 7/7 [00:01<00:00,  5.91it/s]


epoch=412: train_ppl=tensor(4.0282, device='cuda:0') train_epoch_loss=tensor(1.3933, device='cuda:0') eval_ppl=tensor(4.0180, device='cuda:0') eval_epoch_loss=tensor(1.3908, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.62it/s]
100%|██████████| 7/7 [00:01<00:00,  5.44it/s]


epoch=413: train_ppl=tensor(4.1967, device='cuda:0') train_epoch_loss=tensor(1.4343, device='cuda:0') eval_ppl=tensor(4.0398, device='cuda:0') eval_epoch_loss=tensor(1.3962, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  2.20it/s]
100%|██████████| 7/7 [00:01<00:00,  4.92it/s]


epoch=414: train_ppl=tensor(3.9197, device='cuda:0') train_epoch_loss=tensor(1.3660, device='cuda:0') eval_ppl=tensor(4.0731, device='cuda:0') eval_epoch_loss=tensor(1.4044, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.41it/s]
100%|██████████| 7/7 [00:01<00:00,  5.73it/s]


epoch=415: train_ppl=tensor(3.9721, device='cuda:0') train_epoch_loss=tensor(1.3793, device='cuda:0') eval_ppl=tensor(3.9495, device='cuda:0') eval_epoch_loss=tensor(1.3736, device='cuda:0')


100%|██████████| 7/7 [00:04<00:00,  1.71it/s]
100%|██████████| 7/7 [00:01<00:00,  5.92it/s]


epoch=416: train_ppl=tensor(4.0425, device='cuda:0') train_epoch_loss=tensor(1.3969, device='cuda:0') eval_ppl=tensor(4.0330, device='cuda:0') eval_epoch_loss=tensor(1.3945, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  2.16it/s]
100%|██████████| 7/7 [00:01<00:00,  5.52it/s]


epoch=417: train_ppl=tensor(4.0559, device='cuda:0') train_epoch_loss=tensor(1.4002, device='cuda:0') eval_ppl=tensor(4.1206, device='cuda:0') eval_epoch_loss=tensor(1.4160, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.58it/s]
100%|██████████| 7/7 [00:01<00:00,  5.85it/s]


epoch=418: train_ppl=tensor(3.9544, device='cuda:0') train_epoch_loss=tensor(1.3748, device='cuda:0') eval_ppl=tensor(4.0196, device='cuda:0') eval_epoch_loss=tensor(1.3912, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.65it/s]
100%|██████████| 7/7 [00:01<00:00,  5.08it/s]


epoch=419: train_ppl=tensor(3.9257, device='cuda:0') train_epoch_loss=tensor(1.3676, device='cuda:0') eval_ppl=tensor(4.1166, device='cuda:0') eval_epoch_loss=tensor(1.4150, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.45it/s]
100%|██████████| 7/7 [00:01<00:00,  3.93it/s]


epoch=420: train_ppl=tensor(3.9919, device='cuda:0') train_epoch_loss=tensor(1.3843, device='cuda:0') eval_ppl=tensor(4.2494, device='cuda:0') eval_epoch_loss=tensor(1.4468, device='cuda:0')


100%|██████████| 7/7 [00:04<00:00,  1.74it/s]
100%|██████████| 7/7 [00:01<00:00,  5.72it/s]


epoch=421: train_ppl=tensor(3.8928, device='cuda:0') train_epoch_loss=tensor(1.3591, device='cuda:0') eval_ppl=tensor(4.0301, device='cuda:0') eval_epoch_loss=tensor(1.3938, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.65it/s]
100%|██████████| 7/7 [00:01<00:00,  5.37it/s]


epoch=422: train_ppl=tensor(3.8684, device='cuda:0') train_epoch_loss=tensor(1.3529, device='cuda:0') eval_ppl=tensor(3.9766, device='cuda:0') eval_epoch_loss=tensor(1.3804, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.43it/s]
100%|██████████| 7/7 [00:01<00:00,  5.42it/s]


epoch=423: train_ppl=tensor(3.8646, device='cuda:0') train_epoch_loss=tensor(1.3519, device='cuda:0') eval_ppl=tensor(4.0437, device='cuda:0') eval_epoch_loss=tensor(1.3972, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.64it/s]
100%|██████████| 7/7 [00:01<00:00,  5.81it/s]


epoch=424: train_ppl=tensor(3.8586, device='cuda:0') train_epoch_loss=tensor(1.3503, device='cuda:0') eval_ppl=tensor(3.9065, device='cuda:0') eval_epoch_loss=tensor(1.3626, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.67it/s]
100%|██████████| 7/7 [00:01<00:00,  4.70it/s]


epoch=425: train_ppl=tensor(3.8328, device='cuda:0') train_epoch_loss=tensor(1.3436, device='cuda:0') eval_ppl=tensor(3.9196, device='cuda:0') eval_epoch_loss=tensor(1.3660, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  1.97it/s]
100%|██████████| 7/7 [00:01<00:00,  5.91it/s]


epoch=426: train_ppl=tensor(3.8288, device='cuda:0') train_epoch_loss=tensor(1.3425, device='cuda:0') eval_ppl=tensor(3.9981, device='cuda:0') eval_epoch_loss=tensor(1.3858, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  2.09it/s]
100%|██████████| 7/7 [00:01<00:00,  5.83it/s]


epoch=427: train_ppl=tensor(3.8959, device='cuda:0') train_epoch_loss=tensor(1.3599, device='cuda:0') eval_ppl=tensor(3.8885, device='cuda:0') eval_epoch_loss=tensor(1.3580, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.64it/s]
100%|██████████| 7/7 [00:01<00:00,  4.80it/s]


epoch=428: train_ppl=tensor(3.8770, device='cuda:0') train_epoch_loss=tensor(1.3551, device='cuda:0') eval_ppl=tensor(3.8632, device='cuda:0') eval_epoch_loss=tensor(1.3515, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.91it/s]


epoch=429: train_ppl=tensor(3.8566, device='cuda:0') train_epoch_loss=tensor(1.3498, device='cuda:0') eval_ppl=tensor(3.8860, device='cuda:0') eval_epoch_loss=tensor(1.3574, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.94it/s]


epoch=430: train_ppl=tensor(3.8308, device='cuda:0') train_epoch_loss=tensor(1.3431, device='cuda:0') eval_ppl=tensor(3.8611, device='cuda:0') eval_epoch_loss=tensor(1.3509, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.96it/s]


epoch=431: train_ppl=tensor(3.7461, device='cuda:0') train_epoch_loss=tensor(1.3207, device='cuda:0') eval_ppl=tensor(3.7703, device='cuda:0') eval_epoch_loss=tensor(1.3272, device='cuda:0')


100%|██████████| 7/7 [00:04<00:00,  1.73it/s]
100%|██████████| 7/7 [00:01<00:00,  3.60it/s]


epoch=432: train_ppl=tensor(3.6892, device='cuda:0') train_epoch_loss=tensor(1.3054, device='cuda:0') eval_ppl=tensor(3.7753, device='cuda:0') eval_epoch_loss=tensor(1.3285, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  1.85it/s]
100%|██████████| 7/7 [00:01<00:00,  5.89it/s]


epoch=433: train_ppl=tensor(3.7293, device='cuda:0') train_epoch_loss=tensor(1.3162, device='cuda:0') eval_ppl=tensor(3.8576, device='cuda:0') eval_epoch_loss=tensor(1.3500, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  2.21it/s]
100%|██████████| 7/7 [00:01<00:00,  4.77it/s]


epoch=434: train_ppl=tensor(3.7136, device='cuda:0') train_epoch_loss=tensor(1.3120, device='cuda:0') eval_ppl=tensor(3.6777, device='cuda:0') eval_epoch_loss=tensor(1.3023, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.69it/s]
100%|██████████| 7/7 [00:01<00:00,  5.91it/s]


epoch=435: train_ppl=tensor(3.6444, device='cuda:0') train_epoch_loss=tensor(1.2932, device='cuda:0') eval_ppl=tensor(3.7241, device='cuda:0') eval_epoch_loss=tensor(1.3148, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  2.02it/s]
100%|██████████| 7/7 [00:01<00:00,  4.72it/s]


epoch=436: train_ppl=tensor(3.7544, device='cuda:0') train_epoch_loss=tensor(1.3229, device='cuda:0') eval_ppl=tensor(3.8096, device='cuda:0') eval_epoch_loss=tensor(1.3375, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  2.14it/s]
100%|██████████| 7/7 [00:01<00:00,  5.28it/s]


epoch=437: train_ppl=tensor(3.5817, device='cuda:0') train_epoch_loss=tensor(1.2758, device='cuda:0') eval_ppl=tensor(3.6358, device='cuda:0') eval_epoch_loss=tensor(1.2908, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.36it/s]
100%|██████████| 7/7 [00:01<00:00,  4.87it/s]


epoch=438: train_ppl=tensor(3.7225, device='cuda:0') train_epoch_loss=tensor(1.3144, device='cuda:0') eval_ppl=tensor(3.7456, device='cuda:0') eval_epoch_loss=tensor(1.3206, device='cuda:0')


100%|██████████| 7/7 [00:04<00:00,  1.61it/s]
100%|██████████| 7/7 [00:01<00:00,  4.86it/s]


epoch=439: train_ppl=tensor(3.5995, device='cuda:0') train_epoch_loss=tensor(1.2808, device='cuda:0') eval_ppl=tensor(3.6307, device='cuda:0') eval_epoch_loss=tensor(1.2894, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  1.83it/s]
100%|██████████| 7/7 [00:01<00:00,  3.96it/s]


epoch=440: train_ppl=tensor(3.7055, device='cuda:0') train_epoch_loss=tensor(1.3098, device='cuda:0') eval_ppl=tensor(3.6497, device='cuda:0') eval_epoch_loss=tensor(1.2946, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.54it/s]
100%|██████████| 7/7 [00:01<00:00,  5.66it/s]


epoch=441: train_ppl=tensor(3.6379, device='cuda:0') train_epoch_loss=tensor(1.2914, device='cuda:0') eval_ppl=tensor(3.6204, device='cuda:0') eval_epoch_loss=tensor(1.2866, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.88it/s]


epoch=442: train_ppl=tensor(3.6024, device='cuda:0') train_epoch_loss=tensor(1.2816, device='cuda:0') eval_ppl=tensor(3.6360, device='cuda:0') eval_epoch_loss=tensor(1.2909, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.67it/s]
100%|██████████| 7/7 [00:01<00:00,  5.90it/s]


epoch=443: train_ppl=tensor(3.6230, device='cuda:0') train_epoch_loss=tensor(1.2873, device='cuda:0') eval_ppl=tensor(3.6141, device='cuda:0') eval_epoch_loss=tensor(1.2848, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.64it/s]
100%|██████████| 7/7 [00:01<00:00,  5.90it/s]


epoch=444: train_ppl=tensor(3.5612, device='cuda:0') train_epoch_loss=tensor(1.2701, device='cuda:0') eval_ppl=tensor(3.5834, device='cuda:0') eval_epoch_loss=tensor(1.2763, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.56it/s]
100%|██████████| 7/7 [00:01<00:00,  5.87it/s]


epoch=445: train_ppl=tensor(3.5225, device='cuda:0') train_epoch_loss=tensor(1.2592, device='cuda:0') eval_ppl=tensor(3.5673, device='cuda:0') eval_epoch_loss=tensor(1.2718, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.64it/s]
100%|██████████| 7/7 [00:01<00:00,  5.89it/s]


epoch=446: train_ppl=tensor(3.7132, device='cuda:0') train_epoch_loss=tensor(1.3119, device='cuda:0') eval_ppl=tensor(3.5924, device='cuda:0') eval_epoch_loss=tensor(1.2788, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  3.87it/s]


epoch=447: train_ppl=tensor(3.5390, device='cuda:0') train_epoch_loss=tensor(1.2639, device='cuda:0') eval_ppl=tensor(3.5536, device='cuda:0') eval_epoch_loss=tensor(1.2680, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  2.11it/s]
100%|██████████| 7/7 [00:01<00:00,  5.72it/s]


epoch=448: train_ppl=tensor(3.4833, device='cuda:0') train_epoch_loss=tensor(1.2480, device='cuda:0') eval_ppl=tensor(3.5653, device='cuda:0') eval_epoch_loss=tensor(1.2713, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.66it/s]
100%|██████████| 7/7 [00:01<00:00,  5.85it/s]


epoch=449: train_ppl=tensor(3.4336, device='cuda:0') train_epoch_loss=tensor(1.2336, device='cuda:0') eval_ppl=tensor(3.4887, device='cuda:0') eval_epoch_loss=tensor(1.2495, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.64it/s]
100%|██████████| 7/7 [00:01<00:00,  5.78it/s]


epoch=450: train_ppl=tensor(3.5128, device='cuda:0') train_epoch_loss=tensor(1.2564, device='cuda:0') eval_ppl=tensor(3.5288, device='cuda:0') eval_epoch_loss=tensor(1.2609, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.62it/s]
100%|██████████| 7/7 [00:01<00:00,  5.91it/s]


epoch=451: train_ppl=tensor(3.4649, device='cuda:0') train_epoch_loss=tensor(1.2427, device='cuda:0') eval_ppl=tensor(3.4417, device='cuda:0') eval_epoch_loss=tensor(1.2360, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.66it/s]
100%|██████████| 7/7 [00:01<00:00,  5.75it/s]


epoch=452: train_ppl=tensor(3.4673, device='cuda:0') train_epoch_loss=tensor(1.2434, device='cuda:0') eval_ppl=tensor(3.5282, device='cuda:0') eval_epoch_loss=tensor(1.2608, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.67it/s]
100%|██████████| 7/7 [00:01<00:00,  5.90it/s]


epoch=453: train_ppl=tensor(3.3871, device='cuda:0') train_epoch_loss=tensor(1.2200, device='cuda:0') eval_ppl=tensor(3.4824, device='cuda:0') eval_epoch_loss=tensor(1.2477, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.67it/s]
100%|██████████| 7/7 [00:01<00:00,  5.91it/s]


epoch=454: train_ppl=tensor(3.3687, device='cuda:0') train_epoch_loss=tensor(1.2145, device='cuda:0') eval_ppl=tensor(3.3763, device='cuda:0') eval_epoch_loss=tensor(1.2168, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.66it/s]
100%|██████████| 7/7 [00:01<00:00,  5.88it/s]


epoch=455: train_ppl=tensor(3.2750, device='cuda:0') train_epoch_loss=tensor(1.1863, device='cuda:0') eval_ppl=tensor(3.4310, device='cuda:0') eval_epoch_loss=tensor(1.2328, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.84it/s]


epoch=456: train_ppl=tensor(3.4826, device='cuda:0') train_epoch_loss=tensor(1.2478, device='cuda:0') eval_ppl=tensor(3.2764, device='cuda:0') eval_epoch_loss=tensor(1.1867, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.68it/s]
100%|██████████| 7/7 [00:01<00:00,  5.31it/s]


epoch=457: train_ppl=tensor(3.5206, device='cuda:0') train_epoch_loss=tensor(1.2586, device='cuda:0') eval_ppl=tensor(3.2116, device='cuda:0') eval_epoch_loss=tensor(1.1668, device='cuda:0')


100%|██████████| 7/7 [00:02<00:00,  2.44it/s]
100%|██████████| 7/7 [00:05<00:00,  1.21it/s]


epoch=458: train_ppl=tensor(3.7897, device='cuda:0') train_epoch_loss=tensor(1.3323, device='cuda:0') eval_ppl=tensor(4.1006, device='cuda:0') eval_epoch_loss=tensor(1.4111, device='cuda:0')


100%|██████████| 7/7 [00:07<00:00,  1.01s/it]
100%|██████████| 7/7 [00:03<00:00,  1.96it/s]


epoch=459: train_ppl=tensor(4.4814, device='cuda:0') train_epoch_loss=tensor(1.4999, device='cuda:0') eval_ppl=tensor(4.9567, device='cuda:0') eval_epoch_loss=tensor(1.6007, device='cuda:0')


100%|██████████| 7/7 [00:08<00:00,  1.24s/it]
100%|██████████| 7/7 [00:03<00:00,  2.11it/s]


epoch=460: train_ppl=tensor(4.6895, device='cuda:0') train_epoch_loss=tensor(1.5453, device='cuda:0') eval_ppl=tensor(3.5188, device='cuda:0') eval_epoch_loss=tensor(1.2581, device='cuda:0')


100%|██████████| 7/7 [00:04<00:00,  1.63it/s]
100%|██████████| 7/7 [00:01<00:00,  3.75it/s]


epoch=461: train_ppl=tensor(3.5697, device='cuda:0') train_epoch_loss=tensor(1.2725, device='cuda:0') eval_ppl=tensor(3.3189, device='cuda:0') eval_epoch_loss=tensor(1.1996, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  1.93it/s]
100%|██████████| 7/7 [00:01<00:00,  5.03it/s]


epoch=462: train_ppl=tensor(3.2057, device='cuda:0') train_epoch_loss=tensor(1.1649, device='cuda:0') eval_ppl=tensor(3.3006, device='cuda:0') eval_epoch_loss=tensor(1.1941, device='cuda:0')


100%|██████████| 7/7 [00:03<00:00,  2.31it/s]
100%|██████████| 7/7 [00:01<00:00,  4.66it/s]


epoch=463: train_ppl=tensor(3.4236, device='cuda:0') train_epoch_loss=tensor(1.2307, device='cuda:0') eval_ppl=tensor(3.3208, device='cuda:0') eval_epoch_loss=tensor(1.2002, device='cuda:0')


100%|██████████| 7/7 [00:08<00:00,  1.27s/it]
100%|██████████| 7/7 [00:09<00:00,  1.32s/it]


epoch=464: train_ppl=tensor(3.5100, device='cuda:0') train_epoch_loss=tensor(1.2556, device='cuda:0') eval_ppl=tensor(3.2613, device='cuda:0') eval_epoch_loss=tensor(1.1821, device='cuda:0')


100%|██████████| 7/7 [00:15<00:00,  2.20s/it]
100%|██████████| 7/7 [00:04<00:00,  1.73it/s]


epoch=465: train_ppl=tensor(3.0378, device='cuda:0') train_epoch_loss=tensor(1.1111, device='cuda:0') eval_ppl=tensor(3.7286, device='cuda:0') eval_epoch_loss=tensor(1.3160, device='cuda:0')


100%|██████████| 7/7 [00:14<00:00,  2.10s/it]
  0%|          | 0/7 [00:01<?, ?it/s]


KeyboardInterrupt: 

In [11]:
1

1

In [12]:
# print accuracy
correct = 0
total = 0
for pred, true in zip(eval_preds, dataset["train"]["text_label"]):
    if pred.strip() == true.strip():
        correct += 1
    total += 1
accuracy = correct / total * 100
print(f"{accuracy=} % on the evaluation dataset")
print(f"{eval_preds[:10]=}")
print(f"{dataset['train']['text_label'][:10]=}")

ZeroDivisionError: division by zero

In [13]:
model.eval()
i = 7
inputs = tokenizer(f'{text_column} : {dataset["test"][i]["Tweet text"]} Label : ', return_tensors="pt")
print(dataset["test"][i]["Tweet text"])
print(inputs)

with torch.no_grad():
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = model.generate(
        input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=10, eos_token_id=3
    )
    print(outputs)
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))

@mombot @mcrartgallery This is the most fucking stupid thing I've ever heard of. Next step is make women wear burka… https://t.co/m8dfs0isUb
{'input_ids': tensor([[227985,   5484,    915,   2566,     80,   2068,    479,   2566,     80,
           1376,    878, 147587,   3904,    632,    368,   6084,  65673,  78851,
          11736,  15527,  19082,  33151,    461,     17,  45575,  17887,    632,
           5219,  14216,  68870,   5967,   1841,   4346,  87843,     17,   1594,
          14512,     27,     71,   8184,     19,    290,  63748,  77658,    915,
            210]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
tensor([[227985,   5484,    915,   2566,     80,   2068,    479,   2566,     80,
           1376,    878, 147587,   3904,    632,    368,   6084,  65673,  78851,
          11736,  15527,  19082,  33151,    461,     17,  45575,  17887,    632,


In [14]:
# saving model
peft_model_id = f"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}"
model.save_pretrained(peft_model_id)

In [15]:
print(peft_model_id)

bigscience/bloomz-1b7_LORA_CAUSAL_LM


In [16]:
ckpt = f"{peft_model_id}/adapter_model.bin"
!du -h $ckpt

'du' �����ڲ����ⲿ���Ҳ���ǿ����еĳ���
���������ļ���


In [5]:
from peft import PeftModel, PeftConfig,LoraConfig

peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=Trhe, r=8, lora_alpha=32, lora_dropout=0.1)


peft_model_id = f"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}"

config = PeftConfig.from_pretrained(peft_model_id)
model_ft = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
model_ft = PeftModel.from_pretrained(model_ft, peft_model_id)




In [7]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): BloomForCausalLM(
      (transformer): BloomModel(
        (word_embeddings): Embedding(250880, 2048)
        (word_embeddings_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (h): ModuleList(
          (0-23): 24 x BloomBlock(
            (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
            (self_attention): BloomAttention(
              (query_key_value): MergedLinear(
                in_features=2048, out_features=6144, bias=True
                (lora_dropout): Dropout(p=0.1, inplace=False)
                (lora_A): Linear(in_features=2048, out_features=16, bias=False)
                (lora_B): Conv1d(16, 4096, kernel_size=(1,), stride=(1,), groups=2, bias=False)
              )
              (dense): Linear(in_features=2048, out_features=2048, bias=True)
              (attention_dropout): Dropout(p=0.0, inplace=False)
            )
            (post_attention_lay

In [8]:
model_ft.to(device)
model_ft.eval()
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

i = 4
inputs = tokenizer(f'{text_column} : {dataset["test"][i]["Tweet text"]} Label : ', return_tensors="pt")
print(dataset["test"][i]["Tweet text"])
print(inputs)

with torch.no_grad():
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = model_ft.generate(
        input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_new_tokens=10, eos_token_id=3
    )
    print(outputs)
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))

RuntimeError: Expected 4-dimensional input for 4-dimensional weight [4096, 8, 1, 1], but got 3-dimensional input of size [1, 16, 2048] instead