# Summarizing (w/ GPT-2)

## Packages

In [1]:
from utils.json_utils import read_json, write_json
from datasets import Dataset
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


## Tokenizing

In [2]:
sentences_w_subjects_tokenized = read_json("9_non_lemmatized_tokenized_sentences_black_clover.json")
sentences_w_subjects_tokenized

[{'subjects': ['Yuno'],
  'tokens': ['a',
   'priest',
   'takes',
   'two',
   'babies',
   'abandoned',
   'outside',
   'his',
   'church',
   'inside',
   'and',
   'discovers',
   'two',
   'babies',
   'abandoned',
   'outside',
   'his',
   'church',
   'names',
   'to',
   'be',
   'Yuno',
   'and',
   'Asta',
   '.']},
 {'subjects': ['Asta'],
  'tokens': ['a',
   'priest',
   'takes',
   'two',
   'babies',
   'abandoned',
   'outside',
   'his',
   'church',
   'inside',
   'and',
   'discovers',
   'two',
   'babies',
   'abandoned',
   'outside',
   'his',
   'church',
   'names',
   'to',
   'be',
   'Yuno',
   'and',
   'Asta',
   '.']},
 {'subjects': ['Lily'],
  'tokens': ['Fifteen',
   'years',
   'later',
   ',',
   'Asta',
   'proposes',
   'to',
   'Sister',
   'Lily',
   ',',
   'who',
   'refuses',
   'repeatedly',
   '.']},
 {'subjects': ['Asta'],
  'tokens': ['Fifteen',
   'years',
   'later',
   ',',
   'Asta',
   'proposes',
   'to',
   'Sister',
   'Lily',
   

In [3]:
sentences_w_subjects_tokenized = [
    {
        "subjects" : sentence["subjects"],
        "tokens"  : " ".join(sentence["tokens"])
    } 
    for sentence in sentences_w_subjects_tokenized
]
sentences_w_subjects_tokenized

[{'subjects': ['Yuno'],
  'tokens': 'a priest takes two babies abandoned outside his church inside and discovers two babies abandoned outside his church names to be Yuno and Asta .'},
 {'subjects': ['Asta'],
  'tokens': 'a priest takes two babies abandoned outside his church inside and discovers two babies abandoned outside his church names to be Yuno and Asta .'},
 {'subjects': ['Lily'],
  'tokens': 'Fifteen years later , Asta proposes to Sister Lily , who refuses repeatedly .'},
 {'subjects': ['Asta'],
  'tokens': 'Fifteen years later , Asta proposes to Sister Lily , who refuses repeatedly .'},
 {'subjects': ['Yuno'],
  'tokens': 'Yuno and the other orphans criticize Asta and point out Yuno lack of magic .'},
 {'subjects': ['Asta'],
  'tokens': 'Yuno and the other orphans criticize Asta and point out Yuno lack of magic .'},
 {'subjects': ['Yuno'],
  'tokens': 'Asta tries to show off Asta skills , but Yuno outshines Asta with Asta magic .'},
 {'subjects': ['Asta'],
  'tokens': 'Asta t

In [4]:
max_length = 0
for sentence in sentences_w_subjects_tokenized:
    if len(sentence["tokens"]) > max_length:
        max_length = len(sentence["tokens"])
max_length

610

In [5]:
varied_set_adjectives = read_json("14_varied_set_adjectives_definitions.json")
#varied_set_adjectives

In [6]:
training_sents = list(map(lambda x : x["tokens"], sentences_w_subjects_tokenized))
#training_sents

## Transformer

In [7]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import torch

2023-01-26 15:46:07.339643: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-26 15:46:07.411895: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-01-26 15:46:07.411908: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-01-26 15:46:07.801867: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [8]:
model_checkpoint = "bert-base-cased"

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [10]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [11]:
tokenize_fn = lambda doc : tokenizer(
        doc,
        truncation=True,
        max_length=1024
    )

tokenized_dataset = list(map(tokenize_fn, training_sents))

In [12]:
tokenized_dataset = pd.DataFrame(tokenized_dataset)
tokenized_dataset = Dataset.from_pandas(tokenized_dataset)

In [13]:
model = AutoModelForMaskedLM.from_pretrained(
    model_checkpoint, pad_token_id=tokenizer.eos_token_id)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
device = f"cuda:{torch.cuda.current_device()}" if torch.cuda.is_available() else "cpu"

In [15]:
training_args = TrainingArguments(
    f"{model_checkpoint}-finetuned-sents-coref-black-clover",
    num_train_epochs=1,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,    
    learning_rate=2e-5,
    weight_decay=0.01,
    do_eval=True, # eval en validation set
    evaluation_strategy="steps", # eval en validation set
    eval_steps=100,
    save_steps=100, # checkpoint model every 500 steps
    logging_dir='./logs', # logging
    logging_strategy="steps",
    logging_steps=1,
    fp16=False, # float16 en training (only on CUDA)
    push_to_hub=False,
)

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset, #.select(range(0, 128)),
    eval_dataset=tokenized_dataset, #.select(range(0, 128)),
)

In [17]:
train_output = trainer.train()

***** Running training *****
  Num examples = 5989
  Num Epochs = 1
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 1
  Total optimization steps = 599
  0%|          | 0/599 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  0%|          | 1/599 [00:03<32:21,  3.25s/it]

{'loss': 1.3977, 'learning_rate': 1.996661101836394e-05, 'epoch': 0.0}


  0%|          | 2/599 [00:05<26:53,  2.70s/it]

{'loss': 1.3895, 'learning_rate': 1.993322203672788e-05, 'epoch': 0.0}


  1%|          | 3/599 [00:07<21:28,  2.16s/it]

{'loss': 1.1783, 'learning_rate': 1.989983305509182e-05, 'epoch': 0.01}


  1%|          | 4/599 [00:09<22:26,  2.26s/it]

{'loss': 0.8765, 'learning_rate': 1.986644407345576e-05, 'epoch': 0.01}


  1%|          | 5/599 [00:12<25:40,  2.59s/it]

{'loss': 0.7257, 'learning_rate': 1.98330550918197e-05, 'epoch': 0.01}


  1%|          | 6/599 [00:14<24:34,  2.49s/it]

{'loss': 0.8388, 'learning_rate': 1.979966611018364e-05, 'epoch': 0.01}


  1%|          | 7/599 [00:17<23:15,  2.36s/it]

{'loss': 0.7703, 'learning_rate': 1.976627712854758e-05, 'epoch': 0.01}


  1%|▏         | 8/599 [00:18<20:30,  2.08s/it]

{'loss': 0.9715, 'learning_rate': 1.973288814691152e-05, 'epoch': 0.01}


  2%|▏         | 9/599 [00:20<20:13,  2.06s/it]

{'loss': 0.6549, 'learning_rate': 1.969949916527546e-05, 'epoch': 0.02}


  2%|▏         | 10/599 [00:22<20:58,  2.14s/it]

{'loss': 0.6644, 'learning_rate': 1.96661101836394e-05, 'epoch': 0.02}


  2%|▏         | 11/599 [00:24<20:11,  2.06s/it]

{'loss': 0.5672, 'learning_rate': 1.963272120200334e-05, 'epoch': 0.02}


  2%|▏         | 12/599 [00:26<18:39,  1.91s/it]

{'loss': 0.5608, 'learning_rate': 1.959933222036728e-05, 'epoch': 0.02}


  2%|▏         | 13/599 [00:28<18:41,  1.91s/it]

{'loss': 0.4516, 'learning_rate': 1.956594323873122e-05, 'epoch': 0.02}


  2%|▏         | 14/599 [00:29<17:34,  1.80s/it]

{'loss': 0.5116, 'learning_rate': 1.953255425709516e-05, 'epoch': 0.02}


  3%|▎         | 15/599 [00:32<20:35,  2.12s/it]

{'loss': 0.3003, 'learning_rate': 1.94991652754591e-05, 'epoch': 0.03}


  3%|▎         | 16/599 [00:34<19:18,  1.99s/it]

{'loss': 0.3358, 'learning_rate': 1.946577629382304e-05, 'epoch': 0.03}


  3%|▎         | 17/599 [00:36<20:14,  2.09s/it]

{'loss': 0.1727, 'learning_rate': 1.943238731218698e-05, 'epoch': 0.03}


  3%|▎         | 18/599 [00:38<19:03,  1.97s/it]

{'loss': 0.1503, 'learning_rate': 1.939899833055092e-05, 'epoch': 0.03}


  3%|▎         | 19/599 [00:39<17:30,  1.81s/it]

{'loss': 0.1691, 'learning_rate': 1.936560934891486e-05, 'epoch': 0.03}


  3%|▎         | 20/599 [00:41<18:28,  1.91s/it]

{'loss': 0.1187, 'learning_rate': 1.93322203672788e-05, 'epoch': 0.03}


  4%|▎         | 21/599 [00:44<21:19,  2.21s/it]

{'loss': 0.0588, 'learning_rate': 1.929883138564274e-05, 'epoch': 0.04}


  4%|▎         | 22/599 [00:46<20:27,  2.13s/it]

{'loss': 0.0431, 'learning_rate': 1.926544240400668e-05, 'epoch': 0.04}


  4%|▍         | 23/599 [00:49<20:52,  2.17s/it]

{'loss': 0.0214, 'learning_rate': 1.923205342237062e-05, 'epoch': 0.04}


  4%|▍         | 24/599 [00:51<20:20,  2.12s/it]

{'loss': 0.0133, 'learning_rate': 1.919866444073456e-05, 'epoch': 0.04}


  4%|▍         | 25/599 [00:53<20:22,  2.13s/it]

{'loss': 0.0107, 'learning_rate': 1.9165275459098497e-05, 'epoch': 0.04}


  4%|▍         | 26/599 [00:55<19:50,  2.08s/it]

{'loss': 0.0625, 'learning_rate': 1.9131886477462437e-05, 'epoch': 0.04}


  5%|▍         | 27/599 [00:57<20:20,  2.13s/it]

{'loss': 0.0051, 'learning_rate': 1.9098497495826377e-05, 'epoch': 0.05}


  5%|▍         | 28/599 [00:59<19:39,  2.07s/it]

{'loss': 0.0082, 'learning_rate': 1.9065108514190317e-05, 'epoch': 0.05}


  5%|▍         | 29/599 [01:01<19:08,  2.01s/it]

{'loss': 0.1344, 'learning_rate': 1.9031719532554257e-05, 'epoch': 0.05}


  5%|▌         | 30/599 [01:02<17:37,  1.86s/it]

{'loss': 0.0057, 'learning_rate': 1.8998330550918197e-05, 'epoch': 0.05}


  5%|▌         | 31/599 [01:04<18:36,  1.97s/it]

{'loss': 0.0145, 'learning_rate': 1.8964941569282137e-05, 'epoch': 0.05}


  5%|▌         | 32/599 [01:07<21:34,  2.28s/it]

{'loss': 0.0044, 'learning_rate': 1.8931552587646077e-05, 'epoch': 0.05}


  6%|▌         | 33/599 [01:10<22:26,  2.38s/it]

{'loss': 0.0142, 'learning_rate': 1.8898163606010017e-05, 'epoch': 0.06}


  6%|▌         | 34/599 [01:12<21:17,  2.26s/it]

{'loss': 0.0051, 'learning_rate': 1.8864774624373957e-05, 'epoch': 0.06}


  6%|▌         | 35/599 [01:14<21:05,  2.24s/it]

{'loss': 0.0703, 'learning_rate': 1.8831385642737897e-05, 'epoch': 0.06}


  6%|▌         | 36/599 [01:17<21:07,  2.25s/it]

{'loss': 0.0027, 'learning_rate': 1.8797996661101837e-05, 'epoch': 0.06}


  6%|▌         | 37/599 [01:18<20:19,  2.17s/it]

{'loss': 0.0052, 'learning_rate': 1.8764607679465777e-05, 'epoch': 0.06}


  6%|▋         | 38/599 [01:20<18:47,  2.01s/it]

{'loss': 0.0073, 'learning_rate': 1.8731218697829717e-05, 'epoch': 0.06}


  7%|▋         | 39/599 [01:22<17:53,  1.92s/it]

{'loss': 0.0022, 'learning_rate': 1.8697829716193657e-05, 'epoch': 0.07}


  7%|▋         | 40/599 [01:25<20:03,  2.15s/it]

{'loss': 0.0014, 'learning_rate': 1.8664440734557597e-05, 'epoch': 0.07}


  7%|▋         | 41/599 [01:30<28:36,  3.08s/it]

{'loss': 0.0021, 'learning_rate': 1.8631051752921537e-05, 'epoch': 0.07}


  7%|▋         | 42/599 [01:31<24:37,  2.65s/it]

{'loss': 0.0025, 'learning_rate': 1.8597662771285477e-05, 'epoch': 0.07}


  7%|▋         | 43/599 [01:34<23:43,  2.56s/it]

{'loss': 0.0011, 'learning_rate': 1.8564273789649417e-05, 'epoch': 0.07}


  7%|▋         | 44/599 [01:36<22:29,  2.43s/it]

{'loss': 0.0017, 'learning_rate': 1.8530884808013357e-05, 'epoch': 0.07}


  8%|▊         | 45/599 [01:38<20:38,  2.24s/it]

{'loss': 0.0016, 'learning_rate': 1.8497495826377297e-05, 'epoch': 0.08}


  8%|▊         | 46/599 [01:39<18:44,  2.03s/it]

{'loss': 0.0012, 'learning_rate': 1.8464106844741237e-05, 'epoch': 0.08}


  8%|▊         | 47/599 [01:41<17:01,  1.85s/it]

{'loss': 0.0029, 'learning_rate': 1.8430717863105177e-05, 'epoch': 0.08}


  8%|▊         | 48/599 [01:43<17:25,  1.90s/it]

{'loss': 0.002, 'learning_rate': 1.8397328881469117e-05, 'epoch': 0.08}


  8%|▊         | 49/599 [01:44<16:14,  1.77s/it]

{'loss': 0.0009, 'learning_rate': 1.8363939899833057e-05, 'epoch': 0.08}


  8%|▊         | 50/599 [01:46<15:37,  1.71s/it]

{'loss': 0.0014, 'learning_rate': 1.8330550918196996e-05, 'epoch': 0.08}


  9%|▊         | 51/599 [01:48<16:20,  1.79s/it]

{'loss': 0.0012, 'learning_rate': 1.8297161936560936e-05, 'epoch': 0.09}


  9%|▊         | 52/599 [01:50<16:54,  1.86s/it]

{'loss': 0.0577, 'learning_rate': 1.8263772954924876e-05, 'epoch': 0.09}


  9%|▉         | 53/599 [01:51<15:06,  1.66s/it]

{'loss': 0.0017, 'learning_rate': 1.8230383973288816e-05, 'epoch': 0.09}


  9%|▉         | 54/599 [01:54<19:05,  2.10s/it]

{'loss': 0.0026, 'learning_rate': 1.8196994991652756e-05, 'epoch': 0.09}


  9%|▉         | 55/599 [01:56<17:26,  1.92s/it]

{'loss': 0.0009, 'learning_rate': 1.8163606010016696e-05, 'epoch': 0.09}


  9%|▉         | 56/599 [01:59<20:28,  2.26s/it]

{'loss': 0.0015, 'learning_rate': 1.8130217028380636e-05, 'epoch': 0.09}


 10%|▉         | 57/599 [02:01<21:54,  2.42s/it]

{'loss': 0.0018, 'learning_rate': 1.8096828046744576e-05, 'epoch': 0.1}


 10%|▉         | 58/599 [02:04<21:48,  2.42s/it]

{'loss': 0.0017, 'learning_rate': 1.8063439065108516e-05, 'epoch': 0.1}


 10%|▉         | 59/599 [02:08<25:17,  2.81s/it]

{'loss': 0.0014, 'learning_rate': 1.8030050083472456e-05, 'epoch': 0.1}


 10%|█         | 60/599 [02:10<25:22,  2.82s/it]

{'loss': 0.0013, 'learning_rate': 1.7996661101836396e-05, 'epoch': 0.1}


 10%|█         | 61/599 [02:12<21:45,  2.43s/it]

{'loss': 0.0043, 'learning_rate': 1.7963272120200336e-05, 'epoch': 0.1}


 10%|█         | 62/599 [02:14<21:53,  2.45s/it]

{'loss': 0.0011, 'learning_rate': 1.7929883138564276e-05, 'epoch': 0.1}


 11%|█         | 63/599 [02:16<20:38,  2.31s/it]

{'loss': 0.0054, 'learning_rate': 1.7896494156928216e-05, 'epoch': 0.11}


 11%|█         | 64/599 [02:18<19:32,  2.19s/it]

{'loss': 0.0535, 'learning_rate': 1.7863105175292156e-05, 'epoch': 0.11}


 11%|█         | 65/599 [02:21<21:40,  2.44s/it]

{'loss': 0.001, 'learning_rate': 1.7829716193656096e-05, 'epoch': 0.11}


 11%|█         | 66/599 [02:23<19:36,  2.21s/it]

{'loss': 0.0013, 'learning_rate': 1.7796327212020036e-05, 'epoch': 0.11}


 11%|█         | 67/599 [02:25<17:53,  2.02s/it]

{'loss': 0.0008, 'learning_rate': 1.7762938230383976e-05, 'epoch': 0.11}


 11%|█▏        | 68/599 [02:27<19:02,  2.15s/it]

{'loss': 0.0017, 'learning_rate': 1.7729549248747916e-05, 'epoch': 0.11}


 12%|█▏        | 69/599 [02:29<17:46,  2.01s/it]

{'loss': 0.0009, 'learning_rate': 1.7696160267111856e-05, 'epoch': 0.12}


 12%|█▏        | 70/599 [02:31<17:46,  2.02s/it]

{'loss': 0.0007, 'learning_rate': 1.7662771285475796e-05, 'epoch': 0.12}


 12%|█▏        | 71/599 [02:33<17:19,  1.97s/it]

{'loss': 0.0005, 'learning_rate': 1.7629382303839736e-05, 'epoch': 0.12}


 12%|█▏        | 72/599 [02:35<17:57,  2.05s/it]

{'loss': 0.0015, 'learning_rate': 1.7595993322203676e-05, 'epoch': 0.12}


 12%|█▏        | 73/599 [02:37<18:14,  2.08s/it]

{'loss': 0.0019, 'learning_rate': 1.7562604340567616e-05, 'epoch': 0.12}


 12%|█▏        | 74/599 [02:42<25:31,  2.92s/it]

{'loss': 0.0006, 'learning_rate': 1.7529215358931556e-05, 'epoch': 0.12}


 13%|█▎        | 75/599 [02:44<22:29,  2.57s/it]

{'loss': 0.0007, 'learning_rate': 1.7495826377295492e-05, 'epoch': 0.13}


 13%|█▎        | 76/599 [02:45<20:36,  2.36s/it]

{'loss': 0.001, 'learning_rate': 1.7462437395659432e-05, 'epoch': 0.13}


 13%|█▎        | 77/599 [02:47<19:34,  2.25s/it]

{'loss': 0.0006, 'learning_rate': 1.7429048414023372e-05, 'epoch': 0.13}


 13%|█▎        | 78/599 [02:49<17:25,  2.01s/it]

{'loss': 0.0016, 'learning_rate': 1.7395659432387312e-05, 'epoch': 0.13}


 13%|█▎        | 79/599 [02:53<23:11,  2.68s/it]

{'loss': 0.001, 'learning_rate': 1.7362270450751252e-05, 'epoch': 0.13}


 13%|█▎        | 80/599 [02:55<20:53,  2.42s/it]

{'loss': 0.0006, 'learning_rate': 1.7328881469115192e-05, 'epoch': 0.13}


 14%|█▎        | 81/599 [02:57<20:21,  2.36s/it]

{'loss': 0.0044, 'learning_rate': 1.7295492487479132e-05, 'epoch': 0.14}


 14%|█▎        | 82/599 [02:59<19:23,  2.25s/it]

{'loss': 0.0003, 'learning_rate': 1.7262103505843072e-05, 'epoch': 0.14}


 14%|█▍        | 83/599 [03:01<17:15,  2.01s/it]

{'loss': 0.0024, 'learning_rate': 1.7228714524207012e-05, 'epoch': 0.14}


 14%|█▍        | 84/599 [03:02<16:09,  1.88s/it]

{'loss': 0.0007, 'learning_rate': 1.7195325542570952e-05, 'epoch': 0.14}


 14%|█▍        | 85/599 [03:04<17:03,  1.99s/it]

{'loss': 0.0008, 'learning_rate': 1.7161936560934892e-05, 'epoch': 0.14}


 14%|█▍        | 86/599 [03:07<18:16,  2.14s/it]

{'loss': 0.0006, 'learning_rate': 1.7128547579298832e-05, 'epoch': 0.14}


 15%|█▍        | 87/599 [03:09<19:09,  2.25s/it]

{'loss': 0.0006, 'learning_rate': 1.7095158597662772e-05, 'epoch': 0.15}


 15%|█▍        | 88/599 [03:12<19:04,  2.24s/it]

{'loss': 0.0008, 'learning_rate': 1.7061769616026712e-05, 'epoch': 0.15}


 15%|█▍        | 89/599 [03:14<19:16,  2.27s/it]

{'loss': 0.001, 'learning_rate': 1.7028380634390652e-05, 'epoch': 0.15}


 15%|█▌        | 90/599 [03:16<18:24,  2.17s/it]

{'loss': 0.0006, 'learning_rate': 1.6994991652754592e-05, 'epoch': 0.15}


 15%|█▌        | 91/599 [03:18<17:57,  2.12s/it]

{'loss': 0.0011, 'learning_rate': 1.696160267111853e-05, 'epoch': 0.15}


 15%|█▌        | 92/599 [03:20<18:25,  2.18s/it]

{'loss': 0.0005, 'learning_rate': 1.692821368948247e-05, 'epoch': 0.15}


 16%|█▌        | 93/599 [03:22<18:03,  2.14s/it]

{'loss': 0.0052, 'learning_rate': 1.689482470784641e-05, 'epoch': 0.16}


 16%|█▌        | 94/599 [03:24<17:43,  2.11s/it]

{'loss': 0.0006, 'learning_rate': 1.686143572621035e-05, 'epoch': 0.16}


 16%|█▌        | 95/599 [03:27<17:53,  2.13s/it]

{'loss': 0.0007, 'learning_rate': 1.682804674457429e-05, 'epoch': 0.16}


 16%|█▌        | 96/599 [03:28<16:15,  1.94s/it]

{'loss': 0.0018, 'learning_rate': 1.679465776293823e-05, 'epoch': 0.16}


 16%|█▌        | 97/599 [03:30<16:05,  1.92s/it]

{'loss': 0.0008, 'learning_rate': 1.676126878130217e-05, 'epoch': 0.16}


 16%|█▋        | 98/599 [03:32<15:39,  1.88s/it]

{'loss': 0.0028, 'learning_rate': 1.672787979966611e-05, 'epoch': 0.16}


 17%|█▋        | 99/599 [03:34<17:35,  2.11s/it]

{'loss': 0.0007, 'learning_rate': 1.669449081803005e-05, 'epoch': 0.17}


 17%|█▋        | 100/599 [03:37<18:41,  2.25s/it]***** Running Evaluation *****
  Num examples = 5989
  Batch size = 10


{'loss': 0.0005, 'learning_rate': 1.666110183639399e-05, 'epoch': 0.17}


                                                 
 17%|█▋        | 100/599 [08:38<18:41,  2.25s/it]Saving model checkpoint to bert-base-cased-finetuned-sents-coref-black-clover/checkpoint-100
Configuration saved in bert-base-cased-finetuned-sents-coref-black-clover/checkpoint-100/config.json


{'eval_loss': 0.0024367414880543947, 'eval_runtime': 301.1663, 'eval_samples_per_second': 19.886, 'eval_steps_per_second': 1.989, 'epoch': 0.17}


Model weights saved in bert-base-cased-finetuned-sents-coref-black-clover/checkpoint-100/pytorch_model.bin
 17%|█▋        | 101/599 [08:41<12:50:27, 92.83s/it]

{'loss': 0.0005, 'learning_rate': 1.662771285475793e-05, 'epoch': 0.17}


 17%|█▋        | 102/599 [08:44<9:05:27, 65.85s/it] 

{'loss': 0.0354, 'learning_rate': 1.659432387312187e-05, 'epoch': 0.17}


 17%|█▋        | 103/599 [08:46<6:26:17, 46.73s/it]

{'loss': 0.0008, 'learning_rate': 1.656093489148581e-05, 'epoch': 0.17}


 17%|█▋        | 104/599 [08:48<4:34:49, 33.31s/it]

{'loss': 0.0004, 'learning_rate': 1.652754590984975e-05, 'epoch': 0.17}


 18%|█▊        | 105/599 [08:50<3:17:07, 23.94s/it]

{'loss': 0.0042, 'learning_rate': 1.649415692821369e-05, 'epoch': 0.18}


 18%|█▊        | 106/599 [08:53<2:24:13, 17.55s/it]

{'loss': 0.0012, 'learning_rate': 1.646076794657763e-05, 'epoch': 0.18}


 18%|█▊        | 107/599 [08:56<1:48:06, 13.18s/it]

{'loss': 0.0004, 'learning_rate': 1.642737896494157e-05, 'epoch': 0.18}


 18%|█▊        | 108/599 [08:59<1:22:12, 10.05s/it]

{'loss': 0.0302, 'learning_rate': 1.639398998330551e-05, 'epoch': 0.18}


 18%|█▊        | 109/599 [09:04<1:10:46,  8.67s/it]

{'loss': 0.0013, 'learning_rate': 1.636060100166945e-05, 'epoch': 0.18}


 18%|█▊        | 110/599 [09:06<55:26,  6.80s/it]  

{'loss': 0.0004, 'learning_rate': 1.632721202003339e-05, 'epoch': 0.18}


 19%|█▊        | 111/599 [09:09<44:47,  5.51s/it]

{'loss': 0.0011, 'learning_rate': 1.629382303839733e-05, 'epoch': 0.19}


 19%|█▊        | 112/599 [09:12<37:55,  4.67s/it]

{'loss': 0.0004, 'learning_rate': 1.626043405676127e-05, 'epoch': 0.19}


 19%|█▉        | 113/599 [09:14<33:18,  4.11s/it]

{'loss': 0.0048, 'learning_rate': 1.622704507512521e-05, 'epoch': 0.19}


 19%|█▉        | 114/599 [09:17<29:15,  3.62s/it]

{'loss': 0.0007, 'learning_rate': 1.619365609348915e-05, 'epoch': 0.19}


 19%|█▉        | 115/599 [09:19<26:17,  3.26s/it]

{'loss': 0.0008, 'learning_rate': 1.616026711185309e-05, 'epoch': 0.19}


 19%|█▉        | 116/599 [09:21<22:20,  2.78s/it]

{'loss': 0.0007, 'learning_rate': 1.612687813021703e-05, 'epoch': 0.19}


 20%|█▉        | 117/599 [09:23<20:15,  2.52s/it]

{'loss': 0.0005, 'learning_rate': 1.609348914858097e-05, 'epoch': 0.2}


 20%|█▉        | 118/599 [09:25<19:04,  2.38s/it]

{'loss': 0.0005, 'learning_rate': 1.606010016694491e-05, 'epoch': 0.2}


 20%|█▉        | 119/599 [09:27<18:35,  2.32s/it]

{'loss': 0.0026, 'learning_rate': 1.602671118530885e-05, 'epoch': 0.2}


 20%|██        | 120/599 [09:30<19:10,  2.40s/it]

{'loss': 0.0007, 'learning_rate': 1.599332220367279e-05, 'epoch': 0.2}


 20%|██        | 121/599 [09:32<17:53,  2.25s/it]

{'loss': 0.0013, 'learning_rate': 1.595993322203673e-05, 'epoch': 0.2}


 20%|██        | 122/599 [09:34<17:01,  2.14s/it]

{'loss': 0.0005, 'learning_rate': 1.592654424040067e-05, 'epoch': 0.2}


 21%|██        | 123/599 [09:36<17:21,  2.19s/it]

{'loss': 0.0006, 'learning_rate': 1.589315525876461e-05, 'epoch': 0.21}


 21%|██        | 124/599 [09:38<16:44,  2.11s/it]

{'loss': 0.0018, 'learning_rate': 1.585976627712855e-05, 'epoch': 0.21}


 21%|██        | 125/599 [09:39<15:04,  1.91s/it]

{'loss': 0.001, 'learning_rate': 1.5826377295492487e-05, 'epoch': 0.21}


 21%|██        | 126/599 [09:41<15:28,  1.96s/it]

{'loss': 0.0007, 'learning_rate': 1.5792988313856427e-05, 'epoch': 0.21}


 21%|██        | 127/599 [09:44<16:48,  2.14s/it]

{'loss': 0.0008, 'learning_rate': 1.5759599332220367e-05, 'epoch': 0.21}


 21%|██▏       | 128/599 [09:46<17:09,  2.19s/it]

{'loss': 0.0009, 'learning_rate': 1.5726210350584307e-05, 'epoch': 0.21}


 22%|██▏       | 129/599 [09:48<16:43,  2.13s/it]

{'loss': 0.0005, 'learning_rate': 1.5692821368948247e-05, 'epoch': 0.22}


 22%|██▏       | 130/599 [09:51<17:39,  2.26s/it]

{'loss': 0.0005, 'learning_rate': 1.5659432387312187e-05, 'epoch': 0.22}


 22%|██▏       | 131/599 [09:53<18:16,  2.34s/it]

{'loss': 0.0004, 'learning_rate': 1.5626043405676127e-05, 'epoch': 0.22}


 22%|██▏       | 132/599 [09:56<19:10,  2.46s/it]

{'loss': 0.0005, 'learning_rate': 1.5592654424040067e-05, 'epoch': 0.22}


 22%|██▏       | 133/599 [09:58<17:07,  2.21s/it]

{'loss': 0.002, 'learning_rate': 1.5559265442404007e-05, 'epoch': 0.22}


 22%|██▏       | 134/599 [09:59<15:57,  2.06s/it]

{'loss': 0.0008, 'learning_rate': 1.5525876460767947e-05, 'epoch': 0.22}


 23%|██▎       | 135/599 [10:02<17:01,  2.20s/it]

{'loss': 0.0003, 'learning_rate': 1.5492487479131887e-05, 'epoch': 0.23}


 23%|██▎       | 136/599 [10:04<16:04,  2.08s/it]

{'loss': 0.0006, 'learning_rate': 1.5459098497495827e-05, 'epoch': 0.23}


 23%|██▎       | 137/599 [10:05<14:56,  1.94s/it]

{'loss': 0.0018, 'learning_rate': 1.5425709515859767e-05, 'epoch': 0.23}


 23%|██▎       | 138/599 [10:07<14:38,  1.90s/it]

{'loss': 0.0004, 'learning_rate': 1.5392320534223707e-05, 'epoch': 0.23}


 23%|██▎       | 139/599 [10:09<14:14,  1.86s/it]

{'loss': 0.0005, 'learning_rate': 1.5358931552587647e-05, 'epoch': 0.23}


 23%|██▎       | 140/599 [10:10<13:50,  1.81s/it]

{'loss': 0.0033, 'learning_rate': 1.5325542570951587e-05, 'epoch': 0.23}


 24%|██▎       | 141/599 [10:12<13:58,  1.83s/it]

{'loss': 0.0008, 'learning_rate': 1.5292153589315527e-05, 'epoch': 0.24}


 24%|██▎       | 142/599 [10:14<14:37,  1.92s/it]

{'loss': 0.0006, 'learning_rate': 1.5258764607679466e-05, 'epoch': 0.24}


 24%|██▍       | 143/599 [10:17<16:10,  2.13s/it]

{'loss': 0.0008, 'learning_rate': 1.5225375626043406e-05, 'epoch': 0.24}


 24%|██▍       | 144/599 [10:19<15:10,  2.00s/it]

{'loss': 0.0006, 'learning_rate': 1.5191986644407346e-05, 'epoch': 0.24}


 24%|██▍       | 145/599 [10:21<14:34,  1.93s/it]

{'loss': 0.0006, 'learning_rate': 1.5158597662771286e-05, 'epoch': 0.24}


 24%|██▍       | 146/599 [10:22<14:02,  1.86s/it]

{'loss': 0.0021, 'learning_rate': 1.5125208681135226e-05, 'epoch': 0.24}


 25%|██▍       | 147/599 [10:24<13:29,  1.79s/it]

{'loss': 0.0006, 'learning_rate': 1.5091819699499166e-05, 'epoch': 0.25}


 25%|██▍       | 148/599 [10:26<14:57,  1.99s/it]

{'loss': 0.0004, 'learning_rate': 1.5058430717863106e-05, 'epoch': 0.25}


 25%|██▍       | 149/599 [10:28<14:16,  1.90s/it]

{'loss': 0.0006, 'learning_rate': 1.5025041736227046e-05, 'epoch': 0.25}


 25%|██▌       | 150/599 [10:30<15:09,  2.03s/it]

{'loss': 0.0016, 'learning_rate': 1.4991652754590986e-05, 'epoch': 0.25}


 25%|██▌       | 151/599 [10:33<15:57,  2.14s/it]

{'loss': 0.0005, 'learning_rate': 1.4958263772954926e-05, 'epoch': 0.25}


 25%|██▌       | 152/599 [10:36<18:31,  2.49s/it]

{'loss': 0.0004, 'learning_rate': 1.4924874791318866e-05, 'epoch': 0.25}


 26%|██▌       | 153/599 [10:39<18:37,  2.51s/it]

{'loss': 0.0004, 'learning_rate': 1.4891485809682806e-05, 'epoch': 0.26}


 26%|██▌       | 154/599 [10:41<17:23,  2.34s/it]

{'loss': 0.001, 'learning_rate': 1.4858096828046746e-05, 'epoch': 0.26}


 26%|██▌       | 155/599 [10:43<17:02,  2.30s/it]

{'loss': 0.0004, 'learning_rate': 1.4824707846410686e-05, 'epoch': 0.26}


 26%|██▌       | 156/599 [10:45<16:05,  2.18s/it]

{'loss': 0.0005, 'learning_rate': 1.4791318864774626e-05, 'epoch': 0.26}


 26%|██▌       | 157/599 [10:47<15:48,  2.15s/it]

{'loss': 0.0014, 'learning_rate': 1.4757929883138566e-05, 'epoch': 0.26}


 26%|██▋       | 158/599 [10:48<14:04,  1.92s/it]

{'loss': 0.0007, 'learning_rate': 1.4724540901502506e-05, 'epoch': 0.26}


 27%|██▋       | 159/599 [10:50<13:20,  1.82s/it]

{'loss': 0.0005, 'learning_rate': 1.4691151919866444e-05, 'epoch': 0.27}


 27%|██▋       | 160/599 [10:54<18:20,  2.51s/it]

{'loss': 0.0003, 'learning_rate': 1.4657762938230384e-05, 'epoch': 0.27}


 27%|██▋       | 161/599 [10:56<17:13,  2.36s/it]

{'loss': 0.0014, 'learning_rate': 1.4624373956594324e-05, 'epoch': 0.27}


 27%|██▋       | 162/599 [11:01<22:24,  3.08s/it]

{'loss': 0.0004, 'learning_rate': 1.4590984974958264e-05, 'epoch': 0.27}


 27%|██▋       | 163/599 [11:04<22:23,  3.08s/it]

{'loss': 0.0009, 'learning_rate': 1.4557595993322204e-05, 'epoch': 0.27}


 27%|██▋       | 164/599 [11:07<23:40,  3.27s/it]

{'loss': 0.0004, 'learning_rate': 1.4524207011686144e-05, 'epoch': 0.27}


 28%|██▊       | 165/599 [11:10<22:00,  3.04s/it]

{'loss': 0.0003, 'learning_rate': 1.4490818030050084e-05, 'epoch': 0.28}


 28%|██▊       | 166/599 [11:13<21:32,  2.99s/it]

{'loss': 0.0012, 'learning_rate': 1.4457429048414024e-05, 'epoch': 0.28}


 28%|██▊       | 167/599 [11:16<21:26,  2.98s/it]

{'loss': 0.0004, 'learning_rate': 1.4424040066777964e-05, 'epoch': 0.28}


 28%|██▊       | 168/599 [11:18<20:30,  2.85s/it]

{'loss': 0.0004, 'learning_rate': 1.4390651085141904e-05, 'epoch': 0.28}


 28%|██▊       | 169/599 [11:20<18:21,  2.56s/it]

{'loss': 0.0009, 'learning_rate': 1.4357262103505844e-05, 'epoch': 0.28}


 28%|██▊       | 170/599 [11:22<17:23,  2.43s/it]

{'loss': 0.0002, 'learning_rate': 1.4323873121869784e-05, 'epoch': 0.28}


 29%|██▊       | 171/599 [11:25<16:52,  2.37s/it]

{'loss': 0.0008, 'learning_rate': 1.4290484140233725e-05, 'epoch': 0.29}


 29%|██▊       | 172/599 [11:27<17:26,  2.45s/it]

{'loss': 0.0003, 'learning_rate': 1.4257095158597665e-05, 'epoch': 0.29}


 29%|██▉       | 173/599 [11:29<16:27,  2.32s/it]

{'loss': 0.0013, 'learning_rate': 1.4223706176961605e-05, 'epoch': 0.29}


 29%|██▉       | 174/599 [11:31<15:16,  2.16s/it]

{'loss': 0.0003, 'learning_rate': 1.4190317195325545e-05, 'epoch': 0.29}


 29%|██▉       | 175/599 [11:36<22:12,  3.14s/it]

{'loss': 0.0005, 'learning_rate': 1.4156928213689482e-05, 'epoch': 0.29}


 29%|██▉       | 176/599 [11:40<23:20,  3.31s/it]

{'loss': 0.0005, 'learning_rate': 1.4123539232053422e-05, 'epoch': 0.29}


 30%|██▉       | 177/599 [11:42<21:01,  2.99s/it]

{'loss': 0.0002, 'learning_rate': 1.4090150250417362e-05, 'epoch': 0.3}


 30%|██▉       | 178/599 [11:44<19:09,  2.73s/it]

{'loss': 0.0011, 'learning_rate': 1.4056761268781302e-05, 'epoch': 0.3}


 30%|██▉       | 179/599 [11:47<18:21,  2.62s/it]

{'loss': 0.0005, 'learning_rate': 1.4023372287145244e-05, 'epoch': 0.3}


 30%|███       | 180/599 [11:49<17:30,  2.51s/it]

{'loss': 0.0003, 'learning_rate': 1.3989983305509183e-05, 'epoch': 0.3}


 30%|███       | 181/599 [11:52<17:45,  2.55s/it]

{'loss': 0.0007, 'learning_rate': 1.3956594323873123e-05, 'epoch': 0.3}


 30%|███       | 182/599 [11:54<16:44,  2.41s/it]

{'loss': 0.0005, 'learning_rate': 1.3923205342237063e-05, 'epoch': 0.3}


 31%|███       | 183/599 [11:56<16:52,  2.43s/it]

{'loss': 0.0005, 'learning_rate': 1.3889816360601003e-05, 'epoch': 0.31}


 31%|███       | 184/599 [11:58<14:54,  2.15s/it]

{'loss': 0.0004, 'learning_rate': 1.3856427378964943e-05, 'epoch': 0.31}


 31%|███       | 185/599 [12:00<14:24,  2.09s/it]

{'loss': 0.0004, 'learning_rate': 1.3823038397328883e-05, 'epoch': 0.31}


 31%|███       | 186/599 [12:02<14:43,  2.14s/it]

{'loss': 0.0008, 'learning_rate': 1.3789649415692823e-05, 'epoch': 0.31}


 31%|███       | 187/599 [12:06<17:45,  2.59s/it]

{'loss': 0.0004, 'learning_rate': 1.3756260434056763e-05, 'epoch': 0.31}


 31%|███▏      | 188/599 [12:09<19:12,  2.80s/it]

{'loss': 0.0004, 'learning_rate': 1.3722871452420703e-05, 'epoch': 0.31}


 32%|███▏      | 189/599 [12:11<18:21,  2.69s/it]

{'loss': 0.0002, 'learning_rate': 1.3689482470784643e-05, 'epoch': 0.32}


 32%|███▏      | 190/599 [12:17<23:31,  3.45s/it]

{'loss': 0.0002, 'learning_rate': 1.3656093489148583e-05, 'epoch': 0.32}


 32%|███▏      | 191/599 [12:19<20:27,  3.01s/it]

{'loss': 0.0006, 'learning_rate': 1.3622704507512523e-05, 'epoch': 0.32}


 32%|███▏      | 192/599 [12:21<18:45,  2.77s/it]

{'loss': 0.0004, 'learning_rate': 1.3589315525876461e-05, 'epoch': 0.32}


 32%|███▏      | 193/599 [12:23<16:50,  2.49s/it]

{'loss': 0.0004, 'learning_rate': 1.3555926544240401e-05, 'epoch': 0.32}


 32%|███▏      | 194/599 [12:25<16:05,  2.38s/it]

{'loss': 0.0004, 'learning_rate': 1.3522537562604341e-05, 'epoch': 0.32}


 33%|███▎      | 195/599 [12:27<14:49,  2.20s/it]

{'loss': 0.0002, 'learning_rate': 1.3489148580968281e-05, 'epoch': 0.33}


 33%|███▎      | 196/599 [12:28<14:07,  2.10s/it]

{'loss': 0.0002, 'learning_rate': 1.3455759599332221e-05, 'epoch': 0.33}


 33%|███▎      | 197/599 [12:30<13:53,  2.07s/it]

{'loss': 0.0001, 'learning_rate': 1.3422370617696161e-05, 'epoch': 0.33}


 33%|███▎      | 198/599 [12:33<14:40,  2.19s/it]

{'loss': 0.0002, 'learning_rate': 1.3388981636060101e-05, 'epoch': 0.33}


 33%|███▎      | 199/599 [12:36<16:36,  2.49s/it]

{'loss': 0.0002, 'learning_rate': 1.3355592654424041e-05, 'epoch': 0.33}


 33%|███▎      | 200/599 [12:39<16:30,  2.48s/it]***** Running Evaluation *****
  Num examples = 5989
  Batch size = 10


{'loss': 0.0004, 'learning_rate': 1.3322203672787981e-05, 'epoch': 0.33}


                                                 
 33%|███▎      | 200/599 [17:37<16:30,  2.48s/it]Saving model checkpoint to bert-base-cased-finetuned-sents-coref-black-clover/checkpoint-200
Configuration saved in bert-base-cased-finetuned-sents-coref-black-clover/checkpoint-200/config.json


{'eval_loss': 0.0005477193044498563, 'eval_runtime': 298.1311, 'eval_samples_per_second': 20.088, 'eval_steps_per_second': 2.009, 'epoch': 0.33}


Model weights saved in bert-base-cased-finetuned-sents-coref-black-clover/checkpoint-200/pytorch_model.bin
 34%|███▎      | 201/599 [17:39<10:10:02, 91.97s/it]

{'loss': 0.0004, 'learning_rate': 1.3288814691151921e-05, 'epoch': 0.34}


 34%|███▎      | 202/599 [17:42<7:12:03, 65.30s/it] 

{'loss': 0.0003, 'learning_rate': 1.3255425709515861e-05, 'epoch': 0.34}


 34%|███▍      | 203/599 [17:45<5:06:02, 46.37s/it]

{'loss': 0.0003, 'learning_rate': 1.3222036727879801e-05, 'epoch': 0.34}


 34%|███▍      | 204/599 [17:47<3:38:11, 33.14s/it]

{'loss': 0.0005, 'learning_rate': 1.3188647746243741e-05, 'epoch': 0.34}


 34%|███▍      | 205/599 [17:49<2:35:38, 23.70s/it]

{'loss': 0.0002, 'learning_rate': 1.3155258764607681e-05, 'epoch': 0.34}


 34%|███▍      | 206/599 [17:50<1:52:32, 17.18s/it]

{'loss': 0.0006, 'learning_rate': 1.3121869782971621e-05, 'epoch': 0.34}


 35%|███▍      | 207/599 [17:53<1:24:14, 12.89s/it]

{'loss': 0.0003, 'learning_rate': 1.308848080133556e-05, 'epoch': 0.35}


 35%|███▍      | 208/599 [17:56<1:03:38,  9.77s/it]

{'loss': 0.0005, 'learning_rate': 1.3055091819699499e-05, 'epoch': 0.35}


 35%|███▍      | 209/599 [17:58<48:54,  7.52s/it]  

{'loss': 0.0002, 'learning_rate': 1.3021702838063439e-05, 'epoch': 0.35}


 35%|███▌      | 210/599 [18:00<37:26,  5.78s/it]

{'loss': 0.0005, 'learning_rate': 1.2988313856427379e-05, 'epoch': 0.35}


 35%|███▌      | 211/599 [18:02<29:52,  4.62s/it]

{'loss': 0.0004, 'learning_rate': 1.2954924874791319e-05, 'epoch': 0.35}


 35%|███▌      | 212/599 [18:04<25:37,  3.97s/it]

{'loss': 0.0004, 'learning_rate': 1.2921535893155259e-05, 'epoch': 0.35}


 36%|███▌      | 213/599 [18:06<21:22,  3.32s/it]

{'loss': 0.0003, 'learning_rate': 1.2888146911519199e-05, 'epoch': 0.36}


 36%|███▌      | 214/599 [18:08<18:51,  2.94s/it]

{'loss': 0.0005, 'learning_rate': 1.2854757929883139e-05, 'epoch': 0.36}


 36%|███▌      | 215/599 [18:12<20:07,  3.14s/it]

{'loss': 0.0002, 'learning_rate': 1.2821368948247079e-05, 'epoch': 0.36}


 36%|███▌      | 216/599 [18:14<18:11,  2.85s/it]

{'loss': 0.0002, 'learning_rate': 1.2787979966611019e-05, 'epoch': 0.36}


 36%|███▌      | 217/599 [18:16<16:22,  2.57s/it]

{'loss': 0.0004, 'learning_rate': 1.2754590984974959e-05, 'epoch': 0.36}


 36%|███▋      | 218/599 [18:17<14:32,  2.29s/it]

{'loss': 0.0002, 'learning_rate': 1.2721202003338899e-05, 'epoch': 0.36}


 37%|███▋      | 219/599 [18:20<14:20,  2.27s/it]

{'loss': 0.0006, 'learning_rate': 1.2687813021702839e-05, 'epoch': 0.37}


 37%|███▋      | 220/599 [18:21<12:51,  2.04s/it]

{'loss': 0.0017, 'learning_rate': 1.2654424040066779e-05, 'epoch': 0.37}


 37%|███▋      | 221/599 [18:23<13:00,  2.07s/it]

{'loss': 0.0003, 'learning_rate': 1.262103505843072e-05, 'epoch': 0.37}


 37%|███▋      | 222/599 [18:25<12:24,  1.98s/it]

{'loss': 0.0004, 'learning_rate': 1.258764607679466e-05, 'epoch': 0.37}


 37%|███▋      | 223/599 [18:28<13:39,  2.18s/it]

{'loss': 0.0005, 'learning_rate': 1.25542570951586e-05, 'epoch': 0.37}


 37%|███▋      | 224/599 [18:30<13:44,  2.20s/it]

{'loss': 0.0003, 'learning_rate': 1.252086811352254e-05, 'epoch': 0.37}


 38%|███▊      | 225/599 [18:32<13:41,  2.20s/it]

{'loss': 0.0003, 'learning_rate': 1.2487479131886477e-05, 'epoch': 0.38}


 38%|███▊      | 226/599 [18:35<14:35,  2.35s/it]

{'loss': 0.0002, 'learning_rate': 1.2454090150250417e-05, 'epoch': 0.38}


 38%|███▊      | 227/599 [18:39<18:53,  3.05s/it]

{'loss': 0.0002, 'learning_rate': 1.2420701168614357e-05, 'epoch': 0.38}


 38%|███▊      | 228/599 [18:42<17:34,  2.84s/it]

{'loss': 0.0004, 'learning_rate': 1.2387312186978297e-05, 'epoch': 0.38}


 38%|███▊      | 229/599 [18:44<16:11,  2.62s/it]

{'loss': 0.0002, 'learning_rate': 1.2353923205342238e-05, 'epoch': 0.38}


 38%|███▊      | 230/599 [18:46<15:06,  2.46s/it]

{'loss': 0.0001, 'learning_rate': 1.2320534223706178e-05, 'epoch': 0.38}


 39%|███▊      | 231/599 [18:48<14:19,  2.33s/it]

{'loss': 0.0005, 'learning_rate': 1.2287145242070118e-05, 'epoch': 0.39}


 39%|███▊      | 232/599 [18:50<13:15,  2.17s/it]

{'loss': 0.0004, 'learning_rate': 1.2253756260434058e-05, 'epoch': 0.39}


 39%|███▉      | 233/599 [18:52<13:01,  2.14s/it]

{'loss': 0.0002, 'learning_rate': 1.2220367278797998e-05, 'epoch': 0.39}


 39%|███▉      | 234/599 [18:54<12:22,  2.03s/it]

{'loss': 0.0005, 'learning_rate': 1.2186978297161938e-05, 'epoch': 0.39}


 39%|███▉      | 235/599 [18:56<13:36,  2.24s/it]

{'loss': 0.0002, 'learning_rate': 1.2153589315525878e-05, 'epoch': 0.39}


 39%|███▉      | 236/599 [18:58<13:04,  2.16s/it]

{'loss': 0.0003, 'learning_rate': 1.2120200333889818e-05, 'epoch': 0.39}


 40%|███▉      | 237/599 [19:01<13:12,  2.19s/it]

{'loss': 0.0003, 'learning_rate': 1.2086811352253758e-05, 'epoch': 0.4}


 40%|███▉      | 238/599 [19:03<13:09,  2.19s/it]

{'loss': 0.0002, 'learning_rate': 1.2053422370617698e-05, 'epoch': 0.4}


 40%|███▉      | 239/599 [19:06<14:03,  2.34s/it]

{'loss': 0.0003, 'learning_rate': 1.2020033388981638e-05, 'epoch': 0.4}


 40%|████      | 240/599 [19:08<14:53,  2.49s/it]

{'loss': 0.0002, 'learning_rate': 1.1986644407345578e-05, 'epoch': 0.4}


 40%|████      | 241/599 [19:11<14:55,  2.50s/it]

{'loss': 0.0004, 'learning_rate': 1.1953255425709518e-05, 'epoch': 0.4}


 40%|████      | 242/599 [19:16<18:39,  3.14s/it]

{'loss': 0.0002, 'learning_rate': 1.1919866444073456e-05, 'epoch': 0.4}


 41%|████      | 243/599 [19:18<17:38,  2.97s/it]

{'loss': 0.0004, 'learning_rate': 1.1886477462437396e-05, 'epoch': 0.41}


 41%|████      | 244/599 [19:21<16:42,  2.83s/it]

{'loss': 0.0002, 'learning_rate': 1.1853088480801336e-05, 'epoch': 0.41}


 41%|████      | 245/599 [19:24<17:21,  2.94s/it]

{'loss': 0.0006, 'learning_rate': 1.1819699499165276e-05, 'epoch': 0.41}


 41%|████      | 246/599 [19:26<16:30,  2.81s/it]

{'loss': 0.0002, 'learning_rate': 1.1786310517529216e-05, 'epoch': 0.41}


 41%|████      | 247/599 [19:29<15:46,  2.69s/it]

{'loss': 0.0003, 'learning_rate': 1.1752921535893156e-05, 'epoch': 0.41}


 41%|████▏     | 248/599 [19:31<15:40,  2.68s/it]

{'loss': 0.0012, 'learning_rate': 1.1719532554257096e-05, 'epoch': 0.41}


 42%|████▏     | 249/599 [19:33<14:04,  2.41s/it]

{'loss': 0.0003, 'learning_rate': 1.1686143572621036e-05, 'epoch': 0.42}


 42%|████▏     | 250/599 [19:36<14:38,  2.52s/it]

{'loss': 0.0002, 'learning_rate': 1.1652754590984976e-05, 'epoch': 0.42}


 42%|████▏     | 251/599 [19:39<14:55,  2.57s/it]

{'loss': 0.0006, 'learning_rate': 1.1619365609348916e-05, 'epoch': 0.42}


 42%|████▏     | 252/599 [19:41<15:06,  2.61s/it]

{'loss': 0.0001, 'learning_rate': 1.1585976627712856e-05, 'epoch': 0.42}


 42%|████▏     | 253/599 [19:44<15:13,  2.64s/it]

{'loss': 0.0002, 'learning_rate': 1.1552587646076796e-05, 'epoch': 0.42}


 42%|████▏     | 254/599 [19:47<15:10,  2.64s/it]

{'loss': 0.0003, 'learning_rate': 1.1519198664440736e-05, 'epoch': 0.42}


 43%|████▎     | 255/599 [19:49<13:51,  2.42s/it]

{'loss': 0.0002, 'learning_rate': 1.1485809682804676e-05, 'epoch': 0.43}


 43%|████▎     | 256/599 [19:50<12:26,  2.18s/it]

{'loss': 0.0002, 'learning_rate': 1.1452420701168616e-05, 'epoch': 0.43}


 43%|████▎     | 257/599 [19:52<12:26,  2.18s/it]

{'loss': 0.0002, 'learning_rate': 1.1419031719532556e-05, 'epoch': 0.43}


 43%|████▎     | 258/599 [19:54<11:50,  2.08s/it]

{'loss': 0.0002, 'learning_rate': 1.1385642737896494e-05, 'epoch': 0.43}


 43%|████▎     | 259/599 [19:56<11:32,  2.04s/it]

{'loss': 0.0002, 'learning_rate': 1.1352253756260434e-05, 'epoch': 0.43}


 43%|████▎     | 260/599 [19:58<11:35,  2.05s/it]

{'loss': 0.0002, 'learning_rate': 1.1318864774624374e-05, 'epoch': 0.43}


 44%|████▎     | 261/599 [20:01<13:26,  2.39s/it]

{'loss': 0.0004, 'learning_rate': 1.1285475792988314e-05, 'epoch': 0.44}


 44%|████▎     | 262/599 [20:04<13:31,  2.41s/it]

{'loss': 0.0004, 'learning_rate': 1.1252086811352254e-05, 'epoch': 0.44}


 44%|████▍     | 263/599 [20:06<13:10,  2.35s/it]

{'loss': 0.0002, 'learning_rate': 1.1218697829716194e-05, 'epoch': 0.44}


 44%|████▍     | 264/599 [20:08<11:41,  2.10s/it]

{'loss': 0.0007, 'learning_rate': 1.1185308848080134e-05, 'epoch': 0.44}


 44%|████▍     | 265/599 [20:10<12:07,  2.18s/it]

{'loss': 0.0002, 'learning_rate': 1.1151919866444074e-05, 'epoch': 0.44}


 44%|████▍     | 266/599 [20:12<12:37,  2.27s/it]

{'loss': 0.0002, 'learning_rate': 1.1118530884808014e-05, 'epoch': 0.44}


 45%|████▍     | 267/599 [20:15<12:12,  2.21s/it]

{'loss': 0.0002, 'learning_rate': 1.1085141903171954e-05, 'epoch': 0.45}


 45%|████▍     | 268/599 [20:16<11:23,  2.06s/it]

{'loss': 0.001, 'learning_rate': 1.1051752921535894e-05, 'epoch': 0.45}


 45%|████▍     | 269/599 [20:18<11:04,  2.01s/it]

{'loss': 0.0009, 'learning_rate': 1.1018363939899834e-05, 'epoch': 0.45}


 45%|████▌     | 270/599 [20:20<10:04,  1.84s/it]

{'loss': 0.0002, 'learning_rate': 1.0984974958263774e-05, 'epoch': 0.45}


 45%|████▌     | 271/599 [20:21<09:44,  1.78s/it]

{'loss': 0.0006, 'learning_rate': 1.0951585976627715e-05, 'epoch': 0.45}


 45%|████▌     | 272/599 [20:23<10:10,  1.87s/it]

{'loss': 0.0001, 'learning_rate': 1.0918196994991655e-05, 'epoch': 0.45}


 46%|████▌     | 273/599 [20:26<10:50,  1.99s/it]

{'loss': 0.0001, 'learning_rate': 1.0884808013355595e-05, 'epoch': 0.46}


 46%|████▌     | 274/599 [20:27<10:07,  1.87s/it]

{'loss': 0.0002, 'learning_rate': 1.0851419031719535e-05, 'epoch': 0.46}


 46%|████▌     | 275/599 [20:29<09:18,  1.72s/it]

{'loss': 0.0005, 'learning_rate': 1.0818030050083472e-05, 'epoch': 0.46}


 46%|████▌     | 276/599 [20:30<09:21,  1.74s/it]

{'loss': 0.0002, 'learning_rate': 1.0784641068447412e-05, 'epoch': 0.46}


 46%|████▌     | 277/599 [20:32<09:03,  1.69s/it]

{'loss': 0.0004, 'learning_rate': 1.0751252086811352e-05, 'epoch': 0.46}


 46%|████▋     | 278/599 [20:34<09:22,  1.75s/it]

{'loss': 0.0002, 'learning_rate': 1.0717863105175292e-05, 'epoch': 0.46}


 47%|████▋     | 279/599 [20:36<09:52,  1.85s/it]

{'loss': 0.0003, 'learning_rate': 1.0684474123539233e-05, 'epoch': 0.47}


 47%|████▋     | 280/599 [20:38<10:01,  1.88s/it]

{'loss': 0.0002, 'learning_rate': 1.0651085141903173e-05, 'epoch': 0.47}


 47%|████▋     | 281/599 [20:41<11:15,  2.12s/it]

{'loss': 0.0003, 'learning_rate': 1.0617696160267113e-05, 'epoch': 0.47}


 47%|████▋     | 282/599 [20:43<11:41,  2.21s/it]

{'loss': 0.0002, 'learning_rate': 1.0584307178631053e-05, 'epoch': 0.47}


 47%|████▋     | 283/599 [20:45<11:48,  2.24s/it]

{'loss': 0.0002, 'learning_rate': 1.0550918196994993e-05, 'epoch': 0.47}


 47%|████▋     | 284/599 [20:47<11:27,  2.18s/it]

{'loss': 0.0003, 'learning_rate': 1.0517529215358933e-05, 'epoch': 0.47}


 48%|████▊     | 285/599 [20:49<10:38,  2.03s/it]

{'loss': 0.0002, 'learning_rate': 1.0484140233722873e-05, 'epoch': 0.48}


 48%|████▊     | 286/599 [20:52<11:57,  2.29s/it]

{'loss': 0.0002, 'learning_rate': 1.0450751252086813e-05, 'epoch': 0.48}


 48%|████▊     | 287/599 [20:53<10:46,  2.07s/it]

{'loss': 0.0006, 'learning_rate': 1.0417362270450753e-05, 'epoch': 0.48}


 48%|████▊     | 288/599 [20:56<11:07,  2.15s/it]

{'loss': 0.0003, 'learning_rate': 1.0383973288814693e-05, 'epoch': 0.48}


 48%|████▊     | 289/599 [20:58<11:25,  2.21s/it]

{'loss': 0.0002, 'learning_rate': 1.0350584307178633e-05, 'epoch': 0.48}


 48%|████▊     | 290/599 [21:00<11:02,  2.14s/it]

{'loss': 0.0002, 'learning_rate': 1.0317195325542573e-05, 'epoch': 0.48}


 49%|████▊     | 291/599 [21:03<12:17,  2.39s/it]

{'loss': 0.0002, 'learning_rate': 1.0283806343906513e-05, 'epoch': 0.49}


 49%|████▊     | 292/599 [21:06<13:20,  2.61s/it]

{'loss': 0.0001, 'learning_rate': 1.0250417362270451e-05, 'epoch': 0.49}


 49%|████▉     | 293/599 [21:09<12:51,  2.52s/it]

{'loss': 0.0002, 'learning_rate': 1.0217028380634391e-05, 'epoch': 0.49}


 49%|████▉     | 294/599 [21:10<12:00,  2.36s/it]

{'loss': 0.0002, 'learning_rate': 1.0183639398998331e-05, 'epoch': 0.49}


 49%|████▉     | 295/599 [21:13<12:04,  2.38s/it]

{'loss': 0.0002, 'learning_rate': 1.0150250417362271e-05, 'epoch': 0.49}


 49%|████▉     | 296/599 [21:15<12:03,  2.39s/it]

{'loss': 0.0002, 'learning_rate': 1.0116861435726211e-05, 'epoch': 0.49}


 50%|████▉     | 297/599 [21:17<11:34,  2.30s/it]

{'loss': 0.0002, 'learning_rate': 1.0083472454090151e-05, 'epoch': 0.5}


 50%|████▉     | 298/599 [21:19<10:22,  2.07s/it]

{'loss': 0.0001, 'learning_rate': 1.0050083472454091e-05, 'epoch': 0.5}


 50%|████▉     | 299/599 [21:22<11:58,  2.39s/it]

{'loss': 0.0002, 'learning_rate': 1.001669449081803e-05, 'epoch': 0.5}


 50%|█████     | 300/599 [21:24<10:32,  2.12s/it]***** Running Evaluation *****
  Num examples = 5989
  Batch size = 10


{'loss': 0.0003, 'learning_rate': 9.98330550918197e-06, 'epoch': 0.5}


                                                 
 50%|█████     | 300/599 [26:35<10:32,  2.12s/it]Saving model checkpoint to bert-base-cased-finetuned-sents-coref-black-clover/checkpoint-300
Configuration saved in bert-base-cased-finetuned-sents-coref-black-clover/checkpoint-300/config.json


{'eval_loss': 0.0005063707358203828, 'eval_runtime': 311.6559, 'eval_samples_per_second': 19.217, 'eval_steps_per_second': 1.922, 'epoch': 0.5}


Model weights saved in bert-base-cased-finetuned-sents-coref-black-clover/checkpoint-300/pytorch_model.bin
 50%|█████     | 301/599 [26:38<7:55:57, 95.83s/it]

{'loss': 0.0005, 'learning_rate': 9.94991652754591e-06, 'epoch': 0.5}


 50%|█████     | 302/599 [26:40<5:35:02, 67.69s/it]

{'loss': 0.0001, 'learning_rate': 9.91652754590985e-06, 'epoch': 0.5}


 51%|█████     | 303/599 [26:42<3:56:32, 47.95s/it]

{'loss': 0.0002, 'learning_rate': 9.88313856427379e-06, 'epoch': 0.51}


 51%|█████     | 304/599 [26:44<2:47:35, 34.09s/it]

{'loss': 0.0003, 'learning_rate': 9.84974958263773e-06, 'epoch': 0.51}


 51%|█████     | 305/599 [26:46<2:00:07, 24.52s/it]

{'loss': 0.0002, 'learning_rate': 9.81636060100167e-06, 'epoch': 0.51}


 51%|█████     | 306/599 [26:48<1:26:41, 17.75s/it]

{'loss': 0.0012, 'learning_rate': 9.78297161936561e-06, 'epoch': 0.51}


 51%|█████▏    | 307/599 [26:50<1:02:58, 12.94s/it]

{'loss': 0.0004, 'learning_rate': 9.74958263772955e-06, 'epoch': 0.51}


 51%|█████▏    | 308/599 [26:51<46:31,  9.59s/it]  

{'loss': 0.0001, 'learning_rate': 9.71619365609349e-06, 'epoch': 0.51}


 52%|█████▏    | 309/599 [26:53<34:27,  7.13s/it]

{'loss': 0.0003, 'learning_rate': 9.68280467445743e-06, 'epoch': 0.52}


 52%|█████▏    | 310/599 [26:55<27:22,  5.68s/it]

{'loss': 0.0004, 'learning_rate': 9.64941569282137e-06, 'epoch': 0.52}


 52%|█████▏    | 311/599 [26:57<22:35,  4.71s/it]

{'loss': 0.0002, 'learning_rate': 9.61602671118531e-06, 'epoch': 0.52}


 52%|█████▏    | 312/599 [27:00<18:54,  3.95s/it]

{'loss': 0.0003, 'learning_rate': 9.582637729549249e-06, 'epoch': 0.52}


 52%|█████▏    | 313/599 [27:02<16:53,  3.54s/it]

{'loss': 0.0002, 'learning_rate': 9.549248747913189e-06, 'epoch': 0.52}


 52%|█████▏    | 314/599 [27:04<14:05,  2.97s/it]

{'loss': 0.0003, 'learning_rate': 9.515859766277129e-06, 'epoch': 0.52}


 53%|█████▎    | 315/599 [27:06<12:24,  2.62s/it]

{'loss': 0.0003, 'learning_rate': 9.482470784641069e-06, 'epoch': 0.53}


 53%|█████▎    | 316/599 [27:08<12:00,  2.55s/it]

{'loss': 0.0002, 'learning_rate': 9.449081803005009e-06, 'epoch': 0.53}


 53%|█████▎    | 317/599 [27:10<11:03,  2.35s/it]

{'loss': 0.0002, 'learning_rate': 9.415692821368949e-06, 'epoch': 0.53}


 53%|█████▎    | 318/599 [27:12<10:15,  2.19s/it]

{'loss': 0.0001, 'learning_rate': 9.382303839732888e-06, 'epoch': 0.53}


 53%|█████▎    | 319/599 [27:14<10:35,  2.27s/it]

{'loss': 0.0002, 'learning_rate': 9.348914858096828e-06, 'epoch': 0.53}


 53%|█████▎    | 320/599 [27:16<09:51,  2.12s/it]

{'loss': 0.0002, 'learning_rate': 9.315525876460768e-06, 'epoch': 0.53}


 54%|█████▎    | 321/599 [27:18<09:39,  2.08s/it]

{'loss': 0.0003, 'learning_rate': 9.282136894824708e-06, 'epoch': 0.54}


 54%|█████▍    | 322/599 [27:20<08:48,  1.91s/it]

{'loss': 0.0002, 'learning_rate': 9.248747913188648e-06, 'epoch': 0.54}


 54%|█████▍    | 323/599 [27:21<07:59,  1.74s/it]

{'loss': 0.0002, 'learning_rate': 9.215358931552588e-06, 'epoch': 0.54}


 54%|█████▍    | 324/599 [27:23<08:56,  1.95s/it]

{'loss': 0.0003, 'learning_rate': 9.181969949916528e-06, 'epoch': 0.54}


 54%|█████▍    | 325/599 [27:25<08:15,  1.81s/it]

{'loss': 0.0004, 'learning_rate': 9.148580968280468e-06, 'epoch': 0.54}


 54%|█████▍    | 326/599 [27:27<08:19,  1.83s/it]

{'loss': 0.0002, 'learning_rate': 9.115191986644408e-06, 'epoch': 0.54}


 55%|█████▍    | 327/599 [27:28<08:10,  1.80s/it]

{'loss': 0.0001, 'learning_rate': 9.081803005008348e-06, 'epoch': 0.55}


 55%|█████▍    | 328/599 [27:31<08:39,  1.92s/it]

{'loss': 0.07, 'learning_rate': 9.048414023372288e-06, 'epoch': 0.55}


 55%|█████▍    | 329/599 [27:33<08:52,  1.97s/it]

{'loss': 0.0003, 'learning_rate': 9.015025041736228e-06, 'epoch': 0.55}


 55%|█████▌    | 330/599 [27:35<09:11,  2.05s/it]

{'loss': 0.0002, 'learning_rate': 8.981636060100168e-06, 'epoch': 0.55}


 55%|█████▌    | 331/599 [27:38<10:07,  2.27s/it]

{'loss': 0.0001, 'learning_rate': 8.948247078464108e-06, 'epoch': 0.55}


 55%|█████▌    | 332/599 [27:40<09:41,  2.18s/it]

{'loss': 0.0003, 'learning_rate': 8.914858096828048e-06, 'epoch': 0.55}


 56%|█████▌    | 333/599 [27:42<09:56,  2.24s/it]

{'loss': 0.0002, 'learning_rate': 8.881469115191988e-06, 'epoch': 0.56}


 56%|█████▌    | 334/599 [27:44<09:34,  2.17s/it]

{'loss': 0.0003, 'learning_rate': 8.848080133555928e-06, 'epoch': 0.56}


 56%|█████▌    | 335/599 [27:46<09:18,  2.12s/it]

{'loss': 0.0002, 'learning_rate': 8.814691151919868e-06, 'epoch': 0.56}


 56%|█████▌    | 336/599 [27:49<09:43,  2.22s/it]

{'loss': 0.0002, 'learning_rate': 8.781302170283808e-06, 'epoch': 0.56}


 56%|█████▋    | 337/599 [27:51<09:39,  2.21s/it]

{'loss': 0.0001, 'learning_rate': 8.747913188647746e-06, 'epoch': 0.56}


 56%|█████▋    | 338/599 [27:53<09:41,  2.23s/it]

{'loss': 0.0002, 'learning_rate': 8.714524207011686e-06, 'epoch': 0.56}


 57%|█████▋    | 339/599 [27:56<10:44,  2.48s/it]

{'loss': 0.0003, 'learning_rate': 8.681135225375626e-06, 'epoch': 0.57}


 57%|█████▋    | 340/599 [27:58<09:53,  2.29s/it]

{'loss': 0.0003, 'learning_rate': 8.647746243739566e-06, 'epoch': 0.57}


 57%|█████▋    | 341/599 [28:00<09:05,  2.11s/it]

{'loss': 0.0004, 'learning_rate': 8.614357262103506e-06, 'epoch': 0.57}


 57%|█████▋    | 342/599 [28:02<09:43,  2.27s/it]

{'loss': 0.0004, 'learning_rate': 8.580968280467446e-06, 'epoch': 0.57}


 57%|█████▋    | 343/599 [28:04<09:24,  2.20s/it]

{'loss': 0.0006, 'learning_rate': 8.547579298831386e-06, 'epoch': 0.57}


 57%|█████▋    | 344/599 [28:07<09:33,  2.25s/it]

{'loss': 0.0002, 'learning_rate': 8.514190317195326e-06, 'epoch': 0.57}


 58%|█████▊    | 345/599 [28:09<09:10,  2.17s/it]

{'loss': 0.0001, 'learning_rate': 8.480801335559266e-06, 'epoch': 0.58}


 58%|█████▊    | 346/599 [28:10<08:17,  1.97s/it]

{'loss': 0.0003, 'learning_rate': 8.447412353923206e-06, 'epoch': 0.58}


 58%|█████▊    | 347/599 [28:12<08:30,  2.02s/it]

{'loss': 0.0001, 'learning_rate': 8.414023372287146e-06, 'epoch': 0.58}


 58%|█████▊    | 348/599 [28:14<08:29,  2.03s/it]

{'loss': 0.0006, 'learning_rate': 8.380634390651086e-06, 'epoch': 0.58}


 58%|█████▊    | 349/599 [28:16<08:28,  2.03s/it]

{'loss': 0.0002, 'learning_rate': 8.347245409015026e-06, 'epoch': 0.58}


 58%|█████▊    | 350/599 [28:19<09:01,  2.17s/it]

{'loss': 0.0003, 'learning_rate': 8.313856427378966e-06, 'epoch': 0.58}


 59%|█████▊    | 351/599 [28:21<08:21,  2.02s/it]

{'loss': 0.0006, 'learning_rate': 8.280467445742906e-06, 'epoch': 0.59}


 59%|█████▉    | 352/599 [28:22<08:07,  1.97s/it]

{'loss': 0.0004, 'learning_rate': 8.247078464106846e-06, 'epoch': 0.59}


 59%|█████▉    | 353/599 [28:24<08:00,  1.95s/it]

{'loss': 0.0003, 'learning_rate': 8.213689482470786e-06, 'epoch': 0.59}


 59%|█████▉    | 354/599 [28:26<07:49,  1.92s/it]

{'loss': 0.0007, 'learning_rate': 8.180300500834726e-06, 'epoch': 0.59}


 59%|█████▉    | 355/599 [28:29<08:33,  2.10s/it]

{'loss': 0.0003, 'learning_rate': 8.146911519198665e-06, 'epoch': 0.59}


 59%|█████▉    | 356/599 [28:30<08:06,  2.00s/it]

{'loss': 0.0003, 'learning_rate': 8.113522537562605e-06, 'epoch': 0.59}


 60%|█████▉    | 357/599 [28:32<07:59,  1.98s/it]

{'loss': 0.0002, 'learning_rate': 8.080133555926545e-06, 'epoch': 0.6}


 60%|█████▉    | 358/599 [28:34<07:40,  1.91s/it]

{'loss': 0.0004, 'learning_rate': 8.046744574290485e-06, 'epoch': 0.6}


 60%|█████▉    | 359/599 [28:36<07:36,  1.90s/it]

{'loss': 0.0002, 'learning_rate': 8.013355592654425e-06, 'epoch': 0.6}


 60%|██████    | 360/599 [28:39<08:35,  2.16s/it]

{'loss': 0.0002, 'learning_rate': 7.979966611018365e-06, 'epoch': 0.6}


 60%|██████    | 361/599 [28:41<08:28,  2.14s/it]

{'loss': 0.0001, 'learning_rate': 7.946577629382305e-06, 'epoch': 0.6}


 60%|██████    | 362/599 [28:44<09:11,  2.33s/it]

{'loss': 0.0003, 'learning_rate': 7.913188647746244e-06, 'epoch': 0.6}


 61%|██████    | 363/599 [28:46<08:54,  2.27s/it]

{'loss': 0.0002, 'learning_rate': 7.879799666110184e-06, 'epoch': 0.61}


 61%|██████    | 364/599 [28:50<11:43,  2.99s/it]

{'loss': 0.0005, 'learning_rate': 7.846410684474123e-06, 'epoch': 0.61}


 61%|██████    | 365/599 [28:52<10:20,  2.65s/it]

{'loss': 0.0002, 'learning_rate': 7.813021702838063e-06, 'epoch': 0.61}


 61%|██████    | 366/599 [28:55<09:55,  2.56s/it]

{'loss': 0.0002, 'learning_rate': 7.779632721202003e-06, 'epoch': 0.61}


 61%|██████▏   | 367/599 [28:56<08:49,  2.28s/it]

{'loss': 0.0006, 'learning_rate': 7.746243739565943e-06, 'epoch': 0.61}


 61%|██████▏   | 368/599 [29:00<09:57,  2.58s/it]

{'loss': 0.0001, 'learning_rate': 7.712854757929883e-06, 'epoch': 0.61}


 62%|██████▏   | 369/599 [29:02<09:50,  2.57s/it]

{'loss': 0.0002, 'learning_rate': 7.679465776293823e-06, 'epoch': 0.62}


 62%|██████▏   | 370/599 [29:04<09:28,  2.48s/it]

{'loss': 0.0003, 'learning_rate': 7.646076794657763e-06, 'epoch': 0.62}


 62%|██████▏   | 371/599 [29:07<09:06,  2.40s/it]

{'loss': 0.0002, 'learning_rate': 7.612687813021703e-06, 'epoch': 0.62}


 62%|██████▏   | 372/599 [29:09<09:20,  2.47s/it]

{'loss': 0.0003, 'learning_rate': 7.579298831385643e-06, 'epoch': 0.62}


 62%|██████▏   | 373/599 [29:12<09:25,  2.50s/it]

{'loss': 0.0002, 'learning_rate': 7.545909849749583e-06, 'epoch': 0.62}


 62%|██████▏   | 374/599 [29:14<08:55,  2.38s/it]

{'loss': 0.0003, 'learning_rate': 7.512520868113523e-06, 'epoch': 0.62}


 63%|██████▎   | 375/599 [29:16<08:24,  2.25s/it]

{'loss': 0.0004, 'learning_rate': 7.479131886477463e-06, 'epoch': 0.63}


 63%|██████▎   | 376/599 [29:18<08:15,  2.22s/it]

{'loss': 0.0002, 'learning_rate': 7.445742904841403e-06, 'epoch': 0.63}


 63%|██████▎   | 377/599 [29:20<08:24,  2.27s/it]

{'loss': 0.0002, 'learning_rate': 7.412353923205343e-06, 'epoch': 0.63}


 63%|██████▎   | 378/599 [29:23<08:22,  2.27s/it]

{'loss': 0.0002, 'learning_rate': 7.378964941569283e-06, 'epoch': 0.63}


 63%|██████▎   | 379/599 [29:25<08:12,  2.24s/it]

{'loss': 0.0002, 'learning_rate': 7.345575959933222e-06, 'epoch': 0.63}


 63%|██████▎   | 380/599 [29:27<08:21,  2.29s/it]

{'loss': 0.0003, 'learning_rate': 7.312186978297162e-06, 'epoch': 0.63}


 64%|██████▎   | 381/599 [29:29<07:37,  2.10s/it]

{'loss': 0.0004, 'learning_rate': 7.278797996661102e-06, 'epoch': 0.64}


 64%|██████▍   | 382/599 [29:31<07:51,  2.17s/it]

{'loss': 0.0003, 'learning_rate': 7.245409015025042e-06, 'epoch': 0.64}


 64%|██████▍   | 383/599 [29:35<09:10,  2.55s/it]

{'loss': 0.0003, 'learning_rate': 7.212020033388982e-06, 'epoch': 0.64}


 64%|██████▍   | 384/599 [29:38<10:30,  2.93s/it]

{'loss': 0.0002, 'learning_rate': 7.178631051752922e-06, 'epoch': 0.64}


 64%|██████▍   | 385/599 [29:41<09:49,  2.76s/it]

{'loss': 0.0002, 'learning_rate': 7.145242070116863e-06, 'epoch': 0.64}


 64%|██████▍   | 386/599 [29:44<10:30,  2.96s/it]

{'loss': 0.0002, 'learning_rate': 7.111853088480803e-06, 'epoch': 0.64}


 65%|██████▍   | 387/599 [29:47<10:00,  2.83s/it]

{'loss': 0.0002, 'learning_rate': 7.078464106844741e-06, 'epoch': 0.65}


 65%|██████▍   | 388/599 [29:49<09:24,  2.68s/it]

{'loss': 0.0003, 'learning_rate': 7.045075125208681e-06, 'epoch': 0.65}


 65%|██████▍   | 389/599 [29:52<09:10,  2.62s/it]

{'loss': 0.0002, 'learning_rate': 7.011686143572622e-06, 'epoch': 0.65}


 65%|██████▌   | 390/599 [29:53<08:17,  2.38s/it]

{'loss': 0.0005, 'learning_rate': 6.978297161936562e-06, 'epoch': 0.65}


 65%|██████▌   | 391/599 [29:55<07:36,  2.20s/it]

{'loss': 0.0001, 'learning_rate': 6.944908180300502e-06, 'epoch': 0.65}


 65%|██████▌   | 392/599 [29:57<07:08,  2.07s/it]

{'loss': 0.0001, 'learning_rate': 6.911519198664442e-06, 'epoch': 0.65}


 66%|██████▌   | 393/599 [29:59<06:48,  1.98s/it]

{'loss': 0.0002, 'learning_rate': 6.878130217028382e-06, 'epoch': 0.66}


 66%|██████▌   | 394/599 [30:01<06:36,  1.93s/it]

{'loss': 0.0001, 'learning_rate': 6.844741235392322e-06, 'epoch': 0.66}


 66%|██████▌   | 395/599 [30:02<06:26,  1.90s/it]

{'loss': 0.0002, 'learning_rate': 6.8113522537562615e-06, 'epoch': 0.66}


 66%|██████▌   | 396/599 [30:07<09:29,  2.80s/it]

{'loss': 0.0003, 'learning_rate': 6.777963272120201e-06, 'epoch': 0.66}


 66%|██████▋   | 397/599 [30:12<11:37,  3.45s/it]

{'loss': 0.0002, 'learning_rate': 6.744574290484141e-06, 'epoch': 0.66}


 66%|██████▋   | 398/599 [30:15<10:41,  3.19s/it]

{'loss': 0.0005, 'learning_rate': 6.711185308848081e-06, 'epoch': 0.66}


 67%|██████▋   | 399/599 [30:17<09:52,  2.96s/it]

{'loss': 0.0002, 'learning_rate': 6.6777963272120206e-06, 'epoch': 0.67}


 67%|██████▋   | 400/599 [30:20<09:27,  2.85s/it]***** Running Evaluation *****
  Num examples = 5989
  Batch size = 10


{'loss': 0.0003, 'learning_rate': 6.6444073455759605e-06, 'epoch': 0.67}


                                                 
 67%|██████▋   | 400/599 [35:27<09:27,  2.85s/it]Saving model checkpoint to bert-base-cased-finetuned-sents-coref-black-clover/checkpoint-400
Configuration saved in bert-base-cased-finetuned-sents-coref-black-clover/checkpoint-400/config.json


{'eval_loss': 0.00025847391225397587, 'eval_runtime': 307.6509, 'eval_samples_per_second': 19.467, 'eval_steps_per_second': 1.947, 'epoch': 0.67}


Model weights saved in bert-base-cased-finetuned-sents-coref-black-clover/checkpoint-400/pytorch_model.bin
 67%|██████▋   | 401/599 [35:30<5:13:24, 94.97s/it]

{'loss': 0.0001, 'learning_rate': 6.6110183639399005e-06, 'epoch': 0.67}


 67%|██████▋   | 402/599 [35:32<3:40:40, 67.21s/it]

{'loss': 0.0002, 'learning_rate': 6.5776293823038405e-06, 'epoch': 0.67}


 67%|██████▋   | 403/599 [35:34<2:35:51, 47.71s/it]

{'loss': 0.0003, 'learning_rate': 6.54424040066778e-06, 'epoch': 0.67}


 67%|██████▋   | 404/599 [35:36<1:50:19, 33.94s/it]

{'loss': 0.0004, 'learning_rate': 6.5108514190317195e-06, 'epoch': 0.67}


 68%|██████▊   | 405/599 [35:39<1:19:03, 24.45s/it]

{'loss': 0.0001, 'learning_rate': 6.4774624373956595e-06, 'epoch': 0.68}


 68%|██████▊   | 406/599 [35:41<57:09, 17.77s/it]  

{'loss': 0.0003, 'learning_rate': 6.4440734557595995e-06, 'epoch': 0.68}


 68%|██████▊   | 407/599 [35:44<43:07, 13.48s/it]

{'loss': 0.0001, 'learning_rate': 6.4106844741235394e-06, 'epoch': 0.68}


 68%|██████▊   | 408/599 [35:46<31:54, 10.02s/it]

{'loss': 0.0002, 'learning_rate': 6.377295492487479e-06, 'epoch': 0.68}


 68%|██████▊   | 409/599 [35:48<23:48,  7.52s/it]

{'loss': 0.0002, 'learning_rate': 6.343906510851419e-06, 'epoch': 0.68}


 68%|██████▊   | 410/599 [35:49<18:05,  5.74s/it]

{'loss': 0.0003, 'learning_rate': 6.31051752921536e-06, 'epoch': 0.68}


 69%|██████▊   | 411/599 [35:51<14:23,  4.59s/it]

{'loss': 0.0442, 'learning_rate': 6.2771285475793e-06, 'epoch': 0.69}


 69%|██████▉   | 412/599 [35:53<11:39,  3.74s/it]

{'loss': 0.0002, 'learning_rate': 6.243739565943238e-06, 'epoch': 0.69}


 69%|██████▉   | 413/599 [35:55<09:48,  3.16s/it]

{'loss': 0.0002, 'learning_rate': 6.210350584307178e-06, 'epoch': 0.69}


 69%|██████▉   | 414/599 [35:56<08:11,  2.66s/it]

{'loss': 0.0001, 'learning_rate': 6.176961602671119e-06, 'epoch': 0.69}


 69%|██████▉   | 415/599 [35:58<07:32,  2.46s/it]

{'loss': 0.0001, 'learning_rate': 6.143572621035059e-06, 'epoch': 0.69}


 69%|██████▉   | 416/599 [36:00<06:38,  2.18s/it]

{'loss': 0.0001, 'learning_rate': 6.110183639398999e-06, 'epoch': 0.69}


 70%|██████▉   | 417/599 [36:03<07:11,  2.37s/it]

{'loss': 0.0003, 'learning_rate': 6.076794657762939e-06, 'epoch': 0.7}


 70%|██████▉   | 418/599 [36:04<06:36,  2.19s/it]

{'loss': 0.0002, 'learning_rate': 6.043405676126879e-06, 'epoch': 0.7}


 70%|██████▉   | 419/599 [36:06<05:53,  1.96s/it]

{'loss': 0.0001, 'learning_rate': 6.010016694490819e-06, 'epoch': 0.7}


 70%|███████   | 420/599 [36:08<05:45,  1.93s/it]

{'loss': 0.0002, 'learning_rate': 5.976627712854759e-06, 'epoch': 0.7}


 70%|███████   | 421/599 [36:10<05:42,  1.92s/it]

{'loss': 0.0001, 'learning_rate': 5.943238731218698e-06, 'epoch': 0.7}


 70%|███████   | 422/599 [36:12<05:37,  1.91s/it]

{'loss': 0.0002, 'learning_rate': 5.909849749582638e-06, 'epoch': 0.7}


 71%|███████   | 423/599 [36:14<05:56,  2.03s/it]

{'loss': 0.0001, 'learning_rate': 5.876460767946578e-06, 'epoch': 0.71}


 71%|███████   | 424/599 [36:16<05:38,  1.93s/it]

{'loss': 0.0003, 'learning_rate': 5.843071786310518e-06, 'epoch': 0.71}


 71%|███████   | 425/599 [36:17<05:30,  1.90s/it]

{'loss': 0.0007, 'learning_rate': 5.809682804674458e-06, 'epoch': 0.71}


 71%|███████   | 426/599 [36:19<05:18,  1.84s/it]

{'loss': 0.0003, 'learning_rate': 5.776293823038398e-06, 'epoch': 0.71}


 71%|███████▏  | 427/599 [36:21<05:30,  1.92s/it]

{'loss': 0.0002, 'learning_rate': 5.742904841402338e-06, 'epoch': 0.71}


 71%|███████▏  | 428/599 [36:23<05:39,  1.99s/it]

{'loss': 0.0007, 'learning_rate': 5.709515859766278e-06, 'epoch': 0.71}


 72%|███████▏  | 429/599 [36:25<05:26,  1.92s/it]

{'loss': 0.0002, 'learning_rate': 5.676126878130217e-06, 'epoch': 0.72}


 72%|███████▏  | 430/599 [36:27<05:21,  1.90s/it]

{'loss': 0.0002, 'learning_rate': 5.642737896494157e-06, 'epoch': 0.72}


 72%|███████▏  | 431/599 [36:28<05:00,  1.79s/it]

{'loss': 0.0001, 'learning_rate': 5.609348914858097e-06, 'epoch': 0.72}


 72%|███████▏  | 432/599 [36:30<04:50,  1.74s/it]

{'loss': 0.0008, 'learning_rate': 5.575959933222037e-06, 'epoch': 0.72}


 72%|███████▏  | 433/599 [36:32<04:46,  1.73s/it]

{'loss': 0.0002, 'learning_rate': 5.542570951585977e-06, 'epoch': 0.72}


 72%|███████▏  | 434/599 [36:33<04:30,  1.64s/it]

{'loss': 0.0003, 'learning_rate': 5.509181969949917e-06, 'epoch': 0.72}


 73%|███████▎  | 435/599 [36:35<04:37,  1.69s/it]

{'loss': 0.0005, 'learning_rate': 5.475792988313858e-06, 'epoch': 0.73}


 73%|███████▎  | 436/599 [36:37<04:38,  1.71s/it]

{'loss': 0.0002, 'learning_rate': 5.4424040066777976e-06, 'epoch': 0.73}


 73%|███████▎  | 437/599 [36:38<04:33,  1.69s/it]

{'loss': 0.0004, 'learning_rate': 5.409015025041736e-06, 'epoch': 0.73}


 73%|███████▎  | 438/599 [36:40<04:36,  1.72s/it]

{'loss': 0.0004, 'learning_rate': 5.375626043405676e-06, 'epoch': 0.73}


 73%|███████▎  | 439/599 [36:42<04:27,  1.67s/it]

{'loss': 0.0002, 'learning_rate': 5.342237061769617e-06, 'epoch': 0.73}


 73%|███████▎  | 440/599 [36:44<04:48,  1.81s/it]

{'loss': 0.0002, 'learning_rate': 5.308848080133557e-06, 'epoch': 0.73}


 74%|███████▎  | 441/599 [36:46<04:42,  1.79s/it]

{'loss': 0.0003, 'learning_rate': 5.2754590984974965e-06, 'epoch': 0.74}


 74%|███████▍  | 442/599 [36:49<05:57,  2.28s/it]

{'loss': 0.0002, 'learning_rate': 5.2420701168614365e-06, 'epoch': 0.74}


 74%|███████▍  | 443/599 [36:51<05:26,  2.09s/it]

{'loss': 0.0003, 'learning_rate': 5.2086811352253765e-06, 'epoch': 0.74}


 74%|███████▍  | 444/599 [36:52<04:58,  1.92s/it]

{'loss': 0.0001, 'learning_rate': 5.1752921535893164e-06, 'epoch': 0.74}


 74%|███████▍  | 445/599 [36:54<04:44,  1.85s/it]

{'loss': 0.0001, 'learning_rate': 5.141903171953256e-06, 'epoch': 0.74}


 74%|███████▍  | 446/599 [36:56<04:34,  1.79s/it]

{'loss': 0.0002, 'learning_rate': 5.1085141903171955e-06, 'epoch': 0.74}


 75%|███████▍  | 447/599 [36:57<04:28,  1.77s/it]

{'loss': 0.0002, 'learning_rate': 5.0751252086811355e-06, 'epoch': 0.75}


 75%|███████▍  | 448/599 [37:00<05:08,  2.05s/it]

{'loss': 0.0001, 'learning_rate': 5.0417362270450755e-06, 'epoch': 0.75}


 75%|███████▍  | 449/599 [37:02<05:20,  2.14s/it]

{'loss': 0.0002, 'learning_rate': 5.008347245409015e-06, 'epoch': 0.75}


 75%|███████▌  | 450/599 [37:04<05:17,  2.13s/it]

{'loss': 0.0032, 'learning_rate': 4.974958263772955e-06, 'epoch': 0.75}


 75%|███████▌  | 451/599 [37:06<04:44,  1.92s/it]

{'loss': 0.0002, 'learning_rate': 4.941569282136895e-06, 'epoch': 0.75}


 75%|███████▌  | 452/599 [37:08<05:00,  2.05s/it]

{'loss': 0.0001, 'learning_rate': 4.908180300500835e-06, 'epoch': 0.75}


 76%|███████▌  | 453/599 [37:11<05:10,  2.12s/it]

{'loss': 0.0002, 'learning_rate': 4.874791318864775e-06, 'epoch': 0.76}


 76%|███████▌  | 454/599 [37:12<04:55,  2.04s/it]

{'loss': 0.0002, 'learning_rate': 4.841402337228715e-06, 'epoch': 0.76}


 76%|███████▌  | 455/599 [37:16<06:12,  2.59s/it]

{'loss': 0.0001, 'learning_rate': 4.808013355592655e-06, 'epoch': 0.76}


 76%|███████▌  | 456/599 [37:18<05:43,  2.40s/it]

{'loss': 0.0001, 'learning_rate': 4.774624373956594e-06, 'epoch': 0.76}


 76%|███████▋  | 457/599 [37:20<05:20,  2.26s/it]

{'loss': 0.0002, 'learning_rate': 4.741235392320534e-06, 'epoch': 0.76}


 76%|███████▋  | 458/599 [37:22<05:04,  2.16s/it]

{'loss': 0.0002, 'learning_rate': 4.707846410684474e-06, 'epoch': 0.76}


 77%|███████▋  | 459/599 [37:24<05:00,  2.14s/it]

{'loss': 0.0003, 'learning_rate': 4.674457429048414e-06, 'epoch': 0.77}


 77%|███████▋  | 460/599 [37:26<04:55,  2.13s/it]

{'loss': 0.0002, 'learning_rate': 4.641068447412354e-06, 'epoch': 0.77}


 77%|███████▋  | 461/599 [37:31<06:28,  2.82s/it]

{'loss': 0.0004, 'learning_rate': 4.607679465776294e-06, 'epoch': 0.77}


 77%|███████▋  | 462/599 [37:33<05:56,  2.60s/it]

{'loss': 0.0004, 'learning_rate': 4.574290484140234e-06, 'epoch': 0.77}


 77%|███████▋  | 463/599 [37:35<05:43,  2.52s/it]

{'loss': 0.0004, 'learning_rate': 4.540901502504174e-06, 'epoch': 0.77}


 77%|███████▋  | 464/599 [37:37<05:00,  2.22s/it]

{'loss': 0.0001, 'learning_rate': 4.507512520868114e-06, 'epoch': 0.77}


 78%|███████▊  | 465/599 [37:38<04:41,  2.10s/it]

{'loss': 0.0003, 'learning_rate': 4.474123539232054e-06, 'epoch': 0.78}


 78%|███████▊  | 466/599 [37:40<04:32,  2.05s/it]

{'loss': 0.0001, 'learning_rate': 4.440734557595994e-06, 'epoch': 0.78}


 78%|███████▊  | 467/599 [37:42<04:18,  1.96s/it]

{'loss': 0.0001, 'learning_rate': 4.407345575959934e-06, 'epoch': 0.78}


 78%|███████▊  | 468/599 [37:45<04:50,  2.21s/it]

{'loss': 0.0001, 'learning_rate': 4.373956594323873e-06, 'epoch': 0.78}


 78%|███████▊  | 469/599 [37:47<04:54,  2.26s/it]

{'loss': 0.0001, 'learning_rate': 4.340567612687813e-06, 'epoch': 0.78}


 78%|███████▊  | 470/599 [37:49<04:27,  2.08s/it]

{'loss': 0.0002, 'learning_rate': 4.307178631051753e-06, 'epoch': 0.78}


 79%|███████▊  | 471/599 [37:51<04:29,  2.11s/it]

{'loss': 0.0004, 'learning_rate': 4.273789649415693e-06, 'epoch': 0.79}


 79%|███████▉  | 472/599 [37:53<04:12,  1.99s/it]

{'loss': 0.0003, 'learning_rate': 4.240400667779633e-06, 'epoch': 0.79}


 79%|███████▉  | 473/599 [37:55<04:06,  1.96s/it]

{'loss': 0.0002, 'learning_rate': 4.207011686143573e-06, 'epoch': 0.79}


 79%|███████▉  | 474/599 [37:56<03:46,  1.81s/it]

{'loss': 0.0002, 'learning_rate': 4.173622704507513e-06, 'epoch': 0.79}


 79%|███████▉  | 475/599 [37:58<03:57,  1.92s/it]

{'loss': 0.0001, 'learning_rate': 4.140233722871453e-06, 'epoch': 0.79}


 79%|███████▉  | 476/599 [38:00<04:02,  1.97s/it]

{'loss': 0.0002, 'learning_rate': 4.106844741235393e-06, 'epoch': 0.79}


 80%|███████▉  | 477/599 [38:03<04:27,  2.19s/it]

{'loss': 0.0003, 'learning_rate': 4.073455759599333e-06, 'epoch': 0.8}


 80%|███████▉  | 478/599 [38:05<04:01,  1.99s/it]

{'loss': 0.0001, 'learning_rate': 4.040066777963273e-06, 'epoch': 0.8}


 80%|███████▉  | 479/599 [38:06<03:47,  1.89s/it]

{'loss': 0.0002, 'learning_rate': 4.006677796327213e-06, 'epoch': 0.8}


 80%|████████  | 480/599 [38:08<03:44,  1.88s/it]

{'loss': 0.0002, 'learning_rate': 3.973288814691153e-06, 'epoch': 0.8}


 80%|████████  | 481/599 [38:10<03:29,  1.78s/it]

{'loss': 0.0005, 'learning_rate': 3.939899833055092e-06, 'epoch': 0.8}


 80%|████████  | 482/599 [38:11<03:22,  1.73s/it]

{'loss': 0.0002, 'learning_rate': 3.906510851419032e-06, 'epoch': 0.8}


 81%|████████  | 483/599 [38:13<03:10,  1.64s/it]

{'loss': 0.0002, 'learning_rate': 3.873121869782972e-06, 'epoch': 0.81}


 81%|████████  | 484/599 [38:15<03:13,  1.68s/it]

{'loss': 0.0002, 'learning_rate': 3.839732888146912e-06, 'epoch': 0.81}


 81%|████████  | 485/599 [38:16<03:15,  1.71s/it]

{'loss': 0.0001, 'learning_rate': 3.8063439065108516e-06, 'epoch': 0.81}


 81%|████████  | 486/599 [38:19<03:30,  1.87s/it]

{'loss': 0.0002, 'learning_rate': 3.7729549248747916e-06, 'epoch': 0.81}


 81%|████████▏ | 487/599 [38:20<03:19,  1.78s/it]

{'loss': 0.0001, 'learning_rate': 3.7395659432387315e-06, 'epoch': 0.81}


 81%|████████▏ | 488/599 [38:22<03:15,  1.76s/it]

{'loss': 0.0008, 'learning_rate': 3.7061769616026715e-06, 'epoch': 0.81}


 82%|████████▏ | 489/599 [38:24<03:08,  1.71s/it]

{'loss': 0.0004, 'learning_rate': 3.672787979966611e-06, 'epoch': 0.82}


 82%|████████▏ | 490/599 [38:25<03:09,  1.73s/it]

{'loss': 0.0001, 'learning_rate': 3.639398998330551e-06, 'epoch': 0.82}


 82%|████████▏ | 491/599 [38:28<03:22,  1.88s/it]

{'loss': 0.0003, 'learning_rate': 3.606010016694491e-06, 'epoch': 0.82}


 82%|████████▏ | 492/599 [38:29<03:10,  1.78s/it]

{'loss': 0.0002, 'learning_rate': 3.5726210350584314e-06, 'epoch': 0.82}


 82%|████████▏ | 493/599 [38:31<03:08,  1.78s/it]

{'loss': 0.0001, 'learning_rate': 3.5392320534223705e-06, 'epoch': 0.82}


 82%|████████▏ | 494/599 [38:33<03:08,  1.80s/it]

{'loss': 0.0002, 'learning_rate': 3.505843071786311e-06, 'epoch': 0.82}


 83%|████████▎ | 495/599 [38:34<03:04,  1.77s/it]

{'loss': 0.0005, 'learning_rate': 3.472454090150251e-06, 'epoch': 0.83}


 83%|████████▎ | 496/599 [38:36<03:10,  1.85s/it]

{'loss': 0.0001, 'learning_rate': 3.439065108514191e-06, 'epoch': 0.83}


 83%|████████▎ | 497/599 [38:38<03:03,  1.80s/it]

{'loss': 0.0001, 'learning_rate': 3.4056761268781308e-06, 'epoch': 0.83}


 83%|████████▎ | 498/599 [38:40<02:55,  1.74s/it]

{'loss': 0.0002, 'learning_rate': 3.3722871452420703e-06, 'epoch': 0.83}


 83%|████████▎ | 499/599 [38:41<02:39,  1.59s/it]

{'loss': 0.0002, 'learning_rate': 3.3388981636060103e-06, 'epoch': 0.83}


 83%|████████▎ | 500/599 [38:43<02:51,  1.73s/it]***** Running Evaluation *****
  Num examples = 5989
  Batch size = 10


{'loss': 0.0003, 'learning_rate': 3.3055091819699502e-06, 'epoch': 0.83}


                                                 
 83%|████████▎ | 500/599 [43:32<02:51,  1.73s/it]Saving model checkpoint to bert-base-cased-finetuned-sents-coref-black-clover/checkpoint-500
Configuration saved in bert-base-cased-finetuned-sents-coref-black-clover/checkpoint-500/config.json


{'eval_loss': 9.931849490385503e-05, 'eval_runtime': 289.389, 'eval_samples_per_second': 20.695, 'eval_steps_per_second': 2.07, 'epoch': 0.83}


Model weights saved in bert-base-cased-finetuned-sents-coref-black-clover/checkpoint-500/pytorch_model.bin
 84%|████████▎ | 501/599 [43:35<2:24:58, 88.76s/it]

{'loss': 0.0006, 'learning_rate': 3.27212020033389e-06, 'epoch': 0.84}


 84%|████████▍ | 502/599 [43:36<1:41:03, 62.51s/it]

{'loss': 0.0002, 'learning_rate': 3.2387312186978297e-06, 'epoch': 0.84}


 84%|████████▍ | 503/599 [43:38<1:10:53, 44.31s/it]

{'loss': 0.0002, 'learning_rate': 3.2053422370617697e-06, 'epoch': 0.84}


 84%|████████▍ | 504/599 [43:40<49:51, 31.49s/it]  

{'loss': 0.0012, 'learning_rate': 3.1719532554257097e-06, 'epoch': 0.84}


 84%|████████▍ | 505/599 [43:41<35:25, 22.62s/it]

{'loss': 0.0002, 'learning_rate': 3.13856427378965e-06, 'epoch': 0.84}


 84%|████████▍ | 506/599 [43:43<25:13, 16.27s/it]

{'loss': 0.0001, 'learning_rate': 3.105175292153589e-06, 'epoch': 0.84}


 85%|████████▍ | 507/599 [43:45<18:23, 11.99s/it]

{'loss': 0.0002, 'learning_rate': 3.0717863105175296e-06, 'epoch': 0.85}


 85%|████████▍ | 508/599 [43:47<13:49,  9.12s/it]

{'loss': 0.0002, 'learning_rate': 3.0383973288814695e-06, 'epoch': 0.85}


 85%|████████▍ | 509/599 [43:49<10:31,  7.02s/it]

{'loss': 0.0001, 'learning_rate': 3.0050083472454095e-06, 'epoch': 0.85}


 85%|████████▌ | 510/599 [43:51<08:07,  5.47s/it]

{'loss': 0.0002, 'learning_rate': 2.971619365609349e-06, 'epoch': 0.85}


 85%|████████▌ | 511/599 [43:53<06:16,  4.27s/it]

{'loss': 0.0001, 'learning_rate': 2.938230383973289e-06, 'epoch': 0.85}


 85%|████████▌ | 512/599 [43:55<05:08,  3.55s/it]

{'loss': 0.0001, 'learning_rate': 2.904841402337229e-06, 'epoch': 0.85}


 86%|████████▌ | 513/599 [43:57<04:25,  3.09s/it]

{'loss': 0.0005, 'learning_rate': 2.871452420701169e-06, 'epoch': 0.86}


 86%|████████▌ | 514/599 [43:59<04:02,  2.85s/it]

{'loss': 0.0001, 'learning_rate': 2.8380634390651085e-06, 'epoch': 0.86}


 86%|████████▌ | 515/599 [44:01<03:29,  2.50s/it]

{'loss': 0.0001, 'learning_rate': 2.8046744574290484e-06, 'epoch': 0.86}


 86%|████████▌ | 516/599 [44:02<03:09,  2.29s/it]

{'loss': 0.0002, 'learning_rate': 2.7712854757929884e-06, 'epoch': 0.86}


 86%|████████▋ | 517/599 [44:05<03:06,  2.27s/it]

{'loss': 0.0002, 'learning_rate': 2.737896494156929e-06, 'epoch': 0.86}


 86%|████████▋ | 518/599 [44:07<03:06,  2.30s/it]

{'loss': 0.0002, 'learning_rate': 2.704507512520868e-06, 'epoch': 0.86}


 87%|████████▋ | 519/599 [44:09<02:52,  2.16s/it]

{'loss': 0.0002, 'learning_rate': 2.6711185308848083e-06, 'epoch': 0.87}


 87%|████████▋ | 520/599 [44:10<02:37,  1.99s/it]

{'loss': 0.0003, 'learning_rate': 2.6377295492487483e-06, 'epoch': 0.87}


 87%|████████▋ | 521/599 [44:12<02:27,  1.89s/it]

{'loss': 0.0008, 'learning_rate': 2.6043405676126882e-06, 'epoch': 0.87}


 87%|████████▋ | 522/599 [44:14<02:25,  1.88s/it]

{'loss': 0.0002, 'learning_rate': 2.570951585976628e-06, 'epoch': 0.87}


 87%|████████▋ | 523/599 [44:16<02:23,  1.88s/it]

{'loss': 0.0002, 'learning_rate': 2.5375626043405677e-06, 'epoch': 0.87}


 87%|████████▋ | 524/599 [44:18<02:19,  1.86s/it]

{'loss': 0.0001, 'learning_rate': 2.5041736227045077e-06, 'epoch': 0.87}


 88%|████████▊ | 525/599 [44:20<02:22,  1.93s/it]

{'loss': 0.0001, 'learning_rate': 2.4707846410684477e-06, 'epoch': 0.88}


 88%|████████▊ | 526/599 [44:21<02:08,  1.75s/it]

{'loss': 0.0002, 'learning_rate': 2.4373956594323876e-06, 'epoch': 0.88}


 88%|████████▊ | 527/599 [44:23<02:17,  1.91s/it]

{'loss': 0.0007, 'learning_rate': 2.4040066777963276e-06, 'epoch': 0.88}


 88%|████████▊ | 528/599 [44:26<02:29,  2.10s/it]

{'loss': 0.0002, 'learning_rate': 2.370617696160267e-06, 'epoch': 0.88}


 88%|████████▊ | 529/599 [44:30<03:07,  2.67s/it]

{'loss': 0.0002, 'learning_rate': 2.337228714524207e-06, 'epoch': 0.88}


 88%|████████▊ | 530/599 [44:32<02:44,  2.38s/it]

{'loss': 0.0001, 'learning_rate': 2.303839732888147e-06, 'epoch': 0.88}


 89%|████████▊ | 531/599 [44:33<02:25,  2.13s/it]

{'loss': 0.0001, 'learning_rate': 2.270450751252087e-06, 'epoch': 0.89}


 89%|████████▉ | 532/599 [44:36<02:39,  2.38s/it]

{'loss': 0.0002, 'learning_rate': 2.237061769616027e-06, 'epoch': 0.89}


 89%|████████▉ | 533/599 [44:38<02:27,  2.23s/it]

{'loss': 0.0001, 'learning_rate': 2.203672787979967e-06, 'epoch': 0.89}


 89%|████████▉ | 534/599 [44:40<02:18,  2.14s/it]

{'loss': 0.0007, 'learning_rate': 2.1702838063439065e-06, 'epoch': 0.89}


 89%|████████▉ | 535/599 [44:42<02:10,  2.05s/it]

{'loss': 0.0002, 'learning_rate': 2.1368948247078465e-06, 'epoch': 0.89}


 89%|████████▉ | 536/599 [44:43<01:59,  1.90s/it]

{'loss': 0.0003, 'learning_rate': 2.1035058430717864e-06, 'epoch': 0.89}


 90%|████████▉ | 537/599 [44:46<02:03,  1.99s/it]

{'loss': 0.0002, 'learning_rate': 2.0701168614357264e-06, 'epoch': 0.9}


 90%|████████▉ | 538/599 [44:48<02:03,  2.03s/it]

{'loss': 0.0002, 'learning_rate': 2.0367278797996664e-06, 'epoch': 0.9}


 90%|████████▉ | 539/599 [44:49<01:51,  1.86s/it]

{'loss': 0.0002, 'learning_rate': 2.0033388981636063e-06, 'epoch': 0.9}


 90%|█████████ | 540/599 [44:51<01:44,  1.77s/it]

{'loss': 0.0002, 'learning_rate': 1.969949916527546e-06, 'epoch': 0.9}


 90%|█████████ | 541/599 [44:53<01:45,  1.82s/it]

{'loss': 0.0002, 'learning_rate': 1.936560934891486e-06, 'epoch': 0.9}


 90%|█████████ | 542/599 [44:55<01:52,  1.97s/it]

{'loss': 0.0073, 'learning_rate': 1.9031719532554258e-06, 'epoch': 0.9}


 91%|█████████ | 543/599 [44:57<01:52,  2.01s/it]

{'loss': 0.0002, 'learning_rate': 1.8697829716193658e-06, 'epoch': 0.91}


 91%|█████████ | 544/599 [44:59<01:55,  2.11s/it]

{'loss': 0.0001, 'learning_rate': 1.8363939899833055e-06, 'epoch': 0.91}


 91%|█████████ | 545/599 [45:01<01:49,  2.04s/it]

{'loss': 0.0002, 'learning_rate': 1.8030050083472455e-06, 'epoch': 0.91}


 91%|█████████ | 546/599 [45:03<01:39,  1.87s/it]

{'loss': 0.0002, 'learning_rate': 1.7696160267111852e-06, 'epoch': 0.91}


 91%|█████████▏| 547/599 [45:05<01:40,  1.93s/it]

{'loss': 0.0005, 'learning_rate': 1.7362270450751254e-06, 'epoch': 0.91}


 91%|█████████▏| 548/599 [45:07<01:39,  1.95s/it]

{'loss': 0.0002, 'learning_rate': 1.7028380634390654e-06, 'epoch': 0.91}


 92%|█████████▏| 549/599 [45:09<01:44,  2.08s/it]

{'loss': 0.0001, 'learning_rate': 1.6694490818030051e-06, 'epoch': 0.92}


 92%|█████████▏| 550/599 [45:12<01:46,  2.18s/it]

{'loss': 0.0007, 'learning_rate': 1.636060100166945e-06, 'epoch': 0.92}


 92%|█████████▏| 551/599 [45:13<01:36,  2.00s/it]

{'loss': 0.0002, 'learning_rate': 1.6026711185308849e-06, 'epoch': 0.92}


 92%|█████████▏| 552/599 [45:15<01:33,  2.00s/it]

{'loss': 0.0002, 'learning_rate': 1.569282136894825e-06, 'epoch': 0.92}


 92%|█████████▏| 553/599 [45:17<01:30,  1.96s/it]

{'loss': 0.0002, 'learning_rate': 1.5358931552587648e-06, 'epoch': 0.92}


 92%|█████████▏| 554/599 [45:19<01:25,  1.89s/it]

{'loss': 0.0002, 'learning_rate': 1.5025041736227048e-06, 'epoch': 0.92}


 93%|█████████▎| 555/599 [45:21<01:23,  1.91s/it]

{'loss': 0.0002, 'learning_rate': 1.4691151919866445e-06, 'epoch': 0.93}


 93%|█████████▎| 556/599 [45:23<01:28,  2.05s/it]

{'loss': 0.0002, 'learning_rate': 1.4357262103505845e-06, 'epoch': 0.93}


 93%|█████████▎| 557/599 [45:27<01:46,  2.53s/it]

{'loss': 0.0001, 'learning_rate': 1.4023372287145242e-06, 'epoch': 0.93}


 93%|█████████▎| 558/599 [45:28<01:31,  2.22s/it]

{'loss': 0.0004, 'learning_rate': 1.3689482470784644e-06, 'epoch': 0.93}


 93%|█████████▎| 559/599 [45:32<01:44,  2.61s/it]

{'loss': 0.0001, 'learning_rate': 1.3355592654424042e-06, 'epoch': 0.93}


 93%|█████████▎| 560/599 [45:34<01:31,  2.35s/it]

{'loss': 0.0002, 'learning_rate': 1.3021702838063441e-06, 'epoch': 0.93}


 94%|█████████▎| 561/599 [45:36<01:26,  2.28s/it]

{'loss': 0.0001, 'learning_rate': 1.2687813021702839e-06, 'epoch': 0.94}


 94%|█████████▍| 562/599 [45:38<01:20,  2.18s/it]

{'loss': 0.0058, 'learning_rate': 1.2353923205342238e-06, 'epoch': 0.94}


 94%|█████████▍| 563/599 [45:39<01:15,  2.09s/it]

{'loss': 0.001, 'learning_rate': 1.2020033388981638e-06, 'epoch': 0.94}


 94%|█████████▍| 564/599 [45:41<01:07,  1.94s/it]

{'loss': 0.0003, 'learning_rate': 1.1686143572621036e-06, 'epoch': 0.94}


 94%|█████████▍| 565/599 [45:43<01:01,  1.82s/it]

{'loss': 0.0001, 'learning_rate': 1.1352253756260435e-06, 'epoch': 0.94}


 94%|█████████▍| 566/599 [45:44<00:57,  1.74s/it]

{'loss': 0.0003, 'learning_rate': 1.1018363939899835e-06, 'epoch': 0.94}


 95%|█████████▍| 567/599 [45:46<00:55,  1.75s/it]

{'loss': 0.0004, 'learning_rate': 1.0684474123539232e-06, 'epoch': 0.95}


 95%|█████████▍| 568/599 [45:48<00:56,  1.81s/it]

{'loss': 0.0002, 'learning_rate': 1.0350584307178632e-06, 'epoch': 0.95}


 95%|█████████▍| 569/599 [45:52<01:13,  2.45s/it]

{'loss': 0.0001, 'learning_rate': 1.0016694490818032e-06, 'epoch': 0.95}


 95%|█████████▌| 570/599 [45:53<01:02,  2.16s/it]

{'loss': 0.0002, 'learning_rate': 9.68280467445743e-07, 'epoch': 0.95}


 95%|█████████▌| 571/599 [45:55<00:55,  1.97s/it]

{'loss': 0.0002, 'learning_rate': 9.348914858096829e-07, 'epoch': 0.95}


 95%|█████████▌| 572/599 [45:56<00:49,  1.84s/it]

{'loss': 0.0001, 'learning_rate': 9.015025041736227e-07, 'epoch': 0.95}


 96%|█████████▌| 573/599 [45:58<00:47,  1.81s/it]

{'loss': 0.0002, 'learning_rate': 8.681135225375627e-07, 'epoch': 0.96}


 96%|█████████▌| 574/599 [46:00<00:48,  1.96s/it]

{'loss': 0.0002, 'learning_rate': 8.347245409015026e-07, 'epoch': 0.96}


 96%|█████████▌| 575/599 [46:02<00:47,  2.00s/it]

{'loss': 0.0003, 'learning_rate': 8.013355592654424e-07, 'epoch': 0.96}


 96%|█████████▌| 576/599 [46:04<00:41,  1.80s/it]

{'loss': 0.0002, 'learning_rate': 7.679465776293824e-07, 'epoch': 0.96}


 96%|█████████▋| 577/599 [46:06<00:44,  2.01s/it]

{'loss': 0.0001, 'learning_rate': 7.345575959933223e-07, 'epoch': 0.96}


 96%|█████████▋| 578/599 [46:08<00:39,  1.87s/it]

{'loss': 0.0002, 'learning_rate': 7.011686143572621e-07, 'epoch': 0.96}


 97%|█████████▋| 579/599 [46:09<00:34,  1.73s/it]

{'loss': 0.0003, 'learning_rate': 6.677796327212021e-07, 'epoch': 0.97}


 97%|█████████▋| 580/599 [46:11<00:33,  1.79s/it]

{'loss': 0.0047, 'learning_rate': 6.343906510851419e-07, 'epoch': 0.97}


 97%|█████████▋| 581/599 [46:13<00:32,  1.78s/it]

{'loss': 0.0001, 'learning_rate': 6.010016694490819e-07, 'epoch': 0.97}


 97%|█████████▋| 582/599 [46:16<00:36,  2.15s/it]

{'loss': 0.0003, 'learning_rate': 5.676126878130218e-07, 'epoch': 0.97}


 97%|█████████▋| 583/599 [46:18<00:31,  1.99s/it]

{'loss': 0.0004, 'learning_rate': 5.342237061769616e-07, 'epoch': 0.97}


 97%|█████████▋| 584/599 [46:19<00:28,  1.88s/it]

{'loss': 0.0002, 'learning_rate': 5.008347245409016e-07, 'epoch': 0.97}


 98%|█████████▊| 585/599 [46:22<00:28,  2.06s/it]

{'loss': 0.0002, 'learning_rate': 4.6744574290484144e-07, 'epoch': 0.98}


 98%|█████████▊| 586/599 [46:23<00:24,  1.89s/it]

{'loss': 0.0002, 'learning_rate': 4.3405676126878136e-07, 'epoch': 0.98}


 98%|█████████▊| 587/599 [46:26<00:24,  2.03s/it]

{'loss': 0.0001, 'learning_rate': 4.006677796327212e-07, 'epoch': 0.98}


 98%|█████████▊| 588/599 [46:28<00:23,  2.16s/it]

{'loss': 0.0002, 'learning_rate': 3.672787979966611e-07, 'epoch': 0.98}


 98%|█████████▊| 589/599 [46:29<00:19,  1.95s/it]

{'loss': 0.0004, 'learning_rate': 3.3388981636060104e-07, 'epoch': 0.98}


 98%|█████████▊| 590/599 [46:32<00:19,  2.17s/it]

{'loss': 0.0005, 'learning_rate': 3.0050083472454095e-07, 'epoch': 0.98}


 99%|█████████▊| 591/599 [46:34<00:16,  2.11s/it]

{'loss': 0.0004, 'learning_rate': 2.671118530884808e-07, 'epoch': 0.99}


 99%|█████████▉| 592/599 [46:35<00:13,  1.89s/it]

{'loss': 0.0005, 'learning_rate': 2.3372287145242072e-07, 'epoch': 0.99}


 99%|█████████▉| 593/599 [46:38<00:11,  1.97s/it]

{'loss': 0.004, 'learning_rate': 2.003338898163606e-07, 'epoch': 0.99}


 99%|█████████▉| 594/599 [46:40<00:09,  1.96s/it]

{'loss': 0.0005, 'learning_rate': 1.6694490818030052e-07, 'epoch': 0.99}


 99%|█████████▉| 595/599 [46:42<00:08,  2.10s/it]

{'loss': 0.0003, 'learning_rate': 1.335559265442404e-07, 'epoch': 0.99}


 99%|█████████▉| 596/599 [46:44<00:05,  1.93s/it]

{'loss': 0.0003, 'learning_rate': 1.001669449081803e-07, 'epoch': 0.99}


100%|█████████▉| 597/599 [46:46<00:04,  2.01s/it]

{'loss': 0.0002, 'learning_rate': 6.67779632721202e-08, 'epoch': 1.0}


100%|█████████▉| 598/599 [46:48<00:02,  2.12s/it]

{'loss': 0.0001, 'learning_rate': 3.33889816360601e-08, 'epoch': 1.0}


100%|██████████| 599/599 [46:50<00:00,  1.92s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 599/599 [46:50<00:00,  4.69s/it]

{'loss': 0.0003, 'learning_rate': 0.0, 'epoch': 1.0}
{'train_runtime': 2810.0789, 'train_samples_per_second': 2.131, 'train_steps_per_second': 0.213, 'train_loss': 0.02317726491183841, 'epoch': 1.0}





## Summarize

In [18]:
import numpy as np

In [19]:
def predict_mask(input_str):
    """Tomamos el camino largo en lugar de usar pipeline
    """
    inputs = tokenizer(input_str, return_tensors="pt")
    mask_index = np.where(inputs['input_ids'] == tokenizer.mask_token_id)
    # .eval() to set dropout and batch normalization layers to evaluation mode
    model.eval()
    outputs = model(**inputs)
    top_5_predictions = torch.softmax(outputs.logits[mask_index], dim=1).topk(5)
    predicted = []
    for i in range(5):
        token = tokenizer.decode(top_5_predictions.indices[0, i])
        prob = top_5_predictions.values[0, i]
        predicted.append({
            "token": token, 
            "prob": prob.item()
        })
    return predicted

In [20]:
predicted_by_sentence_by_character = []
for sentence in sentences_w_subjects_tokenized:
    predicted_by_character = dict()
    for subject in sentence["subjects"]:
        print(sentence["tokens"] + " " + subject + " can be described as [MASK].")
        print()
        predicted = predict_mask(sentence["tokens"] + " " + subject + " can be described as [MASK].")
        predicted_by_character[subject] = predicted
        for i in range(0, len(predicted)):
            print(f" {i+1}) {predicted[i]['token']:<20} {predicted[i]['prob']:.3f}")
        print()
    predicted_by_sentence_by_character.append(predicted_by_character)

write_json(predicted_by_sentence_by_character, "27_sent_predicted_by_char_bert_sent_coref.json")

a priest takes two babies abandoned outside his church inside and discovers two babies abandoned outside his church names to be Yuno and Asta . Yuno can be described as [MASK].

 1) [CLS]                0.032
 2) well                 0.024
 3) possessed            0.017
 4) depressed            0.010
 5) mute                 0.007

a priest takes two babies abandoned outside his church inside and discovers two babies abandoned outside his church names to be Yuno and Asta . Asta can be described as [MASK].

 1) [CLS]                0.064
 2) well                 0.028
 3) possessed            0.016
 4) depressed            0.008
 5) normal               0.005

Fifteen years later , Asta proposes to Sister Lily , who refuses repeatedly . Lily can be described as [MASK].

 1) well                 0.015
 2) [CLS]                0.013
 3) female               0.008
 4) depressed            0.008
 5) pregnant             0.007

Fifteen years later , Asta proposes to Sister Lily , who refuses

In [21]:
predicted_by_sentence_by_character

[{'Yuno': [{'token': '[CLS]', 'prob': 0.03198141232132912},
   {'token': 'well', 'prob': 0.024102821946144104},
   {'token': 'possessed', 'prob': 0.016506711021065712},
   {'token': 'depressed', 'prob': 0.01045365072786808},
   {'token': 'mute', 'prob': 0.006880167406052351}]},
 {'Asta': [{'token': '[CLS]', 'prob': 0.06378690898418427},
   {'token': 'well', 'prob': 0.027626674622297287},
   {'token': 'possessed', 'prob': 0.015696875751018524},
   {'token': 'depressed', 'prob': 0.007910224609076977},
   {'token': 'normal', 'prob': 0.005311070941388607}]},
 {'Lily': [{'token': 'well', 'prob': 0.014730491675436497},
   {'token': '[CLS]', 'prob': 0.013179894536733627},
   {'token': 'female', 'prob': 0.008496018126606941},
   {'token': 'depressed', 'prob': 0.007985766045749187},
   {'token': 'pregnant', 'prob': 0.0073789628222584724}]},
 {'Asta': [{'token': '[CLS]', 'prob': 0.02613426186144352},
   {'token': 'well', 'prob': 0.018935659900307655},
   {'token': 'depressed', 'prob': 0.01357097

In [22]:
predicted_by_sentence_by_character = []
for sentence in sentences_w_subjects_tokenized:
    predicted_by_character = dict()
    for subject in sentence["subjects"]:
        print(subject + " can be described as [MASK].")
        print()
        predicted = predict_mask(subject + " can be described as [MASK].")
        predicted_by_character[subject] = predicted
        for i in range(0, len(predicted)):
            print(f" {i+1}) {predicted[i]['token']:<20} {predicted[i]['prob']:.3f}")
        print()
    predicted_by_sentence_by_character.append(predicted_by_character)

write_json(predicted_by_sentence_by_character, "27_predicted_by_char_bert_sent_coref.json")

Yuno can be described as [MASK].

 1) well                 0.061
 2) [CLS]                0.044
 3) depressed            0.007
 4) possessed            0.005
 5) drunk                0.005

Asta can be described as [MASK].

 1) well                 0.097
 2) [CLS]                0.053
 3) follows              0.007
 4) opposed              0.005
 5) normal               0.003

Lily can be described as [MASK].

 1) well                 0.030
 2) Lily                 0.030
 3) [CLS]                0.026
 4) female               0.008
 5) feminine             0.007

Asta can be described as [MASK].

 1) well                 0.097
 2) [CLS]                0.053
 3) follows              0.007
 4) opposed              0.005
 5) normal               0.003

Yuno can be described as [MASK].

 1) well                 0.061
 2) [CLS]                0.044
 3) depressed            0.007
 4) possessed            0.005
 5) drunk                0.005

Asta can be described as [MASK].

 1) well        

In [23]:
predicted_by_sentence_by_character

[{'Yuno': [{'token': 'well', 'prob': 0.061498939990997314},
   {'token': '[CLS]', 'prob': 0.04403240606188774},
   {'token': 'depressed', 'prob': 0.006903379689902067},
   {'token': 'possessed', 'prob': 0.004973285365849733},
   {'token': 'drunk', 'prob': 0.004861949011683464}]},
 {'Asta': [{'token': 'well', 'prob': 0.09666887670755386},
   {'token': '[CLS]', 'prob': 0.052952952682971954},
   {'token': 'follows', 'prob': 0.006803152617067099},
   {'token': 'opposed', 'prob': 0.004629872739315033},
   {'token': 'normal', 'prob': 0.0033788199070841074}]},
 {'Lily': [{'token': 'well', 'prob': 0.030192334204912186},
   {'token': 'Lily', 'prob': 0.02950207330286503},
   {'token': '[CLS]', 'prob': 0.025733662769198418},
   {'token': 'female', 'prob': 0.00785995926707983},
   {'token': 'feminine', 'prob': 0.0066382004879415035}]},
 {'Asta': [{'token': 'well', 'prob': 0.09666887670755386},
   {'token': '[CLS]', 'prob': 0.052952952682971954},
   {'token': 'follows', 'prob': 0.006803152617067099

In [25]:
def predict_mask_x(input_str, x):
    """Tomamos el camino largo en lugar de usar pipeline
    """
    inputs = tokenizer(input_str, return_tensors="pt")
    mask_index = np.where(inputs['input_ids'] == tokenizer.mask_token_id)
    # .eval() to set dropout and batch normalization layers to evaluation mode
    model.eval()
    outputs = model(**inputs)
    top_x_predictions = torch.softmax(outputs.logits[mask_index], dim=1).topk(x)
    predicted = []
    for i in range(x):
        token = tokenizer.decode(top_x_predictions.indices[0, i])
        prob = top_x_predictions.values[0, i]
        predicted.append({
            "token": token, 
            "prob": prob.item()
        })
    return predicted

In [29]:
subject = "Yami"
predicted = predict_mask_x(subject + " can be described as [MASK].", 27)
predicted_by_character[subject] = predicted
for i in range(0, len(predicted)):
    print(f" {i+1}) {predicted[i]['token']:<20} {predicted[i]['prob']:.3f}")
print()

 1) [CLS]                0.045
 2) well                 0.041
 3) depressed            0.004
 4) normal               0.004
 5) sound                0.004
 6) opposed              0.004
 7) floating             0.004
 8) possessed            0.003
 9) falling              0.003
 10) distinct             0.003
 11) weak                 0.003
 12) hollow               0.003
 13) stable               0.002
 14) follows              0.002
 15) nothing              0.002
 16) either               0.002
 17) simple               0.002
 18) strong               0.002
 19) drunk                0.002
 20) [UNK]                0.002
 21) flying               0.002
 22) independent          0.002
 23) impossible           0.002
 24) real                 0.002
 25) smoke                0.002
 26) mobile               0.001
 27) true                 0.001

