In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import pandas
from transformers import BertModel
from transformers import AutoTokenizer
from typing import Dict, List
from util import load_training_data

In [3]:
train_arg_path = "../data/arguments-training.tsv"
train_label_path = "../data/labels-training.tsv"
validation_arg_path = "../data/arguments-validation.tsv"
validation_label_path = "../data/labels-validation.tsv"

In [20]:
df_train_arguments, df_train_labels = load_training_data(train_arg_path, train_label_path)

df_train_labels = df_train_labels[["Argument ID", "Self-direction: action"]]
df_train_arguments = df_train_arguments.merge(df_train_labels, on='Argument ID')
df_train_arguments.head()

Unnamed: 0,Argument ID,Conclusion,Stance,Premise,Self-direction: action
0,A01001,Entrapment should be legalized,in favor of,if entrapment can serve to more easily capture...,0
1,A01002,We should ban human cloning,in favor of,we should ban human cloning as it will only ca...,0
2,A01003,We should abandon marriage,against,marriage is the ultimate commitment to someone...,1
3,A01004,We should ban naturopathy,against,it provides a useful income for some people,0
4,A01005,We should ban fast food,in favor of,fast food should be banned because it is reall...,0


In [22]:
df_vali_arguments, df_vali_labels = load_training_data(validation_arg_path, validation_label_path)
, 
df_vali_labels = df_vali_labels[["Argument ID", "Self-direction: action"]]
df_vali_arguments = df_vali_arguments.merge(df_vali_labels, on='Argument ID')
df_vali_arguments.head()

Unnamed: 0,Argument ID,Conclusion,Stance,Premise,Self-direction: action
0,A01001,Entrapment should be legalized,in favor of,if entrapment can serve to more easily capture...,0
1,A01012,The use of public defenders should be mandatory,in favor of,the use of public defenders should be mandator...,0
2,A02001,Payday loans should be banned,in favor of,payday loans create a more impoverished societ...,0
3,A02002,Surrogacy should be banned,against,Surrogacy should not be banned as it is the wo...,1
4,A02009,Entrapment should be legalized,against,entrapment is gravely immoral and,0


In [6]:
def generate_input(dataset: pandas.core.frame.DataFrame) -> (List[str], List[str], List[int]):
    
    premise, conclusion, label = ([] for i in range(3))
    
    premise = dataset["Premise"].tolist()
    conclusion = (dataset["Stance"] + ": " + dataset["Conclusion"]).tolist()
    label = dataset["Self-direction: action"].tolist()
        
    return premise, conclusion, label

batch_size = 8

def chunk(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

def chunk_multi(lst1, lst2, n):
    for i in range(0, len(lst1), n):
        yield lst1[i: i + n], lst2[i: i + n]

In [7]:
def encode_labels(labels: List[int]) -> torch.FloatTensor:
    """Turns the batch of labels into a tensor

    Args:
        labels (List[int]): List of all labels in the batch

    Returns:
        torch.FloatTensor: Tensor of all labels in the batch
    """
    return torch.LongTensor([int(l) for l in labels])

In [8]:
# Huggingface tokenizer

class BatchTokenizer:
    """Tokenizes and pads a batch of input sentences."""

    def __init__(self):
        """Initializes the tokenizer

        Args:
            pad_symbol (Optional[str], optional): The symbol for a pad. Defaults to "<P>".
        """
        self.hf_tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-small")
    
    def get_sep_token(self,):
        return self.hf_tokenizer.sep_token
    
    def __call__(self, prem_batch: List[str], conc_batch: List[str]) -> List[List[str]]:
        """Uses the huggingface tokenizer to tokenize and pad a batch.

        We return a dictionary of tensors per the huggingface model specification.

        Args:
            batch (List[str]): A List of sentence strings

        Returns:
            Dict: The dictionary of token specifications provided by HuggingFace
        """
        # The HF tokenizer will PAD for us, and additionally combine 
        # The two sentences deimited by the [SEP] token.
        enc = self.hf_tokenizer(
            prem_batch,
            conc_batch,
            padding=True,
            return_token_type_ids=False,
            return_tensors='pt'
        )

        return enc
    

# HERE IS AN EXAMPLE OF HOW TO USE THE BATCH TOKENIZER
tokenizer = BatchTokenizer()
x = tokenizer(*[["this is the premise.", "This is also a premise"], ["this is the hypothesis", "This is a second hypothesis"]])
print(x)
tokenizer.hf_tokenizer.batch_decode(x["input_ids"])


{'input_ids': tensor([[  101,  2023,  2003,  1996, 18458,  1012,   102,  2023,  2003,  1996,
         10744,   102,     0],
        [  101,  2023,  2003,  2036,  1037, 18458,   102,  2023,  2003,  1037,
          2117, 10744,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


['[CLS] this is the premise. [SEP] this is the hypothesis [SEP] [PAD]',
 '[CLS] this is also a premise [SEP] this is a second hypothesis [SEP]']

In [9]:
# Build the model here
class HVDClassifier(torch.nn.Module):
    def __init__(self, output_size: int, hidden_size: int):
        super().__init__()
        self.output_size = output_size
        self.hidden_size = hidden_size
        # Initialize BERT, which we use instead of a single embedding layer.
        self.bert = BertModel.from_pretrained("prajjwal1/bert-small")
        # TODO [OPTIONAL]: Updating all BERT parameters can be slow and memory intensive. 
        # Freeze them if training is too slow. Notice that the learning
        # rate should probably be smaller in this case.
        # Uncommenting out the below 2 lines means only our classification layer will be updated.
        for param in self.bert.parameters():
            param.requires_grad = False
        self.bert_hidden_dimension = self.bert.config.hidden_size
        # TODO: Add an extra hidden layer in the classifier, projecting
        #      from the BERT hidden dimension to hidden size.
        self.hidden_layer = torch.nn.Linear(self.bert_hidden_dimension, self.hidden_size)
        
        # TODO: Add a relu nonlinearity to be used in the forward method
        #      https://pytorch.org/docs/stable/generated/torch.nn.ReLU.html
        self.relu = torch.nn.ReLU()
        self.classifier = torch.nn.Linear(self.hidden_size, self.output_size)
        self.log_softmax = torch.nn.LogSoftmax(dim=2)

    def encode_text(
        self,
        symbols: Dict
    ) -> torch.Tensor:
        """Encode the (batch of) sequence(s) of token symbols with an LSTM.
            Then, get the last (non-padded) hidden state for each symbol and return that.

        Args:
            symbols (Dict): The Dict of token specifications provided by the HuggingFace tokenizer

        Returns:
            torch.Tensor: The final hiddens tate of the LSTM, which represents an encoding of
                the entire sentence
        """
        # First we get the contextualized embedding for each input symbol
        # We no longer need an LSTM, since BERT encodes context and 
        # gives us a single vector describing the sequence in the form of the [CLS] token.
        encoded_sequence = self.bert(**symbols)
        # TODO: Get the [CLS] token using the `pooler_output` from 
        #      The BertModel output. See here: https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel
        #      and check the returns for the forward method.
        # We want to return a tensor of the form batch_size x 1 x bert_hidden_dimension
        output = torch.unsqueeze(encoded_sequence.pooler_output,1)
        return output
    

    def forward(
        self,
        symbols: Dict,
    ) -> torch.Tensor:
        """_summary_

        Args:
            symbols (Dict): The Dict of token specifications provided by the HuggingFace tokenizer

        Returns:
            torch.Tensor: _description_
        """
        encoded_sents = self.encode_text(symbols)
        output = self.hidden_layer(encoded_sents)
        output = self.relu(output)
        output = self.classifier(output)
        return self.log_softmax(output)

In [10]:
# For making predictions at test time
def predict(model: torch.nn.Module, sents: torch.Tensor) -> List:
    logits = model(sents)
    return list(torch.argmax(logits, axis=2).squeeze().numpy())

In [11]:
import numpy as np
from numpy import logical_and, sum as t_sum

def precision(predicted_labels, true_labels, which_label=1):
    """
    Precision is True Positives / All Positives Predictions
    """
    pred_which = np.array([pred == which_label for pred in predicted_labels])
    true_which = np.array([lab == which_label for lab in true_labels])
    denominator = t_sum(pred_which)
    if denominator:
        return t_sum(logical_and(pred_which, true_which))/denominator
    else:
        return 0.


def recall(predicted_labels, true_labels, which_label=1):
    """
    Recall is True Positives / All Positive Labels
    """
    pred_which = np.array([pred == which_label for pred in predicted_labels])
    true_which = np.array([lab == which_label for lab in true_labels])
    denominator = t_sum(true_which)
    if denominator:
        return t_sum(logical_and(pred_which, true_which))/denominator
    else:
        return 0.


def f1_score(
    predicted_labels: List[int],
    true_labels: List[int],
    which_label: int
):
    """
    F1 score is the harmonic mean of precision and recall
    """
    P = precision(predicted_labels, true_labels, which_label=which_label)
    R = recall(predicted_labels, true_labels, which_label=which_label)
    if P and R:
        return 2*P*R/(P+R)
    else:
        return 0.


def macro_f1(
    predicted_labels: List[int],
    true_labels: List[int],
    possible_labels: List[int]
):
    scores = [f1_score(predicted_labels, true_labels, l) for l in possible_labels]
    # Macro, so we take the uniform avg.
    return sum(scores) / len(scores)

In [12]:
import random
from tqdm import tqdm

def training_loop(
    num_epochs,
    train_features,
    train_labels,
    dev_sents,
    dev_labels,
    optimizer,
    model,
):
    print("Training...")
    loss_func = torch.nn.NLLLoss()
    batches = list(zip(train_features, train_labels))
    random.shuffle(batches)
    for i in range(num_epochs):
        losses = []
        for features, labels in tqdm(batches):
            # Empty the dynamic computation graph
            optimizer.zero_grad()
            preds = model(features).squeeze(1)
            loss = loss_func(preds, labels)
            # Backpropogate the loss through our model
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
        
        print(f"epoch {i}, loss: {sum(losses)/len(losses)}")
        # Estimate the f1 score for the development set
        print("Evaluating dev...")
        all_preds = []
        all_labels = []
        for sents, labels in tqdm(zip(dev_sents, dev_labels), total=len(dev_sents)):
            pred = predict(model, sents)
            all_preds.extend(pred)
            all_labels.extend(list(labels.numpy()))

        dev_f1 = macro_f1(all_preds, all_labels, [0,1])
        print(f"Dev F1 {dev_f1}")
        
    # Return the trained model
    return model

In [13]:
# Batch and Tokenize data here
tokenizer = BatchTokenizer()

# Traning dataset
train_premises, train_conclusions, train_labels = generate_input(df_train_arguments)
# Batches
train_input_batches = [b for b in chunk_multi(train_premises, train_conclusions, batch_size)]
train_label_batches = [b for b in chunk(train_labels, batch_size)]
# Tokenize + encode
train_input_batches = [tokenizer(*batch) for batch in train_input_batches]
train_label_batches = [encode_labels(batch) for batch in train_label_batches]

# Validation dataset
vali_premises, vali_conclusions, vali_labels = generate_input(df_vali_arguments)
# Batches
vali_input_batches = [b for b in chunk_multi(vali_premises, vali_conclusions, batch_size)]
vali_label_batches = [b for b in chunk(vali_labels, batch_size)]
# Tokenize + encode
vali_input_batches = [tokenizer(*batch) for batch in vali_input_batches]
vali_label_batches = [encode_labels(batch) for batch in vali_label_batches]

{0, 1}

In [None]:
# You can increase epochs if need be
epochs = 50
# TODO: Find a good learning rate
LR = 0.0001

possible_labels = len(set(train_labels))
model = HVDClassifier(output_size=possible_labels, hidden_size=1024)
optimizer = torch.optim.AdamW(model.parameters(), LR)

training_loop(
    epochs,
    train_input_batches,
    train_label_batches,
    vali_input_batches,
    vali_label_batches,
    optimizer,
    model,
)

Some weights of the model checkpoint at prajjwal1/bert-small were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training...


100%|█████████████████████████████████████████| 653/653 [02:49<00:00,  3.85it/s]


epoch 0, loss: 0.5317879814489147
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:11<00:00,  3.33it/s]


Dev F1 0.4617653666321883


100%|█████████████████████████████████████████| 653/653 [02:52<00:00,  3.79it/s]


epoch 1, loss: 0.49956278189172426
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:12<00:00,  3.28it/s]


Dev F1 0.5341889225939204


100%|█████████████████████████████████████████| 653/653 [02:57<00:00,  3.68it/s]


epoch 2, loss: 0.48487321974832287
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:21<00:00,  2.92it/s]


Dev F1 0.5665051694180758


100%|█████████████████████████████████████████| 653/653 [03:08<00:00,  3.47it/s]


epoch 3, loss: 0.47402301507147676
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:21<00:00,  2.90it/s]


Dev F1 0.5990411444756677


100%|█████████████████████████████████████████| 653/653 [03:49<00:00,  2.84it/s]


epoch 4, loss: 0.46459997384873136
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:41<00:00,  2.33it/s]


Dev F1 0.6169342061141567


100%|█████████████████████████████████████████| 653/653 [03:41<00:00,  2.94it/s]


epoch 5, loss: 0.45576198794415496
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:34<00:00,  2.50it/s]


Dev F1 0.6326299967655397


100%|█████████████████████████████████████████| 653/653 [03:34<00:00,  3.04it/s]


epoch 6, loss: 0.4472402047114889
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:29<00:00,  2.64it/s]


Dev F1 0.6401811103532191


100%|█████████████████████████████████████████| 653/653 [03:02<00:00,  3.58it/s]


epoch 7, loss: 0.4389742987221863
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:19<00:00,  2.97it/s]


Dev F1 0.6515543608489844


100%|█████████████████████████████████████████| 653/653 [03:12<00:00,  3.39it/s]


epoch 8, loss: 0.43071828586880867
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:24<00:00,  2.80it/s]


Dev F1 0.6579513537804741


100%|█████████████████████████████████████████| 653/653 [03:30<00:00,  3.11it/s]


epoch 9, loss: 0.4225710630702114
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:32<00:00,  2.55it/s]


Dev F1 0.670804039727709


100%|█████████████████████████████████████████| 653/653 [03:41<00:00,  2.95it/s]


epoch 10, loss: 0.41421483181542496
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:32<00:00,  2.57it/s]


Dev F1 0.6789348710990502


100%|█████████████████████████████████████████| 653/653 [03:50<00:00,  2.84it/s]


epoch 11, loss: 0.4060540354195954
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:34<00:00,  2.50it/s]


Dev F1 0.6801577271816971


100%|█████████████████████████████████████████| 653/653 [03:26<00:00,  3.17it/s]


epoch 12, loss: 0.3983169285791597
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:25<00:00,  2.78it/s]


Dev F1 0.687722139739956


100%|█████████████████████████████████████████| 653/653 [03:25<00:00,  3.17it/s]


epoch 13, loss: 0.3906218952500756
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:30<00:00,  2.63it/s]


Dev F1 0.690360824161588


100%|█████████████████████████████████████████| 653/653 [03:35<00:00,  3.03it/s]


epoch 14, loss: 0.3830114602686047
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:28<00:00,  2.68it/s]


Dev F1 0.690360824161588


100%|█████████████████████████████████████████| 653/653 [03:40<00:00,  2.97it/s]


epoch 15, loss: 0.37530806167464803
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:28<00:00,  2.69it/s]


Dev F1 0.698637449079927


100%|█████████████████████████████████████████| 653/653 [03:16<00:00,  3.32it/s]


epoch 16, loss: 0.3678165830509198
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:16<00:00,  3.10it/s]


Dev F1 0.7066607988495772


100%|█████████████████████████████████████████| 653/653 [03:09<00:00,  3.45it/s]


epoch 17, loss: 0.3604345865658902
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:18<00:00,  3.01it/s]


Dev F1 0.7083893720936905


100%|█████████████████████████████████████████| 653/653 [03:05<00:00,  3.52it/s]


epoch 18, loss: 0.35328173583906836
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:16<00:00,  3.09it/s]


Dev F1 0.7112290649413795


100%|█████████████████████████████████████████| 653/653 [03:02<00:00,  3.58it/s]


epoch 19, loss: 0.3460898664630208
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:15<00:00,  3.12it/s]


Dev F1 0.7133656948758726


100%|█████████████████████████████████████████| 653/653 [03:01<00:00,  3.60it/s]


epoch 20, loss: 0.3389067663946638
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:16<00:00,  3.09it/s]


Dev F1 0.7150419188824465


100%|█████████████████████████████████████████| 653/653 [03:01<00:00,  3.60it/s]


epoch 21, loss: 0.3319924236824351
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:17<00:00,  3.06it/s]


Dev F1 0.715590036098203


100%|█████████████████████████████████████████| 653/653 [03:01<00:00,  3.60it/s]


epoch 22, loss: 0.3250760250335494
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:15<00:00,  3.16it/s]


Dev F1 0.7217583407083064


100%|█████████████████████████████████████████| 653/653 [02:58<00:00,  3.65it/s]


epoch 23, loss: 0.31857785702202396
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:16<00:00,  3.10it/s]


Dev F1 0.7267475836873525


100%|█████████████████████████████████████████| 653/653 [02:57<00:00,  3.67it/s]


epoch 24, loss: 0.3115660981603172
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:15<00:00,  3.13it/s]


Dev F1 0.7283673078929975


100%|█████████████████████████████████████████| 653/653 [03:00<00:00,  3.61it/s]


epoch 25, loss: 0.30519783749654433
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:15<00:00,  3.13it/s]


Dev F1 0.7343512323242123


100%|█████████████████████████████████████████| 653/653 [02:59<00:00,  3.64it/s]


epoch 26, loss: 0.2985752019097039
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:14<00:00,  3.19it/s]


Dev F1 0.7354297693920335


100%|█████████████████████████████████████████| 653/653 [03:02<00:00,  3.58it/s]


epoch 27, loss: 0.2922356881167038
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:12<00:00,  3.27it/s]


Dev F1 0.7423493451619094


100%|█████████████████████████████████████████| 653/653 [02:58<00:00,  3.66it/s]


epoch 28, loss: 0.2854333849806356
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:16<00:00,  3.08it/s]


Dev F1 0.7424849054640039


100%|█████████████████████████████████████████| 653/653 [03:07<00:00,  3.48it/s]


epoch 29, loss: 0.27929163035105165
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:19<00:00,  2.97it/s]


Dev F1 0.7461690006223117


100%|█████████████████████████████████████████| 653/653 [03:11<00:00,  3.42it/s]


epoch 30, loss: 0.273258054485633
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:20<00:00,  2.94it/s]


Dev F1 0.7488515698896847


100%|█████████████████████████████████████████| 653/653 [03:14<00:00,  3.36it/s]


epoch 31, loss: 0.267089158634243
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:20<00:00,  2.96it/s]


Dev F1 0.745678459471563


100%|█████████████████████████████████████████| 653/653 [03:01<00:00,  3.60it/s]


epoch 32, loss: 0.2611766932232685
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:15<00:00,  3.14it/s]


Dev F1 0.7514316766560036


100%|█████████████████████████████████████████| 653/653 [03:02<00:00,  3.58it/s]


epoch 33, loss: 0.2554468457101435
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:16<00:00,  3.11it/s]


Dev F1 0.749423244204077


100%|█████████████████████████████████████████| 653/653 [03:00<00:00,  3.61it/s]


epoch 34, loss: 0.2490699692354017
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:13<00:00,  3.24it/s]


Dev F1 0.7525781810779829


100%|█████████████████████████████████████████| 653/653 [02:51<00:00,  3.80it/s]


epoch 35, loss: 0.24352899896498673
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:12<00:00,  3.29it/s]


Dev F1 0.7579090271014498


100%|█████████████████████████████████████████| 653/653 [03:01<00:00,  3.59it/s]


epoch 36, loss: 0.2372367927764655
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:25<00:00,  2.77it/s]


Dev F1 0.7657246967591795


100%|█████████████████████████████████████████| 653/653 [03:36<00:00,  3.02it/s]


epoch 37, loss: 0.23155208421884615
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:36<00:00,  2.46it/s]


Dev F1 0.7673348363003536


100%|█████████████████████████████████████████| 653/653 [03:58<00:00,  2.73it/s]


epoch 38, loss: 0.225945650036896
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:42<00:00,  2.30it/s]


Dev F1 0.7642509852090691


100%|█████████████████████████████████████████| 653/653 [04:11<00:00,  2.59it/s]


epoch 39, loss: 0.2199943646038761
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:39<00:00,  2.38it/s]


Dev F1 0.7606903718623113


100%|█████████████████████████████████████████| 653/653 [03:44<00:00,  2.91it/s]


epoch 40, loss: 0.2144251423834425
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:29<00:00,  2.65it/s]


Dev F1 0.7641145572180055


100%|█████████████████████████████████████████| 653/653 [03:27<00:00,  3.15it/s]


epoch 41, loss: 0.20910864349067423
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:18<00:00,  3.01it/s]


Dev F1 0.7616024739877385


100%|█████████████████████████████████████████| 653/653 [03:56<00:00,  2.76it/s]


epoch 42, loss: 0.2033314569107154
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:34<00:00,  2.51it/s]


Dev F1 0.7633506867143885


100%|█████████████████████████████████████████| 653/653 [03:19<00:00,  3.28it/s]


epoch 43, loss: 0.19771328383254794
Evaluating dev...


100%|█████████████████████████████████████████| 237/237 [01:16<00:00,  3.11it/s]


Dev F1 0.7642509852090691


100%|█████████████████████████████████████████| 653/653 [03:01<00:00,  3.60it/s]


epoch 44, loss: 0.1924489459936018
Evaluating dev...


 78%|████████████████████████████████▏        | 186/237 [00:51<00:13,  3.76it/s]