In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip3 install conllu
!pip3 install transformers
!pip3 install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting conllu
  Downloading conllu-4.5.2-py2.py3-none-any.whl (16 kB)
Installing collected packages: conllu
Successfully installed conllu-4.5.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 30.2 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 78.2 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 66.2 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 tran

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, RandomSampler, DataLoader

In [4]:
from collections import defaultdict
from datasets import Dataset
from datasets import load_dataset
from typing import List

In [5]:
from transformers import DistilBertTokenizer, DistilBertModel

In [6]:
from tqdm import notebook

In [7]:
from typing import NamedTuple

class DependencyParse(NamedTuple):

    text: str
    tokens: List[str]
    heads: List[str]
    deprel: List[str]

    @classmethod
    def from_huggingface_dict(cls, data_dict):
        return cls(data_dict["text"], data_dict["tokens"], data_dict["head"], data_dict["deprel"])

In [8]:
def get_parses(subset: str = "en_gum", split = 'train') -> List[DependencyParse]:
    dataset = load_dataset("universal_dependencies", subset, split=split)
    parses = [DependencyParse.from_huggingface_dict(data_dict) for data_dict in dataset]
    return parses

In [9]:
dataset_train = get_parses(split = 'train')
dataset_valid = get_parses(split = 'validation')

Downloading builder script:   0%|          | 0.00/87.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.33M [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/185k [00:00<?, ?B/s]

Downloading and preparing dataset universal_dependencies/en_gum to /root/.cache/huggingface/datasets/universal_dependencies/en_gum/2.7.0/1ac001f0e8a0021f19388e810c94599f3ac13cc45d6b5b8c69f7847b2188bdf7...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/199k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/206k [00:00<?, ?B/s]

   

Extracting data files #1:   0%|          | 0/1 [00:00<?, ?obj/s]

Extracting data files #2:   0%|          | 0/1 [00:00<?, ?obj/s]

Extracting data files #0:   0%|          | 0/1 [00:00<?, ?obj/s]

Generating train split:   0%|          | 0/4287 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/784 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/890 [00:00<?, ? examples/s]

Dataset universal_dependencies downloaded and prepared to /root/.cache/huggingface/datasets/universal_dependencies/en_gum/2.7.0/1ac001f0e8a0021f19388e810c94599f3ac13cc45d6b5b8c69f7847b2188bdf7. Subsequent calls will reuse this data.




In [10]:
print(dataset_train[1])
print(dataset_valid[1])

DependencyParse(text='Insights from Eye-Tracking', tokens=['Insights', 'from', 'Eye-Tracking'], heads=['0', '3', '1'], deprel=['root', 'case', 'nmod'])
DependencyParse(text='Research on adult-learned second language (L2) has provided considerable insight into the neurocognitive mechanisms underlying the learning and processing of L2 grammar [1] – [11].', tokens=['Research', 'on', 'adult-learned', 'second', 'language', '(', 'L2', ')', 'has', 'provided', 'considerable', 'insight', 'into', 'the', 'neurocognitive', 'mechanisms', 'underlying', 'the', 'learning', 'and', 'processing', 'of', 'L2', 'grammar', '[', '1', ']', '–', '[', '11', ']', '.'], heads=['10', '5', '5', '5', '1', '7', '5', '7', '10', '0', '12', '10', '16', '16', '16', '12', '16', '19', '17', '21', '19', '24', '24', '19', '26', '10', '26', '30', '30', '26', '30', '10'], deprel=['nsubj', 'case', 'amod', 'amod', 'nmod', 'punct', 'appos', 'punct', 'aux', 'root', 'amod', 'obj', 'case', 'det', 'amod', 'nmod', 'acl', 'det', 'obj', 

In [11]:
def get_labels(dataset):
    rel_pos_labels = []
    dep_labels = []
    for data in dataset:
        rel_pos = []
        dep_labels.append(data.deprel)
        for i,head in enumerate(data.heads):
            if head == '0':
                rel_pos.append(str(0))
            else:
                rel_pos.append(str(int(head)-i-1))
        rel_pos_labels.append(rel_pos)
    rel_pos_vocab = ['unk']+sorted(list(set([rel_p for rel_ps in rel_pos_labels for rel_p in rel_ps])))
    
    dep_label_vocab = ['unk']+sorted(list(set([dep_l for dep_ls in dep_labels for dep_l in dep_ls])))
    
    return (rel_pos_labels, dep_labels), (rel_pos_vocab, dep_label_vocab)

In [12]:
(pos_labels,dep_labels),(pos_vocab,dep_label_vocab) = get_labels(dataset_train)
(val_pos_labels,val_dep_labels),(val_pos_vocab,val_dep_label_vocab) = get_labels(dataset_valid)

In [13]:
dep_label_vocab = {dp: i for i,dp in enumerate(dep_label_vocab)}
dep_id_to_label_vocab = {i: dp for i,dp in enumerate(dep_label_vocab)}
pos_vocab = {pos: i for i,pos in enumerate(pos_vocab)}
pos_id_to_label_vocab = {i: pos for i,pos in enumerate(pos_vocab)}

val_dep_label_vocab = {dp: i for i,dp in enumerate(val_dep_label_vocab)}
val_pos_vocab = {pos: i for i,pos in enumerate(val_pos_vocab)}

In [14]:
dep_labels_n = [[dep_label_vocab[dp] for dp in deps] for deps in dep_labels]
pos_labels_n = [[pos_vocab[pos] for pos in pos_ls] for pos_ls in pos_labels]

val_dep_labels_n = [[val_dep_label_vocab[dp] for dp in deps] for deps in val_dep_labels]
val_pos_labels_n = [[val_pos_vocab[pos] for pos in pos_ls] for pos_ls in val_pos_labels]

In [15]:
n_pos = len(pos_vocab)
n_deps = len(dep_label_vocab)

val_n_pos = len(val_pos_vocab)
val_n_deps = len(val_dep_label_vocab)

In [16]:
train_sentences = [data.text for data in dataset_train]
valid_sentences = [data.text for data in dataset_valid]

In [17]:
max_seq_length = 126

In [18]:
batch_size = 32

In [19]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [20]:
def tokenize_dataset(sentences):
  input_ids = []
  attention_masks = []
  for text in train_sentences:
      encoded_dict = tokenizer.encode_plus(
          text,
          add_special_tokens=True,
          max_length=max_seq_length,
          pad_to_max_length=True,
          return_attention_mask=True,
          return_tensors='pt'
      )
      input_ids.append(encoded_dict['input_ids'])
      attention_masks.append(encoded_dict['attention_mask'])
  return input_ids,attention_masks

In [21]:
train_input_ids,train_attention_masks = tokenize_dataset(train_sentences)
valid_input_ids,valid_attention_masks = tokenize_dataset(valid_sentences)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [22]:
# labels:   0 1 2 0 3 4 0 0 0 
# tokens: cls 1 2 * 4 5 * * sep
def filter_subwords(input_ids):
  pos_labels_updated = []
  dep_labels_updated = []

  for j,sample in enumerate(input_ids):  
    remove_next = False
    k = 0
    p_labs = []
    d_labs = []
    for batch in sample:
      for i,ind in enumerate(batch):
        remove = False
        if remove_next:
          remove = True
          remove_next = False
        token = tokenizer.convert_ids_to_tokens(ind.item())
        if token == '-':
          remove=True
          remove_next = True
        elif token.startswith('##') or token == '[CLS]' or token == '[SEP]':
          remove = True
        elif token == '[PAD]':
          break
        if remove:
          p_labs.append(0)
          d_labs.append(0)
        else:
          if k >= len(pos_labels_n[j]):
            break
          p_labs.append(pos_labels_n[j][k])
          d_labs.append(dep_labels_n[j][k])
          k+=1
    while k<len(pos_labels_n[j]):
      p_labs.append(pos_labels_n[j][k])
      d_labs.append(dep_labels_n[j][k])
      k+=1
    pos_labels_updated.append(p_labs)
    dep_labels_updated.append(d_labs)
  return pos_labels_updated,dep_labels_updated

In [23]:
pos_labels_updated,dep_labels_updated = filter_subwords(train_input_ids)
val_pos_labels_updated,val_dep_labels_updated = filter_subwords(valid_input_ids)

In [43]:
sample_out = [data.text for data in dataset_train[:10]]
sample_pos = pos_labels[:10]
sample_dep = dep_labels[:10]

In [46]:
for i in range(10):
  print('{}\t{}\t{}'.format(sample_out[i],str(sample_pos[i]),str(sample_dep[i])))

Aesthetic Appreciation and Spanish Art:	['1', '0', '2', '1', '-3', '-4']	['amod', 'root', 'cc', 'amod', 'conj', 'punct']
Insights from Eye-Tracking	['0', '1', '-2']	['root', 'case', 'nmod']
Claire Bailey-Ross claire.bailey-ross@port.ac.uk University of Portsmouth, United Kingdom	['0', '-1', '-2', '-3', '1', '-2', '2', '1', '-8']	['root', 'flat', 'list', 'list', 'case', 'nmod', 'punct', 'amod', 'list']
Andrew Beresford a.m.beresford@durham.ac.uk Durham University, United Kingdom	['0', '-1', '-2', '1', '-4', '2', '1', '-7']	['root', 'flat', 'list', 'compound', 'list', 'punct', 'amod', 'list']
Daniel Smith daniel.smith2@durham.ac.uk Durham University, United Kingdom	['0', '-1', '-2', '1', '-4', '2', '1', '-7']	['root', 'flat', 'list', 'compound', 'list', 'punct', 'amod', 'list']
Claire Warwick c.l.h.warwick@durham.ac.uk Durham University, United Kingdom	['0', '-1', '-2', '1', '-4', '2', '1', '-7']	['root', 'flat', 'list', 'compound', 'list', 'punct', 'amod', 'list']
How do people look at 

In [24]:
print(pos_labels_n[75])          #before filtering subwords
print(pos_labels_updated[75])    #after filtering subwords
print(dataset_train[75].tokens)  #original tokens
print([tokenizer.convert_ids_to_tokens(ind.item()) for ind in train_input_ids[75][0]])   #bert tokenized tokens

[92, 81, 84, 1, 92, 81, 119, 81, 92, 81, 34, 1, 92, 81, 80, 1, 84, 83, 81, 120, 92, 81, 23, 81, 12, 92, 81, 45, 6, 81, 12, 1, 1, 81, 23, 92, 81, 65, 71, 112, 1, 81, 81, 7, 81, 12, 1, 81, 45, 1, 92, 81, 34, 1, 81, 4, 92, 81, 23, 1, 81, 10, 81, 12, 1, 92, 81, 56, 81, 12, 92, 81, 22, 81, 12, 1, 81, 45, 81, 12, 1, 81, 2, 81, 12, 1, 81, 7, 1, 81, 23, 1, 81, 14, 103, 92, 81, 34, 75]
[0, 92, 81, 84, 1, 92, 81, 0, 0, 119, 81, 92, 81, 0, 0, 0, 34, 1, 92, 81, 80, 1, 84, 83, 81, 120, 92, 81, 23, 81, 12, 0, 0, 0, 92, 81, 45, 6, 81, 12, 1, 1, 81, 23, 92, 81, 65, 71, 112, 1, 81, 81, 7, 0, 81, 12, 1, 81, 45, 0, 0, 1, 0, 0, 92, 81, 34, 1, 81, 4, 0, 92, 81, 23, 1, 81, 10, 0, 81, 12, 1, 92, 81, 56, 81, 12, 0, 0, 0, 92, 81, 22, 0, 0, 81, 12, 1, 81, 45, 0, 0, 81, 12, 1, 81, 2, 0, 81, 12, 1, 81, 7, 1, 81, 23, 1, 81, 14, 0, 0, 103, 92, 81, 34, 75, 0]
['In', 'Spanish', 'poetry', ',', 'the', 'syntactic', 'configurations', 'under', 'which', 'enjambment', 'takes', 'place', 'have', 'been', 'described', 'extensiv

In [25]:
padded_pos_labels = [t+[pos_vocab['unk']]*(max_seq_length - len(t)) for t in pos_labels_updated]
padded_dep_labels = [t+[dep_label_vocab['unk']]*(max_seq_length - len(t)) for t in dep_labels_updated]

val_padded_pos_labels = [t+[val_pos_vocab['unk']]*(max_seq_length - len(t)) for t in val_pos_labels_updated]
val_padded_dep_labels = [t+[val_dep_label_vocab['unk']]*(max_seq_length - len(t)) for t in val_dep_labels_updated]

In [26]:
train_inputs = torch.cat(train_input_ids, dim=0)
train_masks = torch.cat(train_attention_masks, dim=0)

val_inputs = torch.cat(valid_input_ids, dim=0)
val_masks = torch.cat(valid_attention_masks, dim=0)

In [27]:
train_pos_labels = torch.tensor(padded_pos_labels)
train_dep_labels = torch.tensor(padded_dep_labels)

val_pos_labels = torch.tensor(val_padded_pos_labels)
val_dep_labels = torch.tensor(val_padded_dep_labels)

In [28]:
train_data = TensorDataset(train_inputs, train_masks, train_pos_labels, train_dep_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

In [29]:
valid_data = TensorDataset(val_inputs, val_masks, val_pos_labels, val_dep_labels)
valid_sampler = RandomSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_size)

In [30]:
def freeze_model(model):
    for param in model.parameters():
        param.requires_grad = False
        
def unfreeze_model(model):
    for param in model.parameters():
        param.requires_grad = True

In [31]:
class BERTParser(nn.Module):
    def __init__(self, bert, hidden_size, num_pos_labels, num_dep_labels):
        super().__init__()
        self.bert = bert
        self.pos_predictor = nn.Linear(hidden_size,num_pos_labels)
        self.dep_predictor = nn.Linear(hidden_size, num_dep_labels)
        
    def forward(self, input_ids, attention_mask):
        out = self.bert(input_ids=input_ids,attention_mask=attention_mask)[0]
        pos_logits = self.pos_predictor(out)
        dep_logits = self.dep_predictor(out)
        return pos_logits, dep_logits

In [32]:
bert = DistilBertModel.from_pretrained('distilbert-base-uncased',output_attentions=True)

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [33]:
hidden_size = bert.config.hidden_size
learning_rate = 1e-4
num_epochs = 3
max_norm = 1.0
lmbdas={.25, .5, .75}

In [34]:
device = 'cuda' if torch.cuda.device_count() > 0 else 'cpu'

In [35]:
def get_correct(predicted,golden):
    correct = 0
    n_non_zero = 0
    for i,sample in enumerate(golden):
      for j,label in enumerate(sample):
        if label != 0:
          n_non_zero += 1
          if label == predicted[i][j]:
            correct+=1
    return correct,n_non_zero

In [36]:
for lmbda in lmbdas:
  torch.manual_seed(0)

  fine_tune_model = BERTParser(bert, hidden_size, n_pos, n_deps)
  fine_tune_model.to(device)
  # unfreeze
  unfreeze_model(fine_tune_model)
  # set optimizer and criterion
  params_to_update = []
  for name,param in fine_tune_model.named_parameters():
      if param.requires_grad == True:
          params_to_update.append(param)

  optimizer = optim.Adam(params_to_update, lr=learning_rate, eps=1e-08)
  p_loss = nn.CrossEntropyLoss(ignore_index=0).to(device)
  d_loss = nn.CrossEntropyLoss(ignore_index=0).to(device)

  fine_tune_model.zero_grad()

  for i in notebook.tqdm(range(num_epochs), desc="Epoch"):
      epoch_iterator = notebook.tqdm(train_dataloader, desc="Iteration")
      n_batches = 0
      tr_loss = 0.0
      correct_pos = 0
      correct_dep = 0
      n_pos_non_zero = 0
      n_dep_non_zero = 0
      
      for step, batch in enumerate(epoch_iterator):
          b_input_ids = batch[0].to(device)
          b_input_mask = batch[1].to(device)
          b_pos_labels = batch[2].to(device)
          b_dep_labels = batch[3].to(device)

          fine_tune_model.train()
          
          pos_logits, dep_logits = fine_tune_model(b_input_ids, b_input_mask)

          pos_logits_ = pos_logits.view( -1, pos_logits.size(-1) )
          dep_logits_ = dep_logits.view(-1,dep_logits.size(-1))

          pos_loss = p_loss(pos_logits_, b_pos_labels.view(-1))
          dep_loss = d_loss(dep_logits_, b_dep_labels.view(-1))
          
          correct,non_z = get_correct(torch.argmax(pos_logits, dim=2),b_pos_labels)
          correct_pos+=correct
          n_pos_non_zero+=non_z

          correct,non_z = get_correct(torch.argmax(dep_logits, dim=2),b_dep_labels)
          correct_dep+=correct
          n_dep_non_zero+=non_z
          

          loss = lmbda * pos_loss + (1-lmbda)*dep_loss
          loss.backward()

          nn.utils.clip_grad_norm_(fine_tune_model.parameters(), 1.0)

          tr_loss += loss.item()
          n_batches += 1
          optimizer.step()
          fine_tune_model.zero_grad()
          
      accuracy_pos = correct_pos / n_pos_non_zero
      accuracy_dep = correct_dep / n_dep_non_zero
      print("Rel Pos Accuracy = {}".format(accuracy_pos))
      print("Dep Accuracy = {}".format(accuracy_dep))
      print('train loss for lambda = {} after {} epoch = {:.4f}'.format(lmbda, i+1, (tr_loss/n_batches)))
  PATH = 'drive/MyDrive/Colab files nyu/bert-parser-'+str(lmbda)+'.pt'
  torch.save(fine_tune_model, PATH)

  # Eval!
  eval_loss = 0.0
  nb_eval_steps = 0
  preds = []
  all_labels = []

  correct_pos = 0
  correct_dep = 0
  n_pos_non_zero = 0
  n_dep_non_zero = 0

  for batch in notebook.tqdm(valid_dataloader, desc="Evaluating"):
      fine_tune_model.eval()
      b_input_ids = batch[0].to(device)
      b_input_mask = batch[1].to(device)
      b_pos_labels = batch[2].to(device)
      b_dep_labels = batch[3].to(device)

      with torch.no_grad():
          pos_logits, dep_logits = fine_tune_model(b_input_ids, b_input_mask)

          pos_logits_ = pos_logits.view( -1, pos_logits.size(-1) )
          dep_logits_ = dep_logits.view(-1,dep_logits.size(-1))

          pos_loss = p_loss(pos_logits_, b_pos_labels.view(-1))
          dep_loss = d_loss(dep_logits_, b_dep_labels.view(-1))

          loss = lmbda * pos_loss + (1-lmbda) * dep_loss

      correct,non_z = get_correct(torch.argmax(pos_logits, dim=2),b_pos_labels)
      correct_pos+=correct
      n_pos_non_zero+=non_z

      correct,non_z = get_correct(torch.argmax(dep_logits, dim=2),b_dep_labels)
      correct_dep += correct
      n_dep_non_zero += non_z

      eval_loss += loss.item()

      nb_eval_steps += 1

  eval_loss = eval_loss / nb_eval_steps
  accuracy_pos = correct_pos / n_pos_non_zero
  accuracy_dep = correct_dep / n_dep_non_zero
  print("Rel Pos Accuracy = {}".format(accuracy_pos))
  print("Dep Accuracy = {}".format(accuracy_dep))
  print('Validation loss for lambda value {} = {:.4f}'.format(lmbda,eval_loss))

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/134 [00:00<?, ?it/s]

Rel Pos Accuracy = 0.42733413957806526
Dep Accuracy = 0.6531070961752239
train loss for lambda = 0.25 after 1 epoch = 1.6877


Iteration:   0%|          | 0/134 [00:00<?, ?it/s]

Rel Pos Accuracy = 0.6018006132346294
Dep Accuracy = 0.8444802775436411
train loss for lambda = 0.25 after 2 epoch = 0.8413


Iteration:   0%|          | 0/134 [00:00<?, ?it/s]

Rel Pos Accuracy = 0.665945932739644
Dep Accuracy = 0.8861484711889667
train loss for lambda = 0.25 after 3 epoch = 0.6271


Evaluating:   0%|          | 0/134 [00:00<?, ?it/s]

Rel Pos Accuracy = 0.7134410769475086
Dep Accuracy = 0.9221363042230122
Validation loss for lambda value 0.25 = 0.4652


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/134 [00:00<?, ?it/s]

Rel Pos Accuracy = 0.7156032787285765
Dep Accuracy = 0.907562819902029
train loss for lambda = 0.5 after 1 epoch = 0.7530


Iteration:   0%|          | 0/134 [00:00<?, ?it/s]

Rel Pos Accuracy = 0.7666165817666533
Dep Accuracy = 0.9259232112971989
train loss for lambda = 0.5 after 2 epoch = 0.5506


Iteration:   0%|          | 0/134 [00:00<?, ?it/s]

Rel Pos Accuracy = 0.7948473632132518
Dep Accuracy = 0.9374671699588326
train loss for lambda = 0.5 after 3 epoch = 0.4646


Evaluating:   0%|          | 0/134 [00:00<?, ?it/s]

Rel Pos Accuracy = 0.8350985206630752
Dep Accuracy = 0.9512344095478922
Validation loss for lambda value 0.5 = 0.3657


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/134 [00:00<?, ?it/s]

Rel Pos Accuracy = 0.8162739277555857
Dep Accuracy = 0.9430864514237549
train loss for lambda = 0.75 after 1 epoch = 0.5764


Iteration:   0%|          | 0/134 [00:00<?, ?it/s]

Rel Pos Accuracy = 0.8435518745190017
Dep Accuracy = 0.9485102796203321
train loss for lambda = 0.75 after 2 epoch = 0.4454


Iteration:   0%|          | 0/134 [00:00<?, ?it/s]

Rel Pos Accuracy = 0.8595179633769439
Dep Accuracy = 0.9531889422313434
train loss for lambda = 0.75 after 3 epoch = 0.3892


Evaluating:   0%|          | 0/134 [00:00<?, ?it/s]

Rel Pos Accuracy = 0.8894833925801053
Dep Accuracy = 0.9570613601104311
Validation loss for lambda value 0.75 = 0.3133
