# Neural Dependency Parsing




Code Written by:


*   **Riccardo Rosalen** | **Student ID: 2055530** | **E-Mail Address: riccardo.rosalen@studenti.unipd.it**
*   **Federico Violin** | **Student ID: 2061746** | **E-Mail Address: federico.violin.1@studenti.unipd.it**



#Installing Necessary Libraries




*   **datasets**-->huggingface library with dataset
*   **conllu**-->aux library for processing CoNLL-U format
*   **transformers**-->library containing pre-trained BERT model


In [None]:
!pip install datasets  # huggingface library with dataset
!pip install conllu    # aux library for processing CoNLL-U format
!pip install transformers #library containing pre-trained BERT model

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.13.0-py3-none-any.whl (485 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.6/485.6 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
Collec

In [None]:
import torch
import torch.nn.functional as F
import torch.nn as nn
from functools import partial
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel, BertModel

# ArcEager Parser

In the task of dependency parsing we can use different strategies to produce the final dependency tree.

The ArcEager parser differs from the ArcStandard Parser by 2 substantial concepts:
* the Reduce operator;
* the fact that leftArc and rightArc operations are made between the first element of the buffer and the last element of the stack, instead of being between the last two elements of the stack.

So the operators for ArcEager parser are 4:
* **Left Arc**:(for any dependency label l) adds the arc (b, l, s) to A, where s is
the node on top of the stack and b is the first node in the buffer, and pops the stack. It has
as a precondition that the token s is not the artificial root node 0 and does not already
have a head.
* **Right Arc**:(for any dependency label l) adds the arc (s, l, b) to A, where s is
the node on top of the stack and b is the first node in the buffer, and pushes the node b
onto the stack.
* **Shift**:pops the stack and is subject to the preconditions that the top
token has a head.
* **Reduce**:removes the first node in the buffer and pushes it onto the stack.

In [None]:
class ArcEager:
  def __init__(self, sentence):
    #Initialization for the class that manages the ArcEager parser operators
    #Sentence to compute dependencies for
    self.sentence = sentence

    #Buffer containing the words in the sentence
    self.buffer = [i for i in range(len(self.sentence))]

    #Initialize the stack as an empty list
    self.stack = []

    #This variable is used to store the transitions(i.e. the arcs) between words
    self.arcs = [-1 for _ in range(len(self.sentence))]

    #In this case we need only to move the root to the stack
    if len(self.sentence) > 1:
      self.shift()
  #LeftArc:last element of the stack depends on the first element in the buffer
  #Remove from the stack the last element (pop)
  def left_arc(self):
    o1 = self.stack.pop()
    o2 = self.buffer[0]
    self.arcs[o1] = o2

  #RightArc:the first element in the buffer depends on last element of the stack
  #Append to the stack the first element of the buffer (shift)
  def right_arc(self):
    o1 = self.stack.pop()
    o2 = self.buffer[0]
    self.arcs[o2] = o1
    self.stack.append(o1)
    self.shift()

  #Shift:move the first element in the buffer to the last position of the stack
  def shift(self):
    b1 = self.buffer[0]
    self.buffer = self.buffer[1:]
    self.stack.append(b1)

  #Remove (Pop) the last element of the stack
  def reduce(self):
    if(len(self.stack) > 1):
      b1 = self.stack.pop()


  #Check the ending conditions for the ArcEager parsing
  def is_tree_final(self):
    return len(self.stack) == 1 and len(self.buffer) == 0

  ####NOT USED
  #Prints the configuration of the sentence at any time is requested
  def print_configuration(self):
    s = [self.sentence[i] for i in self.stack]
    b = [self.sentence[i] for i in self.buffer]
    print(s, b)
    print(self.arcs)


#ArcEager's Oracle

The oracle is a component used to guide the training of the parser using ground truth. Basically it implements functions that check if the main operators can be applied.

In [None]:
class Oracle_ArcEager:
  #Initialize the Oracle passing a parser istance and the gold tree
  def __init__(self, parser, gold_tree):
    self.parser = parser
    self.gold = gold_tree


  #Check if the left_arc operator can be applied
  def is_left_arc_gold(self):

    #The head of the dependency should be the first element of the buffer
    #If it doesn't exists, no left_arc can be applied
    if len(self.parser.buffer) == 0:
      return False

    #Assign to o1 the possible head
    o1 = self.parser.buffer[0]

    #Assign to o2 the possible dependent
    o2 = self.parser.stack[len(self.parser.stack)-1]

    #Check in the gold tree if the operations is correct
    if self.gold[o2] == o1 and self.parser.arcs[o1] != self.gold[o1] and o1 != -1:
      return True

    return False


  #Check if the right_arc operator can be applied
  def is_right_arc_gold(self):

    #If buffer is empty we can't do any association
    if len(self.parser.buffer) == 0:
      return False
    else:

      #Assign to o1 the possible head
      o1 = self.parser.stack[len(self.parser.stack)-1]

      #Assign to o2 the possible dependent
      o2= self.parser.buffer[0]

      #Check in the gold tree if the operations is correct
      if self.gold[o2] == o1:
        return True

    return False


  #Check if the reduce operator can be applied
  def is_reduce_gold(self):

    #If no association can be done and stack is not empty
    if len(self.parser.buffer) == 0 and self.parser.stack[len(self.parser.stack)-1] != -1:
      return True

    #Check if left_arc or right_arc can be done, if yes return False
    if(self.is_left_arc_gold() or self.is_right_arc_gold()):
      return False

    #For any element in the stack we check for matches with buffer[0]
    for i in range(len(self.parser.stack)-1):

      o2 = self.parser.stack[i]
      o1 = self.parser.buffer[0]

      if self.gold[o1] == o2 or self.gold[o2] == o1:
        return True

    return False

  #Check if shift operator can be applied
  def is_shift_gold(self):
    #If any other operator can't be applied, shift is the one to use
    if(self.is_left_arc_gold() or self.is_right_arc_gold() or self.is_reduce_gold()):
      return False

    return True


In [None]:
#This function checks if
def is_projective(tree):
  for i in range(len(tree)):
    if tree[i] == -1:
      continue
    left = min(i, tree[i])
    right = max(i, tree[i])

    for j in range(0, left):
      if tree[j] > left and tree[j] < right:
        return False
    for j in range(left+1, right):
      if tree[j] < left or tree[j] > right:
        return False
    for j in range(right+1, len(tree)):
      if tree[j] > left and tree[j] < right:
        return False

  return True

def create_dict(dataset, threshold=3):
  dic = {}  # dictionary of word counts
  for sample in dataset:
    for word in sample['tokens']:
      if word in dic:
        dic[word] += 1
      else:
        dic[word] = 1

  map = {}  # dictionary of word/index pairs. This is our embedding list
  map["<pad>"] = 0
  map["<ROOT>"] = 1
  map["<unk>"] = 2 #used for words that do not appear in our list

  next_indx = 3
  for word in dic.keys():
    if dic[word] >= threshold:
      map[word] = next_indx
      next_indx += 1

  return map

##Dataset
In this section, we import a dataset from HuggingFace's Universal Dependencies repository.
Most of the datasets present a big issue, that is that for some sentencies, in the columnt "head" we find between the words' indices also the "None" value. To overcome this issue we implemented the *remove_none_ids* function.

In [None]:
def remove_none_ids(dataset):
    filtered_dataset = []

    for sentence in dataset:
        if all(id != "None" for id in sentence['head']):
            filtered_dataset.append(sentence)

    return filtered_dataset

In [None]:
train_dataset = load_dataset('universal_dependencies', 'es_ancora', split="train")
val_dataset = load_dataset('universal_dependencies', 'es_ancora',split="validation")
test_dataset = load_dataset('universal_dependencies', 'es_ancora', split="test")

#Many datasets have None value, we check if is present and in case we delete them
train_dataset=remove_none_ids(train_dataset)
val_dataset=remove_none_ids(val_dataset)
test_dataset=remove_none_ids(test_dataset)

# remove non-projective sentences: heads in the gold tree are strings, we convert them to int
train_dataset = [sample for sample in train_dataset if is_projective([-1] + [int(head) for head in sample["head"]])]

Downloading and preparing dataset universal_dependencies/es_ancora to /root/.cache/huggingface/datasets/universal_dependencies/es_ancora/2.7.0/1ac001f0e8a0021f19388e810c94599f3ac13cc45d6b5b8c69f7847b2188bdf7...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/5.11M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/604k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/608k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/14305 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1654 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1721 [00:00<?, ? examples/s]

Dataset universal_dependencies downloaded and prepared to /root/.cache/huggingface/datasets/universal_dependencies/es_ancora/2.7.0/1ac001f0e8a0021f19388e810c94599f3ac13cc45d6b5b8c69f7847b2188bdf7. Subsequent calls will reuse this data.




In [None]:
print("Number of samples:")
print("Train:\t", len(train_dataset)) #(train is the number of samples without the non-projective)
print("Dev:\t", len(val_dataset))
print("Test:\t", len(test_dataset))

Number of samples:
Train:	 12321
Dev:	 1654
Test:	 1721


## Create training data and iterable dataloaders



Recall that to run the arc-standard parser we need a **classifier** that looks at some of the content of the current parser configuration and selects an approapriate action.  In order to train the classifier, we need to convert the gold trees in our treebank into several pairs of the form configuration/gold action.  This is what we do in this section.

First of all, we need to preprocess the training set. We remove non-projective trees.  We also create a dictionary of word/index pairs, to be used later when creating word embeddings.  Words that have less than three occurrences are not encoded and will later be mapped to special token `<unk>`.

In [None]:
# create the dictionary from the dataset
emb_dictionary = create_dict(train_dataset)

The next function is used to process our data and create the actual training samples.

For each sentence in the dataset, we use our oracle to compute the canonical action sequence leading to the gold tree.  We then pair configurations and canonical actions.  Since our neural classifier will look only into $\sigma_1$, $\sigma_2$ and $\beta_1$, we do not have to record the full parser configuration.

In [None]:
def process_sample(sample, get_gold_path = False):


  sentence = ["<ROOT>"] + sample["tokens"]
  gold = [-1] + [int(i) for i in sample["head"] if i != 'None']  #heads in the gold tree are strings, we convert them to int


  # embedding ids of sentence words
  enc_sentence = [emb_dictionary[word] if word in emb_dictionary else emb_dictionary["<unk>"] for word in sentence]

  # gold_path and gold_moves are parallel arrays whose elements refer to parsing steps
  gold_path = []   # record two topmost stack tokens and first buffer token for current step
  gold_moves = []  # contains oracle (canonical) move for current step: 0 is left, 1 right, 2 shift

  if get_gold_path:  # only for training
    parser = ArcEager(sentence)
    oracle = Oracle_ArcEager(parser, gold)

    while not parser.is_tree_final():

       # save configuration
      configuration = [parser.stack[len(parser.stack)-2], parser.stack[len(parser.stack)-1]]
      if len(parser.buffer) == 0:
        configuration.append(-1)
      else:
        configuration.append(parser.buffer[0])
      gold_path.append(configuration)

      # save gold move
      if oracle.is_left_arc_gold():
        parser.left_arc()
        gold_moves.append(0)
      elif oracle.is_right_arc_gold():
        parser.right_arc()
        gold_moves.append(1)
      elif oracle.is_reduce_gold():
        parser.reduce()
        gold_moves.append(2)
      elif oracle.is_shift_gold():
        parser.shift()
        gold_moves.append(3)

  return enc_sentence, gold_path, gold_moves, gold

Next function used to batch the training data.

In [None]:
def prepare_batch(batch_data, get_gold_path=False):
  data = [process_sample(s, get_gold_path=get_gold_path) for s in batch_data]
  # sentences, paths, moves, trees are parallel arrays, each element refers to a sentence
  sentences = [s[0] for s in data]
  paths = [s[1] for s in data]
  moves = [s[2] for s in data]
  trees = [s[3] for s in data]
  return sentences, paths, moves, trees

Finally, we create dataloaders for train, development and test sets.

In [None]:
BATCH_SIZE = 32

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=partial(prepare_batch, get_gold_path=True))
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=partial(prepare_batch))
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=partial(prepare_batch))

## Create neural network model

The main differences between the training program presented below and  Kiperwasser and Goldberg, 2016 are as follows:

* original model uses PoS_tags
* original model also considers third top-most element of the stack
* original model uses hinge loss and dynamic oracle / training

We are now ready to train our parser on the dataset.  We start with the definition of some parameters.


In [None]:
EMBEDDING_SIZE = 100
LSTM_SIZE = 200
LSTM_LAYERS = 1
MLP_SIZE = 200
DROPOUT = 0.2

Next, we create our model. It consists of a Bi-LSTM to represent words together with their contexts and a feedforward network to predict the next move of the parser.

In [None]:
class BiLSTMNet(nn.Module):

  def __init__(self, device):
    super(BiLSTMNet, self).__init__()
    self.device = device
    self.embeddings = nn.Embedding(len(emb_dictionary), EMBEDDING_SIZE, padding_idx=emb_dictionary["<pad>"])

    # initialize bi-LSTM
    self.lstm = nn.LSTM(EMBEDDING_SIZE, LSTM_SIZE, num_layers = LSTM_LAYERS, bidirectional=True, dropout=DROPOUT)

    # initialize feedforward
    self.w1 = torch.nn.Linear(6*LSTM_SIZE, MLP_SIZE, bias=True)
    self.activation = torch.nn.Tanh()
    #Here we put output length equal to 4 because it has 4 outputs: left-arc,
    #right-arc, reduce or shift
    self.w2 = torch.nn.Linear(MLP_SIZE, 4, bias=True)
    self.softmax = torch.nn.Softmax(dim=-1)

    self.dropout = torch.nn.Dropout(DROPOUT)


  def forward(self, x, paths):
    # get the embeddings
    x = [self.dropout(self.embeddings(torch.tensor(i).to(self.device))) for i in x]

    # run the bi-lstm
    h = self.lstm_pass(x)

    # for each parser configuration that we need to score we arrange from the
    # output of the bi-lstm the correct input for the feedforward
    mlp_input = self.get_mlp_input(paths, h)

    # run the feedforward and get the scores for each possible action
    out = self.mlp(mlp_input)

    return out

  def lstm_pass(self, x):
    x = torch.nn.utils.rnn.pack_sequence(x, enforce_sorted=False)
    h, (h_0, c_0) = self.lstm(x)
    h, h_sizes = torch.nn.utils.rnn.pad_packed_sequence(h) # size h: (length_sentences, batch, output_hidden_units)
    return h

  def get_mlp_input(self, configurations, h):
    mlp_input = []
    zero_tensor = torch.zeros(2*LSTM_SIZE, requires_grad=False).to(self.device)
    for i in range(len(configurations)): # for every sentence in the batch
      for j in configurations[i]: # for each configuration of a sentence
        mlp_input.append(torch.cat([zero_tensor if j[0]==-1 else h[j[0]][i], zero_tensor if j[1]==-1 else h[j[1]][i], zero_tensor if j[2]==-1 else h[j[2]][i]]))
    mlp_input = torch.stack(mlp_input).to(self.device)
    return mlp_input

  def mlp(self, x):
    return self.softmax(self.w2(self.dropout(self.activation(self.w1(self.dropout(x))))))

  # we use this function at inference time. We run the parser and at each step
  # we pick as next move the one with the highest score assigned by the model
  def infere(self, x):

    parsers = [ArcEager(i) for i in x]

    x = [self.embeddings(torch.tensor(i).to(self.device)) for i in x]

    h = self.lstm_pass(x)

    while not self.parsed_all(parsers):
      # get the current configuration and score next moves
      configurations = self.get_configurations(parsers)
      mlp_input = self.get_mlp_input(configurations, h)
      mlp_out = self.mlp(mlp_input)
      # take the next parsing step
      self.parse_step(parsers, mlp_out)

    # return the predicted dependency tree
    return [parser.arcs for parser in parsers]

  def get_configurations(self, parsers):
    configurations = []

    for parser in parsers:
      if parser.is_tree_final():
        conf = [-1, -1, -1]
      else:
        conf = [parser.stack[len(parser.stack)-2], parser.stack[len(parser.stack)-1]]
        if len(parser.buffer) == 0:
          conf.append(-1)
        else:
          conf.append(parser.buffer[0])
      configurations.append([conf])

    return configurations



  def parsed_all(self, parsers):
    for parser in parsers:
      if not parser.is_tree_final():
        return False
    return True

  # In this function we select and perform the next move according to the scores obtained.
  # We need to be careful and select correct moves, e.g. don't do a shift if the buffer
  # is empty or a left arc if σ2 is the ROOT. For clarity sake we didn't implement
  # these checks in the parser so we must do them here. This renders the function quite ugly
  def parse_step(self, parsers, moves):
    moves_argm = moves.argmax(-1)
    for i in range(len(parsers)):
      if parsers[i].is_tree_final():
        continue
      else:
        if moves_argm[i] == 0:
          #if the element in the stack is the root then we have not to perform
          #the left arc
          if len(parsers[i].buffer) == 0:
            parsers[i].reduce()
          else:
            if parsers[i].stack[len(parsers[i].stack)-1] != 0:
              parsers[i].left_arc()
            else:
              max_overall_move = moves_argm[1:].argmax(-1)
              if max_overall_move == 1:
                parsers[i].right_arc()
              elif max_overall_move == 2:
                parsers[i].reduce()
              else:
                parsers[i].shift()
            #here we can't use the reduce because we have that the element in
            #the stack is the root
        elif moves_argm[i] == 1:
          if len(parsers[i].buffer) == 0:
            parsers[i].reduce()
          else:
            parsers[i].right_arc()
        elif moves_argm[i] == 2:
          if parsers[i].stack[len(parsers[i].stack)-1] != 0:
            parsers[i].reduce()
          else:
            if moves[i][1] > moves[i][3]:
              parsers[i].right_arc()
            else:
              parsers[i].shift()
        elif moves_argm[i] == 3:
          if len(parsers[i].buffer) == 0:
            parsers[i].reduce()
          elif len(parsers[i].buffer) == 1:
            if parsers[i].stack[len(parsers[i].stack)-1] == 0:
              parsers[i].right_arc()
            else:
              #control the move with maximum overall
              max_overall_move = moves_argm[:2].argmax(-1)
              #find the move associated
              if max_overall_move == 0:
                parsers[i].left_arc()
              elif max_overall_move == 1:
                parsers[i].right_arc()
              else:
                parsers[i].reduce()
          else:
            parsers[i].shift()

##BERT Network

In [None]:
BERT_SIZE=768
MLP_SIZE = 200
DROPOUT = 0.3

In [None]:
class BERTNet(nn.Module):

  def __init__(self, device):
    super(BERTNet, self).__init__()
    self.device = device
    # initialize BERT
    self.model = BertModel.from_pretrained("bert-base-uncased")
    self.model.requires_grad_()  # Set requires_grad to True for all parameters
    self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    self.tokenizer.add_tokens(["<ROOT>"])
    self.tokenizer.add_tokens(["<unk>"])
    self.tokenizer.add_tokens(["<pad>"])
    self.model.resize_token_embeddings(len(self.tokenizer))


    # initialize feedforward
    self.w1 = torch.nn.Linear(3*BERT_SIZE, MLP_SIZE, bias=True)
    self.activation = torch.nn.Tanh()
    #Here we put output length equal to 4 because it has 4 outputs: left-arc,
    #right-arc, reduce or shift
    self.w2 = torch.nn.Linear(MLP_SIZE, int(4), bias=True)
    self.softmax = torch.nn.Softmax(dim=-1)

    self.norm1 = nn.LayerNorm(MLP_SIZE)
    self.norm2 = nn.LayerNorm(4)

    self.dropout = torch.nn.Dropout(DROPOUT)

  def get_key_from_value(self, dictionary, target_value):
      for key, value in dictionary.items():
          if value == target_value:
              return key
      return None

  def get_the_tokenization(self, x):
    word_sentences = []
    sentences_to_tokenize = []
    filtered_sentences=[]

    for sentence in x:
        word_sentence = [self.get_key_from_value(emb_dictionary, word) for word in sentence if word in emb_dictionary.values()]
        word_sentence=" ".join(word_sentence)

        tokens=self.tokenizer.convert_ids_to_tokens(self.tokenizer(word_sentence)["input_ids"])
        tokens=[i for i in tokens if not i.startswith("##")]
        word_sentences.append(" ".join(tokens))

    tokenized_sentences = self.tokenizer(word_sentences, add_special_tokens=False, padding=True, return_tensors="pt")

    input_ids = tokenized_sentences["input_ids"].to(self.device)
    attention_mask = tokenized_sentences["attention_mask"].to(self.device)
    """for i in range(attention_mask.size(0)):
      tokens=self.tokenizer.convert_ids_to_tokens(input_ids[i])
      for j in range(len(tokens)):
        if tokens[j].startswith("##"):
          attention_mask[i,j]=0
    """
    model_output = self.model(input_ids=input_ids, attention_mask=attention_mask)
    return model_output.last_hidden_state

  def forward(self, x, paths):
    # get the embeddings
    x = self.dropout(self.get_the_tokenization(x)).to(self.device)

    # run the bi-lstm
    h = x.permute(1,0,2)
    # for each parser configuration that we need to score we arrange from the
    # output of the bi-lstm the correct input for the feedforward
    mlp_input = self.get_mlp_input(paths, h)

    # run the feedforward and get the scores for each possible action
    out = self.mlp(mlp_input)

    return out


  def get_mlp_input(self, configurations, h):
    mlp_input = []
    zero_tensor = torch.zeros(BERT_SIZE, requires_grad=False).to(self.device)
    for i in range(len(configurations)): # for every sentence in the batch
      for j in configurations[i]: # for each configuration of a sentence
        mlp_input.append(torch.cat([zero_tensor if j[0]==-1 else h[j[0]][i], zero_tensor if j[1]==-1 else h[j[1]][i], zero_tensor if j[2]==-1 else h[j[2]][i]]))
    mlp_input = torch.stack(mlp_input).to(self.device)
    return mlp_input

  def mlp(self, x):
    x = self.dropout(x)
    x = self.w1(x)
    #x = self.norm1(x)  # Apply layer normalization
    x = self.activation(x)
    x = self.dropout(x)
    x = self.w2(x)
    #x = self.norm2(x)  # Apply layer normalization
    x=self.softmax(x)
    return x

  # we use this function at inference time. We run the parser and at each step
  # we pick as next move the one with the highest score assigned by the model
  def infere(self, x):

    parsers = [ArcEager(i) for i in x]

    x = (self.dropout(self.get_the_tokenization(x))).to(self.device)

    h = x.permute(1,0,2)
    while not self.parsed_all(parsers):
      # get the current configuration and score next moves
      configurations = self.get_configurations(parsers)
      mlp_input = self.get_mlp_input(configurations, h)
      mlp_out = self.mlp(mlp_input)
      # take the next parsing step
      self.parse_step(parsers, mlp_out)

    # return the predicted dependency tree
    return [parser.arcs for parser in parsers]

  def get_configurations(self, parsers):
    configurations = []

    for parser in parsers:
      if parser.is_tree_final():
        conf = [-1, -1, -1]
      else:
        conf = [parser.stack[len(parser.stack)-2], parser.stack[len(parser.stack)-1]]
        if len(parser.buffer) == 0:
          conf.append(-1)
        else:
          conf.append(parser.buffer[0])
      configurations.append([conf])

    return configurations



  def parsed_all(self, parsers):
    for parser in parsers:
      if not parser.is_tree_final():
        return False
    return True

  # In this function we select and perform the next move according to the scores obtained.
  # We need to be careful and select correct moves, e.g. don't do a shift if the buffer
  # is empty or a left arc if σ2 is the ROOT. For clarity sake we didn't implement
  # these checks in the parser so we must do them here. This renders the function quite ugly
  def parse_step(self, parsers, moves):
    moves_argm = moves.argmax(-1)
    for i in range(len(parsers)):
      if parsers[i].is_tree_final():
        continue
      else:
        if moves_argm[i] == 0:
          #if the element in the stack is the root then we have not to perform
          #the left arc
          if len(parsers[i].buffer) == 0:
            parsers[i].reduce()
          else:
            if parsers[i].stack[len(parsers[i].stack)-1] != 0:
              parsers[i].left_arc()
            else:
              max_overall_move = moves_argm[1:].argmax(-1)
              if max_overall_move == 1:
                parsers[i].right_arc()
              elif max_overall_move == 2:
                parsers[i].reduce()
              else:
                parsers[i].shift()
            #here we can't use the reduce because we have that the element in
            #the stack is the root
        elif moves_argm[i] == 1:
          if len(parsers[i].buffer) == 0:
            parsers[i].reduce()
          else:
            parsers[i].right_arc()
        elif moves_argm[i] == 2:
          if parsers[i].stack[len(parsers[i].stack)-1] != 0:
            parsers[i].reduce()
          else:
            if moves[i][1] > moves[i][3]:
              parsers[i].right_arc()
            else:
              parsers[i].shift()
        elif moves_argm[i] == 3:
          if len(parsers[i].buffer) == 0:
            parsers[i].reduce()
          elif len(parsers[i].buffer) == 1:
            if parsers[i].stack[len(parsers[i].stack)-1] == 0:
              parsers[i].right_arc()
            else:
              #control the move with maximum overall
              max_overall_move = moves_argm[:2].argmax(-1)
              #find the move associated
              if max_overall_move == 0:
                parsers[i].left_arc()
              elif max_overall_move == 1:
                parsers[i].right_arc()
              else:
                parsers[i].reduce()
          else:
            parsers[i].shift()


##Train and Test
Now that we have defined all our components, we are ready to train and test our model.

First we define our evaluation function. We use UAS (Unlabeled Accuracy Score) which is the percentage of correct arcs predicted over all the arcs.

In [None]:
def evaluate(gold, preds):
  total = 0
  correct = 0

  for g, p in zip(gold, preds):
    for i in range(1,len(g)):
      total += 1
      if g[i] == p[i]:
        correct += 1

  return correct/total

Next, we define our train loop and the test function to run inference.

In [None]:
def train(model, dataloader, criterion, optimizer):
  model.train()
  total_loss = 0
  count = 0

  #progress_bar = tqdm(dataloader, desc="Training", leave=False)

  for batch in dataloader:
    optimizer.zero_grad()
    sentences, paths, moves, trees = batch

    out = model(sentences, paths)
    labels = torch.tensor(sum(moves, [])).to(device) #sum(moves, []) flatten the array
    loss = criterion(out, labels)

    count +=1
    total_loss += loss.item()

    loss.backward()
    optimizer.step()
    #progress_bar.set_postfix({"avg_train_loss": "{:.3f}".format(total_loss/count)})

  return total_loss/count

def test(model, dataloader):
  model.eval()

  gold = []
  preds = []

  for batch in dataloader:
    sentences, paths, moves, trees = batch
    with torch.no_grad():
      pred = model.infere(sentences)

      gold += trees
      preds += pred
  return evaluate(gold, preds)

We run the training of our neural dependency parser with BERT.

# BERT Training Specific Parameters

To allow the consecutive execution of the whole notebook (and also to easily control all the hyperparameters) below there are the ones used for the BERT-based pipeline

In [None]:
EPOCHS = 15
LR = 0.0001   # learning rate

#BERT Training

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

model = BERTNet(device)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)


for epoch in range(EPOCHS):
  avg_train_loss = train(model, train_dataloader, criterion, optimizer)
  val_uas = test(model, val_dataloader)
  print("Epoch: {:3d} | avg_train_loss: {:5.3f} | dev_uas: {:5.3f} |".format( epoch, avg_train_loss, val_uas))

Device: cuda


Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Epoch:   0 | avg_train_loss: 0.965 | dev_uas: 0.661 |
Epoch:   1 | avg_train_loss: 0.891 | dev_uas: 0.714 |
Epoch:   2 | avg_train_loss: 0.870 | dev_uas: 0.731 |
Epoch:   3 | avg_train_loss: 0.858 | dev_uas: 0.728 |
Epoch:   4 | avg_train_loss: 0.849 | dev_uas: 0.748 |
Epoch:   5 | avg_train_loss: 0.838 | dev_uas: 0.755 |
Epoch:   6 | avg_train_loss: 0.831 | dev_uas: 0.760 |
Epoch:   7 | avg_train_loss: 0.824 | dev_uas: 0.759 |
Epoch:   8 | avg_train_loss: 0.822 | dev_uas: 0.768 |
Epoch:   9 | avg_train_loss: 0.822 | dev_uas: 0.759 |
Epoch:  10 | avg_train_loss: 0.818 | dev_uas: 0.770 |
Epoch:  11 | avg_train_loss: 0.814 | dev_uas: 0.769 |
Epoch:  12 | avg_train_loss: 0.813 | dev_uas: 0.777 |
Epoch:  13 | avg_train_loss: 0.809 | dev_uas: 0.770 |
Epoch:  14 | avg_train_loss: 0.808 | dev_uas: 0.774 |


In [None]:
test_uas = test(model, test_dataloader)
print("test_uas: {:5.3f}".format( test_uas))

test_uas: 0.784


# BiLSTM Specific Training Parameters



In [None]:
EPOCHS = 15
LR = 0.001   # learning rate

#BiLSTM Training

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)
model = BiLSTMNet(device)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)


for epoch in range(EPOCHS):
  avg_train_loss = train(model, train_dataloader, criterion, optimizer)
  val_uas = test(model, dev_dataloader)
  print("Epoch: {:3d} | avg_train_loss: {:5.3f} | dev_uas: {:5.3f} |".format( epoch, avg_train_loss, val_uas))

And we test on the test set.

In [None]:
test_uas = test(model, test_dataloader)
print("test_uas: {:5.3f}".format( test_uas))