In [1]:
import os
import json
import argparse
import numpy as np
from statistics import mean
from collections import Counter
from typing import List, Tuple
from dataclasses import dataclass, field

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW

import transformers
from transformers import (
    AutoTokenizer, 
    AutoModel, 
    AutoModelForMaskedLM, 
    AutoModelForSequenceClassification, 
    AutoModelForTokenClassification
)

from tqdm import tqdm

os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Dataset class

In [2]:
def pad_1d(tensor: torch.Tensor, length: int, pad_value: int = 0):
    if tensor.size(0) >= length:
        return tensor[:length]
    else:
        return torch.cat([tensor, torch.full((length - tensor.size(0),), pad_value, dtype=tensor.dtype)], dim=0)

def pad_2d(tensor: torch.Tensor, lengths: Tuple[int, int], pad_value: int = 0):
    if tensor.size(0) >= lengths[0]:
        tensor = tensor[:lengths[0], :]
    if tensor.size(1) >= lengths[1]:
        tensor = tensor[:, :lengths[1]]

    if tensor.size(0) < lengths[0]:
        tensor = torch.cat([tensor, torch.full((lengths[0] - tensor.size(0), tensor.size(1)), pad_value, dtype=tensor.dtype)], dim=0)
    if tensor.size(1) < lengths[1]:
        tensor = torch.cat([tensor, torch.full((tensor.size(0), lengths[1] - tensor.size(1)), pad_value, dtype=tensor.dtype)], dim=1)
    
    return tensor

def collate_fn(batch):
    max_word_length = max(sentence["pos_tag_ids"].size(0) for sentence in batch)
    max_subword_length = max(sentence["subword_ids"].size(0) for sentence in batch)

    return {
        "subword_ids": torch.stack(
            [pad_1d(sentence["subword_ids"], max_subword_length, 0) for sentence in batch],
            dim=0
        ),
        "pos_tag_ids": torch.stack(
            [pad_1d(sentence["pos_tag_ids"], max_word_length, -1) for sentence in batch],
            dim=0
        ),
        "dependencies": torch.stack(
            [pad_1d(sentence["dependencies"], max_word_length, -1) for sentence in batch],
            dim=0
        ),
        "dep_relation_ids": torch.stack(
            [pad_1d(sentence["dep_relation_ids"], max_word_length, -1) for sentence in batch],
            dim=0
        ),
        "subword_to_word_map": torch.stack(
            [pad_1d(sentence["subword_to_word_map"], max_subword_length, -1) for sentence in batch],
            dim=0
        ),
        "word_to_subword_map": torch.stack(
            [pad_1d(sentence["word_to_subword_map"], max_word_length, -1) for sentence in batch],
            dim=0
        ),
        "attention_mask": torch.stack(
            [pad_1d(torch.ones(sentence["subword_ids"].size(0), dtype=torch.bool), max_subword_length, False) for sentence in batch],
            dim=0
        )
    }

@dataclass
class Sentence:
    words: List[str] = field(default_factory=list) 
    subwords: List[str] = field(default_factory=list) 
    subword_ids: torch.LongTensor = None
    subword_to_word_map: List[int] = field(default_factory=list)
    word_to_subword_map: List[int] = field(default_factory=list)
    pos_tags: List[str] = field(default_factory=list) 
    pos_tag_ids: torch.LongTensor = None
    dependencies: List[int] = field(default_factory=list)
    dep_relations: List[str] = field(default_factory=list)

class ConlluDataset(Dataset):
    def __init__(self, path: str, tokenizer: AutoTokenizer, pos_ids_to_str: List[str] = None, dep_ids_to_str: List[str] = None, verbose=True):
        self.sentences = []
        sentence = Sentence()
        space_before = False
        self.dep_relations = []

        for line in open(path):
            line = line.strip()
            if line.startswith("#"):
                continue
            if len(line) == 0:
                if len(sentence.words) > 0:
                    self.sentences.append(sentence)
                    sentence = Sentence()
                continue
            items = line.split("\t")
            if not items[0].isdigit():
                continue
            word = ("" if space_before else " ") + items[1].strip()
            pos_tag = f"POS={items[3].strip()}" + ("" if items[5].strip() == "_" else f"|{items[5].strip()}")
            dependency = int(items[6].strip())
            dep_rel = items[7].strip()

            sentence.words.append(word)
            sentence.pos_tags.append(pos_tag)
            sentence.dependencies.append(dependency)
            sentence.dep_relations.append(dep_rel)
            self.dep_relations.append(dep_rel)
            space_before = "SpaceAfter=No" not in items[-1]
        
        if len(sentence.words) > 0:
            self.sentences.append(sentence)

        for sentence in self.sentences:
            encoding = tokenizer(sentence.words, add_special_tokens=True, is_split_into_words=True)
            sentence.subword_ids = torch.LongTensor(encoding.input_ids)
            sentence.subwords = tokenizer.convert_ids_to_tokens(encoding.input_ids)
            sentence.subword_to_word_map = encoding.word_ids()
            sentence.word_to_subword_map = torch.LongTensor([
                subword_index 
                for subword_index, word_index in enumerate(sentence.subword_to_word_map)
                if word_index is not None and word_index != sentence.subword_to_word_map[subword_index - 1]
            ])
            sentence.subword_to_word_map = torch.LongTensor([
                word_index if word_index is not None else -1 for word_index in sentence.subword_to_word_map
            ])
        
        if pos_ids_to_str is None:
            self.pos_ids_to_str = [
                pos_tag 
                for pos_tag, count in Counter(tag for sentence in self.sentences for tag in sentence.pos_tags).most_common()
            ]
        else:
            self.pos_ids_to_str = pos_ids_to_str

        self.pos_str_to_ids = {tag: i for i, tag in enumerate(self.pos_ids_to_str)}
        for sentence in self.sentences:
            sentence.pos_tag_ids = torch.LongTensor([self.pos_str_to_ids.get(tag, 0) for tag in sentence.pos_tags])

        if dep_ids_to_str is None:
            self.dep_ids_to_str = [dep_rel for dep_rel, count in Counter(self.dep_relations).most_common()]
        else:
            self.dep_ids_to_str = dep_ids_to_str
        self.dep_str_to_ids = {dep_rel: i for i, dep_rel in enumerate(self.dep_ids_to_str)}
        for sentence in self.sentences:
            sentence.dep_relation_ids = torch.LongTensor([self.dep_str_to_ids.get(dep_rel, 0) for dep_rel in sentence.dep_relations])

    def state_dict(self):
        return {
            "pos_vocabulary": self.pos_ids_to_str,
            "dep_vocabulary": self.dep_ids_to_str,
        }

    # load state dict
    def load_state_dict(self, state_dict):
        self.pos_ids_to_str = state_dict["pos_vocabulary"]
        self.pos_str_to_ids = {tag: i for i, tag in enumerate(self.pos_ids_to_str)}

        for sentence in self.sentences:
            sentence.pos_tag_ids = torch.LongTensor([self.pos_str_to_ids[tag] for tag in sentence.pos_tags])
            sentence.dep_relation_ids = torch.LongTensor([self.dep_str_to_ids[dep_rel] for dep_rel in sentence.dep_relations])

    def __getitem__(self, index: int):
        sentence = self.sentences[index]
        return {
            "words": sentence.words,
            "subword_ids": sentence.subword_ids,
            "pos_tag_ids": sentence.pos_tag_ids,
            "dependencies": torch.LongTensor(sentence.dependencies),
            "dep_relation_ids": torch.LongTensor(sentence.dep_relation_ids),
            "subword_to_word_map": sentence.subword_to_word_map,
            "word_to_subword_map": sentence.word_to_subword_map
        }

    def __len__(self):
        return len(self.sentences)

# Pool

In [3]:
def pool_subword_to_word(subword_outputs, word_to_subword_map):
    batch_size, word_seq_len = word_to_subword_map.shape
    batch_indices = torch.arange(batch_size, device=subword_outputs.device).view(batch_size, 1).expand(batch_size, word_seq_len)
    
    word_outputs = subword_outputs[batch_indices, word_to_subword_map, :]
    return word_outputs

# Combined Model

In [4]:
class JointModel(nn.Module):
    def __init__(self, pretrained_model_name, num_pos_labels, num_dep_labels, max_seq_length=150):
        super(JointModel, self).__init__()
        self.base_model = AutoModel.from_pretrained(pretrained_model_name, trust_remote_code=True)
        self.hidden_size = self.base_model.config.hidden_size
        self.max_seq_length = max_seq_length
        
        self.pos_classifier = nn.Linear(self.hidden_size, num_pos_labels)
        
        self.dep_relation_classifier = nn.Linear(self.hidden_size, num_dep_labels)
        
        self.dep_head_projection = nn.Linear(self.hidden_size, 128)
        self.dep_dependent_projection = nn.Linear(self.hidden_size, 128)
        
        self.root_embedding = nn.Parameter(torch.randn(1, 1, self.hidden_size))
    
    def forward(self, input_ids, attention_mask, word_to_subword_map=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state  # [batch_size, seq_len, hidden_size]
        
        pos_logits = self.pos_classifier(sequence_output)
        dep_relation_logits = self.dep_relation_classifier(sequence_output)

        if word_to_subword_map is not None:
            word_representations = pool_subword_to_word(sequence_output, word_to_subword_map)
            pos_logits = pool_subword_to_word(pos_logits, word_to_subword_map)
            dep_relation_logits = pool_subword_to_word(dep_relation_logits, word_to_subword_map)
            
            batch_size, seq_len, _ = word_representations.shape
            root_emb_expanded = self.root_embedding.expand(batch_size, -1, -1)
            word_representations_with_root = torch.cat([root_emb_expanded, word_representations], dim=1)
            
            head_representations = self.dep_head_projection(word_representations_with_root)  # [batch, seq_len+1, 128]
            dependent_representations = self.dep_dependent_projection(word_representations)  # [batch, seq_len, 128]
            dep_head_scores = torch.bmm(dependent_representations, head_representations.transpose(1, 2))
            
            return pos_logits, dep_relation_logits, dep_head_scores
        else:
            return pos_logits, dep_relation_logits, None

# Training loop

In [5]:
def train_joint_model(model, train_loader, val_loader, device, epochs=5):
    optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
    scheduler = transformers.get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=50, 
        num_training_steps=len(train_loader) * epochs
    )
    
    for epoch in range(epochs):
        model.train()
        train_pos_loss = 0.0
        train_dep_loss = 0.0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} [Training]")
        
        for batch in progress_bar:
            batch = {k: v.to(device) for k, v in batch.items()}
            
            pos_logits, dep_relation_logits, dep_head_scores = model(
                input_ids=batch["subword_ids"],
                attention_mask=batch["attention_mask"],
                word_to_subword_map=batch["word_to_subword_map"]
            )
            
            pos_mask = batch["pos_tag_ids"] != -1
            dep_mask = batch["dep_relation_ids"] != -1
            
            pos_loss = F.cross_entropy(
                pos_logits.view(-1, pos_logits.size(-1))[pos_mask.view(-1)],
                batch["pos_tag_ids"].view(-1)[pos_mask.view(-1)]
            )
            
            dep_relation_loss = F.cross_entropy(
                dep_relation_logits.view(-1, dep_relation_logits.size(-1))[dep_mask.view(-1)],
                batch["dep_relation_ids"].view(-1)[dep_mask.view(-1)]
            )
            
            dep_head_loss = F.cross_entropy(
                dep_head_scores.view(-1, dep_head_scores.size(-1))[dep_mask.view(-1)],
                batch["dependencies"].view(-1)[dep_mask.view(-1)]
            )
            
            loss = pos_loss + dep_relation_loss + dep_head_loss
            
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            
            train_pos_loss += pos_loss.item()
            train_dep_loss += (dep_relation_loss + dep_head_loss).item()

            progress_bar.set_postfix(pos_loss=pos_loss.item(), dep_loss=(dep_relation_loss + dep_head_loss).item())
        
        avg_train_pos_loss = train_pos_loss / len(train_loader)
        avg_train_dep_loss = train_dep_loss / len(train_loader)
        
        model.eval()
        val_pos_loss = 0.0
        val_dep_loss = 0.0
        pos_correct = 0
        dep_rel_correct = 0
        dep_head_correct = 0
        total_pos = 0
        total_dep = 0
        
        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f"Epoch {epoch+1}/{epochs} [Validation]"):
                batch = {k: v.to(device) for k, v in batch.items()}
                
                pos_logits, dep_relation_logits, dep_head_scores = model(
                    input_ids=batch["subword_ids"],
                    attention_mask=batch["attention_mask"],
                    word_to_subword_map=batch["word_to_subword_map"]
                )
                
                pos_mask = batch["pos_tag_ids"] != -1
                dep_mask = batch["dep_relation_ids"] != -1
                
                pos_loss = F.cross_entropy(
                    pos_logits.view(-1, pos_logits.size(-1))[pos_mask.view(-1)],
                    batch["pos_tag_ids"].view(-1)[pos_mask.view(-1)]
                )

                dep_relation_loss = F.cross_entropy(
                    dep_relation_logits.view(-1, dep_relation_logits.size(-1))[dep_mask.view(-1)],
                    batch["dep_relation_ids"].view(-1)[dep_mask.view(-1)]
                )

                dep_head_loss = F.cross_entropy(
                    dep_head_scores.view(-1, dep_head_scores.size(-1))[dep_mask.view(-1)], batch["dependencies"].view(-1)[dep_mask.view(-1)]
                )

                val_pos_loss += pos_loss.item()
                val_dep_loss += (dep_relation_loss + dep_head_loss).item()

                _, pos_predictions = pos_logits.max(dim=-1)
                _, dep_relation_predictions = dep_relation_logits.max(dim=-1)
                _, dep_head_predictions = dep_head_scores.max(dim=-1)
                
                pos_correct += (pos_predictions[pos_mask] == batch["pos_tag_ids"][pos_mask]).sum().item()
                total_pos += pos_mask.sum().item()
                
                dep_rel_correct += (dep_relation_predictions[dep_mask] == batch["dep_relation_ids"][dep_mask]).sum().item()
                
                dep_head_correct += (dep_head_predictions[dep_mask] == batch["dependencies"][dep_mask]).sum().item()
                
                total_dep += dep_mask.sum().item()
        
        avg_val_pos_loss = val_pos_loss / len(val_loader)
        avg_val_dep_loss = val_dep_loss / len(val_loader)
        pos_accuracy = pos_correct / total_pos if total_pos > 0 else 0
        las = dep_rel_correct / total_dep if total_dep > 0 else 0
        uas = dep_head_correct / total_dep if total_dep > 0 else 0
        
        print(f"Epoch {epoch+1}: Avg Train POS Loss = {avg_train_pos_loss:.4f} | Avg Train DEP Loss = {avg_train_dep_loss:.4f} | Avg Val POS Loss = {avg_val_pos_loss:.4f} | Avg Val DEP Loss = {avg_val_dep_loss:.4f}")
        #print(f"Epoch {epoch+1}: POS Accuracy = {pos_accuracy:.4f} | LAS = {las:.4f} | UAS = {uas:.4f}")
    
    return model

# Predict output 

In [6]:
def predict(model, input_path, output_path, tokenizer, device=None):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model.eval()
    dataset = ConlluDataset(
        input_path, 
        tokenizer, 
        pos_ids_to_str=state_dict['pos_vocabulary'], 
        dep_ids_to_str=state_dict['dep_vocabulary']
    )
    dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=64,
        shuffle=False,
        collate_fn=collate_fn
    )
    
    with torch.no_grad(), open(output_path, "w") as output_file:
        for batch in tqdm(dataloader, desc="Predicting"):
            batch = {k: v.to(device) for k, v in batch.items()}
            pos_logits, dep_relation_logits, dep_head_scores = model(
                input_ids=batch["subword_ids"],
                attention_mask=batch["attention_mask"],
                word_to_subword_map=batch["word_to_subword_map"]
            )
            
            # Get the most likely class for each position
            _, pos_predictions = pos_logits.max(dim=-1)
            _, dep_relation_predictions = dep_relation_logits.max(dim=-1)
            _, dep_head_predictions = dep_head_scores.max(dim=-1)
            
            for i in range(batch["pos_tag_ids"].size(0)):
                mask = batch["pos_tag_ids"][i] != -1
                sent_length = mask.sum().item()
                
                pos_tags = []
                for j in range(sent_length):
                    pred_idx = pos_predictions[i, j].item()
                    if 0 <= pred_idx < len(dataset.pos_ids_to_str):
                        pos_tags.append(dataset.pos_ids_to_str[pred_idx])
                    else:
                        pos_tags.append(dataset.pos_ids_to_str[0])
                
                dependencies = []
                for j in range(sent_length):
                    head_idx = dep_head_predictions[i, j].item()
                    if head_idx > sent_length:
                        head_idx = 0
                    dependent_idx = j + 1
                    
                    dependencies.append([head_idx, dependent_idx])
                
                output_file.write(json.dumps({
                    "pos_tags": pos_tags,
                    "dependencies": dependencies
                }) + "\n")
    
    print(f"Predictions saved to {output_path}")

# Predict on test

In [13]:
def predict_on_test(model, test_file_path, output_path, tokenizer, train_dataset, device=None):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model.eval()
    
    # Read test sentences
    test_sentences = []
    current_sentence = []
    
    with open(test_file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line.startswith('#'):
                continue
            elif not line:
                if current_sentence:
                    test_sentences.append(current_sentence)
                    current_sentence = []
            else:
                parts = line.split('\t')
                if parts[0].isdigit():
                    # Only store word and SpaceAfter info
                    word = parts[1].strip()
                    space_after = "SpaceAfter=No" not in line
                    current_sentence.append((word, space_after))
    
    # Add the last sentence if needed
    if current_sentence:
        test_sentences.append(current_sentence)
    
    with open(output_path, "w") as output_file:
        for sentence in tqdm(test_sentences, desc="Predicting"):
            # Prepare input for the model
            words = [word for word, _ in sentence]
            encoding = tokenizer(words, add_special_tokens=True, is_split_into_words=True, return_tensors="pt")
            
            input_ids = encoding.input_ids.to(device)
            attention_mask = encoding.attention_mask.to(device)
            
            # Create word to subword map
            word_to_subword_map = []
            for word_idx, word_id in enumerate(encoding.word_ids(batch_index=0)):
                if word_id is not None and (word_idx == 0 or encoding.word_ids(batch_index=0)[word_idx-1] != word_id):
                    word_to_subword_map.append(word_idx)
            
            word_to_subword_map = torch.tensor([word_to_subword_map]).to(device)
            
            # Make prediction
            with torch.no_grad():
                pos_logits, dep_relation_logits, dep_head_scores = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    word_to_subword_map=word_to_subword_map
                )
                
                _, pos_predictions = pos_logits.max(dim=-1)
                _, dep_relation_predictions = dep_relation_logits.max(dim=-1)
                _, dep_head_predictions = dep_head_scores.max(dim=-1)
            
            # Convert predictions to output format
            sent_length = len(sentence)
            pos_tags = [train_dataset.pos_ids_to_str[pos_predictions[0, j].item()] 
                        for j in range(sent_length)]
            
            dependencies = []
            for j in range(sent_length):
                # Get head prediction (1-indexed)
                head_idx = dep_head_predictions[0, j].item()
                if head_idx >= sent_length:
                    head_idx = 0  # Set to root if out of bounds
                dependent_idx = j + 1  # 1-indexed
                
                # Format exactly as [head_idx, dependent_idx]
                dependencies.append([head_idx, dependent_idx])
            
            # Output in the exact required format
            result = {
                "pos_tags": pos_tags,
                "dependencies": dependencies
            }
            
            output_file.write(json.dumps(result) + "\n")
    
    print(f"Predictions saved to {output_path}")

# Run the model

In [10]:
gradient_clipping = 5
if __name__ == "__main__":
    tokenizer = AutoTokenizer.from_pretrained("ltg/norbert3-large")
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    
    # Load datasets
    train_data = ConlluDataset('/fp/projects01/ec403/IN5550/obligatories/2/no_bokmaal-ud-train.conllu', tokenizer)
    state_dict = train_data.state_dict()
    val_data = ConlluDataset('/fp/projects01/ec403/IN5550/obligatories/2/no_bokmaal-ud-dev.conllu', tokenizer)
    
    train_loader = torch.utils.data.DataLoader(
        train_data,
        batch_size=32,
        shuffle=True,
        drop_last=True,
        collate_fn=collate_fn
    )
    
    val_loader = torch.utils.data.DataLoader(
        val_data,
        batch_size=32,
        shuffle=False,
        collate_fn=collate_fn
    )
    
    #Joint model 
    joint_model = JointModel(
        pretrained_model_name="ltg/norbert3-large",
        num_pos_labels=len(train_data.pos_ids_to_str),
        num_dep_labels=len(train_data.dep_ids_to_str)
    ).to(device)
    
    # Train joint model
    joint_model = train_joint_model(joint_model, train_loader, val_loader, device, epochs=10)
    
    # Predict using the joint model
    predict(
        model=joint_model,
        input_path='/fp/projects01/ec403/IN5550/obligatories/2/no_bokmaal-ud-dev.conllu',
        output_path='predictions.jsonl',
        tokenizer=tokenizer,
        device=device
    )
# Models:
# ltg/norbert3-xs
# xlm-roberta-base
# bert-base-german-cased
# Path:
# /fp/projects01/ec403/IN5550/obligatories/2/no_bokmaal-ud-dev.jsonl and /fp/projects01/ec403/IN5550/obligatories/2/no_bokmaal-ud-dev.conllu


Epoch 1/10 [Training]: 100%|██████████| 490/490 [00:42<00:00, 11.51it/s, dep_loss=1.71, pos_loss=0.2]   
Epoch 1/10 [Validation]: 100%|██████████| 76/76 [00:01<00:00, 40.45it/s]


Epoch 1: Avg Train POS Loss = 1.0993 | Avg Train DEP Loss = 12.6982 | Avg Val POS Loss = 4.0859 | Avg Val DEP Loss = 2.7880


Epoch 2/10 [Training]: 100%|██████████| 490/490 [00:42<00:00, 11.63it/s, dep_loss=0.404, pos_loss=0.151] 
Epoch 2/10 [Validation]: 100%|██████████| 76/76 [00:01<00:00, 40.62it/s]


Epoch 2: Avg Train POS Loss = 0.1528 | Avg Train DEP Loss = 0.8445 | Avg Val POS Loss = 4.5922 | Avg Val DEP Loss = 2.9640


Epoch 3/10 [Training]: 100%|██████████| 490/490 [00:42<00:00, 11.53it/s, dep_loss=0.407, pos_loss=0.0621]
Epoch 3/10 [Validation]: 100%|██████████| 76/76 [00:01<00:00, 40.49it/s]


Epoch 3: Avg Train POS Loss = 0.0871 | Avg Train DEP Loss = 0.4933 | Avg Val POS Loss = 5.0928 | Avg Val DEP Loss = 3.2441


Epoch 4/10 [Training]: 100%|██████████| 490/490 [00:42<00:00, 11.53it/s, dep_loss=0.423, pos_loss=0.0318]
Epoch 4/10 [Validation]: 100%|██████████| 76/76 [00:01<00:00, 40.38it/s]


Epoch 4: Avg Train POS Loss = 0.0553 | Avg Train DEP Loss = 0.3188 | Avg Val POS Loss = 5.4173 | Avg Val DEP Loss = 3.3963


Epoch 5/10 [Training]: 100%|██████████| 490/490 [00:42<00:00, 11.52it/s, dep_loss=0.112, pos_loss=0.0154] 
Epoch 5/10 [Validation]: 100%|██████████| 76/76 [00:01<00:00, 40.36it/s]


Epoch 5: Avg Train POS Loss = 0.0357 | Avg Train DEP Loss = 0.2070 | Avg Val POS Loss = 5.9961 | Avg Val DEP Loss = 3.8768


Epoch 6/10 [Training]: 100%|██████████| 490/490 [00:42<00:00, 11.49it/s, dep_loss=0.148, pos_loss=0.0292]  
Epoch 6/10 [Validation]: 100%|██████████| 76/76 [00:01<00:00, 40.41it/s]


Epoch 6: Avg Train POS Loss = 0.0236 | Avg Train DEP Loss = 0.1433 | Avg Val POS Loss = 6.1778 | Avg Val DEP Loss = 3.9649


Epoch 7/10 [Training]: 100%|██████████| 490/490 [00:42<00:00, 11.48it/s, dep_loss=0.0764, pos_loss=0.0128] 
Epoch 7/10 [Validation]: 100%|██████████| 76/76 [00:01<00:00, 40.45it/s]


Epoch 7: Avg Train POS Loss = 0.0152 | Avg Train DEP Loss = 0.0908 | Avg Val POS Loss = 6.5748 | Avg Val DEP Loss = 4.2632


Epoch 8/10 [Training]: 100%|██████████| 490/490 [00:42<00:00, 11.50it/s, dep_loss=0.0818, pos_loss=0.015]   
Epoch 8/10 [Validation]: 100%|██████████| 76/76 [00:01<00:00, 40.40it/s]


Epoch 8: Avg Train POS Loss = 0.0097 | Avg Train DEP Loss = 0.0545 | Avg Val POS Loss = 7.0951 | Avg Val DEP Loss = 4.7886


Epoch 9/10 [Training]: 100%|██████████| 490/490 [00:42<00:00, 11.49it/s, dep_loss=0.0442, pos_loss=0.00289] 
Epoch 9/10 [Validation]: 100%|██████████| 76/76 [00:01<00:00, 40.44it/s]


Epoch 9: Avg Train POS Loss = 0.0060 | Avg Train DEP Loss = 0.0288 | Avg Val POS Loss = 7.7022 | Avg Val DEP Loss = 5.2440


Epoch 10/10 [Training]: 100%|██████████| 490/490 [00:42<00:00, 11.56it/s, dep_loss=0.004, pos_loss=0.00268]   
Epoch 10/10 [Validation]: 100%|██████████| 76/76 [00:01<00:00, 40.42it/s]


Epoch 10: Avg Train POS Loss = 0.0034 | Avg Train DEP Loss = 0.0141 | Avg Val POS Loss = 7.9397 | Avg Val DEP Loss = 5.4044


TypeError: predict_on_test() missing 1 required positional argument: 'state_dict'

In [14]:
predict_on_test(
        model=joint_model,
        test_file_path='/fp/projects01/ec403/IN5550/obligatories/2/test.conllu',
        output_path='test_predictions.jsonl',
        tokenizer=tokenizer,
        train_dataset=train_data,  
        device=device
    )

Predicting: 100%|██████████| 1939/1939 [00:21<00:00, 89.49it/s]

Predictions saved to test_predictions.jsonl





# Evaluer output

In [9]:
def pos_accuracy(gold: List[str], prediction: List[str]) -> float:
    if len(gold) != len(prediction):
        return 0.0
    return mean(1 if g == p else 0 for g, p in zip(gold, prediction))


def unlabeled_attachment_score(gold: List[Tuple[int, int]], prediction: List[Tuple[int, int]]) -> float:
    gold = set(tuple(dependency) for dependency in gold)
    prediction = set(tuple(dependency) for dependency in prediction)

    precision = len(gold & prediction) / len(prediction)
    recall = len(gold & prediction) / len(gold)
    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0

    return f1


def labeled_attachment_score(gold: List[Tuple[int, int, str, str]], prediction: List[Tuple[int, int, str, str]]) -> float:
    gold = set(gold)
    prediction = set(prediction)

    precision = len(gold & prediction) / len(prediction)
    recall = len(gold & prediction) / len(gold)
    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0

    return f1


def create_labeled_dependencies(pos_tags: List[str], dependencies: List[Tuple[int, int]]) -> List[Tuple[int, int, str, str]]:
    assert min(min(head, dependency) for head, dependency in dependencies) >= 0, "Dependency index out of bounds"
    assert max(max(head, dependency) for head, dependency in dependencies) <= len(pos_tags), "Dependency index out of bounds"
    
    pos_tags = ["ROOT"] + pos_tags
    
    return [(head, dependency, pos_tags[head], pos_tags[dependency]) for head, dependency in dependencies]


def sentence_metrics(sentence_gold, sentence_prediction):
    pos_acc = pos_accuracy(sentence_gold["pos_tags"], sentence_prediction["pos_tags"])
    uas = unlabeled_attachment_score(sentence_gold["dependencies"], sentence_prediction["dependencies"])
    las = labeled_attachment_score(
        create_labeled_dependencies(sentence_gold["pos_tags"], sentence_gold["dependencies"]),
        create_labeled_dependencies(sentence_prediction["pos_tags"], sentence_prediction["dependencies"])
    )

    return {
        "POS accuracy": pos_acc,
        "Unlabeled attachment score": uas,
        "Labeled attachment score": las
    }


def dataset_metrics(gold_path: str, prediction_path: str, verbose=True):
    gold_sentences = [json.loads(line) for line in open(gold_path)]
    prediction_sentences = [json.loads(line) for line in open(prediction_path)]

    assert len(gold_sentences) == len(prediction_sentences), "Number of sentences do not match"

    metrics = [sentence_metrics(gold, prediction) for gold, prediction in zip(gold_sentences, prediction_sentences)]

    metrics = {
        "POS accuracy": mean(metric["POS accuracy"] for metric in metrics),
        "Unlabeled attachment score": mean(metric["Unlabeled attachment score"] for metric in metrics),
        "Labeled attachment score": mean(metric["Labeled attachment score"] for metric in metrics)
    }

    if verbose:
        print("METRICS")
        for metric, value in metrics.items():
            print(f"{metric}: {value:.2%}")

    return metrics

In [10]:
dataset_metrics("/fp/projects01/ec403/IN5550/obligatories/2/no_bokmaal-ud-dev.jsonl", "predictions.jsonl")


METRICS
POS accuracy: 97.02%
Unlabeled attachment score: 95.66%
Labeled attachment score: 90.01%


{'POS accuracy': 0.9701674328484712,
 'Unlabeled attachment score': 0.9566305955258916,
 'Labeled attachment score': 0.9001210377290003}