## Load Dataset

In [1]:
!pip install sklearn_crfsuite



In [2]:

!git clone https://github.com/adapter-hub/adapters.git
!cd adapters

fatal: destination path 'adapters' already exists and is not an empty directory.


In [3]:
!pip install datasets conllu



In [4]:
!pip install adapters



In [5]:
import torch
from transformers import AutoModel, AutoTokenizer
from adapters import AutoAdapterModel
from transformers import AutoTokenizer
from adapters import AdapterTrainer
from transformers import TrainingArguments
from transformers import AutoConfig
from adapters import AdapterConfig
from transformers import TrainingArguments, Trainer
from datasets import Dataset, concatenate_datasets

In [6]:
%mkdir Ug/

In [8]:
import conllu

def extract_corpus(conllu_path):
    """Extracts transliterated sentences from a CONLLU file."""
    translit_sentences = []
    original_sentences = []
    with open(conllu_path, "r", encoding="utf-8") as f:
        data = conllu.parse(f.read())

    for sentence in data:
        original = sentence.metadata.get('text', '')
        translit = sentence.metadata.get('translit', '')
        if translit:
            translit_sentences.append(translit.strip())
        if original:
            original_sentences.append(original.strip())
    return original_sentences, translit_sentences

def original_corpus(files):
    corpus = {}
    for split, file_path in files.items():
        original, translit = extract_corpus(file_path)
        corpus[split] = {"original": original, "translit": translit}

    for split in ["train", "dev", "test"]:
        with open(f"Ug/ug_{split}_original.txt", "w", encoding="utf-8") as f:
            f.write("\n".join(corpus[split]["original"]) + "\n")
        with open(f"Ug/ug_{split}_transliterated.txt", "w", encoding="utf-8") as f:
            f.write("\n".join(corpus[split]["translit"]) + "\n")

    return corpus["train"]["original"], corpus["train"]["translit"]

files = {
        "train": "ug_udt-ud-train.conllu",
        "dev": "ug_udt-ud-dev.conllu",
        "test": "ug_udt-ud-test.conllu"
    }
original_texts, transliterated_texts = original_corpus(files)

In [9]:
original_texts[5]

'كۆچەت ئوبدان كۆكلەۋاتقاندا، بىر كاككۈك ئۇچۇپ كېلىپ دەپتۇ:'

In [10]:
transliterated_texts[5]

'köchet obdan köklewatqanda, bir kakkük uchup këlip deptu:'

In [11]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
additional_tokens = ["<new_token1>", "<new_token2>"]
tokenizer.add_tokens(additional_tokens)

def tokenize(text):
    return tokenizer(
        text,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [12]:
from transformers import DataCollatorForLanguageModeling
from torch.utils.data import Dataset, DataLoader
import torch

class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        self.encodings = tokenizer(
            texts,
            max_length=max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )

    def __len__(self):
        return self.encodings["input_ids"].shape[0]

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx]
        }

# DataLoader
original_dataset = TextDataset(original_texts, tokenizer)
transliterated_dataset = TextDataset(transliterated_texts, tokenizer)

original_dataloader = DataLoader(original_dataset, batch_size=16, shuffle=True)
transliterated_dataloader = DataLoader(transliterated_dataset, batch_size=16, shuffle=True)

# MLM Components
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)
mlm_loss = torch.nn.CrossEntropyLoss()

## Model Training

In [13]:
import torch
from adapters import AutoAdapterModel, Stack, AdapterConfig
from transformers import DataCollatorForLanguageModeling, AutoTokenizer
from torch.optim import AdamW
from torch.optim.lr_scheduler import LinearLR

# Model Initialization
model = AutoAdapterModel.from_pretrained("bert-base-multilingual-cased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

# Adapter Configuration
model.freeze_model()

# Add adapters
lang_config = AdapterConfig.load(
    "pfeiffer",
    reduction_factor=16,
    leave_out=[11]
)
model.add_adapter("la_s", config=lang_config)
model.add_adapter("la_t", config=lang_config)


for param in model.heads.parameters():
    param.requires_grad = False

for name, param in model.named_parameters():
    if "adapter" not in name:
        assert not param.requires_grad, f"Base param {name} is trainable!"

# Training Setup
scaler = torch.cuda.amp.GradScaler()


def train_adapter(adapter_name, dataloader, num_epochs=1):
    model.train_adapter(adapter_name)
    model.set_active_adapters(Stack(adapter_name))
    optimizer = AdamW(
        model.parameters(),
        lr=1e-4,
        weight_decay=0.01
    )

    scheduler = LinearLR(
        optimizer,
        start_factor=0.3,
        total_iters=len(dataloader)*num_epochs//4
    )

    accumulation_steps = 4
    for epoch in range(num_epochs):
        model.train()

        for i, batch in enumerate(dataloader):
            with torch.cuda.amp.autocast():
                masked_inputs = data_collator([{"input_ids": ids} for ids in batch["input_ids"]])

                outputs = model(
                    input_ids=masked_inputs["input_ids"],
                    attention_mask=masked_inputs["attention_mask"]
                )

                loss = torch.nn.CrossEntropyLoss(ignore_index=-100)(
                    outputs.logits.view(-1, tokenizer.vocab_size),
                    masked_inputs["labels"].view(-1)
                )
                loss = loss / accumulation_steps

            scaler.scale(loss).backward()

            if (i + 1) % accumulation_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                scheduler.step()

        eval_loss = evaluate_mlm(dataloader)
        print(f"Epoch {epoch+1} | Loss: {eval_loss:.4f}")

# Evaluation
def evaluate_mlm(dataloader):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch in dataloader:
            masked_inputs = data_collator([{"input_ids": ids} for ids in batch["input_ids"]])

            outputs = model(
                input_ids=masked_inputs["input_ids"],
                attention_mask=masked_inputs["attention_mask"]
            )

            loss = torch.nn.CrossEntropyLoss(ignore_index=-100)(
                outputs.logits.view(-1, tokenizer.vocab_size),
                masked_inputs["labels"].view(-1)
            )
            total_loss += loss.item()

    return total_loss / len(dataloader)

# Train la_s
train_adapter("la_s", original_dataloader, 5)

# Train la_t
train_adapter("la_t", transliterated_dataloader, 5)

model.save_adapter("./la_s", "la_s", with_head=False)
model.save_adapter("./la_t", "la_t", with_head=False)

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():


Epoch 1 | Loss: 5.1926
Epoch 2 | Loss: 4.6305
Epoch 3 | Loss: 4.7053
Epoch 4 | Loss: 5.1842
Epoch 5 | Loss: 3.9855
Epoch 1 | Loss: 6.5837
Epoch 2 | Loss: 6.0928
Epoch 3 | Loss: 6.5337
Epoch 4 | Loss: 7.0732
Epoch 5 | Loss: 6.7741


In [14]:
#  adapter configurations
print("\nla_s config:", model.get_adapter("la_s"))
print("la_t config:", model.get_adapter("la_t"))


la_s config: {0: {'output_adapter': Adapter(
  (non_linearity): Activation_Function_Class(
    (f): ReLU()
  )
  (adapter_down): Sequential(
    (0): Linear(in_features=768, out_features=48, bias=True)
    (1): Activation_Function_Class(
      (f): ReLU()
    )
  )
  (adapter_up): Linear(in_features=48, out_features=768, bias=True)
  (dropout): Dropout(p=0.0, inplace=False)
)}, 1: {'output_adapter': Adapter(
  (non_linearity): Activation_Function_Class(
    (f): ReLU()
  )
  (adapter_down): Sequential(
    (0): Linear(in_features=768, out_features=48, bias=True)
    (1): Activation_Function_Class(
      (f): ReLU()
    )
  )
  (adapter_up): Linear(in_features=48, out_features=768, bias=True)
  (dropout): Dropout(p=0.0, inplace=False)
)}, 2: {'output_adapter': Adapter(
  (non_linearity): Activation_Function_Class(
    (f): ReLU()
  )
  (adapter_down): Sequential(
    (0): Linear(in_features=768, out_features=48, bias=True)
    (1): Activation_Function_Class(
      (f): ReLU()
    )
  )

In [15]:
model.active_adapters

Stack[la_t]

In [16]:
# currently active adapters
print("Active adapters:", model.active_adapters)

Active adapters: Stack[la_t]


In [17]:
SFE_MODEL = model

In [None]:
import torch
from transformers import DataCollatorForLanguageModeling

def test_mlm_with_adapter(text, adapter_name=None):
    inputs = tokenizer(text, return_tensors="pt")
    input_ids = inputs["input_ids"].clone()
    attention_mask = inputs["attention_mask"].clone()
    masked_indices = (input_ids == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
    if adapter_name:
        model.set_active_adapters(Stack(adapter_name))

    model.eval()

    with torch.no_grad():
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
    for idx in masked_indices:
        predicted_token_id = torch.argmax(outputs.logits[0, idx], dim=-1).item()
        input_ids[0, idx] = predicted_token_id

    original_tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze().tolist())
    predicted_tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze().tolist())

    original_text = tokenizer.convert_tokens_to_string(original_tokens)
    predicted_text = tokenizer.convert_tokens_to_string(predicted_tokens)

    return {
        "original_tokens": original_tokens,
        "predicted_tokens": predicted_tokens,
        "original_text": original_text,
        "predicted_text": predicted_text
    }

test_text = "نەشپۈت بەش يىلدا ، ئۆرۈك تۆت يىلدا مېۋە بېرىدۇ دېگەننى ئاڭلىمىغانمىدىڭ ؟"
result = test_mlm_with_adapter(test_text, "la_s")

print("original_tokens:", result["original_tokens"])
print("predicted_tokens:", result["predicted_tokens"])
print("original_text:", result["original_text"])
print("predicted_text:", result["predicted_text"])


original_tokens: ['[CLS]', '[UNK]', '[UNK]', 'ي', '##ى', '##لد', '##ا', '،', '[UNK]', 'ت', '##ۆ', '##ت', 'ي', '##ى', '##لد', '##ا', '[UNK]', '[UNK]', '[UNK]', 'ئ', '##ا', '##ڭ', '##لى', '##مى', '##غان', '##مى', '##دى', '##ڭ', '؟', '[SEP]']
predicted_tokens: ['[CLS]', '[UNK]', '[UNK]', 'ي', '##ى', '##لد', '##ا', '،', '[UNK]', 'ت', '##ۆ', '##ت', 'ي', '##ى', '##لد', '##ا', '[UNK]', '[UNK]', '[UNK]', 'ئ', '##ا', '##ڭ', '##لى', '##مى', '##غان', '##مى', '##دى', '##ڭ', '؟', '[SEP]']
original_text: [CLS] [UNK] [UNK] يىلدا ، [UNK] تۆت يىلدا [UNK] [UNK] [UNK] ئاڭلىمىغانمىدىڭ ؟ [SEP]
predicted_text: [CLS] [UNK] [UNK] يىلدا ، [UNK] تۆت يىلدا [UNK] [UNK] [UNK] ئاڭلىمىغانمىدىڭ ؟ [SEP]


## AdapterFusionPLus

In [19]:
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

class AdapterFusionPlus(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(2 * hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 2)
        )
    def forward(self, outputs_s, outputs_t):
        combined = torch.cat([outputs_s, outputs_t], dim=-1)  # [B, T, 2*H]
        raw_weights = self.mlp(combined)                      # [B, T, 2]
        weights = torch.softmax(raw_weights, dim=-1)          # [B, T, 2]

        w_s = weights[:, :, 0].unsqueeze(-1)  # [B, T, 1]
        w_t = weights[:, :, 1].unsqueeze(-1)  # [B, T, 1]

        fused_output = w_s * outputs_s + w_t * outputs_t      # [B, T, H]
        return fused_output

fusion_layer = AdapterFusionPlus(model.config.hidden_size)

def forward_with_fusion(input_ids):
    outputs_s = model(input_ids, adapter_names="la_s")
    outputs_t = model(input_ids, adapter_names="la_t")
    fused_output = fusion_layer(outputs_s.last_hidden_state, outputs_t.last_hidden_state)
    return fused_output

## POS

In [None]:
from collections import defaultdict
import torch
import torch.nn as nn
import re
from transformers import AutoModelForTokenClassification, AutoTokenizer, AdamW, get_linear_schedule_with_warmup
from sklearn_crfsuite import CRF, metrics
from conllu import parse_incr
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder


def build_pos_vocab(conllu_data_path):
    pos_vocab = defaultdict(int)

    with open(conllu_data_path, "r") as f:
        for line in f:
            if line.startswith("#") or not line.strip():
                continue

            parts = line.strip().split("\t")

            if "-" in parts[0]:
                continue
            pos_tag = parts[3]
            pos_vocab[pos_tag] += 1

    pos_vocab["[PAD]"] = 0  # For padding
    pos_vocab["[UNK]"] = 0  # For unknown tags

    return pos_vocab


conllu_pth = 'ug_udt-ud-train.conllu'

all_pos_tags = []
with open(conllu_pth, "r") as f:
    for line in f:
        if line.startswith("#") or not line.strip():
            continue
        parts = line.strip().split("\t")
        if "-" in parts[0]:
            continue
        all_pos_tags.append(parts[3])  # or parts[4]

le = LabelEncoder()
le.fit(all_pos_tags + ["[PAD]", "[UNK]"])

num_pos_tags = len(le.classes_)

pos_vocab = build_pos_vocab(conllu_pth)
num_pos_tags = len(pos_vocab)
print(f"Number of POS tags: {num_pos_tags}")

sorted_tags = sorted(
    [tag for tag in pos_vocab.keys() if tag not in ["[PAD]", "[UNK]"]],
    key=lambda x: pos_vocab[x],
    reverse=True
)

sorted_tags += ["[PAD]", "[UNK]"]

#  label2id mapping
pos_label2id = {tag: idx for idx, tag in enumerate(sorted_tags)}
id2pos_label = {idx: tag for tag, idx in pos_label2id.items()}

print(pos_label2id)


def load_conllu_data(file_path):
    """Loads CoNLL-U formatted data and extracts sentences with POS tags."""
    data_file = open(file_path, "r", encoding="utf-8")
    ud_treebank = []

    for tokenlist in parse_incr(data_file):
        tokens, tags = [], []
        for token in tokenlist:
            tokens.append(token["form"])
            tags.append(token["upostag"])
        ud_treebank.append((tokens, tags))

    return ud_treebank

conllu_pth = 'ug_udt-ud-train.conllu'
ud_treebank = load_conllu_data(conllu_pth)


def extract_features(sentence, index):
    """Extracts linguistic features from a given sentence at a specific index."""
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'is_alphanumeric': bool(re.match(r'^(?=.*[0-9]$)(?=.*[a-zA-Z])', sentence[index])),
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
    }

def transform_to_dataset(tagged_sentences):
    """Transforms tokenized sentences into feature sets and corresponding labels."""
    X, y = [], []
    for sentence, tags in tagged_sentences:
        X.append([extract_features(sentence, i) for i in range(len(sentence))])
        y.append(tags)
    return X, y


Number of POS tags: 15
{'NOUN': 0, 'PUNCT': 1, 'VERB': 2, 'PRON': 3, 'NUM': 4, 'ADJ': 5, 'ADV': 6, 'PROPN': 7, 'INTJ': 8, 'AUX': 9, 'CCONJ': 10, 'ADP': 11, 'PART': 12, '[PAD]': 13, '[UNK]': 14}


In [21]:
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

class POSDataset(Dataset):
    """Custom dataset for POS tagging with BERT tokenizer."""
    def __init__(self, ud_treebank, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.features = []
        self.labels = []
        self.sentences = []

        all_tags = {tag for _, tags in ud_treebank for tag in tags}
        self.label_map = {tag: i for i, tag in enumerate(sorted(all_tags))}
        self.id2label = {i: tag for tag, i in self.label_map.items()}

        for tokens, tags in ud_treebank:
            self.sentences.append(tokens)
            inputs = tokenizer(
                tokens,
                is_split_into_words=True,
                padding="max_length",
                truncation=True,
                max_length=max_length,
                return_tensors="pt"
            )

            word_ids = inputs.word_ids()
            previous_word_id = None
            label_ids = []

            for word_id in word_ids:
                if word_id is None:
                    label_ids.append(-100)
                elif word_id != previous_word_id:
                    label_ids.append(self.label_map[tags[word_id]])
                else:
                    label_ids.append(-100)
                previous_word_id = word_id

            self.features.append({k: v.squeeze(0) for k, v in inputs.items()})
            self.labels.append(torch.tensor(label_ids))

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        readable_labels = [self.id2label[label.item()] for label in self.labels[idx] if label.item() != -100]

        return {
            "sentence": sentence,
            "input_ids": self.features[idx]["input_ids"],
            "attention_mask": self.features[idx]["attention_mask"],
            "labels_readable": readable_labels,
            "labels": self.labels[idx]
        }

dataset = POSDataset(ud_treebank, tokenizer)

sample = dataset[0]
print("Sentence:", " ".join(sample["sentence"]))
print("Labels:", sample["labels_readable"])


Sentence: نەشپۈت بەش يىلدا ، ئۆرۈك تۆت يىلدا مېۋە بېرىدۇ دېگەننى ئاڭلىمىغانمىدىڭ ؟
Labels: ['NOUN', 'NUM', 'NOUN', 'PUNCT', 'NOUN', 'NUM', 'NOUN', 'NOUN', 'VERB', 'VERB', 'VERB', 'PUNCT']


## POS USING AdapterFusionPlus

In [None]:
class POSClassifier(nn.Module):
    """Final classifier for POS tagging."""
    def __init__(self, hidden_size, num_labels):
        super().__init__()
        self.classifier = nn.Linear(hidden_size, num_labels)

    def forward(self, fused_output):
        return self.classifier(fused_output)

class POSFusionModel(nn.Module):
    """POS tagging model with AdapterFusion+."""
    def __init__(self, base_model, fusion_layer, classifier):
        super().__init__()
        self.base_model = base_model
        self.fusion_layer = fusion_layer
        self.classifier = classifier
        self.base_model.config.output_hidden_states = True

    def forward(self, input_ids, attention_mask):
        outputs_s = self.base_model(input_ids=input_ids, attention_mask=attention_mask, adapter_names="la_s")
        outputs_t = self.base_model(input_ids=input_ids, attention_mask=attention_mask, adapter_names="la_t")
        fused = self.fusion_layer(outputs_s.hidden_states[-1], outputs_t.hidden_states[-1])
        return self.classifier(fused)


def train_pos_model(model, dataloader, num_epochs=3, lr=1e-4):
    optimizer = AdamW(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss(ignore_index=-100)
    model.train()

    for epoch in range(num_epochs):
        total_loss = 0
        for batch in dataloader:
            optimizer.zero_grad()
            input_ids, attention_mask, labels = batch["input_ids"].cuda(), batch["attention_mask"].cuda(), batch["labels"].cuda()

            logits = model(input_ids, attention_mask)
            loss = criterion(logits.view(-1, logits.shape[-1]), labels.view(-1))
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{num_epochs} - Loss: {total_loss / len(dataloader):.4f}")



def evaluate_pos_model(model, dataloader):
    model.eval()
    total_correct, total_tokens = 0, 0
    y_true, y_pred = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = (
                batch["input_ids"].cuda(),
                batch["attention_mask"].cuda(),
                batch["labels"].cuda(),
            )

            logits = model(input_ids, attention_mask)
            predictions = torch.argmax(logits, dim=-1)
            # print("labels",labels )
            # print("predictions", predictions)
            mask = labels != -100
            correct = (predictions == labels) & mask
            total_correct += correct.sum().item()
            total_tokens += mask.sum().item()
            y_true.extend(labels[mask].cpu().numpy().tolist())
            y_pred.extend(predictions[mask].cpu().numpy().tolist())

    print(f"POS Tagging Accuracy: {total_correct / total_tokens:.4f}")
    return y_true, y_pred

def collate_fn(batch):
    input_ids = [item["input_ids"] for item in batch]
    attention_mask = [item["attention_mask"] for item in batch]
    labels = [item["labels"] for item in batch]

    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)

    return {"input_ids": input_ids.cuda(), "attention_mask": attention_mask.cuda(), "labels": labels.cuda()}

train_dataset = POSDataset(load_conllu_data(conllu_pth), tokenizer)
test_dataset = POSDataset(load_conllu_data("ug_udt-ud-test.conllu"), tokenizer)

train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)

fusion_layer = AdapterFusionPlus(768).cuda()
pos_classifier = POSClassifier(768, len(train_dataset.label_map)).cuda()
model1 = model
pos_model = POSFusionModel(model1, fusion_layer, pos_classifier).cuda()

train_pos_model(pos_model, train_loader, num_epochs=15)
y_true, y_pred = evaluate_pos_model(pos_model, test_loader)

Epoch 1/15 - Loss: 2.3302
Epoch 2/15 - Loss: 1.9530
Epoch 3/15 - Loss: 1.7612
Epoch 4/15 - Loss: 1.6558
Epoch 5/15 - Loss: 1.5756
Epoch 6/15 - Loss: 1.5216
Epoch 7/15 - Loss: 1.4770
Epoch 8/15 - Loss: 1.4215
Epoch 9/15 - Loss: 1.3773
Epoch 10/15 - Loss: 1.3355
Epoch 11/15 - Loss: 1.3042
Epoch 12/15 - Loss: 1.2700
Epoch 13/15 - Loss: 1.2301
Epoch 14/15 - Loss: 1.2147
Epoch 15/15 - Loss: 1.1782
POS Tagging Accuracy: 0.3319


In [31]:
def predict_pos(sentence):
    inputs = tokenizer(sentence.split(), is_split_into_words=True, padding="max_length", truncation=True, return_tensors="pt").to("cuda")

    with torch.no_grad():
        logits = pos_model(inputs["input_ids"], inputs["attention_mask"])

    predictions = torch.argmax(logits, dim=-1).squeeze().tolist()
    pred_labels = [id2pos_label[i] for i in predictions if i != -100]

    return list(zip(sentence.split(), pred_labels))


test_sentence = "نەشپۈت بەش يىلدا، ئۆرۈك تۆت يىلدا مېۋە بېرىدۇ دېگەننى ئاڭلىمىغانمىدىڭ؟"
# test_sentence = "neshpüt besh yilda, örük töt yilda mëwe bëridu dëgenni anglimighanmiding?"
print(predict_pos(test_sentence))

[('نەشپۈت', 'ADP'), ('بەش', 'ADV'), ('يىلدا،', 'ADV'), ('ئۆرۈك', 'ADV'), ('تۆت', 'ADV'), ('يىلدا', 'ADV'), ('مېۋە', 'ADV'), ('بېرىدۇ', 'ADP'), ('دېگەننى', 'ADV'), ('ئاڭلىمىغانمىدىڭ؟', 'ADV')]


In [32]:
from sklearn.metrics import f1_score, classification_report

print("## AdapterFusion Model Evaluation ##")

f1 = f1_score(y_true, y_pred, average="weighted")
print("F1 score on Dev Data:", f1)

print("\nClass-wise scores:")
print(classification_report(y_true, y_pred, target_names=train_dataset.label_map.keys()))

## AdapterFusion Model Evaluation ##
F1 score on Dev Data: 0.23745444350995232

Class-wise scores:
              precision    recall  f1-score   support

         ADJ       0.00      0.00      0.00        15
         ADP       0.00      0.00      0.00         9
         ADV       0.00      0.00      0.00         7
         AUX       0.00      0.00      0.00         7
       CCONJ       0.00      0.00      0.00         4
        INTJ       0.00      0.00      0.00         2
        NOUN       0.49      0.88      0.63        88
         NUM       0.00      0.00      0.00         4
        PART       0.00      0.00      0.00        12
        PRON       0.00      0.00      0.00        39
       PROPN       0.00      0.00      0.00        45
       PUNCT       0.00      0.00      0.00         0
        VERB       0.00      0.00      0.00         0

    accuracy                           0.33       232
   macro avg       0.04      0.07      0.05       232
weighted avg       0.18      0.33  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## POS USING AdapterFusion

In [None]:
from adapters.composition import Fuse
from sklearn.metrics import f1_score, accuracy_score, classification_report

train_data = load_conllu_data(conllu_pth)
dev_data = load_conllu_data("ug_udt-ud-test.conllu")

# POS label mapping
all_tags = sorted({tag for _, tags in train_data+dev_data for tag in tags})
id2label = {i: tag for i, tag in enumerate(all_tags)}
label2id = {tag: i for i, tag in id2label.items()}

# Tokenization and Dataset
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

train_dataset = POSDataset(train_data, tokenizer)
dev_dataset = POSDataset(dev_data, tokenizer)

def compute_metrics(p):
    predictions = np.argmax(p.predictions, axis=2)
    true_labels = p.label_ids
    mask = true_labels != -100
    y_true = true_labels[mask].flatten()
    y_pred = predictions[mask].flatten()

    return {
        "f1": f1_score(y_true, y_pred, average="weighted"),
        "accuracy": accuracy_score(y_true, y_pred)
    }

In [34]:
import numpy as np

model = AutoAdapterModel.from_pretrained("bert-base-multilingual-cased")

# Load two adapters
model.load_adapter("la_s", load_as="source_script")
model.load_adapter("la_t", load_as="transliterated")

adapter_setup = Fuse("source_script", "transliterated")
model.add_adapter_fusion(adapter_setup)

# POS classification head
model.add_tagging_head("pos_head", num_labels=len(id2label))

# Activate fusion
model.train_adapter_fusion(adapter_setup)

#  Training
training_args = TrainingArguments(
    learning_rate=1e-4,
    num_train_epochs=10,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=1,
    logging_dir="./logs",
    output_dir="./pos_fusion",
    evaluation_strategy="epoch",
    save_strategy="epoch",
)

trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mgarimat[0m ([33mgarimat-indian-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,No log,2.63874,0.211462,0.344828
2,No log,2.833968,0.223766,0.353448
3,No log,2.803625,0.237575,0.37931
4,No log,2.806741,0.228702,0.353448
5,No log,2.83617,0.234743,0.353448
6,No log,2.845398,0.239098,0.349138
7,No log,2.849963,0.238172,0.349138
8,No log,2.859708,0.237999,0.344828
9,No log,2.86648,0.236882,0.340517
10,No log,2.862419,0.240032,0.349138


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


TrainOutput(global_step=160, training_loss=1.5366881370544434, metrics={'train_runtime': 113.4621, 'train_samples_per_second': 2.82, 'train_steps_per_second': 1.41, 'total_flos': 26852007198720.0, 'train_loss': 1.5366881370544434, 'epoch': 10.0})

In [35]:
# Evaluation and Prediction
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def predict_pos(sentence):
    tokenized = tokenizer(
        sentence.split(),
        is_split_into_words=True,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=128
    ).to(device)

    with torch.no_grad():
        outputs = model(**tokenized)

    predictions = torch.argmax(outputs.logits, dim=-1).squeeze().tolist()
    word_ids = tokenized.word_ids(batch_index=0)

    pred_tags = []
    current_word = None
    for i, word_idx in enumerate(word_ids):
        if word_idx != current_word and word_idx is not None:
            pred_tags.append(id2label[predictions[i]])
            current_word = word_idx

    return list(zip(sentence.split(), pred_tags))

# test_sentence = "نەشپۈت بەش يىلدا، ئۆرۈك تۆت يىلدا مېۋە بېرىدۇ دېگەننى ئاڭلىمىغانمىدىڭ؟"
# test_sentence = "neshpüt besh yilda, örük töt yilda mëwe bëridu dëgenni anglimighanmiding ?"
test_sentence = "سەن شۇ چاققىچە تاقەت قىلىپ تۇرالامسەن!"
print(predict_pos(test_sentence))

# Evaluation Report
def get_true_pred(model, dataset):
    all_true = []
    all_pred = []
    model.eval()

    for item in dataset:
        inputs = {
            "input_ids": item["input_ids"].unsqueeze(0).to(device),
            "attention_mask": item["attention_mask"].unsqueeze(0).to(device)
        }

        with torch.no_grad():
            outputs = model(**inputs)

        preds = torch.argmax(outputs.logits, dim=-1).squeeze().cpu().tolist()
        labels = item["labels"].tolist()

        # mask ignoring -100 labels
        mask = [label != -100 for label in labels]
        all_true.extend([labels[i] for i, m in enumerate(mask) if m])
        all_pred.extend([preds[i] for i, m in enumerate(mask) if m])

    return all_true, all_pred

# predictions and labels
y_true, y_pred = get_true_pred(model, dev_dataset)

# classification report
print("## AdapterFusion Model Evaluation ##")
print("F1 score on Dev Data:", f1_score(y_true, y_pred, average="weighted"))
print("\nClass-wise scores:")
print(classification_report(
    [id2label[t] for t in y_true],
    [id2label[p] for p in y_pred],
    digits=3,
    zero_division=0
))

[('سەن', 'INTJ'), ('شۇ', 'INTJ'), ('چاققىچە', 'INTJ'), ('تاقەت', 'INTJ'), ('قىلىپ', 'INTJ'), ('تۇرالامسەن!', 'PUNCT')]
## AdapterFusion Model Evaluation ##
F1 score on Dev Data: 0.2400323275862069

Class-wise scores:
              precision    recall  f1-score   support

         ADJ      0.000     0.000     0.000        15
         ADP      0.000     0.000     0.000         9
         ADV      0.000     0.000     0.000         7
         AUX      0.000     0.000     0.000         7
       CCONJ      0.000     0.000     0.000         4
         DET      0.000     0.000     0.000         2
        INTJ      0.482     0.920     0.633        88
        NOUN      0.000     0.000     0.000         4
         NUM      0.000     0.000     0.000        12
        PART      0.000     0.000     0.000        39
        PRON      0.000     0.000     0.000        45
       PROPN      0.000     0.000     0.000         0
       PUNCT      0.000     0.000     0.000         0

    accuracy             

# Dependency Parsing AdapterFusionPlus

In [36]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoTokenizer, AdamW
from adapters import AutoAdapterModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class AdapterFusionPlus(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(2 * hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 2)
        )
    def forward(self, outputs_s, outputs_t):
        combined = torch.cat([outputs_s, outputs_t], dim=-1)   # [B, T, 2*H]
        raw_weights = self.mlp(combined)                       # [B, T, 2]
        weights = torch.softmax(raw_weights, dim=-1)           # [B, T, 2]
        w_s = weights[:, :, 0].unsqueeze(-1)                   # [B, T, 1]
        w_t = weights[:, :, 1].unsqueeze(-1)                   # [B, T, 1]
        fused_output = w_s * outputs_s + w_t * outputs_t       # [B, T, H]
        return fused_output

def create_dep_label_mapping(file_path):
    """
    Reads a CoNLL-U file and generates a unique mapping for dependency labels.
    """
    dep_labels_set = set()
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            if not line.startswith("#") and line.strip():
                parts = line.split("\t")
                if len(parts) == 10:
                    dep_labels_set.add(parts[7])
    dep_label2id = {label: idx for idx, label in enumerate(sorted(dep_labels_set))}
    dep_id2label = {idx: label for label, idx in dep_label2id.items()}
    return dep_label2id, dep_id2label

conllu_pth = "ug_udt-ud-train.conllu"
file_path = conllu_pth
dep_label2id, dep_id2label = create_dep_label_mapping(file_path)
NUM_DEP_LABELS = len(dep_label2id)
print(f"Dependency Labels Mapping: {dep_label2id}")


Dependency Labels Mapping: {'acl': 0, 'advcl': 1, 'advmod': 2, 'amod': 3, 'appos': 4, 'aux': 5, 'case': 6, 'cc': 7, 'ccomp': 8, 'compound': 9, 'compound:lvc': 10, 'compound:redup': 11, 'conj': 12, 'cop': 13, 'det': 14, 'discourse': 15, 'dislocated': 16, 'flat': 17, 'mark': 18, 'nmod': 19, 'nmod:poss': 20, 'nsubj': 21, 'nummod': 22, 'obj': 23, 'obl': 24, 'obl:tmod': 25, 'orphan': 26, 'parataxis': 27, 'punct': 28, 'root': 29, 'vocative': 30, 'xcomp': 31}


In [37]:

class ConlluDataset(Dataset):
    def __init__(self, file_path, tokenizer, dep_label2id, max_length=128):
        self.sentences = self.load_conllu(file_path, dep_label2id)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def load_conllu(self, file_path, dep_label2id):

        sentences = []
        with open(file_path, "r", encoding="utf-8") as f:
            sentence = []
            for line in f:
                line = line.strip()
                if line == "":
                    if sentence:
                        sentences.append(sentence)
                        sentence = []
                elif not line.startswith("#"):
                    parts = line.split("\t")
                    if len(parts) == 10:
                        token_id, token, lemma, upos, xpos, feats, head, dep_rel, deps, misc = parts
                        label = dep_label2id.get(dep_rel, -1)
                        sentence.append((token, upos, int(head), label))
            if sentence:
                sentences.append(sentence)
        return sentences

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        tokens = [word[0] for word in sentence]
        dep_heads = [word[2] for word in sentence]
        dep_labels = [word[3] for word in sentence]

        encoding = self.tokenizer(
            tokens,
            is_split_into_words=True,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
            max_length=self.max_length,
            return_offsets_mapping=True
        )
        word_starts = torch.zeros(len(encoding["input_ids"][0]), dtype=torch.long)
        word_ids = encoding.word_ids()
        for i, word_idx in enumerate(word_ids):
            if word_idx is not None and (i == 0 or word_ids[i-1] != word_idx):
                word_starts[i] = 1

        padded_dep_labels = torch.full((self.max_length,), -1, dtype=torch.long)
        padded_dep_heads = torch.full((self.max_length,), -1, dtype=torch.long)
        seq_len = len(dep_labels)
        padded_dep_labels[:seq_len] = torch.tensor(dep_labels, dtype=torch.long)
        padded_dep_heads[:seq_len] = torch.tensor(dep_heads, dtype=torch.long)

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "dep_heads": padded_dep_heads,
            "dep_labels": padded_dep_labels,
            "word_starts": word_starts       # shape: [max_length]
        }

def collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    dep_heads = torch.stack([item['dep_heads'] for item in batch])
    dep_labels = torch.stack([item['dep_labels'] for item in batch])
    word_starts = torch.stack([item['word_starts'] for item in batch])
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "dep_heads": dep_heads,
        "dep_labels": dep_labels,
        "word_starts": word_starts
    }


In [38]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

base_model = AutoAdapterModel.from_pretrained("bert-base-multilingual-cased").to(device)
base_model.load_adapter("la_s")
base_model.load_adapter("la_t")

class DependencyParser(nn.Module):

    def __init__(self, hidden_size, num_labels):
        super().__init__()
        self.classifier = nn.Linear(hidden_size, num_labels)
    def forward(self, fused_output):
        return self.classifier(fused_output)

class DependencyParserFusionModel(nn.Module):

    def __init__(self, base_model, fusion_layer, classifier):
        super().__init__()
        self.base_model = base_model
        self.fusion_layer = fusion_layer
        self.classifier = classifier
        self.base_model.config.output_hidden_states = True

    def forward(self, input_ids, attention_mask):
        # Activate and get outputs from the first adapter
        self.base_model.set_active_adapters(["la_s"])
        outputs_s = self.base_model(input_ids=input_ids, attention_mask=attention_mask)

        # Activate and get outputs from the second adapter
        self.base_model.set_active_adapters(["la_t"])
        outputs_t = self.base_model(input_ids=input_ids, attention_mask=attention_mask)

        # Fuse the outputs from both adapters
        fused = self.fusion_layer(outputs_s.hidden_states[-1], outputs_t.hidden_states[-1])
        # classification logits: shape [B, seq_len, NUM_DEP_LABELS]
        return self.classifier(fused)

fusion_layer = AdapterFusionPlus(base_model.config.hidden_size).to(device)
dependency_parser_classifier = DependencyParser(base_model.config.hidden_size, NUM_DEP_LABELS).to(device)
DepPar_model = DependencyParserFusionModel(base_model, fusion_layer, dependency_parser_classifier).to(device)


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def train_DepPar_model(model, dataloader, num_epochs=3, lr=1e-4):
    optimizer = AdamW(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss(ignore_index=-1)
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in dataloader:
            optimizer.zero_grad()
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["dep_labels"].to(device)
            logits = model(input_ids, attention_mask)
            loss = criterion(logits.view(-1, logits.shape[-1]), labels.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{num_epochs} - Loss: {total_loss / len(dataloader):.4f}")

def evaluate_DepPar_model(model, dataloader):
    model.eval()
    total_correct, total_tokens = 0, 0
    y_true, y_pred = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["dep_labels"].to(device)
            logits = model(input_ids, attention_mask)
            predictions = torch.argmax(logits, dim=-1)
            mask = labels != -1
            correct = (predictions == labels) & mask
            total_correct += correct.sum().item()
            total_tokens += mask.sum().item()
            y_true.extend(labels[mask].cpu().tolist())
            y_pred.extend(predictions[mask].cpu().tolist())
    las = total_correct / total_tokens if total_tokens > 0 else 0
    print(f"Dependency Parser LAS: {las:.4f}")
    return y_true, y_pred

train_dataset = ConlluDataset(conllu_pth, tokenizer, dep_label2id)
val_dataset = ConlluDataset("ug_udt-ud-test.conllu", tokenizer, dep_label2id)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)

train_DepPar_model(DepPar_model, train_loader, num_epochs=10)
_ = evaluate_DepPar_model(DepPar_model, val_loader)

Epoch 1/10 - Loss: 2.4748
Epoch 2/10 - Loss: 2.1343
Epoch 3/10 - Loss: 1.8472
Epoch 4/10 - Loss: 1.5950
Epoch 5/10 - Loss: 1.2657
Epoch 6/10 - Loss: 1.0588
Epoch 7/10 - Loss: 0.9096
Epoch 8/10 - Loss: 0.9226
Epoch 9/10 - Loss: 0.5313
Epoch 10/10 - Loss: 1.0226
Dependency Parser LAS: 0.1076


In [41]:
def parse_sentence_dep_fusion(model, tokenizer, sentence, dep_id2label, max_length=128):

    original_tokens = sentence.strip().split()

    encoding = tokenizer(
        original_tokens,
        is_split_into_words=True,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
        max_length=max_length,
        return_offsets_mapping=True
    )

    word_ids = encoding.word_ids(0)
    word_starts = torch.zeros(len(encoding["input_ids"][0]), dtype=torch.long)
    model_words = []
    previous_word_idx = None
    for pos, word_idx in enumerate(word_ids):
        if word_idx is None:
            continue
        if word_idx != previous_word_idx:
            word_starts[pos] = 1
            model_words.append(original_tokens[word_idx])
            previous_word_idx = word_idx

    word_starts = word_starts.unsqueeze(0)
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    model.eval()
    with torch.no_grad():
        logits = model(input_ids, attention_mask)  # shape: [1, seq_len, NUM_DEP_LABELS]
        predictions = torch.argmax(logits, dim=-1).squeeze(0).cpu().tolist()

    word_pred_labels = []
    for pos, word_idx in enumerate(word_ids):
        if word_idx is None:
            continue
        if pos == 0 or (word_ids[pos - 1] != word_idx):
            word_pred_labels.append(predictions[pos])

    simulated_heads = []
    for i in range(len(model_words)):
        if i == 0:
            simulated_heads.append("ROOT")
        else:
            simulated_heads.append(model_words[i - 1])

    print("Dependency Parsing Result:")
    print("OrigToken\tParsedToken\tPredicted Head\tDependency Label")
    for i in range(len(model_words)):
        token = original_tokens[i] if i < len(original_tokens) else "UNK"
        parsed_token = model_words[i]
        pred_label = dep_id2label.get(word_pred_labels[i], "UNKNOWN")
        pred_head = simulated_heads[i]
        print(f"{token}\t\t{parsed_token}\t\t{pred_head}\t\t\t{pred_label}")

test_sentence = "نەشپۈت بەش يىلدا، ئۆرۈك تۆت يىلدا مېۋە بېرىدۇ دېگەننى ئاڭلىمىغانمىدىڭ ؟"
parse_sentence_dep_fusion(DepPar_model, tokenizer, test_sentence, dep_id2label)


Dependency Parsing Result:
OrigToken	ParsedToken	Predicted Head	Dependency Label
نەشپۈت		نەشپۈت		ROOT			nummod
بەش		بەش		نەشپۈت			punct
يىلدا،		يىلدا،		بەش			punct
ئۆرۈك		ئۆرۈك		يىلدا،			conj
تۆت		تۆت		ئۆرۈك			obj
يىلدا		يىلدا		تۆت			punct
مېۋە		مېۋە		يىلدا			punct
بېرىدۇ		بېرىدۇ		مېۋە			punct
دېگەننى		دېگەننى		بېرىدۇ			punct
ئاڭلىمىغانمىدىڭ		ئاڭلىمىغانمىدىڭ		دېگەننى			compound:redup
؟		؟		ئاڭلىمىغانمىدىڭ			parataxis


# Dependency Parsing AdapterFusion

In [None]:
import torch
from torch.utils.data import Dataset
from adapters.composition import Fuse
from sklearn.metrics import f1_score, accuracy_score
import numpy as np


def create_dep_label_mapping(file_path):
    mapping = {
        'acl': 0, 'advcl': 1, 'advmod': 2, 'amod': 3, 'ccomp': 4,
        'compound': 5, 'conj': 6, 'cop': 7, 'det': 8, 'discourse': 9,
        'dislocated': 10, 'flat': 11, 'nmod': 12, 'nsubj': 13, 'nummod': 14,
        'obj': 15, 'obl': 16, 'orphan': 17, 'parataxis': 18, 'punct': 19,
        'root': 20, 'vocative': 21, 'xcomp': 22
    }
    id2label = {v: k for k, v in mapping.items()}
    return mapping, id2label

def load_conllu_data(path, head_offset=64):
    data = []
    with open(path, encoding="utf-8") as f:
        lines = f.read().strip().split("\n")
    sentence_lines = []
    for line in lines:
        if line.strip() == "":
            if sentence_lines:
                data.append(parse_sentence(sentence_lines, head_offset))
                sentence_lines = []
        else:
            sentence_lines.append(line)
    if sentence_lines:
        data.append(parse_sentence(sentence_lines, head_offset))
    return data

def parse_sentence(lines, head_offset):
    sent_text = None
    tokens = []
    dep_heads = []
    dep_labels = []

    for line in lines:
        if line.startswith("# text ="):
            sent_text = line[len("# text ="):].strip()
            break
    for line in lines:
        if line.startswith("#"):
            continue
        parts = line.split("\t")
        if len(parts) < 8:
            continue
        if "-" in parts[0]:
            continue
        tokens.append(parts[1])
        head = int(parts[6])
        dep_heads.append(head)
        dep_labels.append(parts[7])
    if sent_text is None:
        sent_text = " ".join(tokens)
    return {
        "sentence": sent_text,
        "dep_heads": dep_heads,
        "dep_labels": dep_labels,
        "tokens": tokens
    }

class ConlluDataset(Dataset):
    def __init__(self, file_path, tokenizer, dep_label2id):
        self.data = load_conllu_data(file_path)
        self.tokenizer = tokenizer
        self.dep_label2id = dep_label2id

    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        example = self.data[idx]
        encoding = self.tokenizer(
            example["sentence"].split(),
            is_split_into_words=True,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=128
        )
        label_ids = [self.dep_label2id.get(label, -100) for label in example["dep_labels"]]

        max_length = encoding["input_ids"].shape[-1]
        label_tensor = torch.full((max_length,), -100, dtype=torch.long)
        label_tensor[:len(label_ids)] = torch.tensor(label_ids, dtype=torch.long)
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": label_tensor
        }

conllu_pth = "ug_udt-ud-train.conllu"
dev_conllu   = "ug_udt-ud-test.conllu"

# dependency label mapping.
dep_label2id, dep_id2label = create_dep_label_mapping(conllu_pth)
print(f"Initial Dependency Labels Mapping: {dep_label2id}")

# tokenizer and create datasets.
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
train_dataset = ConlluDataset(conllu_pth, tokenizer, dep_label2id)
val_dataset = ConlluDataset(dev_conllu, tokenizer, dep_label2id)

all_train_labels = set()
for example in train_dataset.data:
    for label in example["dep_labels"]:
        all_train_labels.add(label)
print("All labels in training set:", all_train_labels)

for label in all_train_labels:
    if label not in dep_label2id:
        dep_label2id[label] = len(dep_label2id)
dep_id2label = {v: k for k, v in dep_label2id.items()}
NUM_DEP_LABELS = len(dep_label2id)
print("Updated Dependency Labels Mapping:", dep_label2id)

# compute_metrics.
def compute_metrics(p):
    predictions = np.argmax(p.predictions, axis=2)
    true_labels = p.label_ids  
    mask = true_labels != -100  
    y_true = true_labels[mask].flatten()
    y_pred = predictions[mask].flatten()
    return {
        "f1": f1_score(y_true, y_pred, average="weighted"),
        "accuracy": accuracy_score(y_true, y_pred)
    }


model = AutoAdapterModel.from_pretrained("bert-base-multilingual-cased")
# add adapters.
model.load_adapter("la_s", load_as="source_script")
model.load_adapter("la_t", load_as="transliterated")
# add adapter fusion.
adapter_setup = Fuse("source_script", "transliterated")
model.add_adapter_fusion(adapter_setup)
# Add dependency tagging head.
model.add_tagging_head("dep_head", num_labels=NUM_DEP_LABELS)
# Activate fusion and the tagging head.
model.train_adapter_fusion(adapter_setup)
model.set_active_adapters(adapter_setup)
if hasattr(model, "set_active_head"):
    model.set_active_head("dep_head")
else:
    model.active_head = "dep_head"

def collate_fn(batch):
    input_ids = torch.stack([item["input_ids"] for item in batch])
    attention_mask = torch.stack([item["attention_mask"] for item in batch])
    labels = torch.stack([item["labels"] for item in batch])
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }


training_args = TrainingArguments(
    learning_rate=1e-4,
    num_train_epochs=10,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=1,
    logging_dir="./logs",
    output_dir="./dep_fusion",
    evaluation_strategy="epoch",
    save_strategy="epoch",
)

trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
)

trainer.train()


Initial Dependency Labels Mapping: {'acl': 0, 'advcl': 1, 'advmod': 2, 'amod': 3, 'ccomp': 4, 'compound': 5, 'conj': 6, 'cop': 7, 'det': 8, 'discourse': 9, 'dislocated': 10, 'flat': 11, 'nmod': 12, 'nsubj': 13, 'nummod': 14, 'obj': 15, 'obl': 16, 'orphan': 17, 'parataxis': 18, 'punct': 19, 'root': 20, 'vocative': 21, 'xcomp': 22}
All labels in training set: {'nmod:poss', 'nsubj', 'parataxis', 'discourse', 'cop', 'conj', 'advmod', 'dislocated', 'det', 'case', 'mark', 'vocative', 'flat', 'obl', 'orphan', 'punct', 'obl:tmod', 'cc', 'xcomp', 'aux', 'compound:redup', 'ccomp', 'nummod', 'nmod', 'obj', 'amod', 'root', 'acl', 'compound:lvc', 'advcl', 'appos', 'compound'}
Updated Dependency Labels Mapping: {'acl': 0, 'advcl': 1, 'advmod': 2, 'amod': 3, 'ccomp': 4, 'compound': 5, 'conj': 6, 'cop': 7, 'det': 8, 'discourse': 9, 'dislocated': 10, 'flat': 11, 'nmod': 12, 'nsubj': 13, 'nummod': 14, 'obj': 15, 'obl': 16, 'orphan': 17, 'parataxis': 18, 'punct': 19, 'root': 20, 'vocative': 21, 'xcomp': 

Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,No log,3.13203,0.052066,0.174888
2,No log,3.087168,0.052066,0.174888
3,No log,3.068566,0.052066,0.174888
4,No log,3.073404,0.052066,0.174888
5,No log,3.07758,0.052265,0.174888
6,No log,3.086957,0.052066,0.174888
7,No log,3.082647,0.052066,0.174888
8,No log,3.082368,0.052265,0.174888
9,No log,3.084063,0.052265,0.174888
10,No log,3.08354,0.052466,0.174888


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


TrainOutput(global_step=160, training_loss=2.726246643066406, metrics={'train_runtime': 133.7486, 'train_samples_per_second': 2.393, 'train_steps_per_second': 1.196, 'total_flos': 26855409008640.0, 'train_loss': 2.726246643066406, 'epoch': 10.0})

In [43]:
def infer_dependency_parsing(sentence, tokenizer, model, dep_id2label, max_length=128):
    encoding = tokenizer(
        sentence.split(),
        is_split_into_words=True,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=max_length
    )
    input_ids = encoding["input_ids"].to(model.device)
    attention_mask = encoding["attention_mask"].to(model.device)

    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    predictions = torch.argmax(outputs.logits, dim=-1)[0]

    word_ids = encoding.word_ids()
    predicted_labels = {}
    previous_word_idx = None
    for idx, word_idx in enumerate(word_ids):
        if word_idx is None:
            continue
        if word_idx != previous_word_idx:
            label_id = predictions[idx].item()
            predicted_labels[word_idx] = dep_id2label.get(label_id, "UNK")
            previous_word_idx = word_idx

    words = sentence.split()
    results = []
    for i, word in enumerate(words):
        orig_token = word
        parsed_token = word
        pred_head = "ROOT" if i == 0 else words[i-1]
        dep_label = predicted_labels.get(i, "UNK")
        results.append((orig_token, parsed_token, pred_head, dep_label))
    return results

sample_sentence = "نەشپۈت بەش يىلدا، ئۆرۈك تۆت يىلدا مېۋە بېرىدۇ دېگەننى ئاڭلىمىغانمىدىڭ ؟"
results = infer_dependency_parsing(sample_sentence, tokenizer, model, dep_id2label)

print("Dependency Parsing Result:")
print("OrigToken\t\tParsedToken\t\tPredicted Head\t\tDependency Label")
for orig, parsed, head, label in results:
    print(f"{orig}\t\t{parsed}\t\t{head}\t\t{label}")


Dependency Parsing Result:
OrigToken		ParsedToken		Predicted Head		Dependency Label
نەشپۈت		نەشپۈت		ROOT		punct
بەش		بەش		نەشپۈت		punct
يىلدا،		يىلدا،		بەش		punct
ئۆرۈك		ئۆرۈك		يىلدا،		punct
تۆت		تۆت		ئۆرۈك		punct
يىلدا		يىلدا		تۆت		punct
مېۋە		مېۋە		يىلدا		punct
بېرىدۇ		بېرىدۇ		مېۋە		punct
دېگەننى		دېگەننى		بېرىدۇ		punct
ئاڭلىمىغانمىدىڭ		ئاڭلىمىغانمىدىڭ		دېگەننى		punct
؟		؟		ئاڭلىمىغانمىدىڭ		punct
