# Data prep

In [1]:
from pathlib import Path
glaux_dir = Path("/cluster/tufts/tuftsai/pnadel01/greek-bert/wsd/glaux/xml")
sentences = {}

In [2]:
xml_files = list(glaux_dir.rglob('*.xml'))
len(xml_files)

1421

In [9]:
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup

def parse_xml(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f, 'lxml-xml')
        
    root = ET.fromstring(str(soup))
    
    # Iterate through sentences
    for sentence in root.findall('.//sentence'):
        sentence_id = sentence.get('id')
        document_id = sentence.get('document_id', '')
        subdoc = sentence.get('subdoc', '')
        
        # Create a full GLAUx ID (you may need to adjust this)
        # The exact ID format might vary - check your WSD files
        glaux_id = sentence_id
        if document_id:
            glaux_id = f"{document_id}_{sentence_id}"
        
        # Extract words
        words = []
        lemmas = []
        pos_tags = []
        
        for word in sentence.findall('.//word'):
            form = word.get('form', '')
            lemma = word.get('lemma', '')
            postag = word.get('postag', '')
            
            if form:
                words.append(form)
                lemmas.append(lemma)
                pos_tags.append(postag)
        
        # Create sentence text
        text = ' '.join(words)
        
        # Store sentence data
        sentences[glaux_id] = {
            'text': text,
            'words': words,
            'lemmas': lemmas,
            'pos': pos_tags,
            'file': str(file_path.name),
            'document_id': document_id,
            'subdoc': subdoc
        }

In [10]:
for xml_file in xml_files:
    tree = parse_xml(xml_file)

In [11]:
import json

output_path = glaux_dir / 'glaux_sentences.json'
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(sentences, f, ensure_ascii=False, indent=2)

# Modeling

In [2]:
from dataclasses import dataclass

@dataclass
class WSDConfig:
    """Configuration for WSD training"""
    model_name: str = "/cluster/tufts/tuftsai/pnadel01/greek-bert/hf_format" 
    max_length: int = 512
    batch_size: int = 16
    learning_rate: float = 2e-5
    num_epochs: int = 10
    warmup_steps: int = 100
    weight_decay: float = 0.01
    dropout: float = 0.1
    test_size: float = 0.2
    random_state: int = 42

In [3]:
import json
from pathlib import Path

glaux_dir = Path("/cluster/tufts/tuftsai/pnadel01/greek-bert/wsd/glaux/xml")
with open(glaux_dir / 'glaux_sentences.json', 'r', encoding='utf-8') as f:
    sentences = json.load(f)

In [4]:
glaux_id2word_id = {}
for glaux_id, sent_data in sentences.items():
    word_ids = sent_data['word_ids']
    for word_id in word_ids:
        glaux_id2word_id[word_id] = glaux_id

glaux_id2word_id['100117129']
sentences['0012-001_7306']

{'text': 'τοὶ γὰρ ἄριστοι μάρτυροι ἔσσονται καὶ ἐπίσκοποι ἁρμονιάων ·',
 'words': ['τοὶ',
  'γὰρ',
  'ἄριστοι',
  'μάρτυροι',
  'ἔσσονται',
  'καὶ',
  'ἐπίσκοποι',
  'ἁρμονιάων',
  '·'],
 'lemmas': ['ὁ',
  'γάρ',
  'ἀγαθός',
  'μάρτυς',
  'εἰμί',
  'καί',
  'ἐπίσκοπος',
  'ἁρμονία',
  '·'],
 'pos': ['l-p---mn-',
  'g--------',
  'a-p---mns',
  'n-p---mn-',
  'v3pfim---',
  'b--------',
  'a-p---mn-',
  'n-p---fg-',
  'u--------'],
 'file': '0012-001.xml',
 'document_id': '0012-001',
 'subdoc': '7716',
 'word_ids': ['100117122',
  '100117123',
  '100117124',
  '100117125',
  '100117126',
  '100117127',
  '100117128',
  '100117129',
  '100117130']}

In [20]:
with open(glaux_dir / 'glaux_id2word_id.json', 'w', encoding='utf-8') as f:
    json.dump(glaux_id2word_id, f, ensure_ascii=False, indent=2)

In [5]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd

class WSDDataset(Dataset):
    def __init__(self, data_path, glaux_data, tokenizer, target_word, max_length=512):
        self.tokenizer = tokenizer
        self.target_word = target_word
        self.max_length = max_length

        df = pd.read_csv(data_path, sep="\t", header=None)

        if len(df.columns) == 2:
            df.columns = ['glaux_id', 'sense']
        else:
            df.columns = ['glaux_id', 'sense', 'subsense']

        self.data = []
        missing_ids = []

        for idx, row in df.iterrows():
            word_id = str(row["glaux_id"])
            glaux_id = glaux_id2word_id.get(word_id, None)
            sense = row["sense"]

            if glaux_id in glaux_data:
                sentense = glaux_data[glaux_id]
                self.data.append({
                    'glaux_id': glaux_id,
                    'text': sentense['text'],
                    'sense': sense,
                    'target_position': sentense['word_ids'].index(word_id)
                })
            else:
                missing_ids.append(glaux_id)
        
        if missing_ids:
            print(f"Warning: {len(missing_ids)} glaux_ids not found in glaux data.")
        print(f"Loaded {len(self.data)} examples for target word '{self.target_word}'.")

        self.sense_to_id = {sense: idx for idx, sense in enumerate(sorted(set(df['sense'])))}
        self.id_to_sense = {idx: sense for sense, idx in self.sense_to_id.items()}
        self.num_senses = len(self.sense_to_id)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        sentence = item['text']
        sense_label = self.sense_to_id[item['sense']]
        target_position = item['target_position']

        encoding = self.tokenizer(
            sentence,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'target_position': target_position,
            'label': torch.tensor(sense_label, dtype=torch.long),
            'glaux_id': item['glaux_id']
        }

In [8]:
from transformers import AutoTokenizer, AutoModel

config = WSDConfig()
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
model = AutoModel.from_pretrained(config.model_name).to('cuda' if torch.cuda.is_available() else 'cpu')

Some weights of BertModel were not initialized from the model checkpoint at /cluster/tufts/tuftsai/pnadel01/greek-bert/hf_format and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
ds = WSDDataset(
    data_path='/cluster/tufts/tuftsai/pnadel01/greek-bert/wsd/ancient-greek-wsd-data/harmonia_glaux.txt',
    tokenizer=tokenizer,
    glaux_data=sentences,
    target_word='ἁρμονιά',
    max_length=config.max_length
)

Loaded 538 examples for target word 'ἁρμονιά'.


In [10]:
train_size = int((1 - config.test_size) * len(ds))
val_size = len(ds) - train_size

train_dataset, val_dataset = torch.utils.data.random_split(
    ds,
    [train_size, val_size],
    generator=torch.Generator().manual_seed(config.random_state)
)

train_loader = DataLoader(
    train_dataset,
    batch_size=config.batch_size,
    shuffle=True,
    num_workers=0  # Set to 0 for debugging, increase for speed
)

val_loader = DataLoader(
    val_dataset,
    batch_size=config.batch_size,
    shuffle=False,
    num_workers=0
)

In [12]:
import torch.nn as nn

class WSDClassifier(nn.Module):    
    def __init__(self, bert_model, num_senses: int, dropout: float = 0.1):
        super().__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_senses)
        
    def forward(self, input_ids, attention_mask, target_position):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=True
        )
        
        sequence_output = outputs.last_hidden_state  # [batch_size, seq_len, hidden_size]
        batch_size = sequence_output.size(0)
        
        target_embeddings = sequence_output[
            torch.arange(batch_size, device=sequence_output.device),
            target_position
        ]  # [batch_size, hidden_size]
        
        target_embeddings = self.dropout(target_embeddings)
        logits = self.classifier(target_embeddings)
        
        return logits

In [13]:
wsd_classifier = WSDClassifier(bert_model=model, num_senses=ds.num_senses, dropout=config.dropout)
wsd_classifier = wsd_classifier.to('cuda' if torch.cuda.is_available() else 'cpu')

In [14]:
for batch in train_loader:
    input_ids = batch['input_ids'].to('cuda' if torch.cuda.is_available() else 'cpu')
    attention_mask = batch['attention_mask'].to('cuda' if torch.cuda.is_available() else 'cpu')
    target_positions = batch['target_position'].to('cuda' if torch.cuda.is_available() else 'cpu')
    labels = batch['label'].to('cuda' if torch.cuda.is_available() else 'cpu')

    logits = wsd_classifier(input_ids, attention_mask, target_positions)

    print(logits.shape)
    break

torch.Size([16, 3])


In [15]:
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm

class Trainer:
    def __init__(self, model, train_loader, val_loader, config):
        self.model = model
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.config = config

        self.optimizer = torch.optim.AdamW(
            self.model.parameters(),
            lr=self.config.learning_rate,
            weight_decay=self.config.weight_decay
        )

        total_steps = len(train_loader) * config.num_epochs
        self.scheduler = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=config.warmup_steps,
            num_training_steps=total_steps
        )
        
        self.criterion = nn.CrossEntropyLoss()
        self.best_val_acc = 0.0
        self.history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}
    
    def train_epoch(self):
        self.model.train()
        total_loss = 0.0
        correct = 0
        total = 0

        pbar = tqdm(self.train_loader, desc='Training')
        for batch in pbar:
            input_ids = batch['input_ids'].to('cuda' if torch.cuda.is_available() else 'cpu')
            attention_mask = batch['attention_mask'].to('cuda' if torch.cuda.is_available() else 'cpu')
            target_positions = batch['target_position'].to('cuda' if torch.cuda.is_available() else 'cpu')
            labels = batch['label'].to('cuda' if torch.cuda.is_available() else 'cpu')

            self.optimizer.zero_grad()
            logits = self.model(input_ids, attention_mask, target_positions)
            loss = self.criterion(logits, labels)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
            self.optimizer.step()
            self.scheduler.step()

            predictions = torch.argmax(logits, dim=1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)
            total_loss += loss.item()

            pbar.set_postfix({
                'loss': f'{loss.item():.4f}',
                'acc': f'{correct/total:.4f}'
            })

        avg_loss = total_loss / len(self.train_loader)
        accuracy = correct / total

        return avg_loss, accuracy

    def evaluate(self):
        self.model.eval()
        total_loss = 0.0
        correct = 0
        total = 0
        all_preds = []
        all_labels = []

        with torch.no_grad():
            pbar = tqdm(self.val_loader, desc='Evaluating')
            for batch in pbar:
                input_ids = batch['input_ids'].to('cuda' if torch.cuda.is_available() else 'cpu')
                attention_mask = batch['attention_mask'].to('cuda' if torch.cuda.is_available() else 'cpu')
                target_positions = batch['target_position'].to('cuda' if torch.cuda.is_available() else 'cpu')
                labels = batch['label'].to('cuda' if torch.cuda.is_available() else 'cpu')

                logits = self.model(input_ids, attention_mask, target_positions)
                loss = self.criterion(logits, labels)

                predictions = torch.argmax(logits, dim=1)
                correct += (predictions == labels).sum().item()
                total += labels.size(0)
                total_loss += loss.item()

                all_preds.extend(predictions.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

                pbar.set_postfix({
                    'loss': f'{loss.item():.4f}',
                    'acc': f'{correct/total:.4f}'
                })

        avg_loss = total_loss / len(self.val_loader)
        accuracy = correct / total

        return avg_loss, accuracy, all_preds, all_labels

    def train(self, save_dir="./wsd_model"):
        save_path = Path(save_dir)
        save_path.mkdir(parents=True, exist_ok=True)

        for epoch in range(1, self.config.num_epochs + 1):
            train_loss, train_acc = self.train_epoch()
            print(f"Epoch {epoch}: Train Loss={train_loss:.4f}, Train Acc={train_acc:.4f}")

            val_loss, val_acc, val_preds, val_labels = self.evaluate()
            print(f"Epoch {epoch}: Val Loss={val_loss:.4f}, Val Acc={val_acc:.4f}")
            
            self.history['train_loss'].append(train_loss)
            self.history['train_acc'].append(train_acc)
            self.history['val_loss'].append(val_loss)
            self.history['val_acc'].append(val_acc)

            if val_acc > self.best_val_acc:
                self.best_val_acc = val_acc
                torch.save({
                    'epoch': epoch,
                    'model_state_dict': self.model.state_dict(),
                    'optimizer_state_dict': self.optimizer.state_dict(),
                    'val_acc': val_acc,
                    'config': self.config
                }, save_path / 'best_model.pt')
                print(f"Saved best model with Val Acc={val_acc:.4f} at epoch {epoch}.")
            
        torch.save({
            'model_state_dict': self.model.state_dict(),
            'config': self.config,
            'history': self.history
        }, save_path / 'final_model.pt')

        with open(save_path / 'history.json', 'w') as f:
            json.dump(self.history, f, indent=2)

        print("Training complete. Best Val Acc: {:.4f}".format(self.best_val_acc))
        return self.history

In [16]:
trainer = Trainer(wsd_classifier, train_loader, val_loader, config)
history = trainer.train(save_dir="./wsd_model")

Training: 100%|██████████| 27/27 [00:05<00:00,  4.66it/s, loss=1.0059, acc=0.3163]


Epoch 1: Train Loss=1.2122, Train Acc=0.3163


Evaluating: 100%|██████████| 7/7 [00:00<00:00, 15.04it/s, loss=1.3049, acc=0.3704]


Epoch 1: Val Loss=1.0620, Val Acc=0.3704
Saved best model with Val Acc=0.3704 at epoch 1.


Training: 100%|██████████| 27/27 [00:05<00:00,  5.39it/s, loss=0.9814, acc=0.5116]


Epoch 2: Train Loss=0.9137, Train Acc=0.5116


Evaluating: 100%|██████████| 7/7 [00:00<00:00, 15.16it/s, loss=1.1166, acc=0.4537]


Epoch 2: Val Loss=0.8579, Val Acc=0.4537
Saved best model with Val Acc=0.4537 at epoch 2.


Training: 100%|██████████| 27/27 [00:05<00:00,  5.38it/s, loss=0.7336, acc=0.6256]


Epoch 3: Train Loss=0.7891, Train Acc=0.6256


Evaluating: 100%|██████████| 7/7 [00:00<00:00, 15.06it/s, loss=1.0189, acc=0.5556]


Epoch 3: Val Loss=0.8207, Val Acc=0.5556
Saved best model with Val Acc=0.5556 at epoch 3.


Training: 100%|██████████| 27/27 [00:05<00:00,  5.38it/s, loss=0.3729, acc=0.8093]


Epoch 4: Train Loss=0.5757, Train Acc=0.8093


Evaluating: 100%|██████████| 7/7 [00:00<00:00, 14.77it/s, loss=0.9308, acc=0.7037]


Epoch 4: Val Loss=0.6801, Val Acc=0.7037
Saved best model with Val Acc=0.7037 at epoch 4.


Training: 100%|██████████| 27/27 [00:05<00:00,  5.30it/s, loss=0.5809, acc=0.9070]


Epoch 5: Train Loss=0.3331, Train Acc=0.9070


Evaluating: 100%|██████████| 7/7 [00:00<00:00, 14.67it/s, loss=0.8580, acc=0.7407]


Epoch 5: Val Loss=0.5875, Val Acc=0.7407
Saved best model with Val Acc=0.7407 at epoch 5.


Training: 100%|██████████| 27/27 [00:05<00:00,  5.30it/s, loss=0.1285, acc=0.9605]


Epoch 6: Train Loss=0.1783, Train Acc=0.9605


Evaluating: 100%|██████████| 7/7 [00:00<00:00, 14.70it/s, loss=0.6224, acc=0.8148]


Epoch 6: Val Loss=0.4595, Val Acc=0.8148
Saved best model with Val Acc=0.8148 at epoch 6.


Training: 100%|██████████| 27/27 [00:05<00:00,  5.37it/s, loss=0.1115, acc=0.9907]


Epoch 7: Train Loss=0.0821, Train Acc=0.9907


Evaluating: 100%|██████████| 7/7 [00:00<00:00, 15.03it/s, loss=0.6325, acc=0.8056]


Epoch 7: Val Loss=0.4925, Val Acc=0.8056


Training: 100%|██████████| 27/27 [00:05<00:00,  5.35it/s, loss=0.0688, acc=0.9953]


Epoch 8: Train Loss=0.0452, Train Acc=0.9953


Evaluating: 100%|██████████| 7/7 [00:00<00:00, 14.82it/s, loss=0.6320, acc=0.8056]


Epoch 8: Val Loss=0.5456, Val Acc=0.8056


Training: 100%|██████████| 27/27 [00:05<00:00,  5.33it/s, loss=0.0104, acc=0.9953]


Epoch 9: Train Loss=0.0271, Train Acc=0.9953


Evaluating: 100%|██████████| 7/7 [00:00<00:00, 14.76it/s, loss=0.6488, acc=0.8241]


Epoch 9: Val Loss=0.5510, Val Acc=0.8241
Saved best model with Val Acc=0.8241 at epoch 9.


Training: 100%|██████████| 27/27 [00:05<00:00,  5.33it/s, loss=0.0122, acc=0.9977]


Epoch 10: Train Loss=0.0188, Train Acc=0.9977


Evaluating: 100%|██████████| 7/7 [00:00<00:00, 14.84it/s, loss=0.6841, acc=0.8148]


Epoch 10: Val Loss=0.5798, Val Acc=0.8148
Training complete. Best Val Acc: 0.8241


In [17]:
ari_config = WSDConfig(model_name="Jacobo/aristoBERTo")
ari_tokenizer = AutoTokenizer.from_pretrained(ari_config.model_name)
ari_model = AutoModel.from_pretrained(ari_config.model_name).to('cuda' if torch.cuda.is_available() else 'cpu')

Some weights of BertModel were not initialized from the model checkpoint at Jacobo/aristoBERTo and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
ari_wsd_classifier = WSDClassifier(bert_model=ari_model, num_senses=ds.num_senses, dropout=ari_config.dropout)
ari_wsd_classifier = ari_wsd_classifier.to('cuda' if torch.cuda.is_available() else 'cpu')

In [19]:
ari_trainer = Trainer(ari_wsd_classifier, train_loader, val_loader, ari_config)
history = ari_trainer.train(save_dir="./ari_wsd_model")

Training: 100%|██████████| 27/27 [00:05<00:00,  5.38it/s, loss=1.0686, acc=0.4209]


Epoch 1: Train Loss=1.0208, Train Acc=0.4209


Evaluating: 100%|██████████| 7/7 [00:00<00:00, 15.03it/s, loss=0.7826, acc=0.5000]


Epoch 1: Val Loss=0.9187, Val Acc=0.5000
Saved best model with Val Acc=0.5000 at epoch 1.


Training: 100%|██████████| 27/27 [00:05<00:00,  5.40it/s, loss=0.8848, acc=0.4512]


Epoch 2: Train Loss=0.9413, Train Acc=0.4512


Evaluating: 100%|██████████| 7/7 [00:00<00:00, 15.19it/s, loss=0.8487, acc=0.4722]


Epoch 2: Val Loss=0.8728, Val Acc=0.4722


Training: 100%|██████████| 27/27 [00:05<00:00,  5.39it/s, loss=0.9569, acc=0.5767]


Epoch 3: Train Loss=0.8256, Train Acc=0.5767


Evaluating: 100%|██████████| 7/7 [00:00<00:00, 14.93it/s, loss=0.7905, acc=0.5278]


Epoch 3: Val Loss=0.8194, Val Acc=0.5278
Saved best model with Val Acc=0.5278 at epoch 3.


Training: 100%|██████████| 27/27 [00:05<00:00,  5.37it/s, loss=0.5684, acc=0.7047]


Epoch 4: Train Loss=0.6891, Train Acc=0.7047


Evaluating: 100%|██████████| 7/7 [00:00<00:00, 15.10it/s, loss=0.8290, acc=0.5093]


Epoch 4: Val Loss=0.8125, Val Acc=0.5093


Training: 100%|██████████| 27/27 [00:05<00:00,  5.35it/s, loss=0.3994, acc=0.8605]


Epoch 5: Train Loss=0.4999, Train Acc=0.8605


Evaluating: 100%|██████████| 7/7 [00:00<00:00, 14.94it/s, loss=0.5971, acc=0.6759]


Epoch 5: Val Loss=0.7001, Val Acc=0.6759
Saved best model with Val Acc=0.6759 at epoch 5.


Training: 100%|██████████| 27/27 [00:05<00:00,  5.36it/s, loss=0.3193, acc=0.9535]


Epoch 6: Train Loss=0.3405, Train Acc=0.9535


Evaluating: 100%|██████████| 7/7 [00:00<00:00, 14.83it/s, loss=0.6933, acc=0.6481]


Epoch 6: Val Loss=0.6553, Val Acc=0.6481


Training: 100%|██████████| 27/27 [00:05<00:00,  5.32it/s, loss=0.1584, acc=0.9721]


Epoch 7: Train Loss=0.2092, Train Acc=0.9721


Evaluating: 100%|██████████| 7/7 [00:00<00:00, 14.81it/s, loss=0.6292, acc=0.7315]


Epoch 7: Val Loss=0.5636, Val Acc=0.7315
Saved best model with Val Acc=0.7315 at epoch 7.


Training: 100%|██████████| 27/27 [00:05<00:00,  5.31it/s, loss=0.1061, acc=0.9791]


Epoch 8: Train Loss=0.1408, Train Acc=0.9791


Evaluating: 100%|██████████| 7/7 [00:00<00:00, 14.69it/s, loss=0.7203, acc=0.7500]


Epoch 8: Val Loss=0.5468, Val Acc=0.7500
Saved best model with Val Acc=0.7500 at epoch 8.


Training: 100%|██████████| 27/27 [00:05<00:00,  5.31it/s, loss=0.0494, acc=0.9953]


Epoch 9: Train Loss=0.0990, Train Acc=0.9953


Evaluating: 100%|██████████| 7/7 [00:00<00:00, 14.55it/s, loss=0.6530, acc=0.7593]


Epoch 9: Val Loss=0.5021, Val Acc=0.7593
Saved best model with Val Acc=0.7593 at epoch 9.


Training: 100%|██████████| 27/27 [00:05<00:00,  5.32it/s, loss=0.0419, acc=0.9930]


Epoch 10: Train Loss=0.0742, Train Acc=0.9930


Evaluating: 100%|██████████| 7/7 [00:00<00:00, 14.81it/s, loss=0.6573, acc=0.7870]


Epoch 10: Val Loss=0.4881, Val Acc=0.7870
Saved best model with Val Acc=0.7870 at epoch 10.
Training complete. Best Val Acc: 0.7870
