## Data and code prep

In [1]:
!pip install transformers peft python-docx openai anthropic bitsandbytes
!wget https://tufts.box.com/shared/static/bq3nsjeg9pg4wylt4716pjagh0cz5a7t.xml -O tlg0012.tlg002.perseus-grc2.xml
!wget https://tufts.box.com/shared/static/humiph3c9g148u3vxrgw6rpr0f9vjcyp.docx -O "Aligned Odyssey 5.docx"

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting anthropic
  Downloading anthropic-0.45.2-py3-none-any.whl.metadata (23 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.1-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.13.0->peft)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (fr

In [2]:
%%writefile wla.py
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
from docx import Document
from bs4 import BeautifulSoup
import re
import random
from typing import List, Tuple, NamedTuple, Set
from dataclasses import dataclass
from tqdm import tqdm
import openai
import anthropic
from enum import Enum

PROMPT = """
# Word Alignment Task
You are a helpful AI assistant uniquely skilled in aligning words between two languages. You will be given a sentence in one language and a sentence in another language. You will be asked to align the words in the two sentences. Multiple words in Language 1 may be aligned to single words in Language 2, but multiple words in Language 2 may NOT be aligned to multiple words in Language 1.
Simply return your alignment with no other comments or explanations. If you are unable to align a word, please leave it unaligned. If you are unsure about a word, please leave it unaligned.

## Example
{example}

## New text to align
### Language 1
{lang1}
### Language 2
{lang2}
### Alignment
""".strip()

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

@dataclass
class Alignment:
    english: str
    greek: str
    start_idx: int
    end_idx: int

class MatchType(Enum):
    COMPLETE = "complete"
    PARTIAL = "partial"
    WRONG = "wrong"
    UNMATCHED_GOLD = "unmatched_gold"

@dataclass
class AlignmentMatch:
    pred_alignment: Alignment
    gold_alignment: Alignment | None
    match_type: MatchType
    overlap_score: float = 0.0
    notes: str = ""

class AlignmentScore(NamedTuple):
    complete: int
    partial: int
    wrong: int
    precision: float
    recall: float
    f1: float
    accuracy: float
    weighted_accuracy: float
    total_alignments: int
    total_gold: int
    detailed_matches: List[AlignmentMatch]
    unmatched_gold: List[Alignment]

class Evaluator:
    def __init__(self) -> None:
        pass

    def parse_alignment(self, text: str) -> List[Alignment]:
        alignments = []
        current_pos = 0

        # Split by spaces but keep brackets together
        tokens = re.findall(r'\S+(?:\[[^\]]*\])?', text)

        for token in tokens:
            match = re.match(r'(.*?)\[(.*?)\]', token)
            if match:
                english, greek = match.groups()
                # Handle empty alignments marked with 0
                if greek == '0':
                    continue
                alignments.append(Alignment(
                    english=english.strip('.,?!'),
                    greek=greek.strip('.,?!'),
                    start_idx=current_pos,
                    end_idx=current_pos + len(english)
                ))
            current_pos += len(token) + 1  # +1 for space

        return alignments

    def calculate_overlap(self, align1: Alignment, align2: Alignment) -> float:
        """Calculate the overlap between two alignments."""
        start = max(align1.start_idx, align2.start_idx)
        end = min(align1.end_idx, align2.end_idx)
        if start >= end:
            return 0.0

        overlap_length = end - start
        total_length = max(align1.end_idx, align2.end_idx) - min(align1.start_idx, align2.start_idx)
        return overlap_length / total_length

    def evaluate_alignments(self, pred_text: str, gold_text: str) -> AlignmentScore:
        """Evaluate predicted alignments against gold standard."""
        pred_aligns = self.parse_alignment(pred_text)
        gold_aligns = self.parse_alignment(gold_text)

        complete = 0
        partial = 0
        wrong = 0

        detailed_matches: List[AlignmentMatch] = []
        matched_gold: Set[int] = set()

        # Track matched gold alignments to avoid double-counting
        matched_gold = set()

        for pred in pred_aligns:
            best_match = None
            best_score = 0
            best_gold_idx = None

            for i, gold in enumerate(gold_aligns):
                if i in matched_gold:
                    continue

                # Check for exact match
                if (pred.english == gold.english and pred.greek == gold.greek):
                    complete += 1
                    matched_gold.add(i)
                    best_match = None
                    detailed_matches.append(AlignmentMatch(
                        pred_alignment=pred,
                        gold_alignment=gold,
                        match_type=MatchType.COMPLETE,
                        overlap_score=1.0,
                        notes="Exact match"
                        ))
                    break

                # Check for partial match
                overlap = self.calculate_overlap(pred, gold)
                if overlap > 0:
                    greek_match = ((pred.greek in gold.greek) or (gold.greek in pred.greek))
                    if greek_match and (overlap > best_score):
                        best_score = overlap
                        best_match = gold
                        best_gold_idx = i

            if best_match is not None:
                partial += 1
                matched_gold.add(best_gold_idx)
                notes = []
                if pred.english != best_match.english:
                    notes.append(f"English text differs: '{pred.english}' vs '{best_match.english}'")
                if pred.greek != best_match.greek:
                    notes.append(f"Greek text differs: '{pred.greek}' vs '{best_match.greek}'")

                detailed_matches.append(AlignmentMatch(
                    pred_alignment=pred,
                    gold_alignment=best_match,
                    match_type=MatchType.PARTIAL,
                    overlap_score=best_score,
                    notes="; ".join(notes)
                ))

            elif best_match is None and complete == 0:
                wrong += 1
                detailed_matches.append(AlignmentMatch(
                    pred_alignment=pred,
                    gold_alignment=None,
                    match_type=MatchType.WRONG,
                    notes="No match found"
                ))

        unmatched_gold = [
            gold for i, gold in enumerate(gold_aligns)
            if i not in matched_gold
        ]

        for gold in unmatched_gold:
            detailed_matches.append(AlignmentMatch(
                pred_alignment=None,
                gold_alignment=gold,
                match_type=MatchType.UNMATCHED_GOLD,
                notes="Gold standard alignment not found in prediction"
            ))

        total_pred = len(pred_aligns)
        total_gold = len(gold_aligns)

        precision = (complete + 0.5 * partial) / total_pred if total_pred > 0 else 0
        recall = (complete + 0.5 * partial) / total_gold if total_gold > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        accuracy = complete / total_gold if total_gold > 0 else 0
        weighted_accuracy = (complete + 0.5 * partial) / total_gold if total_gold > 0 else 0

        return AlignmentScore(
            complete=complete,
            partial=partial,
            wrong=wrong,
            precision=precision,
            recall=recall,
            f1=f1,
            accuracy=accuracy,
            weighted_accuracy=weighted_accuracy,
            total_alignments=total_pred,
            total_gold=total_gold,
            detailed_matches=detailed_matches,
            unmatched_gold=unmatched_gold
        )

    def format_score(score: AlignmentScore) -> str:
        """Format alignment scores as a readable string."""
        return f"""Alignment Evaluation Results:
    Complete matches: {score.complete}
    Partial matches: {score.partial}
    Wrong matches: {score.wrong}
    Precision: {score.precision:.3f}
    Recall: {score.recall:.3f}
    F1 Score: {score.f1:.3f}"""


class WordLevelAligner:
    def __init__(self,
                 model_path: str,
                 parrish_path:str="./Aligned Odyssey 5.docx",
                 xml_path:str="tlg0012.tlg002.perseus-grc2.xml",
                 default_oai_model='gpt-4o',
                 default_anthropic_model='claude-3-5-sonnet-20241022'
                 ) -> None:
        self.model_path = model_path
        if self.model_path == 'openai':
            print("Using OpenAI API. Make sure to use os.environ['OPENAI_API_KEY'] to set your API key.")
            self.client = openai.Client()
        elif self.model_path == 'anthropic':
            print("Using Anthropic API. Make sure to use os.environ['ANTHROPIC_API_KEY'] to set your API key.")
            self.client = anthropic.Anthropic()
        elif isinstance(self.model_path, tuple):
            self.model = model_path[0]
            self.tokenizer = model_path[1]
        else:
            self.tokenizer = AutoTokenizer.from_pretrained(model_path)
            self.tokenizer.pad_token = self.tokenizer.eos_token
            self.model = AutoModelForCausalLM.from_pretrained(model_path, quantization_config=bnb_config, device_map="auto")

        self.parrish_path = parrish_path
        self.xml_path = xml_path
        self.doc = Document(parrish_path)
        self.xml = open(xml_path).read()
        self.default_oai_model = default_oai_model
        self.default_anthropic_model = default_anthropic_model

    def get_examples(self) -> List[Tuple[str, str, str]]:
        """
        Formats examples from the source text. Hrded coded some values to get the first 200 examples.
        Returns a list of tuples with the following structure: (Language 1, Language 2, Alignment)
        """
        first_200 = [d.text for d in self.doc.paragraphs[3:203]]
        od = BeautifulSoup(self.xml, "xml")
        line_tags = od.find('div', attrs={'n':'5'}).find_all('l')
        examples = [(re.sub(r'\s+', ' ', lt.text.strip()), re.sub('\[.*?\]','', first_200[int(lt['n'])-1]).strip(), first_200[int(lt['n'])-1].strip()) for lt in line_tags[:200]]
        return examples

    def format_example(self, example: Tuple[str, str, str]) -> str:
        """
        Puts the example into the prompt format.
        Returns a string in the format for the prompt.
        """
        formatted_example = []
        for ex in example:
            formatted_example.append(f"""
            ### Language 1\n{ex[0].strip()}\n### Language 2\n{ex[1].strip()}\n### Alignment\n{ex[2]}
            """.strip())
        return '\n'.join(formatted_example).strip()

    def fill_prompt(self, examples: List[Tuple[str, str, str]], test: Tuple[str, str, str], n_shot: int) -> Tuple[str, str]:
        """
        Fills the prompt with the examples and the test.
        Returns a tuple with the filled prompt and the correct answer.
        """
        example = self.format_example(random.sample(examples, n_shot))
        filled_prompt = re.sub('\{example\}', example, PROMPT)
        filled_prompt = re.sub('\{lang1\}', test[0], filled_prompt)
        filled_prompt = re.sub('\{lang2\}', test[1], filled_prompt)
        correct = test[2]
        return filled_prompt, correct

    def align_words(self, prompt: str, max_new_tokens:int=1024) -> str:
        """
        Aligns the words in the two languages.
        Returns the alignment.
        """
        messages = [
            {'role':'user', 'content':prompt}
        ]

        if self.model_path == 'openai':
            pred = self.client.chat.completions.create(
                model=self.default_oai_model,
                messages=messages,
            )
            pred = pred.choices[0].message.content
        elif self.model_path == 'anthropic':
            pred = self.client.messages.create(
                model=self.default_anthropic_model,
                messages=messages,
                max_tokens=max_new_tokens
            )
            pred = pred.content[0].text
        else:
            input_ids = self.tokenizer.apply_chat_template(messages, return_tensors="pt").to('cuda')
            outputs = self.model.generate(input_ids, max_new_tokens=max_new_tokens, pad_token_id=self.tokenizer.eos_token_id)
            raw_output = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            pred = re.split("### Alignmentassistant\n\n", raw_output)[-1].strip()
        return pred

    def evaluate(self, to_test:int=10, n_shot:int=5, max_new_tokens:int=1024) -> List[Tuple[str, str, AlignmentScore]]:
        """
        Evaluates the model on the word alignment task.
        Returns a list of alignment scores zipped with the test examples.
        """
        examples = self.get_examples()

        # Should make this changeable
        test = examples[100:]
        examples = examples[:100]

        _all = [self.fill_prompt(examples, t, n_shot) for t in test[:to_test]]
        all_prompts = [a[0] for a in _all]
        all_correct = [a[1] for a in _all]
        evaluator = Evaluator()

        results = []
        for prompt in tqdm(all_prompts):
            pred = self.align_words(prompt, max_new_tokens)
            correct = all_correct.pop(0)
            score = evaluator.evaluate_alignments(pred, correct)
            results.append((pred, correct, score))
        return results


Writing wla.py


## Evaluating the finetuned model

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

double_quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained("pnadel/llama38B_grc2eng_alignment", quantization_config=double_quant_config).to("cuda")
tokenizer = AutoTokenizer.from_pretrained("pnadel/llama38B_grc2eng_alignment")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/194 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.1k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

In [4]:
from wla import WordLevelAligner

wla = WordLevelAligner(model_path=(model, tokenizer))

In [5]:
from collections import defaultdict

all_results = defaultdict(list)
for to_test in [20]:
    print("Testing ", to_test, "examples")
    for n in [1,2,3,4,5]:
        print("Testing ", n, "shot")
        results = wla.evaluate(to_test, n)
        all_results[(to_test, n)].extend(results)
        print("Average weighted accuracy: ", sum([r[-1].weighted_accuracy for r in results])/len(results))
        print("Average F1 score: ", sum([r[-1].f1 for r in results])/len(results))

Testing  20 examples
Testing  1 shot


  0%|          | 0/20 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
100%|██████████| 20/20 [02:55<00:00,  8.78s/it]


Average weighted accuracy:  0.6051884920634921
Average F1 score:  0.5215971807690074
Testing  2 shot


100%|██████████| 20/20 [03:03<00:00,  9.15s/it]


Average weighted accuracy:  0.64921626984127
Average F1 score:  0.5798954161163914
Testing  3 shot


100%|██████████| 20/20 [03:26<00:00, 10.33s/it]


Average weighted accuracy:  0.6874503968253969
Average F1 score:  0.5933678910958322
Testing  4 shot


100%|██████████| 20/20 [03:27<00:00, 10.38s/it]


Average weighted accuracy:  0.7448412698412699
Average F1 score:  0.6511453950676087
Testing  5 shot


100%|██████████| 20/20 [03:38<00:00, 10.94s/it]

Average weighted accuracy:  0.6937400793650794
Average F1 score:  0.6228629132305603





## Extra: Thucydides examples

In [6]:
# Thuc. 5.16.1
thuc_grc1 = """
ἐπειδὴ δὲ καὶ ἡ ἐν Ἀμφιπόλει ἧσσα τοῖς Ἀθηναίοις ἐγεγένητο καὶ ἐτεθνήκει Κλέων τε καὶ Βρασίδας, οἵπερ ἀμφοτέρωθεν μάλιστα ἠναντιοῦντο τῇ εἰρήνῃ, ὁ μὲν διὰ τὸ εὐτυχεῖν τε καὶ τιμᾶσθαι ἐκ τοῦ πολεμεῖν, ὁ δὲ γενομένης ἡσυχίας καταφανέστερος νομίζων ἂν εἶναι κακουργῶν καὶ ἀπιστότερος διαβάλλων, τότε δὴ ἑκατέρᾳ τῇ πόλει σπεύδοντες τὰ μάλιστα τὴν ἡγεμονίαν Πλειστοάναξ τε ὁ Παυσανίου βασιλεὺς Λακεδαιμονίων καὶ Νικίας ὁ Νικηράτου, πλεῖστα τῶν τότε εὖ φερόμενος ἐν στρατηγίαις, πολλῷ δὴ μᾶλλον προυθυμοῦντο, Νικίας μὲν βουλόμενος, ἐν ᾧ ἀπαθὴς ἦν καὶ ἠξιοῦτο, διασώσασθαι τὴν εὐτυχίαν, καὶ ἔς τε τὸ αὐτίκα πόνων πεπαῦσθαι καὶ αὐτὸς καὶ τοὺς πολίτας παῦσαι καὶ τῷ μέλλοντι χρόνῳ καταλιπεῖν ὄνομα ὡς οὐδὲν σφήλας τὴν πόλιν διεγένετο, νομίζων ἐκ τοῦ ἀκινδύνου τοῦτο ξυμβαίνειν καὶ ὅστις ἐλάχιστα τύχῃ αὑτὸν παραδίδωσι, τὸ δὲ ἀκίνδυνον τὴν εἰρήνην παρέχειν, Πλειστοάναξ δὲ ὑπὸ τῶν ἐχθρῶν διαβαλλόμενος περὶ τῆς καθόδου, καὶ ἐς ἐνθυμίαν τοῖς Λακεδαιμονίοις αἰεὶ προβαλλόμενος ὑπ᾽ αὐτῶν, ὁπότε τι πταίσειαν, ὡς διὰ τὴν ἐκείνου κάθοδον παρανομηθεῖσαν ταῦτα ξυμβαίνοι.
""".strip()
thuc_eng1 = """
Now, however, after the Athenian defeat at Amphipolis, and the death of Cleon and Brasidas, who had been the two principal opponents of peace on either side—the latter from the success and honor which war gave him, the former because he thought that, if tranquillity were restored, his crimes would be more open to detection and his slanders less credited—the foremost candidates for power in either city, Pleistoanax, son of Pausanias, king of Lacedaemon, and Nicias, son of Niceratus, the most fortunate general of his time, each desired peace more ardently than ever. Nicias, while still happy and honored, wished to secure his good fortune, to obtain a present release from trouble for himself and his countrymen, and hand down to posterity a name as an ever-successful statesman, and thought the way to do this was to keep out of danger and commit himself as little as possible to fortune, and that peace alone made this keeping out of danger possible. Pleistoanax, again, was assailed by his enemies for his restoration, and regularly held up by them to the prejudice of his countrymen, upon every reverse that befell them, as though his unjust restoration were the cause.
"""

# Thuc. 1.33.3
thuc_grc2 = """
τὸν δὲ πόλεμον, δι᾽ ὅνπερ χρήσιμοι ἂν εἶμεν, εἴ τις ὑμῶν μὴ οἴεται ἔσεσθαι, γνώμης ἁμαρτάνει καὶ οὐκ αἰσθάνεται τοὺς Λακεδαιμονίους φόβῳ τῷ ὑμετέρῳ πολεμησείοντας καὶ τοὺς Κορινθίους δυναμένους παρ᾽ αὐτοῖς καὶ ὑμῖν ἐχθροὺς ὄντας καὶ προκαταλαμβάνοντας ἡμᾶς νῦν ἐς τὴν ὑμετέραν ἐπιχείρησιν, ἵνα μὴ τῷ κοινῷ ἔχθει κατ᾽ αὐτοὺς μετ᾽ ἀλλήλων στῶμεν μηδὲ δυοῖν φθάσαι ἁμάρτωσιν, ἢ κακῶσαι ἡμᾶς ἢ σφᾶς αὐτοὺς βεβαιώσασθαι.
""".strip()
thuc_eng2 = """
But it will be urged that it is only in the case of a war that we shall be found useful. To this we answer that if any of you imagine that the war is far off, he is grievously mistaken, and is blind to the fact that Lacedaemon regards you with jealousy and desires war, and that Corinth is powerful there,—the same, remember, that is your enemy, and is even now trying to subdue us as a preliminary to attacking you. And this she does to prevent our becoming united by a common enmity, and her having us both on her hands, and also to insure getting the start of you in one of two ways, either by crippling our power or by making its strength her own.
""".strip()

# Thuc. 7.69.2
thuc_grc3 = """
ὁ δὲ Νικίας ὑπὸ τῶν παρόντων ἐκπεπληγμένος καὶ ὁρῶν οἷος ὁ κίνδυνος καὶ ὡς ἐγγὺς ἤδη [ἦν], ἐπειδὴ καὶ ὅσον οὐκ ἔμελλον ἀνάγεσθαι, καὶ νομίσας, ὅπερ πάσχουσιν ἐν τοῖς μεγάλοις ἀγῶσι, πάντα τε ἔργῳ ἔτι σφίσιν ἐνδεᾶ εἶναι καὶ λόγῳ αὐτοῖς οὔπω ἱκανὰ εἰρῆσθαι, αὖθις τῶν τριηράρχων ἕνα ἕκαστον ἀνεκάλει, πατρόθεν τε ἐπονομάζων καὶ αὐτοὺς ὀνομαστὶ καὶ φυλήν, ἀξιῶν τό τε καθ᾽ ἑαυτόν, ᾧ ὑπῆρχε λαμπρότητός τι, μὴ προδιδόναι τινὰ καὶ τὰς πατρικὰς ἀρετάς, ὧν ἐπιφανεῖς ἦσαν οἱ πρόγονοι, μὴ ἀφανίζειν, πατρίδος τε τῆς ἐλευθερωτάτης ὑπομιμνῄσκων καὶ τῆς ἐν αὐτῇ ἀνεπιτάκτου πᾶσιν ἐς τὴν δίαιταν ἐξουσίας, ἄλλα τε λέγων ὅσα ἐν τῷ τοιούτῳ ἤδη τοῦ καιροῦ ὄντες ἄνθρωποι οὐ πρὸς τὸ δοκεῖν τινὶ ἀρχαιολογεῖν φυλαξάμενοι εἴποιεν ἄν, καὶ ὑπὲρ ἁπάντων παραπλήσια ἔς τε γυναῖκας καὶ παῖδας καὶ θεοὺς πατρῴους προφερόμενα, ἀλλ᾽ ἐπὶ τῇ παρούσῃ ἐκπλήξει ὠφέλιμα νομίζοντες ἐπιβοῶνται.
""".strip()
thuc_eng3 = """
Meanwhile Nicias, appalled by the position of affairs, realizing the greatness and the nearness of the danger now that they were on the point of putting out from shore, and thinking, as men are apt to think in great crises, that when all has been done they have still something left to do, and when all has been said that they have not yet said enough, again called on the captains one by one, addressing each by his father's name and by his own, and by that of his tribe, and adjured them not to belie their own personal renown, or to obscure the hereditary virtues for which their ancestors were illustrious; he reminded them of their country, the freest of the free, and of the unfettered discretion allowed in it to all to live as they pleased; and added other arguments such as men would use at such a crisis, and which, with little alteration, are made to serve on all occasions alike—appeals to wives, children, and national gods,—without caring whether they are thought common-place, but loudly invoking them in the belief that they will be of use in the consternation of the moment.
""".strip()

# Thuc. 6.100.1
thuc_grc4 = """
ἐπειδὴ δὲ τοῖς Συρακοσίοις ἀρκούντως ἐδόκει ἔχειν ὅσα τε ἐσταυρώθη καὶ ᾠκοδομήθη τοῦ ὑποτειχίσματος, καὶ οἱ Ἀθηναῖοι αὐτοὺς οὐκ ἦλθον κωλύσοντες, φοβούμενοι μὴ σφίσι δίχα γιγνομένοις ῥᾷον μάχωνται, καὶ ἅμα τὴν καθ᾽ αὑτοὺς περιτείχισιν ἐπειγόμενοι, οἱ μὲν Συρακόσιοι φυλὴν μίαν καταλιπόντες φύλακα τοῦ οἰκοδομήματος ἀνεχώρησαν ἐς τὴν πόλιν, οἱ δὲ Ἀθηναῖοι τούς τε ὀχετοὺς αὐτῶν, οἳ ἐς τὴν πόλιν ὑπονομηδὸν ποτοῦ ὕδατος ἠγμένοι ἦσαν, διέφθειραν, καὶ τηρήσαντες τούς τε ἄλλους Συρακοσίους κατὰ σκηνὰς ὄντας ἐν μεσημβρίᾳ καί τινας καὶ ἐς τὴν πόλιν ἀποκεχωρηκότας καὶ τοὺς ἐν τῷ σταυρώματι ἀμελῶς φυλάσσοντας, τριακοσίους μὲν σφῶν αὐτῶν λογάδας καὶ τῶν ψιλῶν τινὰς ἐκλεκτοὺς ὡπλισμένους προύταξαν θεῖν δρόμῳ ἐξαπιναίως πρὸς τὸ ὑποτείχισμα, ἡ δ᾽ ἄλλη στρατιὰ δίχα, ἡ μὲν μετὰ τοῦ ἑτέρου στρατηγοῦ πρὸς τὴν πόλιν, εἰ ἐπιβοηθοῖεν, ἐχώρουν, ἡ δὲ μετὰ τοῦ ἑτέρου πρὸς τὸ σταύρωμα τὸ παρὰ τὴν πυλίδα.
""".strip()
thuc_eng4 = """
The Syracusans now thought the stockades and stonework of their counter-wall sufficiently far advanced; and as the Athenians, afraid of being divided and so fighting at a disadvantage, and intent upon their own wall, did not come out to interrupt them, they left one tribe to guard the new work and went back into the city. Meanwhile the Athenians destroyed their pipes of drinking-water carried underground into the city; and watching until the rest of the Syracusans were in their tents at midday, and some even gone away into the city, and those in the stockade keeping but indifferent guard, appointed three hundred picked men of their own, and some men picked from the light troops and armed for the purpose, to run suddenly as fast as they could to the counterwork, while the rest of the army advanced in two divisions, the one with one of the generals to the city in case of a sortie, the other with the other general to the stockade by the postern gate.
""".strip()

# Thuc. 6.43.1
thuc_grc5 = """
μετὰ δὲ ταῦτα τοσῇδε ἤδη τῇ παρασκευῇ Ἀθηναῖοι ἄραντες ἐκ τῆς Κερκύρας ἐς τὴν Σικελίαν ἐπεραιοῦντο, τριήρεσι μὲν ταῖς πάσαις τέσσαρσι καὶ τριάκοντα καὶ ἑκατόν, καὶ δυοῖν Ῥοδίοιν πεντηκοντόροιν （τούτων Ἀττικαὶ μὲν ἦσαν ἑκατόν, ὧν αἱ μὲν ἑξήκοντα ταχεῖαι, αἱ δ᾽ ἄλλαι στρατιώτιδες, τὸ δὲ ἄλλο ναυτικὸν Χίων καὶ τῶν ἄλλων ξυμμάχων）, ὁπλίταις δὲ τοῖς ξύμπασιν ἑκατὸν καὶ πεντακισχιλίοις （καὶ τούτων Ἀθηναίων μὲν αὐτῶν ἦσαν πεντακόσιοι μὲν καὶ χίλιοι ἐκ καταλόγου, ἑπτακόσιοι δὲ θῆτες ἐπιβάται τῶν νεῶν, ξύμμαχοι δὲ οἱ ἄλλοι ξυνεστράτευον, οἱ μὲν τῶν ὑπηκόων, οἱ δ᾽ Ἀργείων πεντακόσιοι καὶ Μαντινέων καὶ μισθοφόρων πεντήκοντα καὶ διακόσιοι）, τοξόταις δὲ τοῖς πᾶσιν ὀγδοήκοντα καὶ τετρακοσίοις （καὶ τούτων Κρῆτες οἱ ὀγδοήκοντα ἦσαν） καὶ σφενδονήταις Ῥοδίων ἑπτακοσίοις, καὶ Μεγαρεῦσι ψιλοῖς φυγάσιν εἴκοσι καὶ ἑκατόν, καὶ ἱππαγωγῷ μιᾷ τριάκοντα ἀγούσῃ ἱππέας.
""".strip()
thuc_eng5 = """
After this the Athenians weighed from Corcyra, and proceeded to cross to Sicily with an armament now consisting of one hundred and thirty-four galleys in all （besides two Rhodian fifty-oars） of which one hundred were Athenian vessels—sixty men-of-war, and forty troopships—and the remainder from Chios and the other allies; five thousand and one hundred heavy infantry in all, that is to say, fifteen hundred Athenian citizens from the rolls at Athens and seven hundred Thetes shipped as marines, and the rest allied troops, some of them Athenian subjects, and besides these five hundred Argives, and two hundred and fifty Mantineans serving for hire; four hundred and eighty archers in all, eighty of whom were Cretans, seven hundred slingers from Rhodes, one hundred and twenty light-armed exiles from Megara, and one horse-transport carrying thirty horses.
""".strip()

In [10]:
examples = wla.get_examples()
tests = [
    (thuc_grc1, thuc_eng1, ''),
    (thuc_grc2, thuc_eng2, ''),
    (thuc_grc3, thuc_eng3, ''),
    (thuc_grc4, thuc_eng4, ''),
    (thuc_grc5, thuc_eng5, ''),
]

prompts = [wla.fill_prompt(examples, test, n_shot=4)[0] for test in tests]

In [20]:
preds = {}
for i, prompt in enumerate(prompts):
    pred = wla.align_words(prompt)
    preds[i] = pred

In [21]:
import textwrap
for i in range(len(prompts)):
    print(f"Passage #{i+1}")
    print(*textwrap.wrap(preds[i]), sep="\n")
    print('-'*20)

Passage #1
After the Athenian[Ἀθηναίοις] defeat[ἐγεγένητο] at[Ἀμφιπόλει]
Amphipolis[Ἀμφιπόλει], and[καὶ] the[0] death[ἐτεθνήκει] of[0]
Cleon[Κλέων] and[τε καὶ] Brasidas[Βρασίδας], who[οἵπερ] had been[ἦν]
the[0] two[δὲ] principal[μάλιστα] opponents[ἠναντιοῦντο] of[τῇ]
peace[εἰρήνῃ] on[ἀμφοτέρωθεν] either[0] side[0]—the[ὁ] latter[δὲ]
from[διὰ] the[τὸ] success[εὐτυχεῖν] and[τε καὶ] honor[τιμᾶσθαι]
which[0] war[πολεμεῖν] gave[0] him[0], the[ὁ] former[μὲν] because[0]
he[0] thought[νομίζων] that[0], if[0] tranquillity[ἡσυχίας]
were[γενομένης] restored[γενομένης], his[0] crimes[κακουργῶν]
would[ἂν] be[εἶναι] more[μάλιστα] open[καταφανέστερος] to[0]
detection[διαβάλλων] and[δὲ] his[0] slanders[διαβάλλων]
less[ἀπιστότερος] credited[0], the[ὁ] foremost[πλεῖστα] candidates[τὰ]
for[0] power[ἡγεμονίαν] in[ἐν] either[ἑκατέρᾳ] city[πόλει],
Pleistoanax[Πλειστοάναξ], son[0] of[0] Pausanias[Παυσανίου],
king[βασιλεὺς] of[0] Lacedaemon[Λακεδαιμονίων], and[τε καὶ]
Nicias[Νικίας], son[0] of[0] Niceratus[Νικ