- unzip the inscription-model folder as folder `model-best`
- have CSV with text column in Leiden format
- run this script

In [None]:
# CELL 1: Copy model to Colab
from google.colab import files
files.upload()  # Upload model-best/ folder

# CELL 2: Upload inscriptions
files.upload()  # Upload inscriptions.csv

# CELL 3: Run inference
import csv
import re
import spacy
import pandas as pd

class LeidenProcessor:
    """Convert Leiden conventions to clean transcription"""

    # Map abbreviations to their expansions
    # Capitalize proper nouns, keep others lowercase
    ABBREV_PROPER = {  # Names (capitalize)
        'Q': 'Quintus', 'C': 'Caius', 'M': 'Marcus', 'L': 'Lucius',
        'T': 'Titus', 'P': 'Publius', 'D': 'Dis', 'A': 'Aulus',
        'Cn': 'Gnaeus', 'TI': 'Tiberius', 'S': 'Sextus', 'N': 'Numerius',
    }

    ABBREV_COMMON = {  # Common words (lowercase)
        'a': 'animo', 'l': 'libens', 'v': 'votum', 'p': 'posuit',
        's': 'sacrum', 'f': 'fecit', 'm': 'mensis', 'an': 'anno',
        'ann': 'annorum', 'h': 'hic', 'e': 'est', 'pos': 'posuit',
        't': 'tibi', 'd': 'de', 'sit': 'sit'
    }

    @staticmethod
    def process(leiden_text):
        """Full pipeline: Leiden → clean transcription"""

        # Step 1: Remove damage markers [3] (n unknown letters)
        text = re.sub(r'\[\d+\]', '', leiden_text)

        # Step 2: Remove question marks and uncertain markers
        text = re.sub(r'\?', '', text)
        text = re.sub(r'\[([^\]]*)\]', r'\1', text)  # [text] → text

        # Step 3: Join words broken across lines intelligently
        # Handle patterns like "Gem/ellian" or "ann]or/um"
        # Remove line breaks only when joining word fragments
        text = re.sub(r'([a-z])/([a-z])', r'\1\2', text, flags=re.IGNORECASE)
        text = re.sub(r'(\])/([a-z])', r'\1\2', text, flags=re.IGNORECASE)

        # Step 4: Expand abbreviations with proper case handling
        def expand_abbrev(match):
            abbrev = match.group(1)
            expansion = match.group(2) if match.group(2) else ""

            # If expansion provided in parentheses, use it
            if expansion:
                # Keep expansion as-is, preserve case
                return abbrev + expansion

            # Try proper noun abbreviations first
            if abbrev in LeidenProcessor.ABBREV_PROPER:
                return LeidenProcessor.ABBREV_PROPER[abbrev]

            # Try common abbreviations
            if abbrev.lower() in LeidenProcessor.ABBREV_COMMON:
                return LeidenProcessor.ABBREV_COMMON[abbrev.lower()]

            # Return original if not found
            return abbrev

        # Pattern: X(expansion) captures abbreviation and optional expansion text
        text = re.sub(r'([A-Za-z]+)\(([^)]*)\)', expand_abbrev, text)

        # Step 5: Clean line break markers and multiple spaces
        text = text.replace('/', ' ')
        text = text.replace('\\', ' ')
        text = re.sub(r'\s+', ' ', text).strip()

        # Step 6: Remove remaining brackets and junk
        text = re.sub(r'[\[\]]', '', text)

        # Step 7: Capitalize only first letter and proper nouns
        # Simple heuristic: capitalize after space, but preserve lowercase articles/prepositions
        LOWERCASE_WORDS = {'et', 'de', 'a', 'in', 'ex', 'ab'}

        words = text.split()
        result = []

        for i, word in enumerate(words):
            if i == 0:  # First word always capitalized
                result.append(word.capitalize())
            elif word.lower() in LOWERCASE_WORDS:
                result.append(word.lower())
            elif word[0].isupper():  # Already capitalized (likely a name)
                result.append(word)
            else:
                result.append(word.capitalize())

        return ' '.join(result)

# Load model
nlp = spacy.load("model-best")

# Process CSV
predictions = []
with open('inscriptions.csv', 'r', encoding='utf-8') as f:
    for i, row in enumerate(csv.DictReader(f), 1):
        leiden = row['text'].strip()
        trans = LeidenProcessor.process(leiden)
        doc = nlp(trans)

        entities = [{"text": ent.text, "label": ent.label_} for ent in doc.ents]
        predictions.append({"leiden": leiden, "text": trans, "entities": entities})

# Export
df = pd.DataFrame([
    {"leiden": p["leiden"], "transcription": p["text"],
     "entities": " | ".join([f"{e['text']} ({e['label']})" for e in p["entities"]])}
    for p in predictions
])

df.to_csv("predictions.csv", index=False)
files.download("predictions.csv")

print(f"✅ Processed {len(predictions)} inscriptions")
print(df.head())