The goal is to fine-tune ModernBERT (https://huggingface.co/blog/modernbert) to return useful metadata for categorizing archaeological reports in Britain.

The source data came from the Archaeology Data Service, 1173 report metadata records returned from a simple search of 'Roman'. 'Bibliography' and 'Url' were removed from the columns. Training data is in ads-roman-result.csv

You need a huggingface account and access token. If you set that up earlier, it will still be available here - click the key icon to check.

Get the training data:

In [None]:
!wget https://gist.githubusercontent.com/shawngraham/d71c21640e1597d90c02123c290c9472/raw/372b865372872fdd78bf1f222bd2299b3217863b/ads-roman-result.csv

--2024-12-20 18:14:41--  https://gist.githubusercontent.com/shawngraham/d71c21640e1597d90c02123c290c9472/raw/372b865372872fdd78bf1f222bd2299b3217863b/ads-roman-result.csv
Resolving gist.githubusercontent.com (gist.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to gist.githubusercontent.com (gist.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1221046 (1.2M) [text/plain]
Saving to: ‘ads-roman-result.csv’


2024-12-20 18:14:43 (157 MB/s) - ‘ads-roman-result.csv’ saved [1221046/1221046]



Now we use the same preprocessing code to convert the csv into the json data we need. We don't use the jsonl data this time.

In [None]:
# for use with ADS csv download file that mixes comma delimited fields with semicolon delimited lists
import pandas as pd
import json
import re

def parse_delimited_field(field, delimiter=';'):
    """Parse semicolon-delimited fields into lists, cleaning empty entries"""
    if pd.isna(field):
        return []
    items = [item.strip() for item in str(field).split(delimiter)]
    return [item for item in items if item]

def parse_location_data(location_str):
    """Parse location string into structured format"""
    if pd.isna(location_str):
        return {}

    location_data = {}
    parts = location_str.split(';')

    for part in parts:
        if ':' in part:
            key, value = part.split(':', 1)
            location_data[key.strip()] = value.strip()
        elif 'EPSG' in part:  # Handle EPSG coordinates
            coord_type, value = part.split(':', 2)[1:]
            location_data[f'EPSG_{coord_type}'] = value

    return location_data

def parse_period_subject(field):
    """Parse period and subject information into categorized lists"""
    if pd.isna(field):
        return {'periods': [], 'subjects': []}

    periods = []
    subjects = []

    items = parse_delimited_field(field)
    for item in items:
        if item.startswith('Period:'):
            periods.append(item.replace('Period:', '').strip())
        elif item.startswith('Subject:'):
            subjects.append(item.replace('Subject:', '').strip())

    return {
        'periods': periods,
        'subjects': subjects
    }

def parse_identifiers(identifier_str):
    """Parse identifier string into structured format"""
    if pd.isna(identifier_str):
        return {}

    identifiers = {}
    parts = parse_delimited_field(identifier_str)

    for part in parts:
        if ':' in part:
            key, value = part.split(':', 1)
            identifiers[key.strip()] = value.strip()

    return identifiers

def transform_row_to_json(row):
    """Transform a single row into structured JSON format"""
    return {
        'title': row['Title'],
        'description': row['Description'],
        'location': parse_location_data(row['Location']),
        'period_subject': parse_period_subject(row['PeriodSubjectIntervention']),
        'identifiers': parse_identifiers(row['Indentifiers']),
        'people': parse_delimited_field(row['People'])
    }

# this is for creating a dataset for finetuning smol, which is a different experiment
def format_for_training(entry):
    """Format the JSON entry into training format"""
    # Create instruction from available data
    instruction = (
        f"Please categorize this archaeological report metadata from {entry['location'].get('Named Location', 'unknown location')}: "
        f"{entry['description']}"
    )

    # Create response using structured data
    response = {
        "subjects": entry['period_subject']['subjects'],
        "periods": entry['period_subject']['periods'],
        "work_conducted_by": entry['people'],
        "location": {
            "civil_parish": entry['location'].get('Civil Parish', ''),
             "admin_county": entry['location'].get('Admin County', '')
        }
    }

    return {
        "text": f"<|system|>You are a helpful archaeological assistant trained to categorize archaeological reports.\n"
                f"<|user|>{instruction}\n"
                f"<|assistant|>{json.dumps(response)}<|endoftext|>"
    }

def process_archaeological_csv(input_file, output_json="processed_data.json", output_training="training_data.jsonl"):
    """Process archaeological CSV file into JSON and training format"""
    try:
        # Read CSV using pandas, handle quote char issues
        try:
            df = pd.read_csv(input_file,
                               quotechar='"',
                               escapechar='\\',
                               encoding='utf-8',
                               on_bad_lines='warn')
        except Exception as e:
             print(f"Error during initial read with quotes:\n{e}\n trying without quotes")
             try:
                  df = pd.read_csv(input_file,
                                   encoding='utf-8',
                                   on_bad_lines='warn')
             except Exception as e:
                  print(f"Error during initial read without quotes:\n{e}")
                  raise e

        # Remove leading/trailing spaces from column names
        df.columns = df.columns.str.strip()

        # Debug: Print column names and first row
        print("\nAvailable columns in CSV:")
        for col in df.columns:
            print(f"- {col}")

        print("\nFirst row of data:")
        print(df.iloc[0].to_dict())

        # Save raw CSV content for debugging
        with open('debug_raw.txt', 'w', encoding='utf-8') as f:
            with open(input_file, 'r', encoding='utf-8') as src:
                f.write(src.read())

        print("\nSaved raw CSV content to debug_raw.txt for inspection")

        # Print shape of dataframe
        print(f"\nDataFrame shape: {df.shape}")

        # Print first few lines of raw file
        print("\nFirst few lines of raw file:")
        with open(input_file, 'r', encoding='utf-8') as f:
            print(f.readline())  # Header
            print(f.readline())  # First data row

        processed_data = df.apply(transform_row_to_json, axis=1).tolist()

        training_data = [format_for_training(entry) for entry in processed_data]

        # Output processed data as json
        with open(output_json, "w") as f:
            json.dump(processed_data, f, indent=2)

        # Output training data as jsonl
        with open(output_training, "w") as f:
            for entry in training_data:
                json.dump(entry, f)
                f.write('\n')

        return processed_data, training_data

    except Exception as e:
        print(f"\nError during processing:")
        print(f"Type of error: {type(e)}")
        print(f"Error message: {str(e)}")
        if 'df' in locals():
            print("\nDataFrame Info:")
            print(df.info())
        raise e


#run it
if __name__ == "__main__":
    try:
        processed_data, training_data = process_archaeological_csv('ads-roman-result.csv')

        # Print example of processed data
        print("\nExample of processed JSON:")
        print(json.dumps(processed_data[0], indent=2))

        print("\nExample of training format:")
        print(training_data[0]['text'])

    except Exception as e:
        print(f"Error: {str(e)}")

The next block installs some more packages that we will need.

In [None]:
%%capture
!pip install datasets

The next block creates training examples by extracting entities from the structured data we made in the last step. It looks for several types of entities:

- Organizations (ORG) from "Creator" fields
- Locations (LOC) from location information
- Periods (PER) from time period data
- Subjects (SUBJ) from subject classifications
- Identifiers (ID) from identifier fields

It then formats these examples using either BIO or BILOU tagging schemes, which are standard approaches for NER tasks:

- BIO: Tags tokens as Beginning, Inside, or Outside of an entity
- BILOU: More detailed scheme that marks Beginning, Inside, Last, Outside, or Unit-length entities

This information is crucial for correctly finding what we're after, when we use the fine-tuned model. The code turns our training data and these tags as tokens in a mathematical array. When we then fine-tune, it is like we are dialing the focus tight into that space of language that deals with archaeological metadata.


In [None]:
import json
from transformers import AutoTokenizer
import numpy as np

def create_training_examples(data, tokenizer, label_map):
    training_examples = []

    for item in data:
        description = item["description"]
        entities = []

        # Organization
        if "people" in item:
            for person in item["people"]:
              if "Creator:" in person:
                org_name = person.split(":")[1]
                start = description.find(org_name)
                if start != -1:
                  end = start + len(org_name)
                  entities.append({"start": start, "end": end, "label": "ORG"})

        # Location
        location_info = item.get("location", {})
        for key, value in location_info.items():
            if isinstance(value, str):
              start = description.find(value)
              if start != -1:
                end = start + len(value)
                entities.append({"start": start, "end": end, "label": "LOC"})


        # Periods and Subjects
        periods = item["period_subject"].get("periods", [])
        subjects = item["period_subject"].get("subjects", [])
        identifiers = item.get("identifiers", {})

        period_text = f" Periods: {', '.join(periods)}. "
        subject_text = f"Subjects: {', '.join(subjects)}. "
        identifier_text = f"Identifiers: {', '.join([f'{k}: {v}' for k,v in identifiers.items()])}."
        full_text = description + period_text + subject_text + identifier_text

        # Process the periods
        period_start = len(description) + len(" Periods: ")
        for period in periods:
            start = full_text.find(period, period_start)
            if start != -1:
                end = start + len(period)
                entities.append({"start": start, "end": end, "label": "PER"})
                period_start = end + len(", ")  # Update the start for the next period

        # Process the subjects
        subject_start = len(description) + len(period_text) + len("Subjects: ")
        for subject in subjects:
            start = full_text.find(subject, subject_start)
            if start != -1:
                end = start + len(subject)
                entities.append({"start": start, "end": end, "label": "SUBJ"})
                subject_start = end + len(", ")  # Update the start for the next subject

        #Process identifiers
        identifier_start = len(description) + len(period_text) + len(subject_text) + len("Identifiers: ")
        for key, value in identifiers.items():
            identifier_str = f"{key}: {value}"
            start = full_text.find(identifier_str, identifier_start)
            if start != -1:
               end = start + len(identifier_str)
               entities.append({"start": start, "end": end, "label": "ID"})
               identifier_start = end + len(", ") # update position


        training_examples.append({"text": full_text, "entities": entities})

    return training_examples

def format_for_ner(training_examples, tokenizer, label_map, max_length, scheme="BIO"):
    """
    Format training examples for NER with BIO or BILOU tagging schemes

    Args:
        training_examples: List of examples with text and entities
        tokenizer: HuggingFace tokenizer
        label_map: Dictionary mapping entity types to IDs
        max_length: Maximum sequence length
        scheme: Tagging scheme, either "BIO" or "BILOU"
    """
    if scheme not in ["BIO", "BILOU"]:
        raise ValueError("Scheme must be either 'BIO' or 'BILOU'")

    # Expand label map for BIO or BILOU tags
    expanded_label_map = {"O": 0}  # Outside tag
    current_idx = 1

    if scheme == "BIO":
        for label, _ in label_map.items():
            if label != "O":
                expanded_label_map[f"B-{label}"] = current_idx
                current_idx += 1
                expanded_label_map[f"I-{label}"] = current_idx
                current_idx += 1
    else:  # BILOU
        for label, _ in label_map.items():
            if label != "O":
                expanded_label_map[f"B-{label}"] = current_idx
                current_idx += 1
                expanded_label_map[f"I-{label}"] = current_idx
                current_idx += 1
                expanded_label_map[f"L-{label}"] = current_idx
                current_idx += 1
                expanded_label_map[f"U-{label}"] = current_idx
                current_idx += 1

    formatted_data = []
    for example in training_examples:
        text = example["text"]
        entities = example["entities"]

        # Sort entities by start position for proper sequential labeling
        entities = sorted(entities, key=lambda x: x["start"])

        # Tokenize text and obtain token ids
        encoding = tokenizer(text, return_offsets_mapping=True, padding="max_length",
                           truncation=True, max_length=max_length)
        input_ids = encoding["input_ids"]
        attention_mask = encoding["attention_mask"]
        offset_mapping = encoding["offset_mapping"]
        labels = [0] * len(input_ids)  # Initialize all labels as O

        # Process each entity
        for entity in entities:
            start_char = entity["start"]
            end_char = entity["end"]
            label_name = entity["label"]

            # Find all tokens that overlap with the entity
            entity_tokens = []
            for idx, (offset_start, offset_end) in enumerate(offset_mapping):
                if offset_start is None or offset_end is None:  # Skip special tokens
                    continue
                if offset_start <= end_char and offset_end > start_char:
                    entity_tokens.append(idx)

            if not entity_tokens:  # Skip if no tokens found for entity
                continue

            # Assign labels based on scheme
            if scheme == "BIO":
                for i, token_idx in enumerate(entity_tokens):
                    if i == 0:  # First token
                        labels[token_idx] = expanded_label_map[f"B-{label_name}"]
                    else:  # Subsequent tokens
                        labels[token_idx] = expanded_label_map[f"I-{label_name}"]
            else:  # BILOU scheme
                if len(entity_tokens) == 1:  # Unit-length entity
                    labels[entity_tokens[0]] = expanded_label_map[f"U-{label_name}"]
                else:
                    for i, token_idx in enumerate(entity_tokens):
                        if i == 0:  # Beginning
                            labels[token_idx] = expanded_label_map[f"B-{label_name}"]
                        elif i == len(entity_tokens) - 1:  # Last
                            labels[token_idx] = expanded_label_map[f"L-{label_name}"]
                        else:  # Inside
                            labels[token_idx] = expanded_label_map[f"I-{label_name}"]

        formatted_data.append({
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        })

    return formatted_data, expanded_label_map

 # Load JSON data
with open("processed_data.json", "r") as f:
    data = json.load(f)

# do it
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")

# Original label map
label_map = {"O": 0, "ORG": 1, "LOC": 2, "PER": 3, "SUBJ": 4, "ID": 5}

# Create training examples (your existing code)
training_examples = create_training_examples(data, tokenizer, label_map)

# Format with BIO scheme
formatted_data_bio, bio_label_map = format_for_ner(
    training_examples,
    tokenizer,
    label_map,
    max_length=512,
    scheme="BIO"
)

# Or format with BILOU scheme
formatted_data_bilou, bilou_label_map = format_for_ner(
    training_examples,
    tokenizer,
    label_map,
    max_length=512,
    scheme="BILOU"
)

# Save the formatted data and label maps
np.save("input_ids.npy", np.array([d['input_ids'] for d in formatted_data_bio]))
np.save("attention_mask.npy", np.array([d['attention_mask'] for d in formatted_data_bio]))
np.save("labels.npy", np.array([d['labels'] for d in formatted_data_bio]))

# Save label maps for use during inference
with open("label_map.json", "w") as f:
    json.dump(bio_label_map, f)  # or bilou_label_map if using BILOU scheme

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

Now we're ready to fine tune, once we install one last piece of code we need:

In [None]:
# gotta use this right now since the release of modernbert hasn't been
# updated in the main transformers yet, apparently
%%capture
!pip install git+https://github.com/huggingface/transformers.git

#restart session then continue

In [None]:
### new finetuning
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
import numpy as np
from sklearn.model_selection import train_test_split
import json

# 1. Load the label map and determine number of labels
with open("label_map.json", "r") as f:
    label_map = json.load(f)
num_labels = len(label_map)  # This will be 11 for BIO scheme with 5 entity types + O

# 2. Load the pre-trained Model and Tokenizer
model_name = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label={str(i): label for label, i in label_map.items()},
    label2id={label: i for label, i in label_map.items()}
)

# 3. Load and Format the Data
max_length = 512
input_ids = np.load("input_ids.npy", allow_pickle=True)
attention_mask = np.load("attention_mask.npy", allow_pickle=True)
labels = np.load("labels.npy", allow_pickle=True)

# Split Data for training and validation
input_ids_train, input_ids_val, attention_mask_train, attention_mask_val, labels_train, labels_val = train_test_split(
    input_ids, attention_mask, labels, test_size=0.2, random_state=42
)

# Convert to Hugging Face Datasets
train_dataset = Dataset.from_dict({
    "input_ids": input_ids_train,
    "attention_mask": attention_mask_train,
    "labels": labels_train
})

val_dataset = Dataset.from_dict({
    "input_ids": input_ids_val,
    "attention_mask": attention_mask_val,
    "labels": labels_val
})

# 4. Define custom compute_metrics function for NER evaluation

# Create inverse label mapping
label_map_inverse = {i: label for label, i in label_map.items()}

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Only evaluate on non-padded tokens
    true_predictions = [
        [label_map_inverse[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_map_inverse[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Calculate accuracy for non-padded tokens
    correct = sum(p == l for pred, lab in zip(true_predictions, true_labels)
                 for p, l in zip(pred, lab))
    total = sum(len(pred) for pred in true_predictions)
    accuracy = correct / total if total > 0 else 0

    return {
        "accuracy": accuracy,
    }

# 5. Training Arguments
training_args = TrainingArguments(
    output_dir="./bert_ner_bio",          # Updated output directory
    evaluation_strategy="steps",
    eval_steps=50,
    learning_rate=1e-5,
    per_device_train_batch_size=16,       # Increased batch size if memory allows
    per_device_eval_batch_size=16,
    num_train_epochs=50,
    weight_decay=0.01,
    save_strategy="steps",
    save_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    logging_dir="./logs",                 # Add logging
    logging_steps=50,
    fp16=True,                           # Enable mixed precision training if available
    gradient_accumulation_steps=2,        # Accumulate gradients for larger effective batch size
    warmup_steps=500,                    # Add warmup steps
    seed=42,                             # Set random seed for reproducibility
    report_to="none",                    # without this, colab logs the run with 'weights and biases' service, which requires an api etc
)

# 6. Define and Start Training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics      # Add metrics computation
)

# 7. Train the Model
trainer.train()

# 8. Save the model and tokenizer
trainer.save_model("./bert_arch_ner_bio_trained")
tokenizer.save_pretrained("./bert_arch_ner_bio_trained")

# 9. Save the label map with the model for inference
with open("./bert_arch_ner_bio_trained/label_map.json", "w") as f:
    json.dump(label_map, f, indent=2)

print("Fine-tuning completed and model saved.")

Some weights of ModernBertForTokenClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy
50,4.6125,1.609851,0.627677
100,1.7619,0.490151,0.847943
150,0.5738,0.139436,0.962247
200,0.1668,0.05751,0.982353
250,0.0805,0.038729,0.987293
300,0.0518,0.032361,0.988764
350,0.0401,0.030121,0.989177
400,0.0308,0.029905,0.989786
450,0.0239,0.03007,0.990016
500,0.0155,0.03346,0.990039


Step,Training Loss,Validation Loss,Accuracy
50,4.6125,1.609851,0.627677
100,1.7619,0.490151,0.847943
150,0.5738,0.139436,0.962247
200,0.1668,0.05751,0.982353
250,0.0805,0.038729,0.987293
300,0.0518,0.032361,0.988764
350,0.0401,0.030121,0.989177
400,0.0308,0.029905,0.989786
450,0.0239,0.03007,0.990016
500,0.0155,0.03346,0.990039


Fine-tuning completed and model saved.


And now we can test it out!

In [None]:
#test it out!
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer
import numpy as np

# 1. Load the Trained Model and Tokenizer
model_name = "./bert_arch_ner_bio_trained"
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
model = AutoModelForTokenClassification.from_pretrained(model_name)

# we need the label map
label_map = {
    "O": 0,
    "B-ORG": 1, "I-ORG": 2,
    "B-LOC": 3, "I-LOC": 4,
    "B-PER": 5, "I-PER": 6,
    "B-SUBJ": 7, "I-SUBJ": 8,
    "B-ID": 9, "I-ID": 10
}
id_to_label = {v: k for k, v in label_map.items()}

def predict_entities(text, tokenizer, model, label_map, max_length):
    inputs = tokenizer(text, return_offsets_mapping=True, padding="max_length",
                      truncation=True, max_length=max_length, return_tensors="pt")
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    offset_mapping = inputs["offset_mapping"].squeeze().tolist()

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predictions = torch.argmax(logits, axis=-1).squeeze().tolist()

    entities = []
    current_entity = None
    entity_tokens = []

    for idx, label_id in enumerate(predictions):
        offset_start, offset_end = offset_mapping[idx]
        if offset_start is None or offset_end is None:  # Skip special tokens
            continue

        label = id_to_label[label_id]

        if label.startswith('B-'):  # Beginning of new entity
            if current_entity:  # Save previous entity if exists
                start = entity_tokens[0][0]
                end = entity_tokens[-1][1]
                entities.append({
                    "text": text[start:end],
                    "start": start,
                    "end": end,
                    "label": current_entity.replace('B-', '')
                })
            current_entity = label
            entity_tokens = [(offset_start, offset_end)]

        elif label.startswith('I-'):  # Inside of entity
            if current_entity and label[2:] == current_entity[2:]:  # Matches current entity type
                entity_tokens.append((offset_start, offset_end))
            else:  # 'I-' without matching 'B-' - ignore
                current_entity = None
                entity_tokens = []

        else:  # 'O' or any other label
            if current_entity:  # Save previous entity if exists
                start = entity_tokens[0][0]
                end = entity_tokens[-1][1]
                entities.append({
                    "text": text[start:end],
                    "start": start,
                    "end": end,
                    "label": current_entity.replace('B-', '')
                })
            current_entity = None
            entity_tokens = []

    # Handle last entity if exists
    if current_entity and entity_tokens:
        start = entity_tokens[0][0]
        end = entity_tokens[-1][1]
        entities.append({
            "text": text[start:end],
            "start": start,
            "end": end,
            "label": current_entity.replace('B-', '')
        })

    return entities

# 3. Input Text and Run Inference
test_texts = [
    "This archive presents appendices B-I and supplementary material resulting from the programme of archaeological works undertaken during the construction scheme to widen the A1 trunk road between Dishforth and Leeming Bar in North Yorkshire. The Iron Age to early medieval evidence from Healam Bridge, along with other evidence for Roman activity along the route is published in two volumes",
    "This collection comprises images, spreadsheets, reports, vector graphics, and scanned site records and drawings from archaeological recording by Archaeological Research Services at Lower Radbourne Deserted Medieval Village, Warwickshire. The work was undertaken between April and December 2021. Area C32070 was dominated by intercutting features predominantly dated to two broad phases, prehistoric and medieval. The prehistoric features were represented by a large ring ditch, potentially dating to the Early Bronze Age, four smaller potential Bronze Age ring ditches and a series of intercutting drip gullies, probably Iron Age in date.",
    "This collection comprises images and CAD from an archaeological evaluation and watching brief, undertaken by Cotswold Archaeology in August 2018, at Hewmar House, 120 London Road, Gloucester, Gloucestershire. Four archaeological evaluation trenches were excavated and four geotechnical test pits were  observed. Despite the proximity of the site to Wotton Roman cemetery, no evidence for any in situ burials, or indeed any Roman activity, was identified in any of the excavated trenches or test pits. It is likely that the site lay beyond the southern boundary of the cemetery and formed  part of the agricultural hinterland of both Roman and medieval Gloucester until the  construction of Hillfield Villa (later Hewmar House) in the early 19th century. Three linear  garden features, probably planting trenches, associated with Hillfield Villa and a large undated ditch were identified. Evidence for possible quarrying was also identified throughout the site. Periods: POST MEDIEVAL, 1800 - 1850, UNCERTAIN. Subjects: Archaeology, Evaluation, DITCH, GARDEN FEATURE, TRIAL TRENCH."
    ]


max_length = 512
for text in test_texts:
    predicted_entities = predict_entities(text, tokenizer, model, label_map, max_length)

    print("\nInput Text:")
    print(text)
    print("\nExtracted Entities:")
    for entity in predicted_entities:
        print(f"  - Text: {entity['text']}, Start: {entity['start']}, End: {entity['end']}, Label: {entity['label']}")


Input Text:
This archive presents appendices B-I and supplementary material resulting from the programme of archaeological works undertaken during the construction scheme to widen the A1 trunk road between Dishforth and Leeming Bar in North Yorkshire. The Iron Age to early medieval evidence from Healam Bridge, along with other evidence for Roman activity along the route is published in two volumes

Extracted Entities:

Input Text:
This collection comprises images, spreadsheets, reports, vector graphics, and scanned site records and drawings from archaeological recording by Archaeological Research Services at Lower Radbourne Deserted Medieval Village, Warwickshire. The work was undertaken between April and December 2021. Area C32070 was dominated by intercutting features predominantly dated to two broad phases, prehistoric and medieval. The prehistoric features were represented by a large ring ditch, potentially dating to the Early Bronze Age, four smaller potential Bronze Age ring dit

Save your fine tuned ModernBERT model to huggingface; change `your-username` accordingly

In [None]:
from huggingface_hub import notebook_login
from huggingface_hub import HfApi

# First, login to Hugging Face
notebook_login()

# Save the model and all necessary files
trainer.save_model("./bert_arch_ner_bio_trained")

# Save the tokenizer and label map with the model
tokenizer.save_pretrained("./bert_arch_ner_bio_trained")
with open("./bert_arch_ner_bio_trained/label_map.json", "w") as f:
    json.dump(label_map, f, indent=2)

# Push to hub with a model card
model.push_to_hub("your-username/ModernBERT_archae",
    use_auth_token=True,
    model_card_kwargs={
        "language": "en",
        "license": "mit",
        "tags": ["token-classification", "ner", "archaeology"],
        "datasets": ["custom archaeology dataset"],
        "metrics": ["accuracy"],
    }
)

# Push the tokenizer configuration
tokenizer.push_to_hub("your-username/ModernBERT_archae")

# Initialize the Hugging Face API
api = HfApi()

# Push the label map as a separate file
api.upload_file(
    path_or_fileobj="./bert_arch_ner_bio_trained/label_map.json",
    path_in_repo="label_map.json",
    repo_id="your-username/ModernBERT_archae",
    repo_type="model"
)

example of use

In [None]:
## get some test data, both in json and csv, both from ADS search results again.
# csv where all of the fields have been smooshed into a single column
!wget https://gist.githubusercontent.com/shawngraham/15c7cf3e2982d645b0c03c745f12e6bf/raw/b06b6333aa14dd7d40bb14aac79b3434db3afdd0/test.csv
# json where everything smooshed into description field
!wget https://gist.githubusercontent.com/shawngraham/15c7cf3e2982d645b0c03c745f12e6bf/raw/fda5ebd3a9674f4ba1546d27a118938000034d2d/test.json

...remember to change 'your-username' as appropriate below. Notice that we're still using the base ModernBERT to handle basic transformations (base_tokenizer), and our own saved model for the particular task we've trained for.

In [None]:
import torch
import json
import csv
import pandas as pd
from transformers import AutoModelForTokenClassification, AutoTokenizer
import datetime
from pathlib import Path

class ArchaeologyNERProcessor:
    def __init__(self, model_name="your-username/ModernBERT_archae", base_tokenizer="answerdotai/ModernBERT-base"):
        self.tokenizer = AutoTokenizer.from_pretrained(base_tokenizer)
        self.model = AutoModelForTokenClassification.from_pretrained(model_name)

        # Define label map with BIO scheme
        self.label_map = {
            "O": 0,
            "B-ORG": 1, "I-ORG": 2,
            "B-LOC": 3, "I-LOC": 4,
            "B-PER": 5, "I-PER": 6,
            "B-SUBJ": 7, "I-SUBJ": 8,
            "B-ID": 9, "I-ID": 10
        }
        self.id_to_label = {v: k for k, v in self.label_map.items()}
        self.max_length = 512

    def predict_entities(self, text):
        inputs = tokenizer = self.tokenizer(
            text,
            return_offsets_mapping=True,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        with torch.no_grad():
            outputs = self.model(input_ids=inputs["input_ids"],
                               attention_mask=inputs["attention_mask"])

        predictions = torch.argmax(outputs.logits, axis=-1).squeeze().tolist()
        offset_mapping = inputs["offset_mapping"].squeeze().tolist()

        entities = []
        current_entity = None
        entity_tokens = []

        for idx, label_id in enumerate(predictions):
            offset_start, offset_end = offset_mapping[idx]
            if offset_start is None or offset_end is None:
                continue

            label = self.id_to_label[label_id]

            if label.startswith('B-'):
                if current_entity:
                    start = entity_tokens[0][0]
                    end = entity_tokens[-1][1]
                    entities.append({
                        "text": text[start:end],
                        "start": start,
                        "end": end,
                        "label": current_entity.replace('B-', '')
                    })
                current_entity = label
                entity_tokens = [(offset_start, offset_end)]

            elif label.startswith('I-'):
                if current_entity and label[2:] == current_entity[2:]:
                    entity_tokens.append((offset_start, offset_end))
                else:
                    current_entity = None
                    entity_tokens = []

            else:  # 'O' or any other label
                if current_entity:
                    start = entity_tokens[0][0]
                    end = entity_tokens[-1][1]
                    entities.append({
                        "text": text[start:end],
                        "start": start,
                        "end": end,
                        "label": current_entity.replace('B-', '')
                    })
                current_entity = None
                entity_tokens = []

        if current_entity and entity_tokens:
            start = entity_tokens[0][0]
            end = entity_tokens[-1][1]
            entities.append({
                "text": text[start:end],
                "start": start,
                "end": end,
                "label": current_entity.replace('B-', '')
            })

        return entities

    def process_file(self, input_path, output_path=None, text_column="description"):
        """Process input file (JSON or CSV) and extract entities"""
        input_path = Path(input_path)

        # Determine input file type
        if input_path.suffix.lower() == '.json':
            with open(input_path, 'r') as f:
                data = json.load(f)
                if isinstance(data, dict):
                    data = [data]

        elif input_path.suffix.lower() == '.csv':
            data = pd.read_csv(input_path).to_dict('records')
        else:
            raise ValueError("Unsupported file type. Please use .json or .csv")

        # Process each record
        output_data = []
        for record in data:
            if text_column not in record:
                print(f"Warning: '{text_column}' column not found in record, skipping...")
                continue

            text = record[text_column]
            extracted_entities = self.predict_entities(text)
            record["extracted_entities"] = extracted_entities
            output_data.append(record)

        # Determine output path if not provided
        if output_path is None:
            timestamp = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
            output_path = input_path.parent / f"processed_{timestamp}{input_path.suffix}"
        else:
            output_path = Path(output_path)

        # Save processed data
        if output_path.suffix.lower() == '.json':
            with open(output_path, 'w') as f:
                json.dump(output_data, f, indent=2)
        elif output_path.suffix.lower() == '.csv':
            # Flatten the entities for CSV output
            flattened_data = []
            for record in output_data:
                base_record = {k: v for k, v in record.items() if k != "extracted_entities"}
                if record["extracted_entities"]:
                    for entity in record["extracted_entities"]:
                        new_record = base_record.copy()
                        new_record.update({
                            "entity_text": entity["text"],
                            "entity_start": entity["start"],
                            "entity_end": entity["end"],
                            "entity_label": entity["label"]
                        })
                        flattened_data.append(new_record)
                else:
                    base_record.update({
                        "entity_text": "",
                        "entity_start": "",
                        "entity_end": "",
                        "entity_label": ""
                    })
                    flattened_data.append(base_record)

            pd.DataFrame(flattened_data).to_csv(output_path, index=False)

        print(f"Processing complete. Output saved to: {output_path}")
        return output_data





In [None]:
# Initialize the processor
processor = ArchaeologyNERProcessor()

# Process a JSON file
processor.process_file("test.json", "output.json")

# Process a CSV file
processor.process_file(
    "test.csv",
    "output.csv",
    text_column="description"  # specify which column contains the text to analyze
)

Processing complete. Output saved to: output.json
Processing complete. Output saved to: output.csv


[{'description': 'Report from Old Quarry Field at Emberton "This report documents archaeological findings from a survey led by Granite Digs at Hidden Valley located near Oakhaven. Key results include pottery analysis  details of a potential burial ground and a system of old roads." Ironmill Greenfield Emberton England 21483571 27700:390648 Subject:Archaeology Subject:Sherd Period:-800 - 1800 Subject:Excavations (Archaeology)--England Subject:Ditch Subject:Pit Subject:Strip Map And Sample Subject:Field Observation (Monitoring) Subject:Excavations (Archaeology)--England Period:ROMAN Associated ID: FAKE3 Import RCN: B45-546542 Creator:Starlight Research',
  'extracted_entities': [{'text': 'Sher',
    'start': 352,
    'end': 356,
    'label': 'SUBJ'},
   {'text': ' Import RCN: B45-546542 Creator:Star',
    'start': 590,
    'end': 626,
    'label': 'ID'}]},
 {'description': 'Report from Hidden Valley at Silverton "This report documents archaeological findings from a survey led by Granite 