## Inference

Run inference on test images using the fine-tuned LoRA model

In [None]:
import json
from typing import List
from PIL import Image
import torch
from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
from peft import PeftModel

def predict_invoice(
    image_path: str,
    words: List[str],
    boxes: List[List[int]],
    model_path: str = "models/layoutlmv3-lora-invoice-number",
    base_model: str = "microsoft/layoutlmv3-base",
    num_labels: int = 3
):
    """
    Run inference on a single invoice
    """
    # Load processor
    processor = LayoutLMv3Processor.from_pretrained(model_path, apply_ocr=False)
    
    # Load base model + LoRA adapter
    print(f"Loading LoRA model from {model_path}...")
    base = LayoutLMv3ForTokenClassification.from_pretrained(
        base_model,
        num_labels=num_labels
    )
    model = PeftModel.from_pretrained(base, model_path)
    model.eval()
    
    # Load image
    image = Image.open(image_path).convert("RGB")
    
    # Process
    encoding = processor(
        image,
        words,
        boxes=boxes,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )
    
    # Predict
    with torch.no_grad():
        outputs = model(**encoding)
        predictions = torch.argmax(outputs.logits, dim=2)
    
    # Convert to labels
    predicted_labels = []
    invoice_tokens = []
    
    token_boxes = encoding.bbox[0].tolist()
    word_ids = encoding.word_ids(0)
    
    prev_word_idx = None
    for pred, box, word_idx in zip(predictions[0].tolist(), token_boxes, word_ids):
        if box != [0, 0, 0, 0] and word_idx is not None:
            label = model.config.id2label[pred]
            if word_idx != prev_word_idx:
                predicted_labels.append(label)
                if label.startswith('B-INVOICE') or label.startswith('I-INVOICE'):
                    invoice_tokens.append(words[word_idx])
                prev_word_idx = word_idx
    
    invoice_number = ' '.join(invoice_tokens) if invoice_tokens else None
    return predicted_labels, invoice_number


# --- Inference on test.json ---
test_json_path = "../data/SROIE2019/test/test.json"

with open(test_json_path, "r", encoding="utf-8") as f:
    test_data = json.load(f)

results = []

for item in test_data:
    file_path = f"../data/SROIE2019/test/img/{item['file']}"
    words = item['words']
    boxes = item['bboxes']
    
    predicted_labels, invoice_number = predict_invoice(
        image_path=file_path,
        words=words,
        boxes=boxes,
        model_path="../models/layoutlmv3-lora-invoice-number",
        base_model="microsoft/layoutlmv3-base",
        num_labels=3
    )
    
    results.append({
        "file": item['file'],
        "predicted_labels": predicted_labels,
        "invoice_number": invoice_number
    })

# Save results
with open("../data/SROIE2019/test/predictions.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2)

print("Inference complete. Predictions saved to predictions.json")


Loading LoRA model from ../models/layoutlmv3-lora...


Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 

## Inference on a single image

In [None]:
import json
from PIL import Image
import torch
from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
from peft import PeftModel

# --- Config ---
file_to_test = "X51005675104.jpg" 
model_path = "../models/layoutlmv3-lora-invoice-number"
base_model = "microsoft/layoutlmv3-base"
num_labels = 3

# --- Load test data ---
test_json_path = "../data/SROIE2019/test/test.json"
with open(test_json_path, "r", encoding="utf-8") as f:
    test_data = json.load(f)

# Find the file
item = next(d for d in test_data if d["file"] == file_to_test)
words = item["words"]
boxes = item["bboxes"]
image_path = f"../data/SROIE2019/test/img/{file_to_test}"

# --- Load processor and model ---
processor = LayoutLMv3Processor.from_pretrained(model_path, apply_ocr=False)
base = LayoutLMv3ForTokenClassification.from_pretrained(base_model, num_labels=num_labels)
model = PeftModel.from_pretrained(base, model_path)
model.eval()

# --- Load image and encode ---
image = Image.open(image_path).convert("RGB")
encoding = processor(
    image,
    words,
    boxes=boxes,
    truncation=True,
    padding="max_length",
    max_length=512,
    return_tensors="pt"
)

# --- Predict ---
with torch.no_grad():
    outputs = model(**encoding)
    predictions = torch.argmax(outputs.logits, dim=2)

# --- Print words, bbox, prediction ---
token_boxes = encoding.bbox[0].tolist()
word_ids = encoding.word_ids(0)
prev_word_idx = None

print(f"{'WORD':20} {'BBOX':20} {'PREDICTION'}")
print("-" * 60)

for pred, box, word_idx in zip(predictions[0].tolist(), token_boxes, word_ids):
    if box != [0, 0, 0, 0] and word_idx is not None and word_idx != prev_word_idx:
        label = model.config.id2label[pred]
        print(f"{words[word_idx]:20} {boxes[word_idx]} {label}")
        prev_word_idx = word_idx


Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


WORD                 BBOX                 PREDICTION
------------------------------------------------------------
SYARIKAT             [92, 111, 331, 137] LABEL_0
PERNIAGAAN           [361, 111, 661, 137] LABEL_0
GIN                  [691, 111, 781, 137] LABEL_0
KEE                  [812, 111, 902, 137] LABEL_0
(                    [408, 142, 426, 167] LABEL_0
81109                [426, 142, 523, 167] LABEL_0
-                    [523, 142, 542, 167] LABEL_0
A                    [542, 142, 561, 167] LABEL_0
)                    [561, 142, 581, 167] LABEL_0
NO                   [255, 170, 295, 190] LABEL_0
290,                 [314, 170, 395, 190] LABEL_0
JALAN                [415, 170, 515, 190] LABEL_0
AIR                  [534, 170, 595, 190] LABEL_0
PANAS,               [615, 170, 735, 190] LABEL_0
SETAPAK,             [403, 191, 588, 216] LABEL_0
53200,               [283, 220, 418, 241] LABEL_0
KUALA                [441, 220, 554, 241] LABEL_0
LUMPUR               [576, 220, 712, 