## INFERENCE

In [None]:
from PIL import Image
import torch
from transformers import LayoutLMv3ForQuestionAnswering, AutoProcessor

# 1️⃣ Load the model and processor
model_path = "models/layoutlmv3-finetuned-qa"  # path to your finetuned model
device = "mps"  # or "cuda" if you have GPU
processor = AutoProcessor.from_pretrained(model_path)
model = LayoutLMv3ForQuestionAnswering.from_pretrained(model_path).to(device)

# 2️⃣ Load and parse test .txt file
test_file = "../data/SROIE2019/train/box/X00016469623.txt"
words = []
bboxes = []

with open(test_file, "r") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        parts = line.split(",")
        text = parts[-1]
        # Use top-left and bottom-right coordinates for LayoutLMv3
        x0, y0, x2, y2 = map(int, [parts[0], parts[1], parts[4], parts[5]])
        words.append(text)
        bboxes.append([x0, y0, x2, y2])

# 3️⃣ Load image
image_path = "X00016469612.jpg"  # replace with your image
image = Image.open(image_path).convert("RGB")
image_width, image_height = image.size

# 4️⃣ Normalize bboxes to [0, 1000]
def normalize_bbox(bbox, width, height):
    return [
        int(1000 * bbox[0] / width),
        int(1000 * bbox[1] / height),
        int(1000 * bbox[2] / width),
        int(1000 * bbox[3] / height),
    ]

norm_bboxes = [normalize_bbox(b, image_width, image_height) for b in bboxes]

# 5️⃣ Prepare inputs for QA
question = "What is the invoice number?"
inputs = processor(
    image,
    words,
    boxes=norm_bboxes,
    question=question,
    return_tensors="pt",
).to(device)

# 6️⃣ Run inference
with torch.no_grad():
    outputs = model(**inputs)
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

# 7️⃣ Decode answer
start_idx = start_logits.argmax(-1).item()
end_idx = end_logits.argmax(-1).item()
answer_tokens = inputs.input_ids[0][start_idx : end_idx + 1]
answer = processor.tokenizer.decode(answer_tokens)

print("Predicted Invoice Number:", answer)


ModuleNotFoundError: No module named 'PIL'