In [23]:
import glob
import torch
import ipywidgets as widgets
from ipywidgets import interact
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
from IPython.display import Markdown, display
from PIL import Image, ImageDraw, ImageFont
import pytorch_lightning as pl
from transformers import AutoTokenizer
from omegaconf import DictConfig
from pytorch_lightning.callbacks import ModelCheckpoint
from sklearn.metrics import classification_report

from core.dataset.token_classification_datamodule import LayoutLMDataset
from core.model.liltxlm_ner_model import LiltXLMNer

In [2]:
def denormalize_bbox(bbox_normed, width, height):
    return [
        int((bbox_normed[0] * width) / 1000.0),
        int((bbox_normed[1] * height) / 1000.0),
        int((bbox_normed[2] * width) / 1000.0),
        int((bbox_normed[3] * height) / 1000.0)     
    ]

def infer(inputs, model : LiltXLMNer, tokenizer : AutoTokenizer):

    # Must add a batch dimension for the model to work
    inputs['input_ids'] = inputs['input_ids'].unsqueeze(0)  
    inputs['bbox'] = inputs['bbox'].unsqueeze(0)
    inputs['attention_mask'] = inputs['attention_mask'].unsqueeze(0)

    model.eval()
    with torch.no_grad():
        output = model(**inputs)

    # Get the predicted labels as the argmax across the last dimension
    # TODO: I should also keep the logits for the loss function and apply softmax
    # so that I can get the probabilities (confidence values) for each class
    preds = torch.argmax(output.logits, dim=2)      

    # Create a list of tuples
    results = []
    for i in range(preds.shape[0]):
        word = inputs["input_ids"][i].squeeze().tolist()
        bbox = inputs["bbox"][i].squeeze().tolist()
        label = preds[i].squeeze().tolist()
        results.append((word, bbox, label))
        
    return results      


In [71]:
color_dict = {
    1: "red",
    2: "green",
    3: "blue",
    4: "yellow",
    5: "purple",
    6: "orange",
    7: "cyan",
    8: "magenta",
    9: "brown",
    10: "lime",
    11: "pink",
    12: "gray",
    13: "olive",
    14: "teal",
}

def show_image_with_bboxes(output, image):
    # Take the image and draw the bounding boxes use PIL
    draw = ImageDraw.Draw(image)
    font = ImageFont.truetype("arial.ttf", 16)
    for words, bboxs, labels in output:
        for word, bbox, label in zip(words, bboxs, labels):
            bbox = denormalize_bbox(bbox, image.width, image.height)
            if label in color_dict:
                draw.rectangle(bbox, outline=color_dict[label], width=3)              

    # Display the image use matplotlib
    plt.figure(figsize=(18, 14))
    plt.imshow(image)
    plt.show()



In [None]:
csv_dir = r'YOUR_CSV_DIR'
image_dir = r'YOUR_IMAGE_DIR'

csv_files = glob.glob(csv_dir + '/*.csv')
tokenizer = AutoTokenizer.from_pretrained("nielsr/lilt-xlm-roberta-base")
label2idx = {
    # YOUR LABELS
}
dataset = LayoutLMDataset(csv_dir=None, image_dir=image_dir, csv_files=csv_files, tokenizer=tokenizer, label2idx=label2idx, is_train=False)

: 

In [70]:
print(dataset[100][0]['input_ids'].shape)
print(dataset[100][0]['bbox'].shape)
print(dataset[100][0]['attention_mask'].shape)


torch.Size([512])
torch.Size([512, 4])
torch.Size([512])


In [61]:
model = LiltXLMNer.load_from_checkpoint(
    r"MODEL_PATH",
    num_labels=NUM_LABELS,
    learning_rate=5e-5,
    label2idx=label2idx
)

Some weights of LiltForTokenClassification were not initialized from the model checkpoint at nielsr/lilt-xlm-roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [62]:
@interact(index=(0, len(dataset)-1))
def show_image(index):
    item, image = dataset[index]
    output = infer(item, model, tokenizer)
    show_image_with_bboxes(output, image)


interactive(children=(IntSlider(value=12055, description='index', max=24110), Output()), _dom_classes=('widget…