In [17]:
path_to_home = "../"
#path_to_home = "./drive/MyDrive/receiptlayoutlm/"

In [18]:
from transformers import LayoutLMv2FeatureExtractor, LayoutXLMTokenizer, LayoutXLMProcessor
from transformers import LayoutLMv2ForTokenClassification

feature_extractor = LayoutLMv2FeatureExtractor()
tokenizer_xlm = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base")
#processor_xlm = LayoutXLMProcessor(feature_extractor, tokenizer_xlm)
model = LayoutLMv2ForTokenClassification.from_pretrained("sibrun/receiptlayoutlm", use_auth_token=True)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LayoutLMv2Tokenizer'. 
The class this function is called from is 'LayoutXLMTokenizer'.


In [41]:
import numpy as np
import torch
import pandas as pd

def labeled_tokens_to_labeled_words(tokens, token_inds, ner_token_labels):
        words = []
        ner_labels = []
        tokens_for_word = []
        for t, i, l in zip(tokens, token_inds, ner_token_labels):
                if t == "<s>":
                        continue
                if t == "</s>":
                        words.append(tokenizer_xlm.convert_tokens_to_string(tokens_for_word))
                        break
                if i == -100:
                        tokens_for_word.append(t)
                        continue
                ner_labels.append(l)
                if len(tokens_for_word) == 0:
                        tokens_for_word.append(t)
                        continue
                words.append(tokenizer_xlm.convert_tokens_to_string(tokens_for_word))
                tokens_for_word = [t]
        return words, ner_labels

def get_labels_dict(words, ner_labels):
        df = pd.DataFrame([words, ner_labels], index=["words", "tags"]).T
        labels_dict = {}
        keys = list(model.config.label2id.keys())
        for key in keys[1:]:
                entity_words = df.query('tags=="{}"'.format(key))["words"].tolist()
                entity = " ".join(entity_words)
                entity = entity if len(entity)>0 else None
                labels_dict[key] = entity
        return labels_dict

def extract_labels(image):
        features = feature_extractor(image, return_tensors="pt")
        tokenizer_output = tokenizer_xlm(
                text=features['words'],
                boxes=features['boxes'],
                word_labels=[list(range(len(features['words'][0])))],
                is_split_into_words=True,
                truncation=True,
                return_tensors='pt')
        model_input = tokenizer_output
        token_inds = model_input.pop('labels')
        model_input['image'] = features['pixel_values']
        with torch.no_grad():
                model_output = model(**model_input)
        logits = model_output.logits.numpy()[0]
        predictions = np.argmax(logits, axis=-1).tolist()
        ner_token_labels = [model.config.id2label[p] for p in predictions]
        tokens = [tokenizer_xlm.convert_ids_to_tokens(id) for id in model_input['input_ids'][0].tolist()]
        token_inds = token_inds.tolist()[0]
        words, ner_labels = labeled_tokens_to_labeled_words(tokens, token_inds, ner_token_labels)
        labels_dict = get_labels_dict(words, ner_labels)
        return labels_dict

In [42]:
from PIL import Image

path_to_image = "../data/X00016469622.jpg"
image = Image.open(path_to_image)
labels_dict = extract_labels(image)
print(labels_dict)

  torch.arange(
  torch.arange(


{'address': 'NO 122.124. JALAN DEDAP 13 81100 JOHOR BAHRU', 'company': None, 'date': None, 'total': '80.90 80.91'}
