In [1]:
import os
os.chdir("../")

In [16]:
from collections import Counter
import warnings
from transformers import LayoutLMv3Tokenizer
from src.Glean.utils import str_utils

class VocabularyBuilder():
    """Vocabulary builder class to generate vocabulary."""
    
    def __init__(self, max_size = 512):
        self._words_counter = Counter()
        self.max_size = max_size
        self._vocabulary = { '<PAD>':0, '<NUMBER>':1, '<RARE>':2 }
        self.built = False

        self.tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
        self.layoutlmv3_vocab = self.tokenizer.get_vocab()
        
    def add(self, word):
        if not str_utils.is_number(word):
            self._words_counter.update([word.lower()])
            
    def load_layoutlmv3_vocabulary(self):
        for token, token_id in self.layoutlmv3_vocab.items():
            self._vocabulary[token] = token_id + len(self._vocabulary)

        print(f"LayoutLMv3 Vocabulary loaded. Size: {len(self._vocabulary)}")
        self.built = True

    def build(self):
        self.load_layoutlmv3_vocabulary()
        if not self.built:
            warnings.warn(
                "The vocabulary is not built. Use VocabularyBuilder.load_layoutlmv3_vocabulary() or add words before building. Returning default vocabulary.", Warning)
        return self._vocabulary

    def get_vocab(self):
        if not self.built:
            warnings.warn(
                "The vocabulary is not built. Use VocabularyBuilder.build(). Returning default vocabulary.", Warning)
            return self._vocabulary
        else:
            return self._vocabulary

In [17]:
vocab = VocabularyBuilder()
vocab.build()

LayoutLMv3 Vocabulary loaded. Size: 50268


{'<PAD>': 0,
 '<NUMBER>': 1,
 '<RARE>': 2,
 '<s>': 3,
 '<pad>': 5,
 '</s>': 7,
 '<unk>': 9,
 '.': 11,
 'Ġthe': 13,
 ',': 15,
 'Ġto': 17,
 'Ġand': 19,
 'Ġof': 21,
 'Ġa': 23,
 'Ġin': 25,
 '-': 27,
 'Ġfor': 29,
 'Ġthat': 31,
 'Ġon': 33,
 'Ġis': 35,
 'âĢ': 37,
 "'s": 39,
 'Ġwith': 41,
 'ĠThe': 43,
 'Ġwas': 45,
 'Ġ"': 47,
 'Ġat': 49,
 'Ġit': 51,
 'Ġas': 53,
 'Ġsaid': 55,
 'Ļ': 57,
 'Ġbe': 59,
 's': 61,
 'Ġby': 63,
 'Ġfrom': 65,
 'Ġare': 67,
 'Ġhave': 69,
 'Ġhas': 71,
 ':': 73,
 'Ġ(': 75,
 'Ġhe': 77,
 'ĠI': 79,
 'Ġhis': 81,
 'Ġwill': 83,
 'Ġan': 85,
 'Ġthis': 87,
 ')': 89,
 'ĠâĢ': 91,
 'Ġnot': 93,
 'Ŀ': 95,
 'Ġyou': 97,
 'ľ': 99,
 'Ġtheir': 101,
 'Ġor': 103,
 'Ġthey': 105,
 'Ġwe': 107,
 'Ġbut': 109,
 'Ġwho': 111,
 'Ġmore': 113,
 'Ġhad': 115,
 'Ġbeen': 117,
 'Ġwere': 119,
 'Ġabout': 121,
 ',"': 123,
 'Ġwhich': 125,
 'Ġup': 127,
 'Ġits': 129,
 'Ġcan': 131,
 'Ġone': 133,
 'Ġout': 135,
 'Ġalso': 137,
 'Ġ$': 139,
 'Ġher': 141,
 'Ġall': 143,
 'Ġafter': 145,
 '."': 147,
 '/': 149,
 'Ġwould': 151,
 

In [3]:
from transformers import LayoutLMv3Tokenizer
LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base-uncased")

OSError: microsoft/layoutlmv3-base-uncased is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [7]:
from transformers import LayoutLMv3Tokenizer

tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")


In [6]:
from transformers import LayoutLMv3Tokenizer

# Specify the model name or path
model_name = "microsoft/layoutlmv3-base"

# Load the LayoutLMv3 tokenizer
tokenizer = LayoutLMv3Tokenizer.from_pretrained(model_name)

# Example usage
text = "This is an example text for tokenization."
tokens = tokenizer(text)

print("Original text:", text)
print("Tokenized text:", tokens)


ValueError: Words must be of type `List[str]` (single pretokenized example), or `List[List[str]]` (batch of pretokenized examples).