In [1]:
from transformers import LongformerTokenizer
from transformers import PreTrainedTokenizerFast
from datasets import load_dataset
from tokenizers import trainers, Tokenizer
from tokenizers.models import Unigram
import ftfy
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
longformer_tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')

In [3]:
dataset = load_dataset("brwac", data_dir="data", streaming=True)

In [4]:
def batch_iterator(n_samples=None):
    for i, example in enumerate(dataset["train"]):
        if n_samples is not None and i > n_samples:
            break
        text = example["text"]
        text = sum(text["paragraphs"], [])
        text = [example["title"]] + text
        text = "\n".join(text)
        text = ftfy.fix_text(text)
        yield text

In [4]:
tokenizer = Tokenizer(Unigram())

In [5]:
special_tokens = list(longformer_tokenizer.special_tokens_map.values())
special_tokens.remove('<unk>')

In [6]:
trainer = trainers.UnigramTrainer(
    vocab_size=longformer_tokenizer.vocab_size, 
    special_tokens=special_tokens,
    unk_token='<unk>'
)

In [28]:
tokenizer.train_from_iterator(batch_iterator(), trainer)





In [29]:
tokenizer.save('tokenizer.json')

In [30]:
unigram_tokenizer = PreTrainedTokenizerFast.from_pretrained('tokenizer.json')



In [31]:
unigram_tokenizer.add_special_tokens(longformer_tokenizer.special_tokens_map)

0

In [32]:
unigram_tokenizer.save_pretrained('PTLongformerTokenizer')

('PTLongformerTokenizer/tokenizer_config.json',
 'PTLongformerTokenizer/special_tokens_map.json',
 'PTLongformerTokenizer/tokenizer.json')

In [34]:
pt_phrase = "Essa é uma frase de teste para o modelo de longformer."
unigram_tokenizer.tokenize(pt_phrase)

['E',
 's',
 's',
 'a',
 ' é um',
 'a',
 ' f',
 'r',
 'a',
 's',
 'e de te',
 's',
 'te ',
 'p',
 'a',
 'r',
 'a',
 ' o modelo de lon',
 'g',
 'fo',
 'r',
 'me',
 'r',
 '.']