### Process dataset

In [3]:
%load_ext jupyter_black

In [4]:
from transformers import DebertaV2Tokenizer, AutoTokenizer, DebertaV2ForMaskedLM
from transformers import PreTrainedTokenizerFast
from datasets import load_dataset, load_from_disk
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


### Load dataset

In [6]:
dataset = load_from_disk("../data/c4ai-wik")

### Load model and tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("../model/deberta_pt_tokenizer")
model = DebertaV2ForMaskedLM.from_pretrained("microsoft/deberta-v3-base", vocab_size=tokenizer.vocab_size, ignore_mismatched_sizes=True)

### Splitting data by max_lenght

In [None]:
MAX_POS_EMBD = model.config.max_position_embeddings
WINDOW = 200

In [None]:
from functools import partial

def _tokenizer(sample, tokenizer, window, max_lenght):
    tkn_text = tokenizer(sample['text'], add_special_tokens=False)

    dev_train_dataset = []

    for sample in tkn_text['input_ids']:
        for i in range(0, len(sample), window):
            dev_train_dataset.append(tokenizer.decode(sample[i:max_lenght+i]).strip())
    return {'text': dev_train_dataset}

def _tokenizer_aux(sample, tokenizer):
    tkn_text = tokenizer(sample['text'], return_special_tokens_mask=True)
    return tkn_text

partial_tokenizer = partial(_tokenizer, tokenizer=tokenizer, window=WINDOW, max_lenght=MAX_POS_EMBD-2)
partial_tokenizer_aux = partial(_tokenizer_aux, tokenizer=tokenizer)

### Preprocess dataset

In [None]:
dataset = dataset.map(partial_tokenizer, batched=True, num_proc=4)

In [None]:
dataset.save_to_disk("../data/c4ai-wik-tokenized")

### Generate id's

In [None]:
dataset = dataset.map(partial_tokenizer_aux, batched=True, num_proc=4)

In [None]:
dataset.save_to_disk("../data/c4ai-wik-tokenized-aux")