In [36]:
import os
from itertools import chain
from collections import defaultdict

from transformers import AutoTokenizer
from datasets import load_from_disk

In [2]:
num_procs = os.cpu_count()

In [3]:
tokenizer = AutoTokenizer.from_pretrained("debertinha-v2-tokenizer")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [49]:
dataset = load_from_disk("ds_subset")

In [50]:
dataset

Dataset({
    features: ['text'],
    num_rows: 10000
})

In [51]:
column_names = dataset.column_names
max_seq_length = tokenizer.model_max_length - 2
max_seq_length

510

In [52]:
def convert_to_tokens(examples):
    return {"tokens": [tokenizer.tokenize(t) for t in examples["text"]]}

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, and if the total_length < max_seq_length  we exclude this batch and return an empty dict.
    # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
    total_length = (total_length // max_seq_length) * max_seq_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
        for k, t in concatenated_examples.items()
    }
    return result

def encode_plus(example):
    data = defaultdict(list)
    for tokens in example["tokens"]:
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = [1] * len(input_ids)
        special_tokens_mask = tokenizer.get_special_tokens_mask(input_ids, already_has_special_tokens=True)
        data["input_ids"].append(input_ids)
        data["attention_mask"].append(attention_mask)
        data["special_tokens_mask"].append(special_tokens_mask)
    return data

In [53]:
dataset = dataset.map(
        convert_to_tokens,
        batched=True,
        num_proc=num_procs,
        remove_columns=["text"]
    )

In [54]:
dataset = dataset.map(
        group_texts,
        batched=True,
        num_proc=num_procs,
    )

In [55]:
dataset = dataset.map(
        encode_plus,
        batched=True,
        num_proc=num_procs,
        remove_columns=["tokens"]
    )

Map (num_proc=4):   0%|          | 0/20410 [00:00<?, ? examples/s]

Map (num_proc=4): 100%|██████████| 20410/20410 [00:06<00:00, 3262.59 examples/s]


In [56]:
dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'special_tokens_mask'],
    num_rows: 20410
})

In [57]:
dataset.save_to_disk("ds_subset_encoded")

Saving the dataset (1/1 shards): 100%|██████████| 20410/20410 [00:00<00:00, 204555.22 examples/s]
