This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course).

In [None]:
#@title
from IPython.display import HTML

HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/DJimQynXZsQ?rel=0&amp;controls=0&amp;showinfo=0" frameborder="0" allowfullscreen></iframe>')

Install the Transformers and Datasets libraries to run this notebook.

In [None]:
! pip install datasets transformers[sentencepiece]

In [1]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained(
  'huggingface-course/bert-base-uncased-tokenizer-without-normalizer'
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [2]:
text = "here is a sentence adapted to our tokenizer"
print(tokenizer.tokenize(text))

['here', 'is', 'a', 'sentence', 'adapted', 'to', 'our', 'token', '##izer']


In [3]:
text = "এই বাক্যটি আমাদের টোকেনাইজারের উপযুক্ত নয়"
print(tokenizer.tokenize(text))

['এ', '##ই', '[UNK]', 'আ', '##ম', '##া', '##দ', '##ে', '##র', '[UNK]', '[UNK]', '[UNK]']


In [4]:
text = "this tokenizer does not know àccënts and CAPITAL LETTERS"
print(tokenizer.tokenize(text))

['this', 'token', '##izer', 'does', 'not', 'know', '[UNK]', 'and', '[UNK]', '[UNK]']


In [5]:
text = "the medical vocabulary is divided into many sub-token: paracetamol, phrayngitis"
print(tokenizer.tokenize(text))

['the', 'medical', 'vocabulary', 'is', 'divided', 'into', 'many', 'sub', '-', 'token', ':', 'para', '##ce', '##tam', '##ol', ',', 'ph', '##ray', '##ng', '##itis']


In [10]:
from datasets import load_dataset

raw_datasets = load_dataset("Shuu12121/java-treesitter-dedupe_doc-filtered-dataset")

README.md: 0.00B [00:00, ?B/s]

train-00000-of-00004.parquet:   0%|          | 0.00/69.8M [00:00<?, ?B/s]

train-00001-of-00004.parquet:   0%|          | 0.00/77.1M [00:00<?, ?B/s]

train-00002-of-00004.parquet:   0%|          | 0.00/76.7M [00:00<?, ?B/s]

train-00003-of-00004.parquet:   0%|          | 0.00/82.0M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/3.40M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/4.49M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1553016 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/15165 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/17927 [00:00<?, ? examples/s]

In [11]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['code', 'docstring', 'func_name', 'language', 'repo', 'path', 'url', 'license'],
        num_rows: 1553016
    })
    validation: Dataset({
        features: ['code', 'docstring', 'func_name', 'language', 'repo', 'path', 'url', 'license'],
        num_rows: 15165
    })
    test: Dataset({
        features: ['code', 'docstring', 'func_name', 'language', 'repo', 'path', 'url', 'license'],
        num_rows: 17927
    })
})

In [12]:
def get_training_corpus():
    dataset = raw_datasets["train"]
    for start_idx in range(0, len(dataset), 1000):
        samples = dataset[start_idx : start_idx + 1000]
        yield samples["docstring"]

In [13]:
from transformers import AutoTokenizer

old_tokenizer = AutoTokenizer.from_pretrained("gpt2")
training_corpus = get_training_corpus()
new_tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 52000)
new_tokenizer.save_pretrained("code-search-net-tokenizer")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

('code-search-net-tokenizer/tokenizer_config.json',
 'code-search-net-tokenizer/special_tokens_map.json',
 'code-search-net-tokenizer/vocab.json',
 'code-search-net-tokenizer/merges.txt',
 'code-search-net-tokenizer/added_tokens.json',
 'code-search-net-tokenizer/tokenizer.json')

In [17]:
new_tokenizer

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=52000, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
)

In [None]:
example = """class LinearLayer():
    def __init__(self, input_size, output_size):
        self.weight = torch.randn(input_size, output_size)
        self.bias = torch.zeros(output_size)

    def __call__(self, x):
        return x @ self.weights + self.bias
    """

print(old_tokenizer.tokenize(example))
print(new_tokenizer.tokenize(example))