#### importing necessary libraries and the tokenizers library, specifically importing the ByteLevelBPETokenizer.

In [1]:
import argparse
import os
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer

#### defining the parameters including train_data_file, output_dir, vocab_size, and min_freq. (Took the Tokenizer training.txt file of java dataset)

In [2]:
train_data_file = "DATASET"
output_dir = "Output"
vocab_size = 5000
min_freq = 2

#### creating an instance of the ByteLevelBPETokenizer class from the tokenizers library.

In [3]:
# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()


#### processing the args.train_data_file to obtain the absolute path and, if it does not end with ".txt," assuming it is a directory and collecting all the ".txt" files within that directory using the glob method from the pathlib module.


In [11]:
paths = os.path.abspath(train_data_file)

if not train_data_file.endswith(".txt"):
    paths = [str(x) for x in Path(paths).glob("**/*.txt")]
    print(paths)

['/home/user1-selab3/shradha_test/roberta/DATASET/JAVA/TOKEN/RAW/tokenizer_training.txt']


#### The training involves specifying the input files (files parameter), vocabulary size (vocab_size parameter), minimum frequency (min_frequency parameter), and special tokens.

files: This parameter is set to paths, which is a list of file paths. These files are used as the training data for the tokenizer. It can be a single file or a list of files.


vocab_size: This parameter is set to args.vocab_size, which is the maximum vocabulary size. The tokenizer will learn subword units until the vocabulary size reaches this specified limit.



min_frequency: This parameter is set to args.min_freq, which is the minimum number of occurrences for a subword unit to be included in the vocabulary. Subword units with a frequency less than this threshold will be discarded.



These parameters control the training process and influence the size and composition of the vocabulary that the tokenizer will learn. Adjusting vocab_size and min_frequency allows you to control the granularity and size of the vocabulary based on the characteristics of your training data. After training, the tokenizer will have learned a vocabulary that can be used to tokenize text into subword units. The training process involves iteratively merging the most frequent pairs of consecutive subword units until the vocabulary size reaches the specified limit.

In [12]:
# Customize training
tokenizer.train(files=paths, vocab_size=vocab_size, min_frequency=min_freq, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])






#### After training, this line adds additional special tokens <x> and <z> to the tokenizer. Special tokens are often used to represent certain elements in the text, and they are typically defined during the training phase.

In [13]:
tokenizer.add_special_tokens(["<x>","<z>"])

0

#### This block ensures that the specified output directory exists. If it doesn't, it creates the directory. Then, it saves the trained tokenizer model to the output directory using tokenizer.save_model().

In [14]:

# Save files to disk
output_dir = os.path.abspath(output_dir)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
tokenizer.save_model(output_dir)

['/home/user1-selab3/shradha_test/roberta/Output/vocab.json',
 '/home/user1-selab3/shradha_test/roberta/Output/merges.txt']