## **Training BPE**

In [None]:
from pathlib import Path
import os
from tokenizers import ByteLevelBPETokenizer

In [None]:
def fix_apostrophe_space(input_string):
    pattern = re.compile(r"(?<=\w)'\s(?=[tslvrm])")
    output_string = re.sub(pattern, "'", input_string)

    return output_string

## Loading the DataSet
* Add the kaggle dataset **Machine Translation Data Set** by Aadish Joshi to your working directory

In [None]:
def load_data():
    text = ""

    with open("/kaggle/input/machine-translation-data-set/enlish_data.txt", 'r', encoding='utf-8') as f:
        a_book = f.read()
        
    text += a_book
    text = fix_apostrophe_space(text)
    
    return text

## **Generating BPE and Storing**

In [None]:
path = ["/kaggle/input/machine-translation-data-set/enlish_data.txt"]
tokenizer = ByteLevelBPETokenizer()

tokenizer.train(files=path, vocab_size=18000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

token_dir = 'bpe'
if not os.path.exists(token_dir):
    os.makedirs(token_dir)

tokenizer.save_model('bpe')

## **Loading the vocab file**

In [None]:
def load_bpe():
    with open("bpe/vocab.json") as f:
        vocab = json.load(f)
#     with open("bpe/merges.txt") as f:
#         merges = f.read()
        
    tokenizer = ByteLevelBPETokenizer(
        "bpe/vocab.json",
        "bpe/merges.txt",
    )
    
    return vocab, tokenizer

## **Tokenizing the whole training dataset**

In [None]:
from tokenizers.implementations import ByteLevelBPETokenizer

def tokenize(text):
    start = 0
    end = int(100e6)
    offset = int(100e6)
    total_length = len(text)
    bpe = []
    
    vocab, tokenizer = load_bpe()

    while start != end:
        batch = text[start:end]
        pattern = re.compile(r'(.*)[\.?!\n]', re.DOTALL)
        match = pattern.search(batch)

        if match:
            extracted_text = match.group()
        else:
            extracted_text = ''

        end = end - (len(batch) - len(extracted_text))

        splits = tokenizer.encode(extracted_text).tokens
        bpe.append(splits)

        start = end
        end = end+offset if end+offset <= total_length else total_length
    
    bpe = sum(bpe, [])
    encode = lambda x: [vocab[_] for _ in x]
    input_ids = torch.tensor(encode(bpe))
    
    return input_ids, len(vocab)

In [None]:
text = load_data()
input_ids, vocab_size = tokenize(text)

## **Storing Tokenized Data**

In [None]:
tokenized_data = {
    'input_ids': input_ids,
    'vocab_size': vocab_size
}

# torch.save(tokenized_data, 'bpe/tokenized_data.pt')
torch.save(tokenized_data, '<enter_path>')

## **Loading Tokenized Data**

In [None]:
def load_tokenized():
    tokenized_data = torch.load("<enter_path>")
    input_ids = tokenized_data['input_ids']
    vocab_size = tokenized_data['vocab_size']
    
    return input_ids, vocab_size

In [None]:
input_ids, vocab_size = load_tokenized()