# Imports

In [84]:
import functools
import numpy as np
import pandas as pd
import typing

from tokenizers.processors import TemplateProcessing
from tokenizers import decoders, models, normalizers, pre_tokenizers, processors, trainers, Tokenizer
from datasets import list_datasets, load_dataset
from datasets import Dataset

# Load Corpus

In [85]:
corpus_excel_path = '/home/philko/Documents/Uni/WiSe2223/Consulting/mlw-consulting-project/data/raw/Lemmaliste.xlsx'
corpus_data_json_path = '/home/philko/Documents/Uni/WiSe2223/Consulting/mlw-consulting-project/data.json'
corpus = pd.read_excel(corpus_excel_path)['Lemmata'].to_frame()

In [86]:
len(corpus)

43472

In [87]:
corpus = corpus.dropna()
len(corpus)

43462

In [88]:
corpus_excel_path = '/home/philko/Documents/Uni/WiSe2223/Consulting/mlw-consulting-project/data/raw/Lemmaliste.xlsx'
corpus_data_json_path = '/home/philko/Documents/Uni/WiSe2223/Consulting/mlw-consulting-project/data.json'
corpus = pd.read_excel(corpus_excel_path)['Lemmata'].to_frame()
# corpus = corpus.dropna()
corpus_data_json = pd.read_json(corpus_data_json_path)
# corpus_data_json = corpus_data_json.dropna()
corpus_data_json = corpus_data_json.rename(columns={'lemma': 'Lemmata'})
corpus['Lemmata'] = corpus['Lemmata'].astype(str)
corpus_data_json['Lemmata'] = corpus_data_json['Lemmata'].astype(str)
corpus_data_json = corpus_data_json.drop("id", axis=1)
corpus_data_json = corpus_data_json.drop_duplicates()
# corpus.join(corpus_data_json, on = "Lemmata", how = "inner")
corpus = pd.concat([corpus, corpus_data_json])

rm_list: list = []
for i, e in enumerate(corpus['Lemmata'].values):
    if isinstance(e, float):
        rm_list.append(e)
corpus = corpus[~corpus['Lemmata'].isin(rm_list)]
corpus['list'] = list(map(lambda e: list(e), corpus['Lemmata'].values))
corpus['text'] = list(map(lambda e: " ".join(e), corpus['list'].values))
corpus.head(5)


Unnamed: 0,Lemmata,list,text
0,acarus,"[a, c, a, r, u, s]",a c a r u s
1,a,[a],a
2,aano,"[a, a, n, o]",a a n o
3,aardum,"[a, a, r, d, u, m]",a a r d u m
4,aardus,"[a, a, r, d, u, s]",a a r d u s


In [89]:
corpus

Unnamed: 0,Lemmata,list,text
0,acarus,"[a, c, a, r, u, s]",a c a r u s
1,a,[a],a
2,aano,"[a, a, n, o]",a a n o
3,aardum,"[a, a, r, d, u, m]",a a r d u m
4,aardus,"[a, a, r, d, u, s]",a a r d u s
...,...,...,...
114419,nimbus,"[n, i, m, b, u, s]",n i m b u s
114423,nimietas,"[n, i, m, i, e, t, a, s]",n i m i e t a s
114431,nimirum,"[n, i, m, i, r, u, m]",n i m i r u m
114458,nimis,"[n, i, m, i, s]",n i m i s


In [90]:
corpus['Lemmata'].values

array(['acarus', 'a', 'aano', ..., 'nimirum', 'nimis', 'nimium'],
      dtype=object)

In [91]:
np.savetxt("test.txt", corpus['Lemmata'].values, fmt='%s')

In [92]:
pd.read_csv("test.txt")

Unnamed: 0,acarus
0,a
1,aano
2,aardum
3,aardus
4,aoo
...,...
46973,nimbus
46974,nimietas
46975,nimirum
46976,nimis


In [93]:
dataset = Dataset.from_pandas(corpus)
dataset

Dataset({
    features: ['Lemmata', 'list', 'text', '__index_level_0__'],
    num_rows: 46979
})

# Train Tokenizer

## Constants

In [94]:
BATCH_SIZE = 1000 # Check effect of modifying this parameter

In [95]:
def batch_iterator():
    for i in range(0, len(dataset), BATCH_SIZE):
        yield dataset[i : i + BATCH_SIZE]["text"]

In [96]:
# Initialize tokenizer
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

special_tokens_map = {
    'cls_token': '<CLS>',
    'pad_token':'<PAD>',
    'sep_token': '<SEP>',
    'bos_token': '<|begoftext|>',
    'eos_token': '<|endoftext|>',
    'unk_token': '<UNK>'}

num_added_toks = tokenizer.add_special_tokens(list(special_tokens_map))

In [97]:
alphabet: set = set(functools.reduce(lambda x, y: x + y, corpus['list'], []))
alphabet

{' ',
 '#',
 '*',
 '-',
 '.',
 '>',
 'A',
 'B',
 'C',
 'E',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'Z',
 '`',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '|'}

## Function

In [98]:
def train_tokenizer(
        lemma_list_path: str,
        tokenizer_path: str,
        batch_size: int = 1000) -> Tokenizer:
    """Train Tokenizer.
    
    Load corpus, split words to characters and train tokenizer on them.
    Tokenizer will also be saved at a specified ('tokenizer path')
    location.
    
    :param lemma_list_path: Path to lemma .xlsx file.
    :paramm tokenizer_path: Path where tokenizer is to be saved.
    :param batch_size: Batch size for training.
    :returns: Trained tokenizer
    """
    corpus: pd.DataFrame = pd.read_excel(lemma_list_path)['Lemmata'].to_frame()
    rm_list: list = []
    for i, e in enumerate(corpus['Lemmata'].values):
        if isinstance(e, float):
            rm_list.append(e)
    corpus = corpus[~corpus['Lemmata'].isin(rm_list)]
    corpus['list'] = list(map(lambda e: list(e), corpus['Lemmata'].values))
    corpus['text'] = list(map(lambda e: " ".join(e), corpus['list'].values))

    dataset = Dataset.from_pandas(corpus)

    alphabet: set = set(functools.reduce(lambda x, y: x + y, corpus['list'], []))

    tokenizer: Tokenizer = Tokenizer(models.BPE())
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

    special_tokens_map = {
        'cls_token': '<CLS>',
        'pad_token':'<PAD>',
        'sep_token': '<SEP>',
        'bos_token': '<|begoftext|>',
        'eos_token': '<|endoftext|>',
        'unk_token': '<UNK>'}

    num_added_toks = tokenizer.add_special_tokens(list(special_tokens_map))

    tokenizer.post_processor = TemplateProcessing(
        single="<|begoftext|> $A <|endoftext|>",
        special_tokens=[("<|begoftext|>", 1), ("<|endoftext|>", 2)],
    )

    def batch_iterator():
        for i in range(0, len(dataset), batch_size):
            yield dataset[i : i + batch_size]["text"]

    # Train tokenizer
    trainer = trainers.BpeTrainer(
        vocab_size=len(alphabet),
        special_tokens=list(special_tokens_map)) # Check recommendations for vocabulary size

    tokenizer.train_from_iterator(
        batch_iterator(),
        trainer=trainer)

    # Post-processor and decoder
    tokenizer.post_processor = processors.ByteLevel(trim_offsets=False, )
    tokenizer.decoder = decoders.ByteLevel()

    tokenizer.save(tokenizer_path)
    return tokenizer

In [120]:
tokenizer = train_tokenizer(
    '/home/philko/Documents/Uni/WiSe2223/Consulting/mlw-consulting-project/data/raw/Lemmaliste.xlsx',
    "2-byte-level-BPE.tokenizer.json",
)






In [168]:
a = tokenizer("gardo")

TypeError: 'tokenizers.Tokenizer' object is not callable

# TODO: Check if special chars are necessary

In [123]:
# Train tokenizer
trainer = trainers.BpeTrainer(
    vocab_size=len(alphabet),
    special_tokens=list(special_tokens_map)) # Check recommendations for vocabulary size

tokenizer.train_from_iterator(
    batch_iterator(),
    trainer=trainer)







In [124]:
tokenizer

<tokenizers.Tokenizer at 0x8910dd0>

In [125]:

# Post-processor and decoder
tokenizer.post_processor = processors.ByteLevel(trim_offsets=False, )
tokenizer.decoder = decoders.ByteLevel()

# Sanity check

print(tokenizer.encode("kalane").ids)
print(tokenizer.decode([0,1,2,3,4,5,423], skip_special_tokens = False))

# Save the tokenizer you trained
tokenizer.save("byte-level-BPE.tokenizer.json")

# Load it using transformers (required, otherwise it is not a callable object)
# tokenizer = PreTrainedTokenizerFast(tokenizer_file= path + "byte-level-BPE.tokenizer.json")

[34, 24, 35, 24, 37, 28]
cls_tokenpad_tokensep_tokenbos_tokeneos_tokenunk_token


# Experiments

In [103]:
from transformers import GPT2Config, ViTConfig, VisionEncoderDecoderConfig, VisionEncoderDecoderModel, ViTFeatureExtractor, GPT2Tokenizer, PreTrainedTokenizerFast

In [128]:
# Load it using transformers (required, otherwise it is not a callable object)
tokenizer = PreTrainedTokenizerFast(tokenizer_file="2-byte-level-BPE.tokenizer.json")

In [129]:
tokenizer.pad_token

Using pad_token, but it is not set yet.


In [130]:
tokenizer.add_special_tokens(special_tokens_map)

6

In [131]:
a = tokenizer("g a r d o")

In [132]:
a

{'input_ids': [30, 51, 24, 51, 41, 51, 27, 51, 38], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [134]:
tokenizer.decode(a['input_ids'])

'g a r d o'

In [None]:
a = tokenizer("gardo")

In [163]:
corpus: pd.DataFrame = pd.read_excel('/home/philko/Documents/Uni/WiSe2223/Consulting/mlw-consulting-project/data/raw/Lemmaliste.xlsx')['Lemmata'].to_frame()
rm_list: list = []
for i, e in enumerate(corpus['Lemmata'].values):
    if isinstance(e, float):
        rm_list.append(e)
corpus = corpus[~corpus['Lemmata'].isin(rm_list)]
corpus['list'] = list(map(lambda e: list(e), corpus['Lemmata'].values))
corpus['text'] = list(map(lambda e: " ".join(e), corpus['list'].values))

dataset = Dataset.from_pandas(corpus)

alphabet: set = set(functools.reduce(lambda x, y: x + y, corpus['list'], []))

In [164]:
dataset[0]

{'Lemmata': 'acarus',
 'list': ['a', 'c', 'a', 'r', 'u', 's'],
 'text': 'a c a r u s',
 '__index_level_0__': 0}

In [167]:
# I use tutorial code from https://huggingface.co/docs/tokenizers/quicktour as example
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "[BOS]", "[EOS]"]) #  Adding [BOS] and [EOS] here
tokenizer.pre_tokenizer = Whitespace()

# Please using TemplateProcessing
# https://huggingface.co/docs/tokenizers/api/post-processors#tokenizers.processors.TemplateProcessing
from tokenizers.processors import TemplateProcessing
tokenizer.post_processor = TemplateProcessing(
    single="[BOS] $A [EOS]",
    special_tokens=[("[BOS]", 1), ("[EOS]", 2)],
)
##################################################

files = ["vocab.txt"]
tokenizer.train(files, trainer)

output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
print(output.ids)
# >> [1, 27255, 18, 95, 13, 5099, 7, 7963, 5114, 6220, 0, 37, 2] <-- you can see there are token [1] in the begining and token [2] at the end of the sequence
print(tokenizer.decode(output.ids))
# >> no [BOS] and [EOS] after decoding




[1, 0, 577, 0, 54, 0, 114, 41, 0, 0, 44, 52, 1944, 1970, 50, 0, 0, 2]
ello y al l o w are yo u


In [149]:
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "[BOS]", "[EOS]"]) #  Adding [BOS] and [EOS] here
tokenizer.pre_tokenizer = Whitespace()

In [150]:
from tokenizers.processors import TemplateProcessing
tokenizer.post_processor = TemplateProcessing(
    single="[BOS] $A [EOS]",
    special_tokens=[("[BOS]", 1), ("[EOS]", 2)],
)

In [160]:
trainer = trainers.BpeTrainer(
    vocab_size=50)

def batch_iterator():
    for i in range(0, len(dataset), 128):
        yield dataset[i : i + 128]["Lemmata"]

In [161]:
tokenizer.train_from_iterator(
    batch_iterator(),
    trainer=trainer)






thread '<unnamed>' panicked at 'Missing additional token', /__w/tokenizers/tokenizers/tokenizers/src/tokenizer/added_vocabulary.rs:292:21
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace


PanicException: Missing additional token

In [158]:
output = tokenizer.encode("gardov")

In [157]:
tokenizer.decode(output.ids)

'g a r d o v'