In [1]:
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers import normalizers
from tokenizers.normalizers import NFD
from tokenizers.pre_tokenizers import Whitespace, Sequence, Digits, Punctuation
from tokenizers.processors import TemplateProcessing
from tokenizers.trainers import WordPieceTrainer
from tokenizers import decoders
import string

In [17]:
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]",))
tokenizer.normalizer = normalizers.Sequence([NFD()])
tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits(individual_digits=False), Punctuation()])
tokenizer.post_processor = TemplateProcessing(
    single="[START] $A [END]",
    special_tokens=[
        ("[START]", 1),
        ("[END]", 2),
    ],
)
trainer = WordPieceTrainer(vocab_size=1000, special_tokens=["[START]", "[END]", "[UNK]", "[SEP]", "[PAD]", "[MASK]"], 
                     show_progress=True, initial_alphabet = list(string.ascii_letters) + list(string.digits) + list(string.punctuation))
tokenizer.decoder = decoders.WordPiece()

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
data = pd.read_csv("/home/rahulvadhyar/Documents/Hilichurlian-Eng/dataset/Hilichurl_Eng -Sheet1.csv")

X_train,X_test,y_train,y_test=train_test_split(data["Hilichurl"],data["English"],test_size=0.2,random_state=1234)

In [19]:
def dataset_interator():
    # for article in X_train:
    #     yield article
    for article in y_train:
        yield article

In [20]:
tokenizer.train_from_iterator(dataset_interator(), trainer=trainer)







In [23]:
tokenizer.save("englighTokenizer.json")

In [9]:
output = tokenizer.encode("Mi Muhe Ye Beru Dada, Mi Valo Dada.")
print(output.ids)
tokenizer.decode(output.ids)

[1, 122, 148, 130, 168, 189, 17, 122, 165, 189, 19, 2]


'Mi Muhe Ye Beru Dada, Mi Valo Dada.'

In [21]:
output = tokenizer.encode("Its me, hi, I'm the problem its me.")
print(output.ids)
tokenizer.decode(output.ids)

[1, 46, 106, 108, 240, 17, 77, 103, 17, 46, 12, 82, 126, 85, 334, 123, 115, 111, 113, 175, 108, 240, 19, 2]


"Its me, hi, I ' m the problem its me."

In [22]:
from tokenizers.tools import EncodingVisualizer
encoding = EncodingVisualizer(tokenizer)
encoding("Its me, hi, I'm the problem its me.")