In [12]:
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers import normalizers
from tokenizers.normalizers import NFD
from tokenizers.pre_tokenizers import Whitespace, Sequence, Digits, Punctuation
from tokenizers.processors import TemplateProcessing
from tokenizers.trainers import WordPieceTrainer
from tokenizers import decoders
import string

In [13]:
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]",))
tokenizer.normalizer = normalizers.Sequence([NFD()])
tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits(individual_digits=False), Punctuation()])
tokenizer.post_processor = TemplateProcessing(
    single="[START] $A [END]",
    special_tokens=[
        ("[START]", 0),
        ("[END]", 1),
    ],
)
trainer = WordPieceTrainer(vocab_size=1000, special_tokens=["[START]", "[END]", "[UNK]", "[SEP]", "[PAD]", "[MASK]"], 
                     show_progress=True, initial_alphabet = list(string.ascii_letters) + list(string.digits) + list(string.punctuation))
tokenizer.decoder = decoders.WordPiece()

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers import normalizers
from tokenizers.normalizers import NFD
from tokenizers.pre_tokenizers import Whitespace, Sequence, Digits, Punctuation
from tokenizers.processors import TemplateProcessing
from tokenizers.trainers import WordPieceTrainer
from tokenizers import decoders
import string



lang=[["gl","en"],
      ["glpt","en"],
      ["pt","en"],
      ['tr','en']]

path="../../dataset/comparisions/"

for cur,target in lang:
    tokenizer = Tokenizer(WordPiece(unk_token="[UNK]",))
    tokenizer.normalizer = normalizers.Sequence([NFD()])
    tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits(individual_digits=False), Punctuation()])
    tokenizer.post_processor = TemplateProcessing(
        single="[START] $A [END]",
        special_tokens=[
            ("[START]", 0),
            ("[END]", 1),
        ],
    )
    trainer = WordPieceTrainer(vocab_size=1000, special_tokens=["[START]", "[END]", "[UNK]", "[SEP]", "[PAD]", "[MASK]"], 
                        show_progress=True, initial_alphabet = list(string.ascii_letters) + list(string.digits) + list(string.punctuation))
    tokenizer.decoder = decoders.WordPiece()

    name=path+cur+"_to_"+target+"train.csv"
    data = pd.read_csv(name)
    columns=list(data.columns)
    X_train=data[columns[1]]
    print(X_train)
    def dataset_interator():
        for article in X_train:
            yield article
    tokenizer.train_from_iterator(dataset_interator(), trainer=trainer)
    tokenizer.save(f"../../models/{cur}_tokeniser.json")

    tokenizer = Tokenizer(WordPiece(unk_token="[UNK]",))
    tokenizer.normalizer = normalizers.Sequence([NFD()])
    tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits(individual_digits=False), Punctuation()])
    tokenizer.post_processor = TemplateProcessing(
        single="[START] $A [END]",
        special_tokens=[
            ("[START]", 0),
            ("[END]", 1),
        ],
    )
    trainer = WordPieceTrainer(vocab_size=1000, special_tokens=["[START]", "[END]", "[UNK]", "[SEP]", "[PAD]", "[MASK]"], 
                        show_progress=True, initial_alphabet = list(string.ascii_letters) + list(string.digits) + list(string.punctuation))
    tokenizer.decoder = decoders.WordPiece()

    name=path+cur+"_to_"+target+"train.csv"
    data = pd.read_csv(name)
    columns=list(data.columns)
    X_train=data[columns[2]]
    def dataset_interator():
        for article in X_train:
            yield article
    tokenizer.train_from_iterator(dataset_interator(), trainer=trainer)
    tokenizer.save(f"../../models/{cur}s_{target}_tokeniser.json")



#X_train,X_test,y_train,y_test=train_test_split(data["Hilichurl"],data["English"],test_size=0.2,random_state=1234)
#data = pd.read_csv("/home/rahulvadhyar/Documents/Hilichurlian-Eng/dataset/Hilichurl_Eng - Sheet1.csv")

0        but that does n't mean that all , or many , or...
1        there 's one called tor , which is another nut...
2                             let us celebrate diversity .
3        and all the evidence from around the world is ...
4                                               whatever .
                               ...                        
10012    here you can see i 've added a light , i 'm tu...
10013    i 'm sorry , you 've got to do a little bit of...
10014                      he sees something in the path .
10015    our modern world is communicating with itself ...
10016                     let 's go through the evidence .
Name: en, Length: 10017, dtype: object






0        and as it rises with the strings , i see that ...
1        but that does n't mean that all , or many , or...
2        my travels to afghanistan began many , many ye...
3        i see the polymerase and the enzymes and so fo...
4                        you 've got to get out of there .
           

In [15]:

    # for article in y_train:
    #     yield article

In [16]:
tokenizer.train_from_iterator(dataset_interator(), trainer=trainer)







In [17]:
tokenizer.save("hilliTokenizer.json")

In [18]:
output = tokenizer.encode("Mi Muhe Ye Beru Dada, Mi Valo Dada.")
print(output.ids)
tokenizer.decode(output.ids)

[0, 126, 228, 165, 237, 246, 17, 126, 236, 246, 19, 1]


'Mi Muhe Ye Beru Dada, Mi Valo Dada.'

In [19]:
output = tokenizer.encode("Its me, hi, I'm the problem its me.")
print(output.ids)
tokenizer.decode(output.ids)

[0, 46, 101, 115, 82, 112, 17, 77, 102, 17, 46, 12, 82, 89, 111, 112, 85, 286, 116, 198, 109, 78, 101, 115, 82, 112, 19, 1]


"Its me, hi, I ' m the problem its me."

In [20]:
from tokenizers.tools import EncodingVisualizer
encoding = EncodingVisualizer(tokenizer)
encoding("Its me, hi, I'm the problem its me.")