In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'


In [None]:
import os 
import src.globals as g
import src.utils as utils

import src.data_handler as handling

In [None]:
dataset_path = os.path.join(g.DATA_FOLDER,'training_set.json')
squad_dataset = handling.RawSquadDataset(dataset_path)

df = squad_dataset.train_df.copy()

df.head()

In [None]:
model, vocab = utils.load_qa_embedding_model()

In [None]:
from tokenizers import  Tokenizer, Encoding
from tokenizers.models import WordLevel
from tokenizers.normalizers import Lowercase, Strip, StripAccents, NFD, BertNormalizer
from tokenizers.normalizers import Sequence as NormSequence
from tokenizers.pre_tokenizers import Punctuation, Whitespace
from tokenizers.pre_tokenizers import Sequence as PreSequence
from tokenizers.processors import TemplateProcessing
from tokenizers.trainers import WordLevelTrainer


tokenizer = Tokenizer(WordLevel(unk_token=g.UNK_TOKEN))
tokenizer.normalizer = BertNormalizer(handle_chinese_chars=False) #NormSequence([NFD(), StripAccents(), Lowercase(), Strip()])    
tokenizer.pre_tokenizer = PreSequence([Whitespace(), Punctuation()])

tokenizer.post_processor = TemplateProcessing(
    single="[SOS] $A [EOS]",
    pair="[SOS] $A [EOS] [SOS]:1 $B:1 [EOS]:1",
    special_tokens=[
        ("[SOS]", 2),
        ("[EOS]", 3),
    ],
)


trainer = WordLevelTrainer(special_tokens=[g.PAD_TOKEN,g.UNK_TOKEN,g.SOS_TOKEN,g.EOS_TOKEN],vocab_size=40000)   #min_frequency

#l = df.context.to_list() + df.answer.to_list()
l = df.question.to_list()
tokenizer.train_from_iterator(l,trainer=trainer) 
tokenizer.enable_padding(direction="right", pad_id=tokenizer.token_to_id(g.PAD_TOKEN), pad_type_id=1, pad_token=g.PAD_TOKEN)


In [None]:
tokenizer.get_vocab_size()

In [None]:
tokenizer.add_special_tokens([g.PAD_TOKEN,g.UNK_TOKEN]) #,g.SOS_TOKEN,g.EOS_TOKEN

In [None]:
# l = df.context.to_list() + df.answer.to_list() 
s = set()
for e in l :
    # if 'intellectu' in e:
    #     print(e)
    s.update(e.split())

len(s)
        

#tokenizer.encode('To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?').tokens



In [None]:
tokenizer.save('data/tokenizer.json')

In [None]:
n = 0
for e in tokenizer.get_vocab().keys() :
    if e not in vocab:
        print(e)
        n+=1
n

In [None]:
tokenizer.get_vocab()["tδ"]

df[df['context'].str.contains('tδ')]

for e in l :
    if 'tδ' in e:
        print(e)

In [None]:
tokenizer.id_to_token(2)
tokenizer.token_to_id('plda')

In [None]:
s1 = df.loc[39660,'context']
s2 = df.loc[39601,'context']
s1
s2

In [None]:
encodings: list[Encoding] = tokenizer.encode_batch([s1,s2])

print([e.ids for e in encodings])
print([e.tokens for e in encodings])
print([e.special_tokens_mask for e in encodings])

In [None]:
print(tokenizer.get_vocab()['hokkien'])