# First try Basic Tokenizer training on a small text

In [1]:
from minbpe import BasicTokenizer
tokenizer = BasicTokenizer()
text = "aaabdaaabac"
tokenizer.train(text, 256 + 3) # 256 are the byte tokens, then do 3 merges
print(tokenizer.encode(text))
# [258, 100, 258, 97, 99]
print(tokenizer.decode([258, 100, 258, 97, 99]))
# aaabdaaabac
tokenizer.save("toy")
# writes two files: toy.model (for loading) and toy.vocab (for viewing)


100%|██████████| 3/3 [00:00<?, ?it/s]

[258, 100, 258, 97, 99]
aaabdaaabac





# Now load all uzbek nasr books in cyrillic and train Basic Tokenizer on Big Uzbek Nasr Text

In [2]:
import re
import pathlib
import pandas as pd
from collections import Counter
from tqdm import tqdm

In [3]:
data_folder = pathlib.Path(r"C:\Users\amrul\programming\nlp_related\datasets\whole_ocr_collection-20231208T081731Z-001\whole_ocr_collection")
files = [file for file in data_folder.iterdir() if "txt" in file.suffix]
print(f"There are {len(files)} txt files in {data_folder}")

There are 713 txt files in C:\Users\amrul\programming\nlp_related\datasets\whole_ocr_collection-20231208T081731Z-001\whole_ocr_collection


In [4]:
# filter cyrillic files only
non_cyrillic_filenames=['abdulla_chimirzayev_hayot_yog_dulari_hikoyalar_whole_ocr.txt',
 'abdulla_qahhor_hikoyalar_1933_whole_ocr.txt',
 'abdulla_qahhor_qanotsiz_chittak_1937_whole_ocr.txt',
 'abdulla_qahhor_qotilning_tug_ilishi_1933_whole_ocr.txt',
 'abdulla_qodiriy_jinlar_bazmi_hikoyalar_whole_ocr.txt',
 'baxtiyor_omon_boburning_bolaligi_hikoyalar_start_100_whole_ocr.txt',
 'bibi_robia_saidova_parvoz_hikoyalar_start_100_whole_ocr.txt',
 'cho_lpon_kecha_va_kunduz_roman_start_100_whole_ocr.txt',
 'dinora_rahimova_qishloqdagi_buvijonim_qissa_start_100_whole_ocr.txt',
 'erkin_a_zam_ertak_bilan_xayrlashuv_qissalar_va_hikoyalar_start_100_whole_ocr.txt',
 'hasan_muxtorov_armon_qissa_start_200_whole_ocr.txt',
 'https_n_ziyouz_com_https_www_phoca_cz_phocadownload_whole_ocr.txt',
 'husaynxon_orifiy_ayanchli_qismat_qissa_va_hikoyalar_start_200_whole_ocr.txt',
 'ibrohim_rahim_fidoyilar_roman_start_200_whole_ocr.txt',
 'ilhom_zoyir_yuz_tillo_mojarosi_roman_start_200_whole_ocr.txt',
 'inomjon_abdiyev_arslon_yelkasidagi_xazina_qissa_start_200_whole_ocr.txt',
 'kimsan_mashrab_turon_o_g_li_devona_mashrab_badia_start_200_whole_ocr.txt',
 'latif_mahmudiv_sevgi_desam_hikoyalar_start_200_whole_ocr.txt',
 'mamatqul_hazratqulov_eshiklar_ochiq_qissa_start_200_whole_ocr.txt',
 'mirza_karim_mohlaroyim_qissa_start_300_whole_ocr.txt',
 'muhammad_ismoil_bahorning_eng_so_nggi_lolasi_start_300_whole_ocr.txt',
 'muhammad_ismoil_zabarjad_qissa_start_300_whole_ocr.txt',
 'murod_muhammad_do_st_galatepaga_qaytish_qissa_start_300_whole_ocr.txt',
 'muyassar_tilovova_burgutlar_hikoyalar_start_300_whole_ocr.txt',
 'normurod_norqobilov_g_animlar_qissa_start_300_whole_ocr.txt',
 'normurod_norqobilov_temur_g_ori_1999_start_300_whole_ocr.txt',
 'nurulla_chori_bo_ron_tingan_kecha_hikoyalar_start_400_whole_ocr.txt',
 'oqiljon_husan_tog_da_o_sgan_bola_roman_start_400_whole_ocr.txt',
 'oybek_navoiy_roman_start_400_whole_ocr.txt',
 'pirimqul_qodirov_shohruh_va_gavharshod_roman_start_400_whole_ocr.txt',
 'shuhrat_yetim_boshin_silaganlar_hikoya_va_qissalar_start_500_whole_ocr.txt',
 'sotim_avaz_temurg_ozi_to_ra_start_600_whole_ocr.txt',
 'tog_ay_murod_ot_kishnagan_oqshom_qissalar_2006_start_600_whole_ocr.txt',
 'tog_ay_murod_yulduzlar_mangu_yonadi_qissalar_start_600_whole_ocr.txt',
 'xayriddin_sultonov_ko_ngil_ozodadur_qissa_start_600_whole_ocr.txt',
 'xayriddin_sultonov_saodat_sohili_qissa_start_600_whole_ocr.txt',
 'xayriddin_sultonov_saodat_sohili_start_600_whole_ocr.txt',
 'xudoyberdi_to_xtaboyev_qasoskorning_oltin_boshi_roman_start_600_whole_ocr.txt',
 'xurshid_davron_bibixonim_qissasi_hikoya_va_qissalar_start_600_whole_ocr.txt',
 'xurshid_davron_tarixiy_hikoyalar_start_600_whole_ocr.txt',
 'zohir_a_lam_afandining_qirq_bir_pashshasi_qissa_start_700_whole_ocr.txt']

cyr_files = [file for file in files if file.name not in non_cyrillic_filenames]
print(f"extracted {len(cyr_files)} cyrillic files out of {len(files)}")

extracted 672 cyrillic files out of 713


In [5]:
from tqdm import tqdm
big_text = ""

with tqdm(total=len(cyr_files)) as pbar:
    for file in cyr_files:
        big_text = f"{big_text}\n{file.read_text(encoding='utf-8')}"
        pbar.set_description(f"Read {file.name[:40]:40}")
        pbar.update(1)

Read zulfiya_qurolboy_qizi_mashaqqatlar_girdo: 100%|██████████| 672/672 [00:35<00:00, 18.90it/s] 


In [None]:
tokenizer = BasicTokenizer()

# moment of truth train tokenizer on big text. For the start let's go with hundred thousand mergees
num_merges = 100000
tokenizer.train(
    big_text, 256 + num_merges, verbose=True,prompt_interval=1000
)  # 256 are the byte tokens, then do num_merges merges

print(tokenizer.encode(text[:1000]))

print(tokenizer.decode([258, 259, 260, 261, 262]))

tokenizer.save("uzbek_nasr")
# writes two files: uzbek_nasr.model (for loading) and uzbek_nasr.vocab (for viewing)

  0%|          | 0/100000 [00:00<?, ?it/s]