In [5]:
# !pip install datasets
# !huggingface-cli login

In [8]:
# from datasets import load_dataset
# load_dataset("balochiml/balochi-language-data", data_dir="data", cache_dir="../data")

# Generate the processed data without English characters

In [1]:
import os

def get_txt_file_paths(directory):
    txt_file_paths = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".txt"):
                file_path = os.path.join(root, file)
                txt_file_paths.append(file_path)
    return txt_file_paths

# Replace "directory_path" with the actual path of the directory you want to search
directory_path = "../data/raw_text"
txt_paths = get_txt_file_paths(directory_path)

len(txt_paths)


4294

In [2]:
import re

def clean_text(file_path):
    # Open the file and read it into memory
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    # Remove English-language characters and numbers
    text = re.sub(r'[a-zA-Z0-9]', '', text)

    # Remove any excess whitespace
    text = re.sub(r'[^\S\n]+', ' ', text)

    return text

In [3]:
for path in txt_paths:
    cleaned_text = clean_text(path)

    # write the cleaned text to a new file with an incremented filename
    # write the files all into the '../data/processed_text' directory
    with open(f'../data/processed_text/{path.split("/")[-1]}', 'w', encoding='utf-8') as file:
        file.write(cleaned_text)


In [4]:
from tokenizers import Tokenizer
from tokenizers.models import BPE

tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

In [5]:
from tokenizers.pre_tokenizers import Whitespace
tokenizer.pre_tokenizer = Whitespace()

In [6]:
from tokenizers.trainers import BpeTrainer

trainer = BpeTrainer(
    min_frequency=2,
    vocab_size=40000,
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
    show_progress=True,
)

In [7]:
# get a list of all the txt files in
# '/Users/strickvl/balochi/balochi-tokenizer/data/processed_text'

processed_files = get_txt_file_paths("../data/processed_text")
assert len(processed_files) == len(txt_paths)
len(processed_files)

4294

In [8]:
tokenizer.train(processed_files, trainer)






In [9]:
tokenizer.model

<tokenizers.models.BPE at 0x114910dd0>

In [10]:
tokenizer.get_vocab_size()

40000

In [11]:
tokenizer.get_vocab()

{'وداع': 21045,
 'وسیل': 16315,
 'گھنٹہ': 15020,
 'لسانیں': 24958,
 'نیکی': 4830,
 'پیِ': 34528,
 'ہد': 20306,
 'ریموٹ': 39651,
 'ولک': 35099,
 'مّا': 21551,
 'هال': 2349,
 'دیئیں': 39545,
 'همگرنچ': 19468,
 'ربیدگءَ': 26168,
 'ۓِ': 13276,
 'اَہ': 10481,
 'پمیش': 1235,
 'علاقہ': 6854,
 'زمانگءَ': 3377,
 'مزھرا': 10075,
 'مُک': 4909,
 'وتسر': 13772,
 'بندگءَ': 14795,
 'شن': 1353,
 'آنگو': 7345,
 'پْروشی': 29354,
 'كپت': 19130,
 'ﮕﺎﺭ': 30181,
 'کُشان': 30562,
 'لسانی': 5701,
 'لاطاکیه': 38356,
 'قربانیءِ': 37641,
 'وانس': 25383,
 'بےاںت': 7657,
 'گناہ': 4020,
 'ںْ': 14704,
 'بوسگے': 32604,
 'بیسیمہءِ': 22887,
 'śéń': 38410,
 'طلا': 20225,
 'نڈُک': 19500,
 'ٹہینتگ': 20009,
 'ترس': 1897,
 'ﮎ': 376,
 'السلام': 25174,
 'کماتگ': 12586,
 'پدریچءِ': 26446,
 'ﻤﺎﺭ': 27352,
 'تَہار': 22655,
 'دیّت': 30338,
 'کشّگ': 3752,
 'باریں': 1136,
 'نَدر': 22263,
 'سِکّ': 22182,
 'کودکی': 12401,
 'جتاءُ': 31366,
 'لّت': 11285,
 'ستانے': 31050,
 'دێمروی': 21246,
 'ملامت': 5761,
 'آاوں': 24805,
 'پون': 1567,
 

In [12]:
tokenizer.save("../models/balochi-tokenizer.json")