In [1]:
!pip install datasets
!pip install myanmartools




In [7]:

from datasets import load_dataset
ds = load_dataset("wikipedia", language="my",
                  date="20250120", trust_remote_code=True, split="train")
print(ds)

Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 110833
})


In [76]:
import re

cons = "\u1000-\u1021"
independent_vowel = "[\u1023-\u1027\u1029\u102a]"
cons_init = f"[{cons}\u1025\u1026\u1027\u1029\u103f]"
# consonants + independent_vowel (!caution! 103F is need extra requirement)
medial = "[\u103b-\u103e]{,4}"
vowel = "[\u102b-\u1032]{1,2}\u103a?"
asat = "\u103a"
virama = "\u1039"
cons_asat = f"[{cons}](?:{asat}{virama}|[{asat}{virama}])"
kinzi = f"[\u1004\u101b]{asat}{virama}"
tone = "[\u1036\u1037\u1038]{1,2}"
legaund = f"\u104e\u1004{asat}\u1038|\u104e\u1004{asat}"
symbol1 = f"[\u104c\u104d\u104f]|{legaund}|\u104e"
symbol2 = f"\u103b{asat}"
contra1 = "\u103b\u102c"
digit = "[\u1040-\u1049,\.]+"
# others = "(?![a-zA-Z0-9])[\t-~\x0a\x0d\u104a\u104b]"
others = "[ \u104a\u104b]"
# cons + medial* + vowel* + cons_asat? + tone*
# syllable_based_pattern = f"(?<={cons_init}){medial}(?:{vowel})?" + \
#     f"(?:{cons_asat}(?:{contra1})?)?(?:{tone})?"
syllable_based_pattern = f"{cons_init}{medial}(?:{vowel})?" + \
     f"(?:{cons_asat}(?:{contra1})?)?(?:{tone})?"


def extract_syllables(text):
    # Updated regular expression to handle complex Burmese syllable structures
    #text = text.replace(f'{asat}{virama}', asat)
    #text = text.replace(f'{virama}', asat)
    text = text.replace('\u1037\u103a','\u103a\u1037')
    syllable_pattern = (
        rf"{symbol1}|"
        rf"{symbol2}|"
        rf"{syllable_based_pattern}|"
        rf"{independent_vowel}|"
        rf"{others}"
        # rf"{cons_init}"
    )
    syllables = re.findall(syllable_pattern, text)
    #syllables = [syll for syll in syllables if syll != '']
    return list(filter(None, syllables))

syllable_break_pattern = f"(?<={cons_init}){medial}(?:{vowel})?" + \
    f"(?:{cons_asat}(?:{contra1})?)?(?:{tone})?"

def extract_syllable_to_2parts(text):
    # Updated regular expression to handle complex Burmese syllable structures
    #text = text.replace(f'{asat}{virama}', asat)
    #text = text.replace(f'{virama}', asat)
    syllable_pattern = (
        rf"{symbol1}|"
        rf"{symbol2}|"
        rf"{syllable_break_pattern}|"
        rf"{independent_vowel}|"
        rf"{others}|"
        rf"{cons_init}"
    )
    syllables = re.findall(syllable_pattern, text)
    syllables = list(filter(None, syllables))
    return syllables

In [None]:
# Example usage
text1 = "သင်္ကြန်၊ မန္တလေး၊ ယောက်ျား၊ လက်ချ်မီး၊ သင်္ချာ။ ၎င်း၊ ၎င်၊ ၎၊ ၁၀.၁ ၁၀,၃၀၀.၀၀abc124de\t~ဦးဦ"
text2 = "မြန်မာစာကို မြန်မာတစ်ယောက်က စစ်သည်။"
extract_syllables(text1)
extract_syllable_to_2parts(text1)

In [11]:
from myanmartools import ZawgyiDetector

zg_detector = ZawgyiDetector()
score = zg_detector.get_zawgyi_probability('မ္း')
print(score)

0.9997572675217831


In [None]:
syllables = []
from tqdm.notebook import tqdm
syllables_map = {}
i = 0
syllables_count = 0
syllables_vocab = []
syllables_parts_vocab = []
sentences = []
syllables_hash = {}
ds_size = ds.num_rows
#ds_size = 1
for data in tqdm(ds, total=ds_size, desc="Processing records"):
    i = i + 1
    score = zg_detector.get_zawgyi_probability(data["text"])
    if score > 0.80:
        print(f"zawgyi encoded sentence skipped with scoreb {score}")
        continue
    syllables_out = extract_syllable_to_2parts(data["text"])
    for syll in syllables_out:
        if syll in syllables_hash:
            syllables_hash[syll]["count"] += 1
        else:
            syllables_hash[syll] = {"part":syll, "count":1}
    syllables = syllables + syllables_out
    syllables_count= syllables_count + len(syllables_out)
    if i%1000 == 0:
        syllables = sorted(set(syllables))
    if i > ds_size:
        break
if i % 1000 != 0:
    syllables = sorted(set(syllables))

In [15]:
print(f"Unique syllables {len(syllables)}")
print(f"Total syllables count from dataset {syllables_count}")

Unique syllables 2057
Total syllables count from dataset 62762249


In [16]:
import pandas as pd
df = pd.DataFrame.from_dict(syllables_hash)
df = df.transpose()
print(df)
df.to_csv("syllables_part.csv")

          part    count
ဝ            ဝ   441056
ီ            ီ   410612
က            က  2617855
ပ            ပ  1788761
ီး          ီး   167278
...        ...      ...
ျုရ်      ျုရ်        2
ျာာ်      ျာာ်        1
ှိုဏ်း  ှိုဏ်း        1
ေါဒ်      ေါဒ်        2
ှာား      ှာား        1

[2057 rows x 2 columns]


In [267]:
# article to sentence
def article_to_sentence(article):
    sentences = re.split(r'\u104b[\n ]?|\n',article)
    return sentences
    
# sentence to phrase
def sentence_to_phrase(sentence):
    phrases = re.split(r'\s+|\u104a', sentence)
    return phrases

# phrase to syllable
def phrase_to_syllable(phrase):
    sylls = extract_syllables(phrase)
    return sylls

# get trigram from sentence
def get_trigram_from_sentence(sentence):
    phrases = sentence_to_phrase(sentence)
    trigrams = []
    if not phrases:
        return []
    for pidx in range(0,len(phrases)):
        eos = 2 if pidx==len(phrases)-1 else 0
        sylls = phrase_to_syllable(phrases[pidx])
        if len(sylls) < 3 :
            continue
        for i in range(2, len(sylls)):
            eop = 1 if i==len(sylls)-1 else 0
            state = eop | eos
            #trigrams.append(''.join([sylls[i-2], sylls[i-1], sylls[i],str(state)]))
            trigrams.append(''.join([sylls[i-2], sylls[i-1], sylls[i]]))
    return trigrams

In [105]:
test_str = "မြန်မာနိုင်ငံတွင် ပင်းယခေတ်နှင့် ကုန်းဘောင်ခေတ်အတွင်း အဘိဓာန် အစောင်စောင် ပေါ်ထွက်ခဲ့ဘူးသည်။ သို့သော် ယင်းအဘိဓာန်တို့သည် မြန်မာအနက်နှင့် ယှဉ်တွဲဖော်ပြထားသော ပါဠိ ဝေါဟာရ စာရင်းမျိုးသာ ဖြစ်ကြသည်။ ဖျာပုံမြို့နေ အရှင်ဩဘာသသည် ဒုတိယကမ္ဘာစစ်ကြီး မဖြစ်ပွားမီအချိန်က မြန်မာဝေါဟာရကို မြန်မာလို အနက်ဖွင့်သော အဘိဓာန်တစ်စောင်ကို ပြုစုသည်။ သို့သော် ယင်း အဘိဓာန်စာမူတို့သည် စစ်အတွင်းက ပျောက်ဆုံးခဲ့သည်ဟု သိရ၏။ စစ်ပြီးခေတ်တွင် ပြန်လည်ပြုစုရာ၊ ၁၉၄၇ ခုနှစ်တွင် အဘိဓာန်နှစ်တွဲ ထုတ်ဝေနိုင်ခဲ့သည်။\nရန်ကုန်တက္ကသိုလ်တွင် ပြုစုလျက်ရှိသော တက္ကသိုလ်မြန်မာအဘိဓာန်ကိုမူကား ဒုတိယ ကမ္ဘာစစ်ကြီး အတွင်း ၁၉၄၄ ခုနှစ်ကပင် စတင်ခဲ့သည်။ ထိုနှစ်တွင် မြန်မာနိုင်ငံတော် ပညာ့တံခွန်အသင်းကို နိုင်ငံတော် အစိုးရ၏ အကူအညီဖြင့် တည်ထောင်သည်။ အသင်း၏ ရည်ရွယ်ချက်မှာ မြန်မာ့ ယဉ်ကျေးမှုအရပ်ရပ်ကို လေ့လာသုံးသပ် ပြန်လည် ဖော်ထုတ် ရန် ဖြစ်၏။"
sentences = article_to_sentence(test_str)

In [106]:
trigram_output = []
for sentence in sentences:
    trigram_output += get_trigram_from_sentence(sentence)

In [240]:
import re

class ZGDetector:
    zawgyi_regex = r"\u1031\u103b"  # e+medial ra
    zawgyi_regex += r"|^[\u1031\u103b]"  # beginning e or medial ra
    zawgyi_regex += r"|[\u1022-\u1030\u1032-\u1039\u103b-\u103d\u1040-\u104f]\u103b"  # independent vowel, dependent vowel, tone , medial ra wa ha (no ya because of 103a+103b is valid in unicode) , digit , symbol + medial ra
    zawgyi_regex += r"|\u1039$"  # end with asat
    zawgyi_regex += r"|\u103d\u103c"  # medial ha + medial wa
    zawgyi_regex += r"|\u103b\u103c"  # medial ra + medial wa
    zawgyi_regex += r"|[\u1000-\u1021]\u1039[\u101a\u101b\u101d\u101f\u1022-\u102a\u1031\u1037-\u1039\u103b\u1040-\u104f]"  # consonant + asat + ya ra wa ha independent vowel e dot below visarga asat medial ra digit symbol
    zawgyi_regex += r"|\u102e[\u102d\u103e\u1032]"  # II+I II ae
    zawgyi_regex += r"|\u1032[\u102d\u102e]"  # ae + I II
    # zawgyi_regex += r"|[\u102d\u102e][\u102d\u102e]"  # I II , II I, I I, II II  # [FIXED!! It is not so valuable zawgyi pattern ]
    # zawgyi_regex += r"|[\u102f\u1030][\u102f\u1030]"  # U UU + U UU  # [FIXED!! It is not so valuable zawgyi pattern ]
    # zawgyi_regex += r"|[\u102b\u102c][\u102b\u102c]"  # tall aa short aa  # [FIXED!! It is not so valuable zawgyi pattern ]
    zawgyi_regex += r"|[\u1090-\u1099][\u102b-\u1030\u1032\u1037\u103c-\u103e]"  # shan digit + vowel
    zawgyi_regex += r"|[\u1000-\u102a]\u103a[\u102c-\u102e\u1032-\u1036]"  # consonant + medial ya + dependent vowel tone asat
    zawgyi_regex += r"|[\u1023-\u1030\u1032-\u1039\u1040-\u104f]\u1031"  # independent vowel dependent vowel tone digit + e [ FIXED !!! - not include medial ]
    zawgyi_regex += r"|[\u107e-\u1084][\u1001\u1003\u1005-\u100f\u1012-\u1014\u1016-\u1018\u101f]"  # other shapes of medial ra + consonant not in Shan consonant
    zawgyi_regex += r"|\u1025\u1039"  # u + asat
    zawgyi_regex += r"|[\u1081\u1083]\u108f"  # eain-dray
    zawgyi_regex += r"|\u108f[\u1060-\u108d]"  # short na + stack characters
    zawgyi_regex += r"|[\u102d-\u1030\u1032\u1036\u1037]\u1039"  # I II ae dow bolow above + asat typing error
    zawgyi_regex += r"|\u102c\u1039"  # aa + asat awww
    zawgyi_regex += r"|\u101b\u103c"  # ya + medial wa
    zawgyi_regex += r"|[^\u1040-\u1049]\u1040\u102d"  # non digit + zero + \u102d (i vowel) [FIXED!!! rules tested zero + i vowel in numeric usage]
    zawgyi_regex += r"|\u1031?\u1040[\u102b\u105a\u102e-\u1030\u1032\u1036-\u1038]"  # e + zero + vowel
    zawgyi_regex += r"|\u1031?\u1047[\u102c-\u1030\u1032\u1036-\u1038]"  # e + seven + vowel
    # zawgyi_regex += r"|[\u1000-\u1021]\u103A[\u1000-\u1021]\u1039"  # cons + asat + cons + virama  # [ FIXED!!! REMOVED!!! conflict with Mon's Medial ]
    zawgyi_regex += r"|[\u102f\u1030\u1032]\u1094"  # U | UU | AI + (zawgyi) dot below
    zawgyi_regex += r"|\u1039[\u107E-\u1084]"  # virama + (zawgyi) medial ra


    def count_zawgyi_pattern(self, input_string):
        found = 0
        zawgyi_pattern = re.compile(self.zawgyi_regex)
        matcher = zawgyi_pattern.finditer(input_string)
        for match in matcher:
            found += 1
        return found

    def is_zawgyi(self, input_string):
        zawgyi_pattern = re.compile(self.zawgyi_regex)
        matcher = zawgyi_pattern.search(input_string)
        return bool(matcher)

    def count_zawgyi_pattern_and_show_pattern(self, input_string):
        found = 0
        zawgyi_pattern = re.compile(self.zawgyi_regex)
        matcher = zawgyi_pattern.finditer(input_string)

        for match in matcher:
            found += 1
            print(f"found {found} : {match.group()}")

        print(found)
        return found

In [242]:
zg_detector_rgx = ZGDetector()
zg_detector_rgx.is_zawgyi('မျန်')

False

In [269]:
from tqdm.notebook import tqdm
trigrams_hash={}
ds_size = ds.num_rows
#ds_size = 1
skipped = 0
for data in tqdm(ds, total=ds_size, desc="Processing records"):
    score = zg_detector.get_zawgyi_probability(data["text"])
    if score > 0.80:
        #print(f"zawgyi encoded sentence skipped with scoreb {score}")
        skipped += 1
        continue
    sentences = article_to_sentence(data['text'])
    for sentence in sentences:
        zawgyi = zg_detector_rgx.is_zawgyi(sentence)
        if zawgyi:
            #print(f'skipped sentence')
            continue
        trigrams = get_trigram_from_sentence(sentence)
        if len(trigrams)<1:
            continue
        for trigram in trigrams:
            if trigram in trigrams_hash:
                trigrams_hash[trigram]["count"] += 1
            else:
                trigrams_hash[trigram] = {"count":1}
print(f'Total unique trigrams {len(trigrams_hash)}.', f'Skipped {skipped} of {ds_size} articles.')

Processing records:   0%|          | 0/110833 [00:00<?, ?it/s]

Total unique trigrams 1948554. Skipped 81 of 110833 articles.


In [270]:
len(trigrams_hash)

1948554

In [271]:
df = pd.DataFrame.from_dict(trigrams_hash)
df = df.transpose()
#df.to_csv("trigrams.csv")

In [272]:
df = df.sort_index()

In [None]:
import numpy as np
import math
split_size = math.ceil(df.shape[0]/(1024*1024))
print(split_size)
dfs= np.array_split(df,split_size)
print(split_size, dfs)

In [249]:
chidx = 0
for chank in dfs:
    chidx += 1
    chank.to_csv(f'trigrams_chank_{chidx}.csv', encoding='utf-8-sig')

In [250]:
df.loc[df['count']<5].shape[0]

2146015

In [273]:
df_filtered=df.loc[df['count']>5]

In [274]:
df_filtered.shape[0]

262523

In [253]:
df_filtered.to_csv('trigrams_filtered.csv', encoding='utf-8-sig')

In [254]:
syllables_hash = {}
for row in tqdm(df_filtered.itertuples(), total=df_filtered.shape[0], desc="Getting Unique Syllable"):
    syllables_out = extract_syllables(row.Index)
    for syll in syllables_out:
        if syll in syllables_hash:
            syllables_hash[syll]["count"] += 1
        else:
            syllables_hash[syll] = {"part":syll, "count":1}

Getting Unique Syllable:   0%|          | 0/285297 [00:00<?, ?it/s]

In [255]:
df = pd.DataFrame.from_dict(syllables_hash)
df = df.transpose()
df.to_csv("syllables_filtered.csv", encoding='utf-8-sig')

In [256]:
syllables_hash = {}
for row in tqdm(df_filtered.itertuples(), total=df_filtered.shape[0], desc="Getting Unique Syllable"):
    syllables_out = extract_syllable_to_2parts(row.Index)
    for syll in syllables_out:
        if syll in syllables_hash:
            syllables_hash[syll]["count"] += 1
        else:
            syllables_hash[syll] = {"part":syll, "count":1}

Getting Unique Syllable:   0%|          | 0/285297 [00:00<?, ?it/s]

In [257]:
df = pd.DataFrame.from_dict(syllables_hash)
df = df.transpose()
df.to_csv("syllables_part_filtered.csv", encoding='utf-8-sig')

In [258]:
class MyanmarTokenizer:
    def __init__(self):
        cons = "\u1000-\u1021"
        independent_vowel = "[\u1023-\u1027\u1029\u102a]"
        cons_init = f"[{cons}\u1025\u1026\u1027\u1029\u103f]"
        # consonants + independent_vowel (!caution! 103F is need extra requirement)
        medial = "[\u103b-\u103e]{,4}"
        vowel = "[\u102b-\u1032]{1,2}\u103a?"
        asat = "\u103a"
        virama = "\u1039"
        cons_asat = f"[{cons}](?:{asat}{virama}|[{asat}{virama}])"
        kinzi = f"[\u1004\u101b]{asat}{virama}"
        tone = "[\u1036\u1037\u1038]{1,2}"
        legaund = f"\u104e\u1004{asat}\u1038|\u104e\u1004{asat}"
        symbol1 = f"[\u104c\u104d\u104f]|{legaund}|\u104e"
        symbol2 = f"\u103b{asat}"
        contra1 = "\u103b\u102c"
        digit = "[\u1040-\u1049,\.]+"
        # others = "(?![a-zA-Z0-9])[\t-~\x0a\x0d\u104a\u104b]"
        others = "[ \u104a\u104b]"
        syllable_pattern = f"(?<={cons_init}){medial}(?:{vowel})?" + \
            f"(?:{cons_asat}(?:{contra1})?)?(?:{tone})?"
        self.syllable_break_pattern = (
            rf"{symbol1}|"
            rf"{symbol2}|"
            rf"{syllable_pattern}|"
            rf"{independent_vowel}|"
            rf"{others}|"
            rf"{cons_init}"
        )

    def tokenize(self, text):
        syllables = re.findall(self.syllable_break_pattern, text)
        syllables = list(filter(None, syllables))
        return syllables

In [262]:
import heapq
class PatriciaTrieNode:
    def __init__(self, index, word=None):  # Now stores an index
        self.index = index       # Index into the syllable/sound array
        self.word = word         # The complete word if this node is a terminal node
        self.children = {}       # Dictionary for child nodes (index -> node)

class PatriciaTrie:
    def __init__(self, work_frequencies):
        self.root = PatriciaTrieNode(None)  # Root has no index
        self.syllable_index = []  # Array to store unique syllables/sounds
        self.index_map = {} # Dictionary to store syllable to index mapping
        self.myn_tokenizer = MyanmarTokenizer()
        self.word_frequencies = work_frequencies
        
    def insert(self, word):
        syllables = self._tokenize(word)
        node = self.root

        for syllable in syllables:
            index = self._get_or_create_index(syllable) # Get or create index

            if index in node.children:
                node = node.children[index]
            else:
                new_node = PatriciaTrieNode(index, word)
                node.children[index] = new_node
                node = new_node
                return # Important: Stop after creating the new node

    def suggest_with_scores(self, prefix, top_n=3):
        syllables = self._tokenize(prefix)
        node = self.root

        for syllable in syllables:
            index = self._get_index(syllable)
            if index is None:
                return []

            if index in node.children:
                node = node.children[index]
            else:
                return []

        # Use a heap to efficiently get the top-N suggestions with scores
        heap = []
        self._get_words_with_scores_from_node(node, heap)  # Populate the heap

        # Get the top-N from the heap
        top_suggestions = heapq.nlargest(top_n, heap, key=lambda item: item[1])
        return top_suggestions

    def _get_words_with_scores_from_node(self, node, heap, current_score=0):
        if node.word:
            score = self.word_frequencies.get(node.word, 0)  # Default score of 0 if not found
            heapq.heappush(heap, (node.word, score))

        for child in node.children.values():
            self._get_words_with_scores_from_node(child, heap, current_score + 1) # Example score increment

    def _tokenize(self, word):
        # Your existing syllable breaking logic here...
        return self.myn_tokenizer.tokenize(word)
        
    def _get_or_create_index(self, syllable):
        if syllable in self.index_map:
            return self.index_map[syllable]

        self.syllable_index.append(syllable)
        index = len(self.syllable_index) - 1
        self.index_map[syllable] = index
        return index

    def _get_index(self, syllable):
        return self.index_map.get(syllable) #return None if not found


# Example usage:
test_freq = {"မြန်မာနိုင်ငံ": 100, "မြန်မာလူမျိုး": 50, "မြန်မာဘာသာ": 75, "မြန်မာစကား": 120}  # Replace with your actual frequencies

trie = PatriciaTrie(test_freq)
trie.insert("မြန်မာစာ")
trie.insert("မြန်မာလူမျိုး")
trie.insert("မြန်မာဘာသာစကား")
trie.insert("မြန်မာနိုင်ငံ")  # Add more words for testing

suggestions_with_scores = trie.suggest_with_scores("မြန်မာ", top_n=3)
print(suggestions_with_scores)  # Output: [('myanmar', 7), ('myaing', 6), ('myat', 4)] (Example scores)

suggestions_with_scores = trie.suggest_with_scores("မြန်မာ", top_n=2)
print(suggestions_with_scores)  # Output: [('myanmar', 7), ('myaing', 6)] (Example scores)

[('မြန်မာနိုင်ငံ', 100)]
[('မြန်မာနိုင်ငံ', 100)]


In [279]:
trigrams_ds = df_filtered.to_dict().get('count')

In [281]:
trie = PatriciaTrie(trigrams_ds)

In [284]:
for trigram in tqdm(trigrams_ds, total=len(trigrams_ds), desc="Inserting dictionary in Patricia Trie"):
    trie.insert(trigram)

Inserting dictionary in Patricia Trie:   0%|          | 0/262523 [00:00<?, ?it/s]

In [300]:
trie.suggest_with_scores("မြန်မာလ", top_n=4)

[('မြန်မာလူ', 783), ('မြန်မာလို', 160), ('မြန်မာလက်', 124), ('မြန်မာလ', 35)]