In [59]:
import re
import unicodedata


def normalize_duplicate_char(text):
    text = str(text)
    text = unicodedata.normalize("NFC", text)
    return re.sub(r"(.)\1{3,}", r"\1", text)

def convert_trieu(text):
    text = str(text)
    # Only match numbers followed by 'tr' as a separate word or end of string
    # Use word boundary \b
    text = unicodedata.normalize("NFC", text)
    return re.sub(r"(\d+)\s*[tT][rR]\b(?!iệu)", r"\1 triệu", text)

def normalize_text_list(batch):
    ori_normalized = []
    nor_normalized = []
    for ori, nor in zip(batch['original'], batch['normalized']):
        ori = normalize_duplicate_char(ori)
        nor = normalize_duplicate_char(nor)
        nor = convert_trieu(nor)
        ori_normalized.append(ori)
        nor_normalized.append(nor)
    return {
        'original': ori_normalized,
        'normalized': nor_normalized  
    }

In [7]:
from datasets import load_dataset

dataset = load_dataset('csv',data_files={'train': 'train_data.csv'}, quotechar='"')

Generating train split: 0 examples [00:00, ? examples/s]

In [61]:
dataset = dataset.map(normalize_text_list, batched=True)

Map:   0%|          | 0/17224 [00:00<?, ? examples/s]

In [62]:
dataset['train'].to_csv('normalized_data.csv')

Creating CSV from Arrow format:   0%|          | 0/18 [00:00<?, ?ba/s]

2022337

In [8]:
import re

def get_strings_with_repeats(batch):
    new_ori = []
    new_nor = []
    
    pattern = re.compile(r"(.)\1{1,}")  # character repeated 3+ times

    for ori, nor in zip (batch['original'], batch['normalized']):
        if pattern.search(str(ori)):
            new_ori.append(ori)
            new_nor.append(nor)
    return {
        'original': new_ori,
        'normalized': new_nor
    }

new_dataset = dataset.map(get_strings_with_repeats, batched=True)



Map:   0%|          | 0/12455 [00:00<?, ? examples/s]

In [4]:
print(new_dataset)

DatasetDict({
    train: Dataset({
        features: ['original', 'normalized'],
        num_rows: 1335
    })
})


In [9]:
new_dataset['train'].to_csv('tmp.csv')

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

127743