In [1]:
import re
import string
import unicodedata
# Make sure underthesea is installed: pip install underthesea
from underthesea import word_tokenize
import pandas as pd

# --- Dictionaries and Data for Vietnamese Character Processing ---
bang_nguyen_am = [['a', 'à', 'á', 'ả', 'ã', 'ạ', 'a'],
                  ['ă', 'ằ', 'ắ', 'ẳ', 'ẵ', 'ặ', 'aw'],
                  ['â', 'ầ', 'ấ', 'ẩ', 'ẫ', 'ậ', 'aa'],
                  ['e', 'è', 'é', 'ẻ', 'ẽ', 'ẹ', 'e'],
                  ['ê', 'ề', 'ế', 'ể', 'ễ', 'ệ', 'ee'],
                  ['i', 'ì', 'í', 'ỉ', 'ĩ', 'ị', 'i'],
                  ['o', 'ò', 'ó', 'ỏ', 'õ', 'ọ', 'o'],
                  ['ô', 'ồ', 'ố', 'ổ', 'ỗ', 'ộ', 'oo'],
                  ['ơ', 'ờ', 'ớ', 'ở', 'ỡ', 'ợ', 'ow'],
                  ['u', 'ù', 'ú', 'ủ', 'ũ', 'ụ', 'u'],
                  ['ư', 'ừ', 'ứ', 'ử', 'ữ', 'ự', 'uw'],
                  ['y', 'ỳ', 'ý', 'ỷ', 'ỹ', 'ỵ', 'y']]
bang_ky_tu_dau = ['', 'f', 's', 'r', 'x', 'j']

nguyen_am_to_ids = {}
for i in range(len(bang_nguyen_am)):
    for j in range(len(bang_nguyen_am[i]) - 1):
        nguyen_am_to_ids[bang_nguyen_am[i][j]] = (i, j)

def loaddicchar():
    dic = {}
    char1252 = 'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ'.split('|')
    charutf8 = "à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ".split('|')
    for i in range(len(char1252)):
        dic[char1252[i]] = charutf8[i]
    return dic
dicchar = loaddicchar()

# --- Load Teencode Data ---
# Path to the teencode file.
_teencode_file_path = './teencode.txt' # As specified in prompt context
try:
    teencode_df = pd.read_csv(_teencode_file_path, names=['teencode', 'map'], sep='\t', header=None)
    teencode_map_default = pd.Series(teencode_df['map'].values, index=teencode_df['teencode']).to_dict()
except FileNotFoundError:
    print(f"Warning: Teencode file not found at {_teencode_file_path}. Teencode replacement will be limited.")
    teencode_map_default = {}
except Exception as e:
    print(f"Warning: Error loading teencode file '{_teencode_file_path}': {e}. Teencode replacement will be limited.")
    teencode_map_default = {}


# --- Load Stopwords Data ---
_stopwords_file_path = '/data/elo/khanglg/FreeTxt-Flask/vietnamese-stopwords.txt' # Path from existing code
try:
    with open(_stopwords_file_path, 'r', encoding='utf-8') as f:
        stopwords_list_default = [line.strip() for line in f if line.strip()]
except FileNotFoundError:
    print(f"Warning: Stopwords file not found at {_stopwords_file_path}. Stopword removal will be limited.")
    stopwords_list_default = []
except Exception as e:
    print(f"Warning: Error loading stopwords file '{_stopwords_file_path}': {e}. Stopword removal will be limited.")
    stopwords_list_default = []


# Emoji pattern from the user's snippet (more comprehensive)
emoji_pattern = re.compile("["
    u"\U0001F600-\U0001F64F"  # Emoticons
    u"\U0001F300-\U0001F5FF"  # Symbols & pictographs
    u"\U0001F680-\U0001F6FF"  # Transport & map symbols
    u"\U0001F1E0-\U0001F1FF"  # Flags
    u"\U00002702-\U000027B0"
    u"\U000024C2-\U0001F251"
    u"\U0001f926-\U0001f937"
    u"\U00010000-\U0010ffff"
    u"\u200d"
    u"\u2640-\u2642"
    u"\u2600-\u2B55"
    u"\u23cf"
    u"\u23e9"
    u"\u231a"
    u"\u3030"
    u"\ufe0f"
    "]+", flags=re.UNICODE)

# --- Helper Functions ---

def convert_unicode_legacy(txt): # Renamed to avoid clash if user defines convert_unicode
    return re.sub(
        r'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ',
        lambda x: dicchar[x.group()], txt)

def text_unicode_normalize(text): # From user's snippet
    return unicodedata.normalize('NFC', text)

def is_valid_vietnam_word(word):
    chars = list(word)
    nguyen_am_index = -1
    for index, char in enumerate(chars):
        x, y = nguyen_am_to_ids.get(char, (-1, -1))
        if x != -1:
            if nguyen_am_index == -1:
                nguyen_am_index = index
            else:
                if index - nguyen_am_index != 1:
                    return False
                nguyen_am_index = index
    return True

def chuan_hoa_dau_tu_tieng_viet(word):
    if not is_valid_vietnam_word(word):
        return word
    chars = list(word)
    dau_cau = 0
    nguyen_am_index = []
    qu_or_gi = False
    for index, char in enumerate(chars):
        x, y = nguyen_am_to_ids.get(char, (-1, -1))
        if x == -1: continue
        if x == 9: # u
            if index > 0 and chars[index - 1].lower() == 'q':
                chars[index] = 'u'; qu_or_gi = True
        elif x == 5: # i
            if index > 0 and chars[index - 1].lower() == 'g':
                chars[index] = 'i'; qu_or_gi = True
        if y != 0:
            dau_cau = y; chars[index] = bang_nguyen_am[x][0]
        if not qu_or_gi or index != 1: # Fix: check qu_or_gi correctly
            nguyen_am_index.append(index)

    if not nguyen_am_index: return "".join(chars)

    # Determine which vowel to place the tone mark on
    idx_to_mark = nguyen_am_index[0] # Default to the first vowel in the group
    if len(nguyen_am_index) >= 2:
        # Priority for ê, ơ, ô
        priority_vowel_found = False
        for idx_candidate in nguyen_am_index:
            x_vowel, _ = nguyen_am_to_ids.get(chars[idx_candidate], (-1,-1))
            if x_vowel in [4, 7, 8]: # ê, ô, ơ
                idx_to_mark = idx_candidate
                priority_vowel_found = True
                break
        
        if not priority_vowel_found:
            # Rules for diphthongs/triphthongs (simplified from original logic)
            # If the vowel group is at the end of the word
            if nguyen_am_index[-1] == len(chars) -1:
                # If ends with i, u, y (closed vowels/semivowels), mark the vowel before it
                x_last_vowel, _ = nguyen_am_to_ids.get(chars[nguyen_am_index[-1]], (-1,-1))
                if x_last_vowel in [5, 9, 10, 11]: # i, u, ư, y
                     idx_to_mark = nguyen_am_index[-2] if len(nguyen_am_index) > 1 else nguyen_am_index[-1]
                else: # Otherwise, mark the first vowel of the group (e.g., 'oa', 'oe')
                    idx_to_mark = nguyen_am_index[0]
            else: # Vowel group is followed by consonants (e.g., 'uyen', 'oan')
                if len(nguyen_am_index) == 3: # Triphthongs like 'uye', 'oai' -> mark middle
                    idx_to_mark = nguyen_am_index[1]
                elif len(nguyen_am_index) == 2: # Diphthongs like 'uyê', 'oa' -> mark second
                    idx_to_mark = nguyen_am_index[1]
                # else (single vowel before consonant), default (first vowel) is fine

    # Apply the tone mark
    x_target_vowel, _ = nguyen_am_to_ids.get(chars[idx_to_mark], (-1,-1))
    if x_target_vowel != -1 and dau_cau != 0:
        chars[idx_to_mark] = bang_nguyen_am[x_target_vowel][dau_cau]
    return "".join(chars)


def chuan_hoa_dau_cau_tieng_viet(sentence):
    words = sentence.split()
    for index, word in enumerate(words):
        # Preserve surrounding punctuation by splitting and processing only the word part
        match = re.match(r'(^[\W_]*)([\wÀ-Ỹà-ỹ._]*[\wÀ-Ỹà-ỹ]+)([\W_]*$)', word)
        if match:
            prefix, core_word, suffix = match.groups()
            normalized_core_word = chuan_hoa_dau_tu_tieng_viet(core_word)
            words[index] = prefix + normalized_core_word + suffix
        else: # If word doesn't match (e.g. pure punctuation or malformed), try to normalize if it's a simple word
            words[index] = chuan_hoa_dau_tu_tieng_viet(word) 
    return " ".join(words)

# --- Main Preprocessing Function ---
def preprocess_vietnamese_text_underthesea(
    text,
    custom_teencode_map=None,
    custom_stopwords_list=None,
    use_teencode=True,
    use_stopwords=False,
    remove_all_punctuation=False
    ):
    """
    Comprehensive Vietnamese text preprocessing using underthesea.
    Args:
        text (str): Input Vietnamese text.
        custom_teencode_map (dict, optional): Custom teencode mapping.
                                              Defaults to loaded teencode_map_default.
        custom_stopwords_list (list, optional): Custom list of stopwords.
                                                Defaults to loaded stopwords_list_default.
        use_teencode (bool): Whether to perform teencode replacement.
        use_stopwords (bool): Whether to perform stopword removal.
        remove_all_punctuation (bool): If True, removes all punctuation. Defaults to False.
    Returns:
        str: Processed text.
    """
    if not isinstance(text, str):
        text = str(text)

    # Use custom maps/lists if provided, otherwise use the loaded defaults
    current_teencode_map = custom_teencode_map if custom_teencode_map is not None else teencode_map_default
    current_stopwords_list = custom_stopwords_list if custom_stopwords_list is not None else stopwords_list_default

    # 1. Lowercase
    processed_text = text.lower()

    # 2. Remove URLs, mentions, hashtags
    processed_text = re.sub(r"http\S+|www\S+|@\S+|#\S+", "", processed_text)

    # 3. Legacy Unicode conversion (e.g., Windows-1258 to Unicode)
    processed_text = convert_unicode_legacy(processed_text)

    # 4. Standard Unicode Normalization (NFC)
    processed_text = text_unicode_normalize(processed_text)

    # 5. Remove Emojis
    processed_text = re.sub(emoji_pattern, " ", processed_text) # Replace with space to avoid merging words

    # 6. Reduce repeated alphabetic characters (e.g., "chàoooo" -> "chào")
    # This regex handles Vietnamese characters correctly.
    processed_text = re.sub(r'([a-zàáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ])\1+', r'\1', processed_text)

    # 7. Reduce repeated special characters (non-alphanumeric, non-whitespace)
    # This regex handles Vietnamese characters by excluding them from "special characters".
    processed_text = re.sub(r'([^a-z0-9àáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ\s])\1+', r'\1', processed_text)

    # 8. Normalize punctuation spacing
    # Ensure space around punctuation that is between words or at the end/start of a word.
    # This helps the tokenizer.
    escaped_punctuation = re.escape(string.punctuation)
    processed_text = re.sub(r"(\w)\s*([" + escaped_punctuation + r"])\s*(\w)", r"\1 \2 \3", processed_text)
    processed_text = re.sub(r"(\w)\s*([" + escaped_punctuation + r"])", r"\1 \2", processed_text) # Word followed by punctuation
    processed_text = re.sub(r"([" + escaped_punctuation + r"])\s*(\w)", r"\1 \2", processed_text) # Punctuation followed by word


    # 9. Reduce repeated punctuation characters (e.g., "!!!" to "!")
    # This should be done AFTER spacing normalization if we want "!!! " -> "! "
    # The previous regex for repeated special chars might have handled some of this,
    # but this is more specific to string.punctuation.
    processed_text = re.sub(r"([" + escaped_punctuation + r"])\1+", r"\1", processed_text)


    # 10. Vietnamese tone mark normalization
    processed_text = chuan_hoa_dau_cau_tieng_viet(processed_text)

    # 11. Remove all punctuation (optional)
    if remove_all_punctuation:
        # Create a translation table that maps all punctuation to None
        translator = str.maketrans('', '', string.punctuation)
        processed_text = processed_text.translate(translator)

    # 12. Final whitespace cleanup (multiple spaces to single, strip ends)
    processed_text = re.sub(r'\s+', ' ', processed_text).strip()
    
    # 13. Strip leading/trailing punctuation or space robustly (if not all removed)
    if not remove_all_punctuation and processed_text: # Check if processed_text is not empty
        # Characters to strip: all punctuation and whitespace
        strip_chars = string.punctuation + string.whitespace
        # Strip from the end
        while processed_text and processed_text[-1] in strip_chars:
            processed_text = processed_text[:-1]
        # Strip from the beginning
        while processed_text and processed_text[0] in strip_chars:
            processed_text = processed_text[1:]
    
    # If all processing results in an empty string, return it.
    if not processed_text:
        return ""

    # 14. Tokenization using underthesea
    tokens = word_tokenize(processed_text, format="list") # Get list of tokens

    # 15. Teencode Replacement (on tokens)
    if use_teencode and current_teencode_map:
        new_tokens = []
        for token in tokens:
            replacement = current_teencode_map.get(token, token)
            # If teencode replacement results in multiple words, split them.
            # This requires re-tokenization or careful splitting.
            # For simplicity, we assume teencode maps to single words or phrases
            # that underthesea can handle if joined and re-tokenized.
            new_tokens.append(replacement)
        
        # If teencode replacement might introduce multi-word strings, re-tokenize.
        # This ensures that phrases from teencode are properly tokenized.
        if any(" " in t for t in new_tokens): # Check if any token now contains a space
            temp_token_string = " ".join(new_tokens)
            tokens = word_tokenize(temp_token_string, format="list")
        else:
            tokens = new_tokens


    # 16. Stopword Removal (on tokens)
    if use_stopwords and current_stopwords_list:
        tokens = [token for token in tokens if token not in current_stopwords_list and token.strip()]

    # 17. Join tokens to form the final processed string
    return " ".join(tokens)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# --- Example Usage ---

sample_text_1 = "Xiiiiiiin chàooooo, bạn khoẻee khôngggg ??? Tui là   Hùng nè... ghé thăm web https://example.com chơi nhaaa :)))) 😂😂😂 #VuiVe"
sample_text_2 = "Trời ơi CLM, đC k hÙng? Mày bị đIên à????"
sample_text_3 = "Shop bán đồ đắt qúa đi, hok có tiền mua đâuuuu :("
sample_text_4 = "Tuyệt vời!!! sản phẩm này rất tốtttt. highly recommend nha mn. k chê vào đâu đc. @user123"
sample_text_5 = "cuộc sống bất công biết mấy cho vừai định nghĩa thế gian này muôn mặt"
sample_text_6 = "xạo choá quá anh" # from notebook (accent issue) -> xạo chóa quá anh
sample_text_7 = "anh hoà, đang làm.. gì" # from notebook (accent issue) -> anh hòa, đang làm. gì
sample_text_8 = "Đi đâu đó bạn ơi????? kkkk :D :D"

print(f"Original: '{sample_text_1}'")
processed_1 = preprocess_vietnamese_text_underthesea(sample_text_1)
print(f"Processed: '{processed_1}'\n")

print(f"Original: '{sample_text_2}'")
processed_2 = preprocess_vietnamese_text_underthesea(sample_text_2)
print(f"Processed: '{processed_2}'\n")

print(f"Original: '{sample_text_3}'")
processed_3 = preprocess_vietnamese_text_underthesea(sample_text_3, use_stopwords=False)
print(f"Processed (no stopwords): '{processed_3}'\n")

print(f"Original: '{sample_text_4}'")
processed_4 = preprocess_vietnamese_text_underthesea(sample_text_4, remove_all_punctuation=True)
print(f"Processed (all punct removed): '{processed_4}'\n")

print(f"Original: '{sample_text_5}'")
processed_5 = preprocess_vietnamese_text_underthesea(sample_text_5)
print(f"Processed: '{processed_5}'\n")

print(f"Original: '{sample_text_6}'")
processed_6 = preprocess_vietnamese_text_underthesea(sample_text_6)
print(f"Processed: '{processed_6}'\n")

print(f"Original: '{sample_text_7}'")
processed_7 = preprocess_vietnamese_text_underthesea(sample_text_7)
print(f"Processed: '{processed_7}'\n")

print(f"Original: '{sample_text_8}'")
processed_8 = preprocess_vietnamese_text_underthesea(sample_text_8)
print(f"Processed: '{processed_8}'\n")

print(f"Original (no teencode/stopwords): '{sample_text_1}'")
processed_9 = preprocess_vietnamese_text_underthesea(sample_text_1, use_teencode=False, use_stopwords=False)
print(f"Processed (no teencode/stopwords): '{processed_9}'\n")

Original: 'Xiiiiiiin chàooooo, bạn khoẻee khôngggg ??? Tui là   Hùng nè... ghé thăm web https://example.com chơi nhaaa :)))) 😂😂😂 #VuiVe'
Processed: 'xin chào , bạn khỏee không ? tui là hùng nè . ghé thăm web chơi nha'

Original: 'Trời ơi CLM, đC k hÙng? Mày bị đIên à????'
Processed: 'trời ơi cái lồn má , được k hùng ? mày bị điên à'

Original: 'Shop bán đồ đắt qúa đi, hok có tiền mua đâuuuu :('
Processed (no stopwords): 'shop bán đồ đắt quá đi , không có tiền mua đâu'

Original: 'Tuyệt vời!!! sản phẩm này rất tốtttt. highly recommend nha mn. k chê vào đâu đc. @user123'
Processed (all punct removed): 'tuyệt vời sản phẩm này rất tốt highly recomend nha mọi người k chê vào đâu được'

Original: 'cuộc sống bất công biết mấy cho vừai định nghĩa thế gian này muôn mặt'
Processed: 'cuộc sống bất công biết mấy cho vưài định nghĩa thế gian này muôn mặt'

Original: 'xạo choá quá anh'
Processed: 'xạo chóa quá anh'

Original: 'anh hoà, đang làm.. gì'
Processed: 'anh hòa , đang làm . gi'

Original: '

In [3]:
sample_vi_en = "Hello World quá là đẹp lunnn á"
print(f"Original: '{sample_vi_en}'")
processed_vi_en = preprocess_vietnamese_text_underthesea(sample_vi_en)
print(f"Processed: '{processed_vi_en}'\n")


Original: 'Hello World quá là đẹp lunnn á'
Processed: 'helo world quá là đẹp lun á'

