In [None]:
from khmernltk import word_tokenize

khmer_text = "មកទល់បច្ចុប្បន្ន នៅក្នុងប្រទេសកម្ពុជាមានតំបន់សេដ្ឋកិច្ចពិសេសចំនួន៤៤ ដែលកំពុងជួយជំរុញប្រតិបត្តិការនាំចេញផលិតផលក្នុងស្រុកបានរហូតដល់ ២៣ភាគរយ ទៅ ២៥ភាគរយ នៃការនាំចេញសរុបរបស់កម្ពុជាទៅកាន់ទីផ្សារពិភពលោកចាប់ពីឆ្នាំ២៦២២ ដល់ឆ្នាំ២៦២៤។"

print(word_tokenize(khmer_text, return_tokens=True))


| 2025-06-28 16:05:59,064 | [1;32mINFO[0m | khmer-nltk | Loaded model from /media/thareah/New Volume/Font-text-OCR/Test-version/env/lib/python3.12/site-packages/khmernltk/word_tokenize/sklearn_crf_ner_10000.sav |


['មក', 'ទល់', 'បច្ចុប្បន្ន', ' ', 'នៅក្នុង', 'ប្រទេស', 'កម្ពុជា', 'មាន', 'តំបន់', 'សេដ្ឋកិច្ច', 'ពិសេស', 'ចំនួន', '៤៤', ' ', 'ដែល', 'កំពុង', 'ជួយ', 'ជំរុញ', 'ប្រតិបត្តិការ', 'នាំចេញ', 'ផលិតផល', 'ក្នុង', 'ស្រុក', 'បាន', 'រហូតដល់', ' ', '២៣ភាគរយ', ' ', 'ទៅ', ' ', '២៥', 'ភាគរយ', ' ', 'នៃ', 'ការនាំចេញ', 'សរុប', 'របស់', 'កម្ពុជា', 'ទៅកាន់', 'ទីផ្សារ', 'ពិភពលោក', 'ចាប់ពី', 'ឆ្នាំ', '២៦២២', ' ', 'ដល់', 'ឆ្នាំ', '២៦២៤', '។']


In [9]:
import os
import cv2
import random
import pandas as pd
import numpy as np
from PIL import Image, ImageDraw, ImageFont
from khmernltk import word_tokenize
import re # Import regex module for sanitization


# ========== CONFIG ==========
khmer_text = """
មកទល់បច្ចុប្បន្ន នៅក្នុងប្រទេសកម្ពុជាមានតំបន់សេដ្ឋកិច្ចពិសេសចំនួន៤៤ ដែលកំពុងជួយជំរុញប្រតិបត្តិការនាំចេញផលិតផលក្នុងស្រុកបានរហូតដល់ ២៣ភាគរយ ទៅ ២៥ភាគរយ នៃការនាំចេញសរុបរបស់កម្ពុជាទៅកាន់ទីផ្សារពិភពលោក។។
"""

output_folder = "output_images"
os.makedirs(output_folder, exist_ok=True)

font_path = "Font/khmerOS.ttf"  # Path to your Khmer font file
font_size = 32

image_size = (600, 80)  # Width, Height


# ========== WORD TOKENIZE WITH KHMERNLTK ==========
raw_tokens = word_tokenize(khmer_text, return_tokens=True)
cleaned_tokens = [t for t in raw_tokens if t.strip() != '']
print(f"Total cleaned tokens: {len(cleaned_tokens)}")


# ========== DYNAMIC SPLIT FUNCTION ==========
def split_khmer_paragraph_dynamic(tokens, min_words=3, max_words=8):
    chunks = []
    i = 0
    while i < len(tokens):
        group_size = random.randint(min_words, max_words)
        chunk = tokens[i:i + group_size]
        if chunk:
            chunks.append(' '.join(chunk))
        i += group_size
    return chunks

# ========== SPLIT TO LINES USING DYNAMIC FUNCTION ==========
lines = split_khmer_paragraph_dynamic(cleaned_tokens, min_words=3, max_words=5)
print(f"Generated {len(lines)} lines with dynamic lengths")

# ========== HELPER FUNCTION TO SANITIZE TEXT FOR FILENAME ==========
def sanitize_filename_text(text, max_len=50):
    # Replace spaces with underscores
    sanitized_text = text.replace(' ', '_')
    # Remove any characters that are not alphanumeric, underscore, or Khmer characters
    # This regex is simplified and might need adjustment for extremely rare Khmer characters
    # For common Khmer characters, \p{Khmer} (Unicode property) is useful but re module needs 'regex' engine
    # For basic usage, replacing non-word characters and non-Khmer is a good start.
    # We'll use a more general approach here to replace non-alphanumeric and non-underscore with nothing
    sanitized_text = re.sub(r'[^\w\u1780-\u17FF_]', '', sanitized_text) # \u1780-\u17FF is the Unicode range for Khmer script
    # Collapse multiple underscores to one
    sanitized_text = re.sub(r'_+', '_', sanitized_text)
    # Remove leading/trailing underscores
    sanitized_text = sanitized_text.strip('_')

    # Truncate if too long
    if len(sanitized_text) > max_len:
        sanitized_text = sanitized_text[:max_len] + "..." # Add ellipsis to indicate truncation

    return sanitized_text

# ========== GENERATE IMAGES FUNCTION ==========
def generate_text_image(text, filename, variant="clean"):
    img = Image.new('RGB', image_size, color=(255, 255, 255))
    draw = ImageDraw.Draw(img)

    try:
        font = ImageFont.truetype(font_path, font_size)
    except Exception:
        raise Exception("Font file not found! Check the font_path.")

    bbox = draw.textbbox((0, 0), text, font=font)
    w = bbox[2] - bbox[0]
    h = bbox[3] - bbox[1]

    x = (image_size[0] - w) / 2
    y = (image_size[1] - h) / 2

    draw.text((x, y), text, fill=(0, 0, 0), font=font)

    img_path = os.path.join(output_folder, filename)

    if variant == "clean":
        img.save(img_path)
    elif variant == "blurred":
        img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
        img_cv = cv2.GaussianBlur(img_cv, (5,5), 1)
        cv2.imwrite(img_path, img_cv)
    elif variant == "noisy":
        img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
        noise = np.random.randint(0, 50, (img_cv.shape[0], img_cv.shape[1], 3), dtype='uint8')
        img_cv = cv2.add(img_cv, noise)
        cv2.imwrite(img_path, img_cv)
    elif variant == "noise_blur":
        img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
        noise = np.random.randint(0, 50, (img_cv.shape[0], img_cv.shape[1], 3), dtype='uint8')
        img_cv = cv2.add(img_cv, noise)
        img_cv = cv2.GaussianBlur(img_cv, (5,5), 1)
        cv2.imwrite(img_path, img_cv)


# ========== LOOP TO GENERATE ==========
data = []
variants = ['clean', 'blurred', 'noisy', 'noise_blur']

for idx, line in enumerate(lines):
    # Sanitize the line text for the filename
    sanitized_line_text = sanitize_filename_text(line, max_len=40) # Adjust max_len as desired

    for variant in variants:
        # Construct the new filename with sanitized text
        filename = f"khmerOS_{sanitized_line_text}_{variant}.png"
        generate_text_image(line, filename, variant)
        data.append({
            'filename': filename,
            'label': line,
            'variant': variant
        })

print("✅ Image generation completed!")


# ========== SAVE CSV ==========
df = pd.DataFrame(data)
df.to_csv(os.path.join(output_folder, "labels.csv"), index=False, encoding='utf-8-sig')
print("✅ CSV labels saved!")

Total cleaned tokens: 37
Generated 10 lines with dynamic lengths
✅ Image generation completed!
✅ CSV labels saved!


In [1]:
from paddleocr import PaddleOCR

ModuleNotFoundError: No module named 'paddleocr'