In [18]:
from PIL import Image, ImageDraw, ImageFont, ImageFilter 
import numpy as np
import os
import random

# --- Configuration ---
OUTPUT_DIR = "Gen-Image"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Khmer text
khmer_texts = [
    "មាន តំបន់ សេដ្ឋកិច្ច ពិសេស",


]

# Font paths (update with correct paths to fonts that support Khmer)
font_paths = [
    "Font/khmerOS.ttf",
    # Add other font paths here, e.g.:
    # "Font/YourKhmerFont2.ttf",
    # "Font/YourKhmerFont3.ttf",
    # "Font/YourKhmerFont4.ttf",
    # "Font/YourKhmerFont5.ttf",
]

# Image dimensions
IMAGE_WIDTH = 500
IMAGE_HEIGHT = 80

# --- Function to generate image with text and optional augmentations ---
def generate_text_image(text, font_path, font_size, add_noise=False, noise_ratio=0.001, add_blur=False, blur_radius=0.5):
    try:
        font = ImageFont.truetype(font_path, font_size)
    except IOError:
        print(f"Error: Could not load font from {font_path}. Skipping.")
        return None

    image = Image.new("RGB", (IMAGE_WIDTH, IMAGE_HEIGHT), "white")
    draw = ImageDraw.Draw(image)

    # Calculate text bounding box for centering
    text_bbox = draw.textbbox((0, 0), text, font=font, language='kh')
    text_width = text_bbox[2] - text_bbox[0]
    text_height = text_bbox[3] - text_bbox[1]

    # Center the text
    x = (IMAGE_WIDTH - text_width) / 2 - text_bbox[0]
    y = (IMAGE_HEIGHT - text_height) / 2 - text_bbox[1]

    # Draw text with Khmer language support
    draw.text((x, y), text, font=font, fill="black", language='kh')

    # Add ink-like noise (small black shapes to mimic printing imperfections)
    if add_noise:
        num_specks = int(IMAGE_WIDTH * IMAGE_HEIGHT * noise_ratio)  # Number of ink specks
        for _ in range(num_specks):
            x_noise = random.randint(0, IMAGE_WIDTH - 3)  # Adjust for shape size
            y_noise = random.randint(0, IMAGE_HEIGHT - 3)
            size = random.randint(1, 3)  # Random size for ink specks (1-3 pixels)
            if random.choice([True, False]):  # Randomly choose rectangle or ellipse
                draw.rectangle(
                    [x_noise, y_noise, x_noise + size, y_noise + size],
                    fill="black"
                )
            else:
                draw.ellipse(
                    [x_noise, y_noise, x_noise + size, y_noise + size],
                    fill="black"
                )

    # Add Gaussian blur (subtle)
    if add_blur:
        image = image.filter(ImageFilter.GaussianBlur(blur_radius))

    return image

# --- Generate Images ---
file_counter = 0
for font_path in font_paths:
    font_name = os.path.basename(font_path).split('.')[0]

    for text_content in khmer_texts:
        # Replace spaces with underscores in text for filename
        text_for_filename = text_content.replace(" ", "_")

        # Generate clean image
        clean_img = generate_text_image(text_content, font_path, 40)
        if clean_img:
            clean_img_filename = os.path.join(OUTPUT_DIR, f"{font_name}_{text_for_filename}_clean.png")
            clean_img.save(clean_img_filename)
            file_counter += 1

        # Generate noisy image (ink-like specks)
        noisy_img = generate_text_image(text_content, font_path, 40, add_noise=True, noise_ratio=0.001)
        if noisy_img:
            noisy_img_filename = os.path.join(OUTPUT_DIR, f"{font_name}_{text_for_filename}_noisy.png")
            noisy_img.save(noisy_img_filename)
            file_counter += 1

        # Generate blurred image
        blurred_img = generate_text_image(text_content, font_path, 40, add_blur=True, blur_radius=0.7)
        if blurred_img:
            blurred_img_filename = os.path.join(OUTPUT_DIR, f"{font_name}_{text_for_filename}_blurred.png")
            blurred_img.save(blurred_img_filename)
            file_counter += 1

        # Generate image with both noise and blur
        multi_aug_img = generate_text_image(
            text_content, font_path, 40,
            add_noise=True, noise_ratio=0.001,
            add_blur=True, blur_radius=0.7
        )
        if multi_aug_img:
            multi_aug_img_filename = os.path.join(OUTPUT_DIR, f"{font_name}_{text_for_filename}_noise_blur.png")
            multi_aug_img.save(multi_aug_img_filename)
            file_counter += 1

print(f"Generated {file_counter} images in '{OUTPUT_DIR}' folder.")

Generated 4 images in 'Gen-Image' folder.


In [5]:
import os
import cv2
import random
import pandas as pd
import numpy as np
from PIL import Image, ImageDraw, ImageFont
from khmernltk import word_tokenize
import re

# ========== CONFIG ==========
# Define the folder containing your text files
text_input_folder = "text" # Make sure this folder exists and contains your .txt files
output_folder = "output_images"
os.makedirs(output_folder, exist_ok=True)
os.makedirs(text_input_folder, exist_ok=True) # Ensure text_input_folder exists

# List of font paths (UPDATE THIS WITH YOUR ACTUAL FONT FILES)
font_paths = [
    "Font/khmerOS.ttf",
    "Font/Battambang-Regular.ttf",
    "Font/Moul-Regular.ttf",
    # Add more Khmer font paths here
]

font_size_range = (28, 36) # Example range for font sizes
image_size = (600, 80) # Width, Height

# ========== FUNCTION TO READ TEXT FROM MULTIPLE FILES ==========
def read_khmer_texts_from_folder(folder_path):
    all_khmer_text = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    all_khmer_text.append(f.read())
            except Exception as e:
                print(f"Error reading file {file_path}: {e}")
    return "\n".join(all_khmer_text)

# ========== Read all Khmer text from the specified folder ==========
khmer_text = read_khmer_texts_from_folder(text_input_folder)

if not khmer_text.strip():
    print(f"No Khmer text found in '{text_input_folder}'. Please add .txt files with content to this folder.")
    exit() # Exit if no text is found

# ========== WORD TOKENIZE WITH KHMERNLTK ==========
raw_tokens = word_tokenize(khmer_text, return_tokens=True)
cleaned_tokens = [t for t in raw_tokens if t.strip() != '']
print(f"Total cleaned tokens: {len(cleaned_tokens)}")

# ========== DYNAMIC SPLIT FUNCTION ==========
def split_khmer_paragraph_dynamic(tokens, min_words=3, max_words=8):
    chunks = []
    i = 0
    while i < len(tokens):
        group_size = random.randint(min_words, max_words)
        chunk = tokens[i:i + group_size]
        if chunk:
            chunks.append(' '.join(chunk))
        i += group_size
    return chunks

# ========== SPLIT TO LINES USING DYNAMIC FUNCTION ==========
lines = split_khmer_paragraph_dynamic(cleaned_tokens, min_words=3, max_words=5)
print(f"Generated {len(lines)} lines with dynamic lengths")

# ========== HELPER FUNCTION TO SANITIZE TEXT FOR FILENAME ==========
def sanitize_filename_text(text, max_len=50):
    sanitized_text = text.replace(' ', '_')
    sanitized_text = re.sub(r'[^\w\u1780-\u17FF_]', '', sanitized_text)
    sanitized_text = re.sub(r'_+', '_', sanitized_text)
    sanitized_text = sanitized_text.strip('_')

    if len(sanitized_text) > max_len:
        sanitized_text = sanitized_text[:max_len] + "..."
    return sanitized_text

# ========== GENERATE IMAGES FUNCTION ==========
def generate_text_image(text, filename, variant="clean"):
    img = Image.new('RGB', image_size, color=(255, 255, 255))
    draw = ImageDraw.Draw(img)

    # Randomly select a font and font size
    selected_font_path = random.choice(font_paths)
    selected_font_size = random.randint(font_size_range[0], font_size_range[1])

    try:
        font = ImageFont.truetype(selected_font_path, selected_font_size)
    except Exception as e:
        print(f"Error loading font {selected_font_path} with size {selected_font_size}: {e}")
        # Fallback to a default or raise if no fonts work
        if len(font_paths) > 0:
            font = ImageFont.truetype(font_paths[0], font_size_range[0])
        else:
            raise Exception("No valid font files found!")

    bbox = draw.textbbox((0, 0), text, font=font)
    w = bbox[2] - bbox[0]
    h = bbox[3] - bbox[1]

    x = (image_size[0] - w) / 2
    y = (image_size[1] - h) / 2

    draw.text((x, y), text, fill=(0, 0, 0), font=font)

    img_path = os.path.join(output_folder, filename)

    if variant == "clean":
        img.save(img_path)
    elif variant == "blurred":
        img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
        img_cv = cv2.GaussianBlur(img_cv, (5,5), 1)
        cv2.imwrite(img_path, img_cv)
    elif variant == "noisy":
        img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
        noise = np.random.randint(0, 50, (img_cv.shape[0], img_cv.shape[1], 3), dtype='uint8')
        img_cv = cv2.add(img_cv, noise)
        cv2.imwrite(img_path, img_cv)
    elif variant == "noise_blur":
        img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
        noise = np.random.randint(0, 50, (img_cv.shape[0], img_cv.shape[1], 3), dtype='uint8')
        img_cv = cv2.add(img_cv, noise)
        img_cv = cv2.GaussianBlur(img_cv, (5,5), 1)
        cv2.imwrite(img_path, img_cv)


# ========== LOOP TO GENERATE ==========
data = []
variants = ['clean', 'blurred', 'noisy', 'noise_blur']

for idx, line in enumerate(lines):
    sanitized_line_text = sanitize_filename_text(line, max_len=40)

    for variant in variants:
        filename = f"khmerOS_{sanitized_line_text}_{variant}_{idx}.png" # Added idx to prevent filename conflicts if sanitized text is the same
        generate_text_image(line, filename, variant)
        data.append({
            'filename': filename,
            'label': line,
            'variant': variant
        })

print("✅ Image generation completed!")


# ========== SAVE CSV ==========
df = pd.DataFrame(data)
df.to_csv(os.path.join(output_folder, "labels.csv"), index=False, encoding='utf-8-sig')
print("✅ CSV labels saved!")

Total cleaned tokens: 1672
Generated 421 lines with dynamic lengths
✅ Image generation completed!
✅ CSV labels saved!
