In [4]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

output_folder = "output_images" # Your existing output folder
labels_csv_path = os.path.join(output_folder, "labels.csv")

df = pd.read_csv(labels_csv_path, encoding='utf-8-sig')

# Filter for images to be used for recognition training (e.g., all variations)
df_rec = df[df['variant'].isin(['clean', 'blurred', 'noisy', 'noise_blur'])]

# Calculate number of unique labels
num_unique_labels = len(df_rec['label'].unique())
print(f"Number of unique labels: {num_unique_labels}")

# Determine appropriate test_size
# Ensure test_size allows at least one sample per unique label in the test set
# (e.g., test_size = num_unique_labels / total_samples)
# Let's try to ensure at least 1.5x the number of classes, or a fixed minimum if total samples are very low.
# A safe rule of thumb: if (total_samples * test_size) < num_unique_labels, it will fail.
# Let's set a minimum number of samples for the test set, say 10 or 15% but at least num_unique_labels.
current_test_size_ratio = 0.15 # Your desired ratio
min_test_samples_needed = num_unique_labels # At least one sample per class
total_samples = len(df_rec)

# Calculate minimum test_size ratio required
min_test_size_ratio_required = min_test_samples_needed / total_samples

# Use the larger of your desired ratio or the minimum required
actual_test_size = max(current_test_size_ratio, min_test_size_ratio_required)

# If actual_test_size approaches 1.0 (meaning almost all data is for test), that's problematic.
# For very small datasets, stratification might still be challenging.
# Let's refine the condition to avoid errors for extremely small N.
if num_unique_labels > 1: # Only try to stratify if there's more than one class
    if total_samples * actual_test_size < num_unique_labels:
        # Fallback if even adjusted test_size is not enough
        # This implies your dataset is too small for meaningful stratification at this ratio.
        # Option 1: Increase overall dataset size (more generated lines).
        # Option 2: Choose a larger test_size (e.g., 0.3 or 0.4)
        # Option 3: Don't stratify.
        print("Warning: Dataset might be too small for effective stratification with current test_size. Disabling stratification.")
        train_df, val_df = train_test_split(df_rec, test_size=actual_test_size, random_state=42, stratify=None)
    else:
        train_df, val_df = train_test_split(df_rec, test_size=actual_test_size, random_state=42, stratify=df_rec['label'])
else: # If only one unique label, stratification is not applicable and will error
    print("Only one unique label found. Stratification disabled.")
    train_df, val_df = train_test_split(df_rec, test_size=0.15, random_state=42, stratify=None)


print(f"Total samples: {total_samples}")
print(f"Calculated test_size ratio for split: {actual_test_size:.2f}")
print(f"Train set size: {len(train_df)}, Val set size: {len(val_df)}")


# Define the paths for the PaddleOCR-compatible label files
train_rec_path = os.path.join(output_folder, "train_rec.txt")
val_rec_path = os.path.join(output_folder, "val_rec.txt")

# Write training labels
with open(train_rec_path, 'w', encoding='utf-8') as f:
    for index, row in train_df.iterrows():
        image_relative_path = os.path.join('output_images', row['filename'])
        f.write(f"{image_relative_path}\t{row['label']}\n")

# Write validation labels
with open(val_rec_path, 'w', encoding='utf-8') as f:
    for index, row in val_df.iterrows():
        image_relative_path = os.path.join('output_images', row['filename'])
        f.write(f"{image_relative_path}\t{row['label']}\n")

print(f"Generated {len(train_df)} training samples in {train_rec_path}")
print(f"Generated {len(val_df)} validation samples in {val_rec_path}")

# IMPORTANT: Also generate a dictionary file
char_set = sorted(list(set(''.join(df_rec['label'].tolist()))))
dict_path = os.path.join(output_folder, "khmer_char_dict.txt")
with open(dict_path, 'w', encoding='utf-8') as f:
    for char in char_set:
        f.write(char + '\n')
print(f"Generated character dictionary with {len(char_set)} unique characters in {dict_path}")

Number of unique labels: 4372
Total samples: 17744
Calculated test_size ratio for split: 0.25
Train set size: 13372, Val set size: 4372
Generated 13372 training samples in output_images/train_rec.txt
Generated 4372 validation samples in output_images/val_rec.txt
Generated character dictionary with 168 unique characters in output_images/khmer_char_dict.txt


In [5]:
import os
import shutil

def arrange_images(list_file, target_subdir):
    """
    Reads image paths from a list file and moves them to a specified subdirectory.

    Args:
        list_file (str): The path to the text file containing image paths and labels.
        target_subdir (str): The name of the subdirectory where images should be moved
                             (e.g., 'train', 'val').
    """
    
    target_dir = os.path.join('output_images', target_subdir)
    os.makedirs(target_dir, exist_ok=True)
    print(f"\nProcessing '{list_file}' and moving images to '{target_dir}'...")

    moved_count = 0
    skipped_count = 0
    error_count = 0

    with open(list_file, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            parts = line.strip().split('\t')
            if len(parts) >= 1:
                original_path = parts[0]
                
                # Check if the original_path is already in the target_dir structure
                # This helps prevent errors if the script is run multiple times
                if original_path.startswith(target_dir + os.sep):
                    print(f"Skipping (already in target dir): '{original_path}'")
                    skipped_count += 1
                    continue

                filename = os.path.basename(original_path)
                new_path = os.path.join(target_dir, filename)
                
                try:
                    if os.path.exists(original_path):
                        shutil.move(original_path, new_path)
                        # print(f"Moved '{original_path}' to '{new_path}'") # Uncomment for verbose output
                        moved_count += 1
                    else:
                        print(f"Warning (line {line_num}): File not found at '{original_path}'. Skipping.")
                        skipped_count += 1
                except Exception as e:
                    print(f"Error (line {line_num}) moving '{original_path}': {e}")
                    error_count += 1
            else:
                print(f"Warning (line {line_num}): Line format incorrect, skipping: '{line.strip()}'")
                skipped_count += 1
    
    print(f"Finished processing '{list_file}'.")
    print(f"  Moved: {moved_count} files")
    print(f"  Skipped: {skipped_count} files (e.g., not found, already moved, or malformed lines)")
    print(f"  Errors: {error_count} files (due to unexpected issues)")


# --- Main execution ---
if __name__ == "__main__":
    # Process the training set
    arrange_images('output_images/train_rec.txt', 'train')

    # Process the validation set
    arrange_images('output_images/val_rec.txt', 'val')

    print("\nAll file arrangements complete.")


Processing 'output_images/train_rec.txt' and moving images to 'output_images/train'...
Finished processing 'output_images/train_rec.txt'.
  Moved: 13372 files
  Skipped: 0 files (e.g., not found, already moved, or malformed lines)
  Errors: 0 files (due to unexpected issues)

Processing 'output_images/val_rec.txt' and moving images to 'output_images/val'...
Finished processing 'output_images/val_rec.txt'.
  Moved: 4372 files
  Skipped: 0 files (e.g., not found, already moved, or malformed lines)
  Errors: 0 files (due to unexpected issues)

All file arrangements complete.


In [8]:
import pandas as pd
import os
import shutil # For moving files
from sklearn.model_selection import train_test_split
import hashlib # For encryption (hashing)

# --- Configuration ---
base_project_dir = "/media/thareah/New Volume1/Font-text-OCR/Test-version"
output_folder_name = "output_images" # This is the folder name, not the full path
labels_csv_file = "labels.csv" # Name of your CSV file

output_images_full_path = os.path.join(base_project_dir, output_folder_name)
labels_csv_path = os.path.join(output_images_full_path, labels_csv_file)

# Define the new train/val image subdirectories
train_images_dir = os.path.join(output_images_full_path, "train")
val_images_dir = os.path.join(output_images_full_path, "val")

# Ensure the new directories exist
os.makedirs(train_images_dir, exist_ok=True)
os.makedirs(val_images_dir, exist_ok=True)

# Define the paths for the PaddleOCR-compatible label files
train_rec_path = os.path.join(output_images_full_path, "train_rec.txt")
val_rec_path = os.path.join(output_images_full_path, "val_rec.txt")
dict_path = os.path.join(output_images_full_path, "khmer_char_dict.txt")

print(f"Reading labels from: {labels_csv_path}")
df = pd.read_csv(labels_csv_path, encoding='utf-8-sig')

# Filter for images to be used for recognition training (e.g., all variations)
df_rec = df[df['variant'].isin(['clean', 'blurred', 'noisy', 'noise_blur'])].copy() # Use .copy() to avoid SettingWithCopyWarning

# --- Stratified Split Logic (as before, with a slight adjustment for 'label' column) ---
num_unique_labels = len(df_rec['label'].unique())
print(f"Number of unique labels: {num_unique_labels}")

current_test_size_ratio = 0.15
total_samples = len(df_rec)

if num_unique_labels > 1:
    min_test_samples_needed = num_unique_labels # At least one sample per class for stratification
    min_test_size_ratio_required = min_test_samples_needed / total_samples

    actual_test_size = max(current_test_size_ratio, min_test_size_ratio_required)

    if total_samples * actual_test_size < num_unique_labels:
        print("Warning: Dataset might be too small for effective stratification with current test_size. Disabling stratification.")
        train_df, val_df = train_test_split(df_rec, test_size=actual_test_size, random_state=42, stratify=None)
    else:
        train_df, val_df = train_test_split(df_rec, test_size=actual_test_size, random_state=42, stratify=df_rec['label'])
else:
    print("Only one unique label found. Stratification disabled.")
    train_df, val_df = train_test_split(df_rec, test_size=0.15, random_state=42, stratify=None)

print(f"Total samples: {total_samples}")
print(f"Calculated test_size ratio for split: {actual_test_size:.2f}")
print(f"Train set size: {len(train_df)}, Val set size: {len(val_df)}")


# --- Process and write labels, and move/rename images ---

# List to store unique characters for dictionary
char_set = set()

# Process training data
print(f"Writing {len(train_df)} training samples to {train_rec_path} and moving images...")
with open(train_rec_path, 'w', encoding='utf-8') as f:
    for index, row in train_df.iterrows():
        original_filename = row['filename']
        original_image_path = os.path.join(output_images_full_path, original_filename)

        # Generate a unique (hashed) filename
        # Ensure the hash is unique enough and avoids collision with original names
        # You might want to include the label or a timestamp in the hash if original names are very short
        hashed_filename = hashlib.sha256(original_filename.encode('utf-8')).hexdigest() + ".png" # Assuming all are PNGs
        
        # New destination for the image
        new_image_path = os.path.join(train_images_dir, hashed_filename)

        # Move and rename the image file
        try:
            shutil.move(original_image_path, new_image_path)
        except FileNotFoundError:
            print(f"Warning: Original image not found for training: {original_image_path}. Skipping.")
            continue # Skip this entry if the file doesn't exist

        # Path to write into the label file (relative to data_dir in config)
        # It should be 'train/hashed_name.png' as data_dir is ../output_images/
        label_file_image_path = os.path.join('train', hashed_filename)
        
        f.write(f"{label_file_image_path}\t{row['label']}\n")
        char_set.update(row['label']) # Add characters to set

# Process validation data
print(f"Writing {len(val_df)} validation samples to {val_rec_path} and moving images...")
with open(val_rec_path, 'w', encoding='utf-8') as f:
    for index, row in val_df.iterrows():
        original_filename = row['filename']
        original_image_path = os.path.join(output_images_full_path, original_filename)

        # Generate a unique (hashed) filename
        hashed_filename = hashlib.sha256(original_filename.encode('utf-8')).hexdigest() + ".png"
        
        # New destination for the image
        new_image_path = os.path.join(val_images_dir, hashed_filename)

        # Move and rename the image file
        try:
            shutil.move(original_image_path, new_image_path)
        except FileNotFoundError:
            print(f"Warning: Original image not found for validation: {original_image_path}. Skipping.")
            continue # Skip this entry if the file doesn't exist

        # Path to write into the label file (relative to data_dir in config)
        label_file_image_path = os.path.join('val', hashed_filename)

        f.write(f"{label_file_image_path}\t{row['label']}\n")
        char_set.update(row['label']) # Add characters to set

# Generate character dictionary file
sorted_char_set = sorted(list(char_set))
with open(dict_path, 'w', encoding='utf-8') as f:
    for char in sorted_char_set:
        f.write(char + '\n')

print(f"Generated character dictionary with {len(sorted_char_set)} unique characters in {dict_path}")
print("Image files have been moved and renamed to encrypted filenames within 'train/' and 'val/' subdirectories.")
print("Label files (train_rec.txt, val_rec.txt) now contain paths relative to 'output_images/' (e.g., 'train/hashed_name.png').")

Reading labels from: /media/thareah/New Volume1/Font-text-OCR/Test-version/output_images/labels.csv
Number of unique labels: 418
Total samples: 1684
Calculated test_size ratio for split: 0.25
Train set size: 1266, Val set size: 418
Writing 1266 training samples to /media/thareah/New Volume1/Font-text-OCR/Test-version/output_images/train_rec.txt and moving images...
Writing 418 validation samples to /media/thareah/New Volume1/Font-text-OCR/Test-version/output_images/val_rec.txt and moving images...
Generated character dictionary with 132 unique characters in /media/thareah/New Volume1/Font-text-OCR/Test-version/output_images/khmer_char_dict.txt
Image files have been moved and renamed to encrypted filenames within 'train/' and 'val/' subdirectories.
Label files (train_rec.txt, val_rec.txt) now contain paths relative to 'output_images/' (e.g., 'train/hashed_name.png').
