In [None]:
# Mongolian OCR Training on Google Colab
# Run each cell in order by clicking the play button or pressing Shift+Enter

# ===== CELL 1: Install Dependencies =====
print("Installing Kraken and dependencies...")
!pip install -q kraken pillow


In [None]:

# ===== CELL 2: Mount Google Drive =====
from google.colab import drive
drive.mount('/content/drive')
print("\nGoogle Drive mounted!")
print("Your files should be in /content/drive/MyDrive/")



In [None]:
# ===== CELL 3: Upload your files =====
# After mounting Drive, you have two options:
# OPTION A: Upload directly to Colab (faster for this session only)
print("\nOption A: Upload files directly to Colab")
print("Run the next cell to upload a zip file of your project")

# OPTION B: Copy from Google Drive (better for repeated use)
print("\nOption B: Use files from Google Drive")
print("1. Upload your 'synthetic_mongolian' folder to Google Drive")
print("2. Update the path in Cell 5 to point to your Drive folder")



In [None]:
# ===== CELL 4: Upload ZIP file (if using Option A) =====
from google.colab import files
import zipfile
import os

print("Upload your project ZIP file (should contain synthetic_mongolian folder)")
uploaded = files.upload()

# Extract the ZIP
for filename in uploaded.keys():
    print(f"Extracting {filename}...")
    with zipfile.ZipFile(filename, 'r') as zip_ref:
        zip_ref.extractall('/content/')
    print(f"Extracted to /content/")



In [None]:
# ===== CELL 5: Verify files =====
import glob
import os

# Check if images exist
os.chdir('/content')
images = glob.glob('/content/drive/MyDrive/synthetic_mongolian/images/*.png')
gt_files = glob.glob('/content/drive/MyDrive/synthetic_mongolian/images/*.gt.txt')

print(f"Found {len(images)} PNG images")
print(f"Found {len(gt_files)} ground truth files")

if len(images) != len(gt_files):
    print("WARNING: Number of images and ground truth files don't match!")
else:
    print("✓ All files present and matched!")



In [None]:
# ===== DIAGNOSTIC CELL: Find your files =====
import os
import glob

print("=" * 60)
print("SEARCHING FOR YOUR FILES...")
print("=" * 60)

print("\n1. Contents of /content/:")
os.system('ls -la /content/')

print("\n2. Contents of /content/drive/MyDrive/ (first 20 items):")
os.system('ls -la /content/drive/MyDrive/ | head -20')

print("\n3. Searching for 'synthetic_mongolian' folder:")
os.system('find /content -name "synthetic_mongolian" -type d 2>/dev/null')
os.system('find /content/drive/MyDrive -name "synthetic_mongolian" -type d 2>/dev/null')

print("\n4. Searching for PNG files:")
patterns = [
    'synthetic_mongolian/images/*.png',
    '/content/synthetic_mongolian/images/*.png',
    '/content/drive/MyDrive/synthetic_mongolian/images/*.png',
    '/content/drive/MyDrive/Gemini/synthetic_mongolian/images/*.png',
]

for pattern in patterns:
    files = glob.glob(pattern)
    print(f"   {pattern}")
    print(f"   → Found {len(files)} files")

print("\n5. What's in MyDrive root?")
os.system('ls /content/drive/MyDrive/')

print("=" * 60)

In [None]:
# # NEW CELL: Resize all images before training
# import os
# from PIL import Image
# import glob
# from tqdm import tqdm

# print("Starting image resize operation...")

# # Paths
# original_dir = '/content/drive/MyDrive/synthetic_mongolian'
# temp_new_dir = '/content/drive/MyDrive/synthetic_mongolian_resized'
# backup_dir = '/content/drive/MyDrive/synthetic_mongolian_large_images'

# # Create new directory
# os.makedirs(temp_new_dir, exist_ok=True)
# os.makedirs(os.path.join(temp_new_dir, 'images'), exist_ok=True)

# # Target dimensions
# TARGET_WIDTH = 48
# TARGET_HEIGHT = 120

# # Get all images
# image_files = sorted(glob.glob(os.path.join(original_dir, 'images', '*.png')))
# gt_files = sorted(glob.glob(os.path.join(original_dir, 'images', '*.gt.txt')))

# print(f"Found {len(image_files)} images to resize")
# print(f"Target size: {TARGET_WIDTH}x{TARGET_HEIGHT} pixels")
# print("This will take about 10-15 minutes...\n")

# # Resize images
# for img_path in tqdm(image_files, desc="Resizing images"):
#     # Load image
#     img = Image.open(img_path).convert('L')  # Grayscale

#     # Resize maintaining aspect ratio
#     original_w, original_h = img.size

#     # Calculate scaling to fit within target dimensions
#     scale_w = TARGET_WIDTH / original_w
#     scale_h = TARGET_HEIGHT / original_h
#     scale = min(scale_w, scale_h)

#     new_w = int(original_w * scale)
#     new_h = int(original_h * scale)

#     # Resize
#     img_resized = img.resize((new_w, new_h), Image.LANCZOS)

#     # Create canvas with target dimensions (white background)
#     canvas = Image.new('L', (TARGET_WIDTH, TARGET_HEIGHT), 255)

#     # Paste resized image centered on canvas
#     paste_x = (TARGET_WIDTH - new_w) // 2
#     paste_y = (TARGET_HEIGHT - new_h) // 2
#     canvas.paste(img_resized, (paste_x, paste_y))

#     # Save
#     filename = os.path.basename(img_path)
#     canvas.save(os.path.join(temp_new_dir, 'images', filename))

# print("\nCopying ground truth files...")
# # Copy .gt.txt files
# for gt_path in tqdm(gt_files, desc="Copying GT files"):
#     filename = os.path.basename(gt_path)
#     import shutil
#     shutil.copy(gt_path, os.path.join(temp_new_dir, 'images', filename))

# print("\nRenaming directories...")
# # Rename original to backup
# os.rename(original_dir, backup_dir)
# print(f"✓ Renamed {original_dir} → {backup_dir}")

# # Rename new to original
# os.rename(temp_new_dir, original_dir)
# print(f"✓ Renamed {temp_new_dir} → {original_dir}")

# print("\n" + "="*60)
# print("RESIZE COMPLETE!")
# print("="*60)
# print(f"Original large images backed up to: {backup_dir}")
# print(f"Resized images now at: {original_dir}")
# print(f"All {len(image_files)} images resized to {TARGET_WIDTH}×{TARGET_HEIGHT}")
# print("\nYou can now run Cell 6 and Cell 7 with the resized images!")
# print("Training should be ~25x faster!")

In [None]:
# CELL 6 — Batchwise OCR training for Mongolian (Kraken)

import os
import glob
import traceback
from kraken.lib.train import RecognitionModel, KrakenTrainer

def train_mongolian_model(
    data_path='/content/drive/MyDrive/synthetic_mongolian',
    total_epochs=50,
    mini_batch_size=32,
    learning_rate=0.001,
    early_stopping_patience=10,
    images_per_batch=2500
):

    # logging
    log_path = os.path.join(data_path, 'training_log.txt')
    log_file = open(log_path, 'w')

    def log(msg):
        print(msg)
        log_file.write(msg + '\n')
        log_file.flush()

    try:
        log("="*60)
        log("TRAINING with EPOCH-BY-EPOCH SAVING")
        log("="*60)

        # locate all images directly in Drive
        img_pattern = os.path.join(data_path, 'images', '*.png')
        all_imgs = sorted(glob.glob(img_pattern))
        if not all_imgs:
            log(f"ERROR: no images found at {img_pattern}")
            return None

        total = len(all_imgs)
        log(f"Found {total} total training images")

        # chunk image list into batches
        batches = [all_imgs[i:i+images_per_batch] for i in range(0, total, images_per_batch)]
        log(f"Processing {len(batches)} batches of up to {images_per_batch} images each")

        # Define checkpoint directory
        checkpoints_dir = os.path.join(data_path, 'checkpoints')
        os.makedirs(checkpoints_dir, exist_ok=True)
        log(f"Checkpoints will be saved to: {checkpoints_dir}")

        # Initial model (start fresh or load latest)
        model = None
        latest_checkpoint = sorted(glob.glob(os.path.join(checkpoints_dir, 'mongolian_model_epoch_*.mlmodel')))
        if latest_checkpoint:
            model_path = latest_checkpoint[-1]
            log(f"Loading latest checkpoint: {model_path}")
            model = RecognitionModel(file=model_path)
        else:
            log("Initializing new model.")

        # training loop across epochs
        for epoch in range(1, total_epochs + 1):
            log(f"\n===== EPOCH {epoch}/{total_epochs} ====")
            for b, batch_imgs in enumerate(batches, start=1):
                log(f"\n--- Training batch {b}/{len(batches)} ({len(batch_imgs)} images) ---")

                model = RecognitionModel(
                    training_data=batch_imgs,
                    format_type='path',
                    hyper_params={
                        'epochs': total_epochs,
                        'lag': early_stopping_patience,
                        'min_epochs': 10,
                        'quit': 'dumb',
                        'freq': 1.0,
                        'partition': 0.9,
                        'lrate': learning_rate,
                        'load_threads': 2,
                        'batch_size': mini_batch_size,
                    },
                    output='mongolian_model_temp.mlmodel',
                )

                trainer = KrakenTrainer(
                    enable_progress_bar=True,
                    enable_checkpointing=False,
                    accelerator='auto',
                )

                trainer.fit(model)

            # save after each full epoch
            epoch_path = os.path.join(
                data_path, f"checkpoints/mongolian_model_epoch_{epoch:02d}.mlmodel"
            )
            os.makedirs(os.path.dirname(epoch_path), exist_ok=True)
            if os.path.exists('mongolian_model_temp.mlmodel'):
                os.replace('mongolian_model_temp.mlmodel', epoch_path)
                log(f"Saved model checkpoint: {epoch_path}")
            else:
                log("WARNING: Temporary model file not found after epoch")

        log("="*60)
        log("Training complete!")
        log("="*60)
        log_file.close()
        return True

    except Exception as e:
        log("\n" + "="*60)
        log("EXCEPTION OCCURRED:")
        log("="*60)
        log(f"{type(e).__name__}: {e}")
        log(traceback.format_exc())
        log("="*60)
        log_file.close()
        return None

print("Batchwise training function defined — will read/write directly to Google Drive.")


In [None]:
# CELL 7 — launch training
success = train_mongolian_model(
    data_path='/content/drive/MyDrive/synthetic_mongolian',
    mini_batch_size=32,
    total_epochs=50,              # adjust for longer runs if needed
    images_per_batch=2500,  # Increased given smaller image size
)

if success:
    print("Training completed successfully! Check checkpoints folder in Google Drive.")
else:
    print("Training failed — see training_log.txt in your Drive folder for details.")

In [None]:
#===== CELL 7.5: Delete old checkpoints =====
#old_ckpts = sorted(glob.glob(os.path.join(checkpoints_dir, "mongolian_model_epoch_*.mlmodel")))
#if len(old_ckpts) > keep_last_n:
#    for ckpt in old_ckpts[:-keep_last_n]:
#        os.remove(ckpt)


In [None]:
# ===== CELL 8: Test the model =====
# Test on a sample image
test_image = 'synthetic_mongolian/images/line_0500-1.png'

print(f"Testing model on: {test_image}")
!kraken -i {test_image} output.txt segment ocr -m mongolian_model.mlmodel

print("\nGround truth:")
!cat synthetic_mongolian/images/line_0500-1.gt.txt

print("\nModel prediction:")
!cat output.txt



In [None]:
# ===== CELL 9: Download the trained model =====
from google.colab import files

print("Downloading trained model...")
files.download('mongolian_model.mlmodel')
print("Model downloaded! You can now use it for OCR.")



In [None]:
# ===== CELL 10: (Optional) Save model to Google Drive =====
# Uncomment and run if you want to save to Drive for later use

import shutil
shutil.copy('mongolian_model.mlmodel', '/content/drive/MyDrive/mongolian_model.mlmodel')
print("Model saved to Google Drive!")