# OCR-Devanagari-CRNN — Dataset Analysis & Training Pipeline

This notebook creates an OCR for Devanagari using CRNN + LSTM, leveraging the HuggingFace dataset "Sakonii/nepalitext-language-model-dataset" to synthesize training images.

In [None]:
# !pip install --upgrade pip setuptools
# !pip install -r requirements.txt

Collecting pip
  Using cached pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Using cached pip-25.3-py3-none-any.whl (1.8 MB)


ERROR: To modify pip, please run the following command:
C:\Users\gaurav\miniconda3\envs\ocr\python.exe -m pip install --upgrade pip setuptools


Collecting uharfbuzz (from -r requirements.txt (line 19))
  Downloading uharfbuzz-0.52.0-cp310-abi3-win_amd64.whl.metadata (3.5 kB)
Downloading uharfbuzz-0.52.0-cp310-abi3-win_amd64.whl (1.2 MB)
   ---------------------------------------- 0.0/1.2 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.2 MB ? eta -:--:--
   -------- ------------------------------- 0.3/1.2 MB ? eta -:--:--
   --------------------------------- ------ 1.0/1.2 MB 3.9 MB/s eta 0:00:01
   ---------------------------------------- 1.2/1.2 MB 3.5 MB/s  0:00:00
Installing collected packages: uharfbuzz
Successfully installed uharfbuzz-0.52.0


In [None]:
from datasets import load_dataset
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import random
import re
import os
import glob
import torch
import torch.nn as nn
import freetype
import uharfbuzz as hb
import cv2
from collections import Counter

In [None]:
# Load the NepaliText dataset
dataset = load_dataset("Sakonii/nepalitext-language-model-dataset")
train_texts = dataset["train"]["text"]
print(f"Loaded dataset with {len(train_texts)} training samples")

In [None]:
# Clean character function
def clean_char(c):
    if c in ["\n", "\t", "\r"]:
        return ""
    if ord(c) in [8203, 8204, 8205, 8206, 8207]:
        return ""
    if 2304 <= ord(c) <= 2431:  # Devanagari block
        return c
    if c.isprintable():
        return c
    return ""

# Character frequency analysis
char_freq = Counter()
for text in train_texts:
    if isinstance(text, str):
        cleaned = "".join(clean_char(c) for c in text)
        char_freq.update(cleaned)

print(f"✓ Unique cleaned characters: {len(char_freq)}")

# Top 50 characters
top_50 = char_freq.most_common(50)
chars, freqs = zip(*top_50)

plt.figure(figsize=(14, 6))
plt.bar(chars, freqs)
plt.xticks(rotation=90, fontsize=12)
plt.title("Top 50 Characters (Cleaned)")
plt.tight_layout()
plt.show()

## Step 1: Extract 5000 Nepali Words

Extract unique Devanagari words from the dataset for training.

In [None]:
# Extract Devanagari words
def extract_nepali_words(text):
    """Extract Devanagari words from text."""
    if not isinstance(text, str):
        return []
    matches = re.findall(r"[\u0900-\u097F]+", text)
    return [w for w in matches if 2 <= len(w) <= 30]

all_words = set()
print("Extracting Nepali words from dataset...")
for i, text in enumerate(train_texts):
    words = extract_nepali_words(text)
    all_words.update(words)
    if (i + 1) % 10000 == 0:
        print(f"  Processed {i + 1} texts, found {len(all_words)} unique words")

# Sample 5000 words for training
all_words = list(all_words)
random.shuffle(all_words)
training_words = all_words[:5000]

print(f"\n✓ Total unique words: {len(all_words)}")
print(f"✓ Using {len(training_words)} words for training")
print(f"✓ Sample words: {training_words[:10]}")

## Step 2: Synthetic Dataset Generator

HarfBuzz-based generator for proper Devanagari shaping.

In [None]:
class SyntheticHarfBuzzOCRDatasetGenerator:
    """Generate synthetic OCR dataset with HarfBuzz shaping."""

    def __init__(
        self,
        strings,
        fonts_dir="fonts",
        output_dir="data/word_images",
        font_size_range=(40, 56),
        random_blur=True,
        random_noise=True,
        random_rotate=True,
        random_distortion=True,
        background_mode="random",
        max_image_size=1024
    ):
        self.strings = strings
        self.fonts = glob.glob(os.path.join(fonts_dir, "**/*.ttf"), recursive=True)
        if not self.fonts:
            raise ValueError(f"No fonts found in {fonts_dir}")

        self.output_dir = output_dir
        os.makedirs(self.output_dir, exist_ok=True)

        self.font_size_range = font_size_range
        self.random_blur = random_blur
        self.random_noise = random_noise
        self.random_rotate = random_rotate
        self.random_distortion = random_distortion
        self.background_mode = background_mode
        self.MAX_SIZE = max_image_size

    def _clamp_image_size(self, img):
        w, h = img.size
        if w > self.MAX_SIZE or h > self.MAX_SIZE:
            img.thumbnail((self.MAX_SIZE, self.MAX_SIZE), Image.LANCZOS)
        return img

    def generate_dataset(self):
        for idx, text in enumerate(self.strings, start=1):
            img = self.render_text_image(text)
            image_path = os.path.join(self.output_dir, f"{idx:05d}.png")
            label_path = os.path.join(self.output_dir, f"{idx:05d}.txt")
            img.save(image_path)
            with open(label_path, "w", encoding="utf-8") as f:
                f.write(text)
            if idx % 500 == 0:
                print(f"  [{idx}/{len(self.strings)}] Generated images")

    def render_text_image(self, text, padding=20):
        font_path = random.choice(self.fonts)
        font_size = random.randint(*self.font_size_range)
        face = freetype.Face(font_path)
        face.set_char_size(font_size * 64)

        # HarfBuzz shaping
        hb_blob = hb.Blob.from_file_path(font_path)
        hb_face = hb.Face(hb_blob, 0)
        hb_font = hb.Font(hb_face)
        hb_font.scale = (face.size.ascender, face.size.ascender)

        buf = hb.Buffer()
        buf.add_str(text)
        buf.guess_segment_properties()
        hb.shape(hb_font, buf)

        infos = buf.glyph_infos
        positions = buf.glyph_positions

        width = sum(pos.x_advance for pos in positions) // 64 + 2*padding
        height = font_size + 2*padding

        if self.background_mode == "white":
            img = Image.new("RGB", (width, height), "white")
        elif self.background_mode == "lightgray":
            img = Image.new("RGB", (width, height), "lightgray")
        else:
            arr = np.random.randint(200, 255, (height, width, 3), dtype=np.uint8)
            img = Image.fromarray(arr)

        x, y = padding, padding + font_size

        for info, pos in zip(infos, positions):
            glyph_index = info.codepoint
            face.load_glyph(glyph_index, freetype.FT_LOAD_RENDER)
            bitmap = face.glyph.bitmap
            top = face.glyph.bitmap_top
            left = face.glyph.bitmap_left

            if bitmap.width > 0 and bitmap.rows > 0:
                glyph_img = Image.frombytes("L", (bitmap.width, bitmap.rows), bytes(bitmap.buffer))
                colored_glyph = Image.new("RGB", glyph_img.size, "black")
                img.paste(colored_glyph, (int(x + left), int(y - top)), glyph_img)

            x += pos.x_advance / 64
            y -= pos.y_advance / 64

        img = self._clamp_image_size(img)

        if self.random_blur and random.random() < 0.5:
            img = img.filter(Image.Resampling.LANCZOS if hasattr(Image, 'Resampling') else Image.LANCZOS)
            from PIL import ImageFilter
            img = img.filter(ImageFilter.GaussianBlur(radius=random.uniform(0.5, 1.5)))

        if self.random_rotate:
            angle = random.randint(-7, 7)
            img = img.rotate(angle, expand=True, fillcolor="white")
            img = self._clamp_image_size(img)

        if self.random_distortion:
            img = self.perspective_distortion(img)

        if self.random_noise:
            img = self.add_noise(img)

        return img

    def perspective_distortion(self, img):
        img = self._clamp_image_size(img)
        w, h = img.size
        arr = np.array(img)
        shift = min(w, h) * 0.1

        pts1 = np.float32([[0,0],[w,0],[0,h],[w,h]])
        pts2 = np.float32([
            [random.uniform(-shift, shift), random.uniform(-shift, shift)],
            [w + random.uniform(-shift, shift), random.uniform(-shift, shift)],
            [random.uniform(-shift, shift), h + random.uniform(-shift, shift)],
            [w + random.uniform(-shift, shift), h + random.uniform(-shift, shift)],
        ])
        matrix = cv2.getPerspectiveTransform(pts1, pts2)
        warped = cv2.warpPerspective(arr, matrix, (w,h), borderMode=cv2.BORDER_CONSTANT, borderValue=(255,255,255))
        return Image.fromarray(warped)

    def add_noise(self, img):
        arr = np.array(img).astype(np.float32)
        if random.random() < 0.5:
            arr += np.random.normal(0, 10, arr.shape)
        if random.random() < 0.5:
            amount = 0.02
            num_salt = int(arr.size * amount * 0.5)
            num_pepper = int(arr.size * amount * 0.5)
            coords = [np.random.randint(0, i - 1, num_salt) for i in arr.shape]
            arr[tuple(coords)] = 255
            coords = [np.random.randint(0, i - 1, num_pepper) for i in arr.shape]
            arr[tuple(coords)] = 0
        arr = np.clip(arr, 0, 255)
        return Image.fromarray(arr.astype(np.uint8))

## Step 3: Generate Dataset

In [None]:
print("=" * 70)
print("GENERATING SYNTHETIC DATASET (5000 WORD SAMPLES)")
print("=" * 70)

generator = SyntheticHarfBuzzOCRDatasetGenerator(
    strings=training_words,
    fonts_dir="fonts",
    output_dir="data/word_images",
    font_size_range=(40, 56),
    random_blur=True,
    random_noise=True,
    random_rotate=True,
    random_distortion=True,
    background_mode="random",
    max_image_size=1024
)

generator.generate_dataset()

print("\n✓ DATASET GENERATION COMPLETE!")
print("=" * 70)

# Verify
output_dir = "data/word_images"
image_files = [f for f in os.listdir(output_dir) if f.endswith(".png")]
print(f"✓ Generated {len(image_files)} images")

## Step 4: Create Charset

In [None]:
# Extract unique characters
charset = set()
for word in training_words:
    charset.update(word)
charset = sorted(list(charset))

with open("charset.txt", "w", encoding="utf-8") as f:
    f.write("".join(charset))

print(f"✓ Charset: {len(charset)} unique characters")
print(f"✓ num_classes = {len(charset) + 1} (including CTC blank)")

## Step 5: Model Architecture

In [None]:
class CRNNFeatureExtractor(nn.Module):
    """CNN backbone for CRNN."""
    def __init__(self, img_channels=1):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(img_channels, 64, 3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(64, 128, 3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(128, 256, 3, stride=1, padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(256),
            nn.Conv2d(256, 256, 3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d((2, 1), (2, 1)),
            nn.Conv2d(256, 512, 3, stride=1, padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(512),
            nn.Conv2d(512, 512, 3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d((2, 1), (2, 1)),
            nn.Conv2d(512, 512, 2, stride=1, padding=0),
            nn.ReLU()
        )

    def forward(self, x):
        conv_output = self.cnn(x)
        b, c, h, w = conv_output.size()
        conv_output = conv_output.mean(2)
        return conv_output.permute(2, 0, 1)


class BidirectionalLSTM(nn.Module):
    """BiLSTM layer."""
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.rnn = nn.LSTM(input_size, hidden_size, num_layers=1, bidirectional=True)
        self.embedding = nn.Linear(hidden_size * 2, output_size)

    def forward(self, x):
        recurrent, _ = self.rnn(x)
        return self.embedding(recurrent)


class OCRModel(nn.Module):
    """CRNN + BiLSTM + CTC."""
    def __init__(self, num_classes, img_channels=1, hidden_size=256):
        super().__init__()
        self.cnn = CRNNFeatureExtractor(img_channels)
        self.rnn = nn.Sequential(
            BidirectionalLSTM(512, hidden_size, hidden_size),
            BidirectionalLSTM(hidden_size, hidden_size, num_classes)
        )
        self.ctc_loss = nn.CTCLoss(blank=0, zero_infinity=True)

    def forward(self, x):
        features = self.cnn(x)
        return self.rnn(features)

    def compute_ctc_loss(self, preds, targets, pred_lengths, target_lengths):
        preds_log = preds.log_softmax(2)
        return self.ctc_loss(preds_log, targets, pred_lengths, target_lengths)

print("✓ Model classes defined")

## Step 6: Create Config & Start Training

In [None]:
import subprocess
import sys

# Create config
config_content = f"""# CRNN OCR Configuration
num_classes: {len(charset) + 1}
num_channels: 1
hidden_size: 256

img_height: 32
img_width: 256

batch_size: 64
epochs: 50
learning_rate: 0.001
weight_decay: 1e-5
scheduler_step: 15
scheduler_gamma: 0.5

train_samples: 5000
samples_per_word: 1
fonts_dir: "fonts"
output_dir: "data/word_images"
charset_path: "charset.txt"
"""

with open("config.yaml", "w") as f:
    f.write(config_content)

print("✓ config.yaml created")
print(f"✓ Charset size: {len(charset)} → {len(charset) + 1} classes")
print("\nTo start training, run: python scripts/train_word_ocr.py")

## Summary

Pipeline complete:
- ✅ Extracted 5000 unique Nepali words
- ✅ Generated synthetic images (HarfBuzz + augmentations)
- ✅ Created charset.txt
- ✅ Defined OCR model (CRNN + BiLSTM + CTC)
- ✅ Ready for training