# OCR-Devanagari-CRNN — Dataset Analysis & Training Pipeline

This notebook creates an OCR for Devanagari using CRNN + LSTM, leveraging the HuggingFace dataset "Sakonii/nepalitext-language-model-dataset" to synthesize training images.

In [None]:
# !pip install --upgrade pip setuptools
# !pip install -r requirements.txt

Collecting pip
  Using cached pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Using cached pip-25.3-py3-none-any.whl (1.8 MB)


ERROR: To modify pip, please run the following command:
C:\Users\gaurav\miniconda3\envs\ocr\python.exe -m pip install --upgrade pip setuptools


Collecting uharfbuzz (from -r requirements.txt (line 19))
  Downloading uharfbuzz-0.52.0-cp310-abi3-win_amd64.whl.metadata (3.5 kB)
Downloading uharfbuzz-0.52.0-cp310-abi3-win_amd64.whl (1.2 MB)
   ---------------------------------------- 0.0/1.2 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.2 MB ? eta -:--:--
   -------- ------------------------------- 0.3/1.2 MB ? eta -:--:--
   --------------------------------- ------ 1.0/1.2 MB 3.9 MB/s eta 0:00:01
   ---------------------------------------- 1.2/1.2 MB 3.5 MB/s  0:00:00
Installing collected packages: uharfbuzz
Successfully installed uharfbuzz-0.52.0


In [1]:
from datasets import load_dataset
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import random
import re
import os
import glob
import torch
import torch.nn as nn
import freetype
import uharfbuzz as hb
import cv2
from collections import Counter

In [2]:
# Load the NepaliText dataset
dataset = load_dataset("Sakonii/nepalitext-language-model-dataset")
train_texts = dataset["train"]["text"]
print(f"Loaded dataset with {len(train_texts)} training samples")

Loaded dataset with 13141222 training samples


In [3]:
# Clean character function
def clean_char(c):
    if c in ["\n", "\t", "\r"]:
        return ""
    if ord(c) in [8203, 8204, 8205, 8206, 8207]:
        return ""
    if 2304 <= ord(c) <= 2431:  # Devanagari block
        return c
    if c.isprintable():
        return c
    return ""

# Character frequency analysis
char_freq = Counter()
for text in train_texts:
    if isinstance(text, str):
        cleaned = "".join(clean_char(c) for c in text)
        char_freq.update(cleaned)

print(f"✓ Unique cleaned characters: {len(char_freq)}")

# Top 50 characters
top_50 = char_freq.most_common(50)
chars, freqs = zip(*top_50)

plt.figure(figsize=(14, 6))
plt.bar(chars, freqs)
plt.xticks(rotation=90, fontsize=12)
plt.title("Top 50 Characters (Cleaned)")
plt.tight_layout()
plt.show()

KeyboardInterrupt: 

## Step 1: Extract 5000 Nepali Words

Extract unique Devanagari words from the dataset for training.

In [4]:
# Extract Devanagari words
def extract_nepali_words(text):
    """Extract Devanagari words from text."""
    if not isinstance(text, str):
        return []
    matches = re.findall(r"[\u0900-\u097F]+", text)
    return [w for w in matches if 2 <= len(w) <= 30]

all_words = set()
print("Extracting Nepali words from dataset...")
for i, text in enumerate(train_texts):
    words = extract_nepali_words(text)
    all_words.update(words)
    if (i + 1) % 10000 == 0:
        print(f"  Processed {i + 1} texts, found {len(all_words)} unique words")

# Sample 5000 words for training
all_words = list(all_words)
random.shuffle(all_words)
training_words = all_words[:5000]

print(f"\n✓ Total unique words: {len(all_words)}")
print(f"✓ Using {len(training_words)} words for training")
print(f"✓ Sample words: {training_words[:10]}")

Extracting Nepali words from dataset...
  Processed 10000 texts, found 49361 unique words
  Processed 20000 texts, found 77417 unique words
  Processed 30000 texts, found 100764 unique words
  Processed 40000 texts, found 119936 unique words
  Processed 50000 texts, found 137609 unique words
  Processed 60000 texts, found 153427 unique words
  Processed 70000 texts, found 167843 unique words
  Processed 80000 texts, found 182163 unique words
  Processed 90000 texts, found 195560 unique words
  Processed 100000 texts, found 208380 unique words
  Processed 110000 texts, found 220954 unique words
  Processed 120000 texts, found 232162 unique words
  Processed 130000 texts, found 243317 unique words
  Processed 140000 texts, found 254115 unique words
  Processed 150000 texts, found 264583 unique words
  Processed 160000 texts, found 274815 unique words
  Processed 170000 texts, found 285196 unique words
  Processed 180000 texts, found 294661 unique words
  Processed 190000 texts, found 305

## Step 2: Synthetic Dataset Generator

HarfBuzz-based generator for proper Devanagari shaping.

In [7]:
from PIL import ImageFilter


class SyntheticHarfBuzzOCRDatasetGenerator:
    """Generate synthetic OCR dataset with HarfBuzz shaping."""

    def __init__(
        self,
        strings,
        fonts_dir="fonts",
        output_dir="data/word_images",
        font_size_range=(40, 56),
        random_blur=True,
        random_noise=True,
        random_rotate=True,
        random_distortion=True,
        background_mode="random",
        max_image_size=1024
    ):
        self.strings = strings
        self.fonts = glob.glob(os.path.join(fonts_dir, "**/*.ttf"), recursive=True)
        if not self.fonts:
            raise ValueError(f"No fonts found in {fonts_dir}")

        self.output_dir = output_dir
        os.makedirs(self.output_dir, exist_ok=True)

        self.font_size_range = font_size_range
        self.random_blur = random_blur
        self.random_noise = random_noise
        self.random_rotate = random_rotate
        self.random_distortion = random_distortion
        self.background_mode = background_mode
        self.MAX_SIZE = max_image_size

    def _clamp_image_size(self, img):
        w, h = img.size
        if w > self.MAX_SIZE or h > self.MAX_SIZE:
            img.thumbnail((self.MAX_SIZE, self.MAX_SIZE), Image.LANCZOS)
        return img

    def generate_dataset(self):
        for idx, text in enumerate(self.strings, start=1):
            img = self.render_text_image(text)
            image_path = os.path.join(self.output_dir, f"{idx:05d}.png")
            label_path = os.path.join(self.output_dir, f"{idx:05d}.txt")
            img.save(image_path)
            with open(label_path, "w", encoding="utf-8") as f:
                f.write(text)
            if idx % 500 == 0:
                print(f"  [{idx}/{len(self.strings)}] Generated images")

    def render_text_image(self, text, padding=20):
        font_path = random.choice(self.fonts)
        font_size = random.randint(*self.font_size_range)
        face = freetype.Face(font_path)
        face.set_char_size(font_size * 64)

        # HarfBuzz shaping
        hb_blob = hb.Blob.from_file_path(font_path)
        hb_face = hb.Face(hb_blob, 0)
        hb_font = hb.Font(hb_face)
        hb_font.scale = (face.size.ascender, face.size.ascender)

        buf = hb.Buffer()
        buf.add_str(text)
        buf.guess_segment_properties()
        hb.shape(hb_font, buf)

        infos = buf.glyph_infos
        positions = buf.glyph_positions

        width = sum(pos.x_advance for pos in positions) // 64 + 2*padding
        height = font_size + 2*padding

        if self.background_mode == "white":
            img = Image.new("RGB", (width, height), "white")
        elif self.background_mode == "lightgray":
            img = Image.new("RGB", (width, height), "lightgray")
        else:
            arr = np.random.randint(200, 255, (height, width, 3), dtype=np.uint8)
            img = Image.fromarray(arr)

        x, y = padding, padding + font_size

        for info, pos in zip(infos, positions):
            glyph_index = info.codepoint
            face.load_glyph(glyph_index, freetype.FT_LOAD_RENDER)
            bitmap = face.glyph.bitmap
            top = face.glyph.bitmap_top
            left = face.glyph.bitmap_left

            if bitmap.width > 0 and bitmap.rows > 0:
                glyph_img = Image.frombytes("L", (bitmap.width, bitmap.rows), bytes(bitmap.buffer))
                colored_glyph = Image.new("RGB", glyph_img.size, "black")
                img.paste(colored_glyph, (int(x + left), int(y - top)), glyph_img)

            x += pos.x_advance / 64
            y -= pos.y_advance / 64

        img = self._clamp_image_size(img)

        # --- FIXED BLUR SECTION ---
        from PIL import ImageFilter

        if self.random_blur and random.random() < 0.5:
            img = img.filter(ImageFilter.GaussianBlur(radius=random.uniform(0.5, 1.5)))


        if self.random_rotate:
            angle = random.randint(-7, 7)
            img = img.rotate(angle, expand=True, fillcolor="white")
            img = self._clamp_image_size(img)

        if self.random_distortion:
            img = self.perspective_distortion(img)

        if self.random_noise:
            img = self.add_noise(img)

        return img

    def perspective_distortion(self, img):
        img = self._clamp_image_size(img)
        w, h = img.size
        arr = np.array(img)
        shift = min(w, h) * 0.1

        pts1 = np.float32([[0,0],[w,0],[0,h],[w,h]])
        pts2 = np.float32([
            [random.uniform(-shift, shift), random.uniform(-shift, shift)],
            [w + random.uniform(-shift, shift), random.uniform(-shift, shift)],
            [random.uniform(-shift, shift), h + random.uniform(-shift, shift)],
            [w + random.uniform(-shift, shift), h + random.uniform(-shift, shift)],
        ])
        matrix = cv2.getPerspectiveTransform(pts1, pts2)
        warped = cv2.warpPerspective(arr, matrix, (w,h), borderMode=cv2.BORDER_CONSTANT, borderValue=(255,255,255))
        return Image.fromarray(warped)

    def add_noise(self, img):
        arr = np.array(img).astype(np.float32)
        if random.random() < 0.5:
            arr += np.random.normal(0, 10, arr.shape)
        if random.random() < 0.5:
            amount = 0.02
            num_salt = int(arr.size * amount * 0.5)
            num_pepper = int(arr.size * amount * 0.5)
            coords = [np.random.randint(0, i - 1, num_salt) for i in arr.shape]
            arr[tuple(coords)] = 255
            coords = [np.random.randint(0, i - 1, num_pepper) for i in arr.shape]
            arr[tuple(coords)] = 0
        arr = np.clip(arr, 0, 255)
        return Image.fromarray(arr.astype(np.uint8))

## Step 3: Generate Dataset

In [8]:
print("=" * 70)
print("GENERATING SYNTHETIC DATASET (5000 WORD SAMPLES)")
print("=" * 70)

generator = SyntheticHarfBuzzOCRDatasetGenerator(
    strings=training_words,
    fonts_dir="fonts",
    output_dir="data/word_images",
    font_size_range=(40, 56),
    random_blur=True,
    random_noise=True,
    random_rotate=True,
    random_distortion=True,
    background_mode="random",
    max_image_size=1024
)

generator.generate_dataset()

print("\n✓ DATASET GENERATION COMPLETE!")
print("=" * 70)

# Verify
output_dir = "data/word_images"
image_files = [f for f in os.listdir(output_dir) if f.endswith(".png")]
print(f"✓ Generated {len(image_files)} images")

GENERATING SYNTHETIC DATASET (5000 WORD SAMPLES)
  [500/5000] Generated images
  [1000/5000] Generated images
  [1500/5000] Generated images
  [2000/5000] Generated images
  [2500/5000] Generated images
  [3000/5000] Generated images
  [3500/5000] Generated images
  [4000/5000] Generated images
  [4500/5000] Generated images
  [5000/5000] Generated images

✓ DATASET GENERATION COMPLETE!
✓ Generated 5000 images


## Step 4: Create Charset

In [9]:
# Extract unique characters
charset = set()
for word in training_words:
    charset.update(word)
charset = sorted(list(charset))

with open("charset.txt", "w", encoding="utf-8") as f:
    f.write("".join(charset))

print(f"✓ Charset: {len(charset)} unique characters")
print(f"✓ num_classes = {len(charset) + 1} (including CTC blank)")

✓ Charset: 80 unique characters
✓ num_classes = 81 (including CTC blank)


## Step 5: Model Architecture

In [10]:
class CRNNFeatureExtractor(nn.Module):
    """CNN backbone for CRNN."""
    def __init__(self, img_channels=1):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(img_channels, 64, 3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(64, 128, 3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(128, 256, 3, stride=1, padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(256),
            nn.Conv2d(256, 256, 3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d((2, 1), (2, 1)),
            nn.Conv2d(256, 512, 3, stride=1, padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(512),
            nn.Conv2d(512, 512, 3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d((2, 1), (2, 1)),
            nn.Conv2d(512, 512, 2, stride=1, padding=0),
            nn.ReLU()
        )

    def forward(self, x):
        conv_output = self.cnn(x)
        b, c, h, w = conv_output.size()
        conv_output = conv_output.mean(2)
        return conv_output.permute(2, 0, 1)


class BidirectionalLSTM(nn.Module):
    """BiLSTM layer."""
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.rnn = nn.LSTM(input_size, hidden_size, num_layers=1, bidirectional=True)
        self.embedding = nn.Linear(hidden_size * 2, output_size)

    def forward(self, x):
        recurrent, _ = self.rnn(x)
        return self.embedding(recurrent)


class OCRModel(nn.Module):
    """CRNN + BiLSTM + CTC."""
    def __init__(self, num_classes, img_channels=1, hidden_size=256):
        super().__init__()
        self.cnn = CRNNFeatureExtractor(img_channels)
        self.rnn = nn.Sequential(
            BidirectionalLSTM(512, hidden_size, hidden_size),
            BidirectionalLSTM(hidden_size, hidden_size, num_classes)
        )
        self.ctc_loss = nn.CTCLoss(blank=0, zero_infinity=True)

    def forward(self, x):
        features = self.cnn(x)
        return self.rnn(features)

    def compute_ctc_loss(self, preds, targets, pred_lengths, target_lengths):
        preds_log = preds.log_softmax(2)
        return self.ctc_loss(preds_log, targets, pred_lengths, target_lengths)

print("✓ Model classes defined")

✓ Model classes defined


## Step 6: Create Config & Start Training

In [11]:
import subprocess
import sys

# Create config
config_content = f"""# CRNN OCR Configuration
num_classes: {len(charset) + 1}
num_channels: 1
hidden_size: 256

img_height: 32
img_width: 256

batch_size: 64
epochs: 50
learning_rate: 0.001
weight_decay: 1e-5
scheduler_step: 15
scheduler_gamma: 0.5

train_samples: 5000
samples_per_word: 1
fonts_dir: "fonts"
output_dir: "data/word_images"
charset_path: "charset.txt"
"""

with open("config.yaml", "w") as f:
    f.write(config_content)

print("✓ config.yaml created")
print(f"✓ Charset size: {len(charset)} → {len(charset) + 1} classes")
print("\nTo start training, run: python scripts/train_word_ocr.py")

✓ config.yaml created
✓ Charset size: 80 → 81 classes

To start training, run: python scripts/train_word_ocr.py


## Summary

Pipeline complete:
- ✅ Extracted 5000 unique Nepali words
- ✅ Generated synthetic images (HarfBuzz + augmentations)
- ✅ Created charset.txt
- ✅ Defined OCR model (CRNN + BiLSTM + CTC)
- ✅ Ready for training

In [22]:
import yaml

with open("config.yaml", "r", encoding="utf-8") as f:
    cfg = yaml.safe_load(f)

print("Loaded config:", cfg)


Loaded config: {'num_classes': 81, 'num_channels': 1, 'hidden_size': 256, 'img_height': 32, 'img_width': 256, 'batch_size': 64, 'epochs': 50, 'learning_rate': 0.001, 'weight_decay': '1e-5', 'scheduler_step': 15, 'scheduler_gamma': 0.5, 'train_samples': 5000, 'samples_per_word': 1, 'fonts_dir': 'fonts', 'output_dir': 'data/word_images', 'charset_path': 'charset.txt'}


In [28]:
with open("config.yaml", "r", encoding="utf-8") as f:
    cfg = yaml.safe_load(f)

# FIX the incorrect type
cfg["weight_decay"] = float(cfg["weight_decay"])


# 1. OCR Dataset Class

In [29]:
class OCRDataset(Dataset):
    def __init__(self, root, charset_path, img_h, img_w):
        self.root = root
        self.img_h = img_h
        self.img_w = img_w

        self.samples = []
        for f in os.listdir(root):
            if f.endswith(".png"):
                idx = f[:-4]
                txt = os.path.join(root, idx + ".txt")
                if os.path.exists(txt):
                    self.samples.append((os.path.join(root, f), txt))

        with open(charset_path, "r", encoding="utf-8") as f:
            self.charset = ["blank"] + [c.strip() for c in f]

        self.char_to_idx = {c: i for i, c in enumerate(self.charset)}

    def encode(self, text):
        return torch.tensor([self.char_to_idx[c] for c in text if c in self.char_to_idx])

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        img_path, txt_path = self.samples[idx]

        img = Image.open(img_path).convert("L")
        img = img.resize((self.img_w, self.img_h), Image.LANCZOS)
        img = torch.tensor(img, dtype=torch.float32).unsqueeze(0) / 255.0

        with open(txt_path, "r", encoding="utf-8") as f:
            text = f.read().strip()

        return img, self.encode(text)


# Collate Function (for CTC)

In [30]:
def ocr_collate(batch):
    imgs, texts = zip(*batch)
    imgs = torch.stack(imgs)

    flat_targets = torch.cat(texts)
    target_lengths = torch.tensor([len(t) for t in texts])
    pred_lengths = torch.full((len(imgs),), imgs.shape[-1] // 4, dtype=torch.long)

    return imgs, flat_targets, pred_lengths, target_lengths


# 3. Load Data

In [31]:
dataset = OCRDataset(
    root=cfg["output_dir"],
    charset_path=cfg["charset_path"],
    img_h=cfg["img_height"],
    img_w=cfg["img_width"]
)

train_loader = DataLoader(
    dataset,
    batch_size=cfg["batch_size"],
    shuffle=True,
    collate_fn=ocr_collate,
    num_workers=2
)


# 4. Training Script

In [32]:
num_classes = cfg["num_classes"]
model = OCRModel(
    num_classes=num_classes,
    img_channels=cfg["num_channels"],
    hidden_size=cfg["hidden_size"]
).to(device)


# 6. Optimizer + Scheduler

In [33]:
optimizer = torch.optim.Adam(
    model.parameters(),
    lr=cfg["learning_rate"],
    weight_decay=cfg["weight_decay"]
)

scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer,
    step_size=cfg["scheduler_step"],
    gamma=cfg["scheduler_gamma"]
)


# 7. Full Training Loop (CTC)

In [None]:
loss_history = []
EPOCHS = cfg["epochs"]

for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0

    for imgs, targets, pred_lens, target_lens in train_loader:
        imgs = imgs.to(device)
        targets = targets.to(device)
        pred_lens = pred_lens.to(device)
        target_lens = target_lens.to(device)

        preds = model(imgs)
        loss = model.compute_ctc_loss(preds, targets, pred_lens, target_lens)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    loss_history.append(avg_loss)
    scheduler.step()

    print(f"Epoch {epoch}/{EPOCHS} - Loss: {avg_loss:.4f}")


# Saving the model

In [None]:
os.makedirs("checkpoints", exist_ok=True)
save_path = "checkpoints/nepali_crnn_ctc.pth"
torch.save(model.state_dict(), save_path)
print("✓ Model saved at", save_path)


# Plot the curve

In [None]:
plt.figure(figsize=(8,4))
plt.plot(loss_history, label="Training Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("CRNN-CTC Training Curve")
plt.grid()
plt.legend()
plt.show()
