# Nepali Handwriting Recognition Model Training Notebook

This notebook prepares a copy of the Nepali-STT-Model for training within the notebook environment. Follow the steps below to set up, inspect, and prepare the model for training.

## 1. Clone the Repository
Clone the Nepali-STT-Model repository (if not already present in your environment).

In [None]:
# If running in Colab or a fresh environment, uncomment below:
# !git clone https://github.com/sandeshbhatta495/Nepali-STT-Model.git
# %cd Nepali-STT-Model

## 2. Install Required Dependencies
Install all necessary Python packages for model training and data processing.

In [None]:
# If running in Colab or a fresh environment, uncomment below:
# !pip install torch torchvision pillow numpy matplotlib

## 3. Load and Inspect the Model Code
Read and review the model definition (model.py) to understand the architecture and dependencies.

In [None]:
# Display the model code (for reference)
with open('handwriting_recognition/cnn_model/model.py', 'r', encoding='utf-8') as f:
    print(f.read())

"""
Lightweight CNN classifier for Nepali word recognition.

Architecture:
  Input:  1 × 64 × 192   (grayscale, height × width)
  Conv → BN → ReLU → Pool  ×3
  Global Average Pool
  FC → Dropout → FC (num_classes)
  CrossEntropy loss

Total params: ~200K–500K  →  < 2 MB saved model.
"""

import torch
import torch.nn as nn
import torch.nn.functional as F


class NepaliWordCNN(nn.Module):
    """
    3-layer CNN for word-level classification.

    Image flows:
        1×64×192
      → Conv 32 (3×3) → BN → ReLU → MaxPool(2) → 32×32×96
      → Conv 64 (3×3) → BN → ReLU → MaxPool(2) → 64×16×48
      → Conv 128(3×3) → BN → ReLU → MaxPool(2) → 128×8×24
      → AdaptiveAvgPool(1) → 128
      → FC 128 → ReLU → Dropout → FC num_classes
    """

    def __init__(self, num_classes: int, dropout: float = 0.3):
        super().__init__()

        self.features = nn.Sequential(
            # Block 1
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn

## 4. Copy Model Definition to Notebook
Copy the NepaliWordCNN model class and any required helper functions directly into this notebook for training.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class NepaliWordCNN(nn.Module):
    """
    3-layer CNN for word-level classification.
    """
    def __init__(self, num_classes: int, dropout: float = 0.3):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
        )
        self.global_pool = nn.AdaptiveAvgPool2d(1)
        self.classifier = nn.Sequential(
            nn.Linear(128, 128),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(128, num_classes),
        )
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.features(x)
        x = self.global_pool(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x
    def predict_topk(self, x: torch.Tensor, k: int = 5):
        logits = self.forward(x)
        probs = F.softmax(logits, dim=-1)
        topk_probs, topk_idx = torch.topk(probs, k, dim=-1)
        return topk_probs, topk_idx

def count_parameters(model: nn.Module) -> int:
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

## 5. Verify Model Instantiation
Instantiate the model to confirm it runs without errors and is ready for training.

In [None]:
# Example: Instantiate the model with a dummy class count (replace with your vocab size)
VOCAB_SIZE = 237  # Replace with actual vocab size if different
model = NepaliWordCNN(num_classes=VOCAB_SIZE)
print(model)
print(f"Trainable parameters: {count_parameters(model):,}")

NepaliWordCNN(
  (features): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU(inplace=True)
    (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (8): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU(inplace=True)
    (11): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (global_pool): AdaptiveAvgPool2d(output_size=1)
  (classifier): Sequential(
    (0): Linear(in_features=128, out_features=