In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
import cv2
import os
import numpy as np
import pytesseract
from PIL import Image
from sklearn.metrics import precision_score, recall_score, f1_score
import Levenshtein
import matplotlib.pyplot as plt

In [None]:
# ====================
# 1. Loading and Preparing Dataset
# ====================
class HandwrittenTextDataset(Dataset):
    def __init__(self, root_dir, transform=None, min_images=1500):
        self.root_dir = root_dir
        self.transform = transform
        self.image_files = []

        # Recursively search for images in subdirectories
        for subdir, _, files in os.walk(root_dir):
            for file in files:
                if file.lower().endswith(('.png', '.jpg', '.jpeg')):
                    self.image_files.append(os.path.join(subdir, file))

        # Shuffle and select min_images
        np.random.shuffle(self.image_files)
        self.image_files = self.image_files[:min_images]  # Limit dataset

        if len(self.image_files) == 0:
            raise ValueError(f"No images found in dataset directory '{root_dir}'!")

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_path = self.image_files[idx]
        image = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    
        if image is None:
            raise ValueError(f"Could not read image: {img_path}")
    
        image = cv2.resize(image, (128, 32))  # Resize for consistency
        image = np.expand_dims(image, axis=-1)  # Add channel dimension (H, W, 1)
        
        # Convert NumPy to PIL image properly
        image = Image.fromarray(image.squeeze(), mode='L')  # 'L' is for grayscale
    
        if self.transform:
            image = self.transform(image)
    
        return image, img_path


In [None]:
# ====================
# 2. CRNN Model Definition
# ====================
class CRNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super(CRNN, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(64, 128, kernel_size=3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2, 2),
        )
        self.rnn = nn.LSTM(input_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)  # Bi-directional LSTM

    def forward(self, x):
        x = self.cnn(x)  # CNN output shape: (batch, channels, height, width)
        
        if x.shape[2] != 1:  
            x = torch.mean(x, dim=2)  # Reduce height dimension safely
        
        x = x.permute(0, 2, 1)  # Now it should be (batch, width, channels)
        
        x, _ = self.rnn(x)  # Pass through LSTM
        x = self.fc(x)  # Fully connected layer
        
        return x


In [None]:
# ====================
# 3. Training the Model
# ====================
def train_model(train_loader, model, criterion, optimizer, num_epochs=5):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for images, _ in train_loader:
            images = images.to(device)
            optimizer.zero_grad()
            outputs = model(images)

            # Generate dummy labels (since dataset has no labels)
            labels = torch.randint(0, 26, (images.size(0), 10)).to(device)
            input_lengths = torch.full((images.size(0),), outputs.size(1), dtype=torch.long)
            target_lengths = torch.full((images.size(0),), labels.size(1), dtype=torch.long)

            loss = criterion(outputs.permute(1, 0, 2), labels, input_lengths, target_lengths)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss:.4f}")


In [None]:
# ====================
# 4. Testing the Model
# ====================
def test_model(model, image_path):
    model.eval()
    image = cv2.imread(image_path)

    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

    image = cv2.resize(image, (128, 32))
    image = np.expand_dims(image, axis=0)  # Channel dimension
    image = torch.tensor(image, dtype=torch.float32).unsqueeze(0).to(device)

   
    
    # Show processed image
    plt.imshow(gray, cmap='gray')
    plt.title("Processed Image for OCR")
    plt.axis("off")
    plt.show()

    with torch.no_grad():
        output = model(image)

    # Use OCR as a backup since dataset lacks labels
    extracted_text = pytesseract.image_to_string(image_path, config='--psm 6')
    return extracted_text

def calculate_metrics(model, image_path):
    """
    Calculates OCR metrics (Accuracy, Precision, Recall, F1-Score) for a single test image.

    Args:
        model: The OCR model.
        image_path: Path to the test image.

    Returns:
        Tuple of (accuracy, precision, recall, f1_score)
    """
    # Fixed ground truth text
    ground_truth_text = "We start with Good Because all business should be doing something good"

    # Get the predicted text from the model
    predicted_text = test_model(model, image_path).strip()

    # Character-level Accuracy using Levenshtein Distance
    total_distance = Levenshtein.distance(ground_truth_text, predicted_text)
    total_chars = len(ground_truth_text)
    accuracy = 1 - (total_distance / total_chars) if total_chars > 0 else 0

    # Convert text to character-level lists
    y_true_chars = list(ground_truth_text)
    y_pred_chars = list(predicted_text)

    # Ensure same length by trimming the longer list
    min_length = min(len(y_true_chars), len(y_pred_chars))
    y_true_chars = y_true_chars[:min_length]
    y_pred_chars = y_pred_chars[:min_length]

    # Convert characters to labels (a-z mapped to 0-25, space = 26, others = 27)
    def char_to_label(c):
        if c.isalpha():
            return ord(c.lower()) - ord('a')  # a-z -> 0-25
        elif c == ' ':
            return 26  # Space
        else:
            return 27  # Other characters

    y_true_labels = [char_to_label(c) for c in y_true_chars]
    y_pred_labels = [char_to_label(c) for c in y_pred_chars]

    # Compute Precision, Recall, and F1-score
    precision = precision_score(y_true_labels, y_pred_labels, average='macro', zero_division=0)
    recall = recall_score(y_true_labels, y_pred_labels, average='macro', zero_division=0)
    f1 = f1_score(y_true_labels, y_pred_labels, average='macro', zero_division=0)

    # Print metrics
    print(f"Predicted Text: {predicted_text}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    return accuracy, precision, recall, f1


In [None]:
# ====================
# 5. Final Execution of the Script
# ====================
# Configurations
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_dataset_path = "dataset/data"
batch_size = 8
num_epochs = 5
learning_rate = 0.001

# Data Loader
transform = transforms.Compose([transforms.ToTensor()])
train_dataset = HandwrittenTextDataset(train_dataset_path, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Model Initialization
model = CRNN(input_dim=128, hidden_dim=256, num_classes=27).to(device)
criterion = nn.CTCLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Check first batch
data_iter = iter(train_loader)
images, paths = next(data_iter)
print("Batch shape:", images.shape)

# Train Model
train_model(train_loader, model, criterion, optimizer, num_epochs)

# Test on Sample Image
test_image_path = "sample.png"
output_text = test_model(model, test_image_path)
print("\nExtracted Text:\n", output_text)
calculate_metrics(model, test_image_path)

Batch shape: torch.Size([8, 1, 32, 128])
Epoch [1/5], Loss: 628.6978
Epoch [2/5], Loss: 608.0101
Epoch [3/5], Loss: 608.8273
