In [1]:
# Install required dependencies
!pip install tensorflow keras opencv-python matplotlib scikit-learn



In [2]:
import os
import numpy as np
import cv2
import glob
import string
import zipfile
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Conv2D, MaxPooling2D, Dense, Reshape, Input, Dropout,
    Bidirectional, LSTM, BatchNormalization, TimeDistributed
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import tensorflow.keras.backend as K
from google.colab import files
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split

In [3]:
# Upload and extract dataset
uploaded = files.upload()
zip_filename = list(uploaded.keys())[0]
extract_path = "/content/data/"
with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
    zip_ref.extractall(extract_path)
print("✅ Dataset extracted successfully!")

Saving archive.zip to archive.zip
✅ Dataset extracted successfully!


In [4]:
# Load image file paths
image_files = glob.glob(f"{extract_path}/IAM/**/*.jpg", recursive=True)
print(f"✅ Found {len(image_files)} JPG images.")


✅ Found 2915 JPG images.


In [5]:
# Image Preprocessing Function

datagen = ImageDataGenerator(
    rotation_range=10,
    width_shift_range=0.1,
    height_shift_range=0.1,
    zoom_range=0.2,
    shear_range=0.1
)

def preprocess_image(image_path):
    """Prepares an image for the model by resizing, converting to grayscale, and normalizing."""

    # Load image in grayscale
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

    # Resize while maintaining aspect ratio
    target_width, target_height = 128, 32
    h, w = img.shape

    # Maintain aspect ratio when resizing
    scaling_factor = min(target_width / w, target_height / h)
    new_w = int(w * scaling_factor)
    new_h = int(h * scaling_factor)

    img_resized = cv2.resize(img, (new_w, new_h))

    # Pad resized image to exactly (128, 32)
    padded_img = np.ones((target_height, target_width), dtype=np.uint8) * 255  # White background
    padded_img[:new_h, :new_w] = img_resized

    # Normalize pixel values to range [0,1]
    padded_img = padded_img / 255.0

    # Expand dimensions to match (32, 128, 1)
    padded_img = np.expand_dims(padded_img, axis=-1)

    return padded_img

In [6]:
# Character Mapping for Text Labels
all_characters = string.ascii_lowercase + string.digits  # 36 characters
char_to_int = {ch: i for i, ch in enumerate(all_characters)}
int_to_char = {i: ch for i, ch in enumerate(all_characters)}
blank_index = len(all_characters)  # index 36 is reserved for the CTC blank

In [13]:
# CTC Loss Function
def ctc_loss_lambda(y_true, y_pred):
    input_length = tf.fill([tf.shape(y_pred)[0], 1], tf.shape(y_pred)[1])
    label_length = tf.fill([tf.shape(y_true)[0], 1], tf.shape(y_true)[1])
    return K.ctc_batch_cost(y_true, y_pred, input_length, label_length)

In [7]:
# Path to gt_test.txt
gt_path = os.path.join(extract_path, "IAM", "gt_test.txt")
image_folder = os.path.join(extract_path, "IAM", "image")

labels = []
image_paths = []
with open(gt_path, "r") as f:
    for line in f:
        parts = line.strip().split("\t")
        if len(parts) == 2:
            img_rel_path, label = parts
            full_img_path = os.path.join(image_folder, img_rel_path)
            if os.path.exists(full_img_path):
                image_paths.append(full_img_path)
                labels.append(label)

In [8]:
# Apply slight augmentation only to training images
image_list = []
for path in image_paths:
    try:
        img = preprocess_image(path)
        if img.shape == (32, 128, 1):
            image_list.append(img)
    except:
        continue

image_data = np.stack(image_list)
label_sequences = [[char_to_int[char] for char in label if char in char_to_int] for label in labels[:len(image_data)]]


In [9]:
# Train/Validation Split
X_train, X_val, y_train, y_val = train_test_split(
    image_data, label_sequences, test_size=0.2, random_state=42
)

In [10]:
# Padding
max_label_length = max(len(seq) for seq in y_train)
y_train = pad_sequences(y_train, maxlen=max_label_length, padding="post", value=blank_index)
y_val = pad_sequences(y_val, maxlen=max_label_length, padding="post", value=blank_index)


In [11]:
# Model Definition
def build_model(input_shape=(32, 128, 1), output_units=len(all_characters) + 1):
    inputs = Input(shape=input_shape)
    x = Conv2D(64, (3, 3), activation="relu", padding="same")(inputs)
    x = BatchNormalization()(x)
    x = MaxPooling2D(pool_size=(2, 1))(x)

    x = Conv2D(128, (3, 3), activation="relu", padding="same")(x)
    x = BatchNormalization()(x)
    x = MaxPooling2D(pool_size=(2, 1))(x)

    x = Reshape(target_shape=(128, -1))(x)
    x = Bidirectional(LSTM(128, return_sequences=True, dropout=0.3))(x)
    x = Bidirectional(LSTM(64, return_sequences=True, dropout=0.3))(x)

    outputs = TimeDistributed(Dense(output_units, activation="softmax"))(x)

    model = Model(inputs, outputs)
    return model

In [14]:
# Compile Model
model = build_model()
model.compile(optimizer=Adam(learning_rate=0.001), loss=ctc_loss_lambda)
print("✅ Model successfully compiled!")


✅ Model successfully compiled!


In [None]:
# Train the model with early stopping and learning rate reduction
early_stopping = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=5, min_lr=1e-6, verbose=1)

history = model.fit(
    X_train, y_train,
    batch_size=16,
    epochs=50,
    validation_data=(X_val, y_val),
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)


Epoch 1/50
[1m 34/146[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m2:47[0m 1s/step - loss: 175.9680