# Improving HTR systems with  Attention mechanisms to CRNN
>CSC 8851\
>Ana Costa, Srikar Pottabathula

### Import Statements

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
import tensorflow as tf
from tensorflow.keras import layers, Model, Input
from tensorflow.keras.layers import Conv2D, MaxPooling2D, BatchNormalization, Dense, Dropout, Reshape, Lambda, Bidirectional, LSTM
import numpy as np
import pandas as pd
import cv2
import os

Mounted at /content/drive


### Parameters and Vocabulary

In [2]:
target_img_height = 32     # Fixed image height after resizing
target_img_width = 128     # Fixed image width after padding/cropping
num_channels = 1           # Grayscale images

# Vocabulary and related parameters
vocab = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
num_classes = len(vocab)   # 62 characters
max_label_length = 64      # Maximum number of characters per label
time_steps = 64            # Sequence length output from CNN (fixed)

# Create mappings between characters and indices
char_to_num = {ch: i for i, ch in enumerate(vocab)}
num_to_char = {i: ch for i, ch in enumerate(vocab)}

### CTC Loss Lambda Function

In [3]:
def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
    return tf.keras.backend.ctc_batch_cost(labels, y_pred, input_length, label_length)

### Data Loading and Preprocessing

In [4]:
def load_gt_file(gt_filename="/content/drive/MyDrive/Colab Notebooks/CSC 8851/Final Project/gt_test.txt"):
    """
    Reads the ground truth text file and returns lists of:
      - image_paths: full paths to image files (assuming images folder is in the same directory)
      - labels: corresponding text labels.
    """
    image_paths = []
    labels = []
    with open(gt_filename, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) >= 2:
                filename = parts[0]
                label = parts[1]
                image_paths.append(os.path.join("/content/drive/MyDrive/Colab Notebooks/CSC 8851/Final Project/images", filename))
                labels.append(label)
    return image_paths, labels

def preprocess_image_tf(image_path):
    """
    Reads an image file, decodes it, scales the height to target_img_height while preserving aspect ratio,
    then pads or crops the width to target_img_width.
    """
    img = tf.io.read_file(image_path)
    img = tf.image.decode_jpeg(img, channels=num_channels)
    img = tf.image.convert_image_dtype(img, tf.float32)  # Scale pixels to [0, 1]

    # Preserve aspect ratio: resize height to target_img_height
    orig_shape = tf.shape(img)
    orig_height = tf.cast(orig_shape[0], tf.float32)
    orig_width = tf.cast(orig_shape[1], tf.float32)
    scale = tf.cast(target_img_height, tf.float32) / orig_height
    new_width = tf.cast(tf.math.round(orig_width * scale), tf.int32)
    img = tf.image.resize(img, [target_img_height, new_width])

    # Pad or crop the width to target_img_width
    img = tf.image.resize_with_crop_or_pad(img, target_img_height, target_img_width)
    return img

def process_label(label_str):
    """
    Converts a label string into an integer sequence using char_to_num mapping,
    then pads the sequence to max_label_length using -1.
    """
    label_seq = [char_to_num.get(ch, 0) for ch in label_str]
    if len(label_seq) < max_label_length:
        label_seq += [-1] * (max_label_length - len(label_seq))
    else:
        label_seq = label_seq[:max_label_length]
    return np.array(label_seq, dtype=np.int32)

def parse_function(image_path, label_str):
    """
    Wrapper function for the tf.data pipeline. Preprocesses the image and converts the label.
    """
    img = preprocess_image_tf(image_path)
    label = tf.py_function(func=lambda s: process_label(s.numpy().decode('utf-8')),
                           inp=[label_str], Tout=tf.int32)
    label.set_shape([max_label_length])

    # Determine the actual label length (number of characters before padding)
    label_length = tf.py_function(func=lambda s: np.array(len(s.numpy().decode('utf-8')), dtype=np.int32),
                                  inp=[label_str], Tout=tf.int32)
    label_length.set_shape([])

    # Input length is fixed (the number of time steps from CNN)
    input_length = tf.constant(time_steps, dtype=tf.int32)

    inputs = {
        'input_img': img,
        'labels': label,
        'input_length': tf.expand_dims(input_length, axis=0),
        'label_length': tf.expand_dims(label_length, axis=0)
    }
    return inputs, tf.constant(0, dtype=tf.float32)

# Load data from ground truth file
all_image_paths, all_labels = load_gt_file("/content/drive/MyDrive/Colab Notebooks/CSC 8851/Final Project/gt_test.txt")

label_lengths = [len(label) for label in all_labels]
print("Max label length:", max(label_lengths))
print("Min label length:", min(label_lengths))
print("Average label length:", np.mean(label_lengths))
print("Median label length:", np.median(label_lengths))

# Shuffle and split data: 90% for training, 10% for validation
num_samples = len(all_image_paths)
indices = np.arange(num_samples)
np.random.shuffle(indices)
split_idx = int(0.9 * num_samples)
train_image_paths = np.array(all_image_paths)[indices[:split_idx]]
train_labels = np.array(all_labels)[indices[:split_idx]]
val_image_paths = np.array(all_image_paths)[indices[split_idx:]]
val_labels = np.array(all_labels)[indices[split_idx:]]

# Create tf.data datasets
train_ds = tf.data.Dataset.from_tensor_slices((train_image_paths, train_labels))
train_ds = train_ds.map(parse_function, num_parallel_calls=tf.data.AUTOTUNE)
train_ds = train_ds.batch(32).prefetch(tf.data.AUTOTUNE)

val_ds = tf.data.Dataset.from_tensor_slices((val_image_paths, val_labels))
val_ds = val_ds.map(parse_function, num_parallel_calls=tf.data.AUTOTUNE)
val_ds = val_ds.batch(32).prefetch(tf.data.AUTOTUNE)

Max label length: 93
Min label length: 5
Average label length: 43.051114922813035
Median label length: 43.0


### Model Architecture Definition

In [5]:
def build_crnn_model():
    # Input layer for preprocessed image
    input_img = Input(shape=(target_img_height, target_img_width, num_channels), name='input_img')

    # --- CNN BLOCK ---
    x = Conv2D(32, (3, 3), padding='same', activation='relu', name='conv1')(input_img)
    x = BatchNormalization(name='bn1')(x)
    x = MaxPooling2D(pool_size=(1, 1), name='pool1')(x)

    x = Conv2D(64, (3, 3), padding='same', activation='relu', name='conv2')(x)
    x = BatchNormalization(name='bn2')(x)
    x = MaxPooling2D(pool_size=(2, 2), name='pool2')(x)

    x = Conv2D(64, (3, 3), padding='same', activation='relu', name='conv3')(x)
    x = BatchNormalization(name='bn3')(x)
    x = MaxPooling2D(pool_size=(2, 1), name='pool3')(x)

    # Reshape to convert CNN feature maps to a sequence for the RNN.
    # Expected CNN output shape: (target_img_height/1/2/2, target_img_width/1/2/2, channels)
    # Given target_img_height=32, target_img_width=128, after pooling we assume shape ~ (32, 8, 64)
    x = Reshape(target_shape=(time_steps, 8 * 64), name='reshape')(x)

    # Fully Connected layer followed by Dropout
    x = Dense(64, activation='relu', name='dense1')(x)
    x = Dropout(0.5, name='dropout')(x)

    # --- RNN BLOCK (Bidirectional LSTMs) ---
    x = Bidirectional(LSTM(128, return_sequences=True), name='bilstm1')(x)
    x = Bidirectional(LSTM(64, return_sequences=True), name='bilstm2')(x)

    # Final Dense layer: projects to (num_classes+1) outputs for each time step
    y_pred = Dense(num_classes + 1, activation='softmax', name='dense2')(x)

    return Model(inputs=input_img, outputs=y_pred, name='crnn_model')

base_model = build_crnn_model()
base_model.summary()

# Build the training model by adding additional inputs for labels and lengths.
labels = Input(name='labels', shape=(max_label_length,), dtype='int32')
input_length = Input(name='input_length', shape=(1,), dtype='int32')
label_length = Input(name='label_length', shape=(1,), dtype='int32')

loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')(
    [base_model.output, labels, input_length, label_length])

training_model = Model(inputs=[base_model.input, labels, input_length, label_length],
                       outputs=loss_out, name='crnn_training_model')
training_model.compile(optimizer=tf.keras.optimizers.Adam(),
                       loss={'ctc': lambda y_true, y_pred: y_pred})


### Training the Model

In [6]:
epochs = 100
training_model.fit(train_ds,
                   validation_data=val_ds,
                   epochs=epochs)

Epoch 1/100
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 1s/step - loss: inf - val_loss: inf
Epoch 2/100
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 55ms/step - loss: inf - val_loss: inf
Epoch 3/100
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 53ms/step - loss: inf - val_loss: inf
Epoch 4/100
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 53ms/step - loss: inf - val_loss: inf
Epoch 5/100
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 54ms/step - loss: inf - val_loss: inf
Epoch 6/100
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 54ms/step - loss: inf - val_loss: inf
Epoch 7/100
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 54ms/step - loss: inf - val_loss: inf
Epoch 8/100
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 54ms/step - loss: inf - val_loss: inf
Epoch 9/100
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 53ms/step - 

<keras.src.callbacks.history.History at 0x7a203659a250>

### Testing Functions

In [7]:
def decode_predictions(pred):
    """
    Performs greedy CTC decoding on the model predictions.
    """
    input_len = np.ones(pred.shape[0]) * pred.shape[1]
    decoded, _ = tf.keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)
    return tf.keras.backend.get_value(decoded[0])

def test_sample_image(image_path):
    """
    Loads a sample image from disk, preprocesses it, and runs inference using base_model.
    The decoded prediction is printed.
    """
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if img is None:
        raise ValueError("Image not found: " + image_path)

    # Resize while preserving aspect ratio: scale height to target_img_height and then pad/crop width to target_img_width.
    orig_h, orig_w = img.shape
    scale = target_img_height / orig_h
    new_w = int(round(orig_w * scale))
    img = cv2.resize(img, (new_w, target_img_height))
    if new_w < target_img_width:
        pad_width = target_img_width - new_w
        img = cv2.copyMakeBorder(img, 0, 0, 0, pad_width, cv2.BORDER_CONSTANT, value=255)
    else:
        img = img[:, :target_img_width]
    img = img.astype(np.float32) / 255.0
    img = np.expand_dims(img, axis=-1)  # add channel dimension
    img = np.expand_dims(img, axis=0)    # add batch dimension

    y_pred = base_model.predict(img)
    decoded = decode_predictions(y_pred)
    out_str = ""
    for idx in decoded[0]:
        if idx == -1 or idx == num_classes:
            continue
        out_str += num_to_char.get(idx, '')
    print("Predicted text:", out_str)
    return out_str

### Testing/Experimenting

In [12]:
# Uncomment and update the sample_image_path below to test on a sample image:
sample_image_path = "/content/drive/MyDrive/Colab Notebooks/CSC 8851/Final Project/images/p03-029-07.jpg"
test_sample_image(sample_image_path)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
Predicted text: naeaoathalaindaoaa


'naeaoathalaindaoaa'