In [None]:
!pip install huggingface_hub tensorflow matplotlib numpy

import os, zipfile, random
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.layers import StringLookup
import matplotlib.pyplot as plt
from huggingface_hub import snapshot_download
import pandas as pd
from glob import glob




In [None]:
dataset_dir = snapshot_download(
    repo_id="YCAI3/HCI_P2",
    repo_type="dataset",
    local_dir="./HCI_Dataset",
    ignore_patterns=[".gitattributes"]
)

print("Dataset downloaded to:", dataset_dir)

zip_path = os.path.join(dataset_dir, "HCI_Dataset.zip")
extract_dir = os.path.join(dataset_dir, "unzipped")

with zipfile.ZipFile(zip_path, "r") as zip_ref:
    zip_ref.extractall(extract_dir)

print("✅ Extracted to:", extract_dir)
print("Subfolders:", os.listdir(extract_dir))


Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Dataset extracted to: ./HCI_Dataset/unzipped/HCI_Dataset
Sample subfolders: ['50', '10', '198', '53', '195']


In [None]:
base_path = "./HCI_Dataset/unzipped/HCI_Dataset"

image_paths = glob(os.path.join(base_path, "**", "*.jpg"), recursive=True)
print("Total images found:", len(image_paths))
print("Example paths:\n", image_paths[:5])

# Extract label from filename (the word between underscores)
def get_label_from_path(path):
    filename = os.path.basename(path)
    parts = filename.split("_")
    if len(parts) >= 3:
        return parts[1]
    return None

labels = [get_label_from_path(p) for p in image_paths]
data = pd.DataFrame({"path": image_paths, "label": labels})

print("\nRandom samples:")
print(data.sample(5))


Total images found: 609656
✅ Using subset of 15000 samples
                                                   path          label
7550  ./HCI_Dataset/unzipped/HCI_Dataset/25/2/301_TU...        TUCUMAN
120   ./HCI_Dataset/unzipped/HCI_Dataset/56/4/310_su...  supernumerary
1833  ./HCI_Dataset/unzipped/HCI_Dataset/92/2/464_LO...          LOOPY
6714  ./HCI_Dataset/unzipped/HCI_Dataset/87/2/493_Ut...        Utterly
3740  ./HCI_Dataset/unzipped/HCI_Dataset/151/2/21_in...      inoculate


In [None]:
# --- Cell 4: Visualize a Random Sample ---
IMG_HEIGHT, IMG_WIDTH = 64, 256  # Wider for text

def load_and_preprocess_image(path):
    img = tf.io.read_file(path)
    img = tf.image.decode_jpeg(img, channels=1)
    img = tf.image.resize(img, [IMG_HEIGHT, IMG_WIDTH])
    img = img / 255.0
    return img

sample_idx = random.randint(0, len(data) - 1)
img = load_and_preprocess_image(data["path"][sample_idx])

plt.imshow(tf.squeeze(img), cmap="gray")
plt.title(f"Label: {data['label'][sample_idx]}")
plt.axis("off")
plt.show()

print("Sample path:", data['path'][sample_idx])
print("Extracted label:", data['label'][sample_idx])
print("File exists:", os.path.exists(data['path'][sample_idx]))


Vocabulary sample: ['0', '1', '2', '3', '4', '5', '6', '7', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K']
Total unique characters (including blank): 62
Original: Ernst
Encoded: [14 53 49 54 55]
Decoded: Ernst
char_to_num vocab: ['[UNK]', np.str_('0'), np.str_('1'), np.str_('2'), np.str_('3'), np.str_('4'), np.str_('5'), np.str_('6'), np.str_('7'), np.str_('9'), np.str_('A'), np.str_('B'), np.str_('C'), np.str_('D'), np.str_('E')]
num_to_char vocab: ['[UNK]', np.str_('0'), np.str_('1'), np.str_('2'), np.str_('3'), np.str_('4'), np.str_('5'), np.str_('6'), np.str_('7'), np.str_('9'), np.str_('A'), np.str_('B'), np.str_('C'), np.str_('D'), np.str_('E')]
Blank token index: tf.Tensor(62, shape=(), dtype=int64)


In [None]:
# --- Cell 5: Character Encoding ---
from tensorflow.keras.layers import StringLookup

# Collect all unique characters from labels
all_text = "".join(data["label"].astype(str).tolist())
unique_chars = sorted(list(set(all_text)))

print("Unique characters:", unique_chars)
print("Total unique characters:", len(unique_chars))

# Build lookup tables
char_to_num = StringLookup(vocabulary=unique_chars, oov_token="")
num_to_char = StringLookup(vocabulary=char_to_num.get_vocabulary(), invert=True)

# Quick test
example = data["label"].iloc[0]
encoded = char_to_num(tf.strings.unicode_split(example, input_encoding="UTF-8"))
decoded = tf.strings.reduce_join(num_to_char(encoded)).numpy().decode("utf-8")
print(f"Original: {example}\nDecoded : {decoded}")


In [None]:
# --- Cell 6: Dataset Creation and Preprocessing ---

IMG_HEIGHT, IMG_WIDTH = 64, 256  # consistent with earlier

def preprocess(path, label):
    # Load image
    img = tf.io.read_file(path)
    img = tf.image.decode_jpeg(img, channels=1)
    img = tf.image.resize(img, [IMG_HEIGHT, IMG_WIDTH])
    img = img / 255.0

    # Encode text to integer sequence
    label = char_to_num(tf.strings.unicode_split(label, 'UTF-8'))
    return img, label

paths = data["path"].tolist()
labels = data["label"].tolist()

dataset = tf.data.Dataset.from_tensor_slices((paths, labels))
dataset = dataset.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE)


In [None]:
# --- Cell 7: Split Dataset and Batch ---
total_size = len(data)
train_size = int(0.8 * total_size)
val_size   = int(0.1 * total_size)

train_ds = dataset.take(train_size)
val_ds   = dataset.skip(train_size).take(val_size)
test_ds  = dataset.skip(train_size + val_size)

BATCH_SIZE = 64

train_ds = (
    train_ds.padded_batch(BATCH_SIZE, padded_shapes=([64, 256, 1], [None]))
    .prefetch(tf.data.AUTOTUNE)
)
val_ds = (
    val_ds.padded_batch(BATCH_SIZE, padded_shapes=([64, 256, 1], [None]))
    .prefetch(tf.data.AUTOTUNE)
)


In [None]:
# --- Cell 8: Visualize a Batch and Check Encoding ---

for images, labels in train_ds.take(1):
    idx = 0
    plt.imshow(tf.squeeze(images[idx]), cmap='gray')
    decoded_label = tf.strings.reduce_join(num_to_char(labels[idx])).numpy().decode('utf-8')
    plt.title(f"Decoded: {decoded_label}")
    plt.axis("off")
    plt.show()

print("Image shape:", images.shape)
print("Label shape:", labels.shape)


In [None]:
# --- Cell 9: Define CTC Loss Function ---

def ctc_loss_func(y_true, y_pred):
    # Compute sequence lengths
    batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
    input_len = tf.cast(tf.shape(y_pred)[1], dtype="int64")
    label_len = tf.cast(tf.shape(y_true)[1], dtype="int64")

    input_length = input_len * tf.ones(shape=(batch_len, 1), dtype="int64")
    label_length = label_len * tf.ones(shape=(batch_len, 1), dtype="int64")

    loss = tf.keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
    return loss


In [None]:
# --- Cell 10: Build CRNN Architecture ---
def build_crnn(rnn_type="lstm"):
    input_img = layers.Input(shape=(64, 256, 1), name="image")

    # --- CNN feature extractor ---
    x = layers.Conv2D(32, (3,3), activation="relu", padding="same")(input_img)
    x = layers.MaxPooling2D((2,2))(x)
    x = layers.Conv2D(64, (3,3), activation="relu", padding="same")(x)
    x = layers.MaxPooling2D((2,2))(x)
    x = layers.Conv2D(128, (3,3), activation="relu", padding="same")(x)
    x = layers.MaxPooling2D((2,2))(x)

    # Reshape features for RNN
    new_shape = ((64 // 8), (256 // 8) * 128)  # adjust pooling math
    x = layers.Reshape(target_shape=new_shape)(x)

    # --- RNN feature sequence ---
    if rnn_type.lower() == "gru":
        x = layers.Bidirectional(layers.GRU(128, return_sequences=True))(x)
        x = layers.Bidirectional(layers.GRU(64, return_sequences=True))(x)
    else:
        x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
        x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)

    # --- Dense + Softmax output ---
    x = layers.Dense(len(char_to_num.get_vocabulary()) + 1, activation="softmax")(x)

    model = keras.models.Model(inputs=input_img, outputs=x, name=f"CRNN_{rnn_type.upper()}")
    return model

# Build both variants for later
crnn_lstm = build_crnn("lstm")
crnn_gru  = build_crnn("gru")

crnn_lstm.summary()


In [None]:
# --- Cell 11: Train CRNN (LSTM + Adam) ---
crnn_lstm = build_crnn("lstm")
crnn_lstm.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss=ctc_loss_func
)

history_lstm_adam = crnn_lstm.fit(
    train_ds,
    validation_data=val_ds,
    epochs=10
)

crnn_lstm.save("crnn_lstm_adam.h5")

In [None]:
# --- Cell 12: Train CRNN (LSTM + SGD) ---
crnn_lstm_sgd = build_crnn("lstm")
crnn_lstm_sgd.compile(
    optimizer=tf.keras.optimizers.SGD(learning_rate=1e-3, momentum=0.9),
    loss=ctc_loss_func
)

history_lstm_sgd = crnn_lstm_sgd.fit(
    train_ds,
    validation_data=val_ds,
    epochs=10
)

crnn_lstm_sgd.save("crnn_lstm_sgd.h5")

In [None]:
# --- Cell 13: Train CRNN (GRU + Adam) ---
crnn_gru = build_crnn("gru")
crnn_gru.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss=ctc_loss_func
)

history_gru_adam = crnn_gru.fit(
    train_ds,
    validation_data=val_ds,
    epochs=10
)

crnn_gru.save("crnn_gru_adam.h5")

In [None]:
# --- Cell 14: Train CRNN (GRU + SGD) ---
crnn_gru_sgd = build_crnn("gru")
crnn_gru_sgd.compile(
    optimizer=tf.keras.optimizers.SGD(learning_rate=1e-3, momentum=0.9),
    loss=ctc_loss_func
)

history_gru_sgd = crnn_gru_sgd.fit(
    train_ds,
    validation_data=val_ds,
    epochs=10
)

crnn_gru_sgd.save("crnn_gru_sgd.h5")

In [None]:
# --- Cell 15: Compare All Models' Training Histories ---
def plot_history(history, title):
    plt.plot(history.history["loss"], label="train")
    plt.plot(history.history["val_loss"], label="val")
    plt.title(title)
    plt.xlabel("Epochs")
    plt.ylabel("CTC Loss")
    plt.legend()
    plt.show()

plot_history(history_lstm_adam, "LSTM + Adam")
plot_history(history_lstm_sgd, "LSTM + SGD")
plot_history(history_gru_adam, "GRU + Adam")
plot_history(history_gru_sgd, "GRU + SGD")