In [22]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, SimpleRNN, Reshape, Dropout
from tensorflow.keras.utils import to_categorical
import cv2
import os
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.metrics import accuracy_score, precision_score, recall_score
import numpy as np
import pandas as pd

In [14]:
## From openCV documentation

def preprocess_image(img_path, bbox): ## bbox is bounding box dimensions provided in the words.txt labels file 
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE) ##read in grayscale since works well for our database as the images are in grayscale anyway
    if img is None: # handling images that may be hard to read
        print(f"Failed to load image: {img_path}")
        return None
    height, width = img.shape
    x, y, w, h = bbox

    if img.size == 0:
        print(f"Empty cropped image for bbox {bbox} in image: {img_path}")
        return None
    thresh_val, bin_img = cv2.threshold(img, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU) #thresh_binary specifies binary threshold.
    resized_image = cv2.resize(bin_img, (128, 32)) ## resizing image to normalize
    return resized_image

In [15]:
def load_labels_and_images(word_txt_path, img_folder_path):
    labels = []
    images = []
    
    with open(word_txt_path, 'r') as file:
        lines = file.readlines() ## reading txt file that has the labels
    bad_load_count = 0
    for line in lines:
        if line.startswith('#') or not line.strip(): #based on format of txt file 
            continue
        parts = line.strip().split()
        if parts[1]!='ok':
            bad_load_count+=1
            continue
        word_id = parts[0]
        bbox = list(map(int, parts[3:7]))
        label = parts[-1]
        
        subfolder = word_id.split('-')[0] #traversing through folder
        img_folder = os.path.join(img_folder_path, subfolder, (word_id.split('-')[0]+'-'+word_id.split('-')[1]))
        img_name = f"{word_id}.png"
        img_path = os.path.join(img_folder, img_name)
        
        img = preprocess_image(img_path, bbox)
        if img is not None:
            images.append(img)
            labels.append(label)
    print("# of images not loaded due to bad data: ", bad_load_count)
    return np.array(images), np.array(labels)

images, labels = load_labels_and_images('words.txt', 'words')


Failed to load image: words/a01/a01-117/a01-117-05-02.png
Failed to load image: words/r06/r06-022/r06-022-03-05.png
# of images not loaded due to bad data:  18864


In [16]:
## Prep data for NNs
X = np.array(images).astype('float32')/255.0 #normalizing pixel values
X = np.expand_dims(X, axis=-1) #adding channel dimension (indicating color) to img array
y = pd.factorize(labels)[0] 
y = to_categorical(y) 

print(y.shape)


(96454, 12214)


## CNN Model

In [7]:
def create_cnn_model(input_shape, num_classes):
    model = Sequential([ #sequential model 
        Conv2D(32, (3, 3), activation='relu', input_shape=input_shape), 
        MaxPooling2D((2, 2)), 
        Conv2D(64, (3, 3), activation='relu'), 
        MaxPooling2D((2, 2)),
        Flatten(), 
        Dense(128, activation='relu'), #Adds a fully connected layer with 128 neurons
        Dense(num_classes, activation='softmax')  #Adds a fully connected layer with 128 neurons. Softmax to output probabilities
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

input_shape = (32, 128, 1)
num_classes = y.shape[1] 
cnn_model = create_cnn_model(input_shape, num_classes)

cnn_model.fit(X, y, epochs=10, batch_size=32, validation_split=0.2)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m2412/2412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 50ms/step - accuracy: 0.2260 - loss: 5.9819 - val_accuracy: 0.3615 - val_loss: 4.9131
Epoch 2/10
[1m2412/2412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 45ms/step - accuracy: 0.4000 - loss: 4.1570 - val_accuracy: 0.4184 - val_loss: 4.8971
Epoch 3/10
[1m2412/2412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 45ms/step - accuracy: 0.4595 - loss: 3.4353 - val_accuracy: 0.4360 - val_loss: 5.0650
Epoch 4/10
[1m2412/2412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 42ms/step - accuracy: 0.5066 - loss: 2.9063 - val_accuracy: 0.4446 - val_loss: 5.3792
Epoch 5/10
[1m2412/2412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 44ms/step - accuracy: 0.5581 - loss: 2.4507 - val_accuracy: 0.4341 - val_loss: 6.0483
Epoch 6/10
[1m2412/2412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 48ms/step - accuracy: 0.6001 - loss: 2.0876 - val_accuracy: 0.4480 - val_loss: 6.781

<keras.src.callbacks.history.History at 0x2c4b1e110>

### RNN Model

In [8]:
def create_rnn_model(input_shape, num_classes):
    model = tf.keras.models.Sequential([
        Reshape((32, 128), input_shape=input_shape), #reshape converts into format needed for RNN
        SimpleRNN(128, return_sequences=True), 
        SimpleRNN(128),  
        Dense(128, activation='relu'),
        Dense(num_classes, activation='softmax') 
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

rnn_model = create_rnn_model(input_shape, num_classes)

rnn_model.fit(X, y, epochs=10, batch_size=32, validation_split=0.2)

  super().__init__(**kwargs)


Epoch 1/10
[1m2412/2412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 21ms/step - accuracy: 0.0661 - loss: 7.0438 - val_accuracy: 0.1039 - val_loss: 6.7216
Epoch 2/10
[1m2412/2412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 23ms/step - accuracy: 0.1054 - loss: 6.3312 - val_accuracy: 0.1067 - val_loss: 6.8774
Epoch 3/10
[1m2412/2412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 20ms/step - accuracy: 0.1158 - loss: 6.1522 - val_accuracy: 0.1130 - val_loss: 6.9663
Epoch 4/10
[1m2412/2412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 19ms/step - accuracy: 0.1174 - loss: 6.1217 - val_accuracy: 0.1240 - val_loss: 7.0452
Epoch 5/10
[1m2412/2412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 20ms/step - accuracy: 0.1273 - loss: 6.0056 - val_accuracy: 0.1344 - val_loss: 7.2289
Epoch 6/10
[1m2412/2412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 19ms/step - accuracy: 0.1416 - loss: 5.9209 - val_accuracy: 0.0690 - val_loss: 7.4858
Epoc

<keras.src.callbacks.history.History at 0x2c4bec940>

In [11]:
cnn_loss, cnn_accuracy = cnn_model.evaluate(X, y)
print(f"CNN Accuracy: {cnn_accuracy * 100}%")

rnn_loss, rnn_accuracy = rnn_model.evaluate(X, y)
print(f"RNN Accuracy: {rnn_accuracy * 100}%")

[1m3015/3015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 11ms/step - accuracy: 0.7870 - loss: 1.1362
CNN Accuracy: 72.43038415908813%
[1m3015/3015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 6ms/step - accuracy: 0.1220 - loss: 6.1132
RNN Accuracy: 12.273208051919937%


### Combined CNN and RNN

In [20]:
def create_cnn_rnn_model(input_shape, num_classes):
    model = Sequential([
        Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        MaxPooling2D((2, 2)),
        Conv2D(64, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Flatten(),
        Dense(128, activation='relu'),
        Dense(32*128, activation='relu'), #flatten makes it 1-D layer, but we need a 2D layer of input shape to go into the reshape function.
        Reshape((32, 128)),  ## reshaping for RNN layer
        SimpleRNN(128, return_sequences=True),
        SimpleRNN(128),
        Dense(num_classes, activation='softmax')
    ])
    return model

input_shape = (32, 128, 1)
num_classes = y.shape[1]
cnn_rnn_model = create_cnn_rnn_model(input_shape, num_classes)

cnn_rnn_model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])
cnn_rnn_model.fit(X, y, epochs=10, batch_size=32, validation_split=0.2)

cnn_rnn_loss, cnn_rnn_accuracy = cnn_rnn_model.evaluate(X, y)

Epoch 1/10
[1m2412/2412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 50ms/step - accuracy: 0.1550 - loss: 6.3673 - val_accuracy: 0.3191 - val_loss: 5.0510
Epoch 2/10
[1m2412/2412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 54ms/step - accuracy: 0.3458 - loss: 4.3293 - val_accuracy: 0.3793 - val_loss: 4.6553
Epoch 3/10
[1m2412/2412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 57ms/step - accuracy: 0.4153 - loss: 3.5608 - val_accuracy: 0.4072 - val_loss: 4.5847
Epoch 4/10
[1m2412/2412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 60ms/step - accuracy: 0.4676 - loss: 3.0580 - val_accuracy: 0.4210 - val_loss: 4.6679
Epoch 5/10
[1m2412/2412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 54ms/step - accuracy: 0.5069 - loss: 2.6753 - val_accuracy: 0.4437 - val_loss: 4.6519
Epoch 6/10
[1m2412/2412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 56ms/step - accuracy: 0.5398 - loss: 2.3798 - val_accuracy: 0.4409 - val_loss: 4.714

In [21]:
print(f"Combined Model 1 Accuracy: {cnn_rnn_accuracy * 100}%")

Accuracy: 64.6950900554657%


#### CNN-RNN Combined Model with Dropout 

In [24]:
def create_cnn_rnn_model_with_dropout(input_shape, num_classes):
    model = Sequential([
        Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        MaxPooling2D((2, 2)),
        Conv2D(64, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Flatten(),
        Dense(128, activation='relu'),
        Dropout(0.2),  ## dropout is regularization technique to reduce overfitting; does so by randomly deactivating certain neurons
        Dense(32*128, activation='relu'),
        Reshape((32, 128)),
        SimpleRNN(128, return_sequences=True),
        Dropout(0.2),  ## dropout for RNN
        SimpleRNN(128),
        Dense(num_classes, activation='softmax')
    ])
    return model


input_shape = (32, 128, 1)
num_classes = y.shape[1]
cnn_rnn_model_dropout = create_cnn_rnn_model_with_dropout(input_shape, num_classes)

cnn_rnn_model_dropout.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])
cnn_rnn_model_dropout.fit(X, y, epochs=10, batch_size=32, validation_split=0.2)

cnn_rnn_dropout_loss, cnn_rnn_dropout_accuracy = cnn_rnn_model_dropout.evaluate(X, y)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m2412/2412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 62ms/step - accuracy: 0.1547 - loss: 6.3510 - val_accuracy: 0.3126 - val_loss: 5.0444
Epoch 2/10
[1m2412/2412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 60ms/step - accuracy: 0.3189 - loss: 4.5124 - val_accuracy: 0.3671 - val_loss: 4.7837
Epoch 3/10
[1m2412/2412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m188s[0m 78ms/step - accuracy: 0.3688 - loss: 3.9642 - val_accuracy: 0.3876 - val_loss: 4.6952
Epoch 4/10
[1m2412/2412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m225s[0m 93ms/step - accuracy: 0.4052 - loss: 3.5734 - val_accuracy: 0.4106 - val_loss: 4.5984
Epoch 5/10
[1m2412/2412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m226s[0m 94ms/step - accuracy: 0.4264 - loss: 3.3002 - val_accuracy: 0.4179 - val_loss: 4.7247
Epoch 6/10
[1m2412/2412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m274s[0m 99ms/step - accuracy: 0.4524 - loss: 3.0681 - val_accuracy: 0.4220 - val_loss: 4.807

In [25]:
print(f"Combined Model 2 (with dropout) Accuracy: {cnn_rnn_dropout_accuracy * 100}%")

Combined Model 2 (with dropout) Accuracy: 55.0936222076416%
