# Importing required libraries

In [1]:
import os

import numpy as np
from PIL import Image

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Function for loading the data.

In [6]:
def load_data(data_dir):
    images = []
    labels = []

    for filename in os.listdir(data_dir):
        if filename.endswith(".png"):
            img_path = os.path.join(data_dir, filename)
            label_str = filename.split(".")[0]
            label = [int(digit) for digit in label_str]

            img = Image.open(img_path).convert('L')  # Convert to grayscale
            img = np.array(img) / 255.0  # Normalize pixel values to the range [0, 1]

            images.append(img)
            labels.append(label)
    return np.array(images), np.array(labels)

# Function containing the model architecture

In [7]:
def ocr(image_height, image_width):
    model = models.Sequential()
    model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(image_height, image_width, 1)))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(128, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Flatten())
    model.add(layers.RepeatVector(6))  # Repeat the vector 6 times to match the sequence length
    model.add(layers.LSTM(64, return_sequences=True))
    model.add(layers.TimeDistributed(layers.Dense(10, activation='softmax')))
    return model

# Printing the Summary of the model

# Loading the images and logits using load_data() funtion.

In [8]:
data_path = 'captcha_assignment_images/Gujarat_Rural_Captchas/'
images, labels = load_data(data_path)

# Creating ImageDataGenerator object for Image Data Augmentation.

In [10]:
datagen = ImageDataGenerator(
    rotation_range=10,
    zoom_range=0.1,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=False
)

# Splitting the data into train, test and val set

In [9]:
# Convert labels to padded sequences
# labels = preprocessing.sequence.pad_sequences(labels, padding='post', value=-1, maxlen=7)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

X_train.shape, X_test.shape, X_val.shape, y_train.shape, y_test.shape, y_val.shape

((1419, 80, 190),
 (177, 80, 190),
 (178, 80, 190),
 (1419, 6),
 (177, 6),
 (178, 6))

# Converting Y logits to one hot encoding

In [10]:
# Convert labels to one-hot encoding
y_train_one_hot = tf.keras.utils.to_categorical(y_train, num_classes=10)
y_test_one_hot = tf.keras.utils.to_categorical(y_test, num_classes=10)
y_val_one_hot = tf.keras.utils.to_categorical(y_val, num_classes=10)

# Defining batch_size, reshaping X and Y data and creating generator objects for train, test and val.

In [88]:
# Train the model using tf.data.Dataset
batch_size = 16
X_train = X_train.reshape(-1, 80, 190, 1)
X_test = X_test.reshape(-1, 80, 190, 1)
X_val = X_val.reshape(-1, 80, 190, 1)

y_train_one_hot = y_train_one_hot.reshape(-1, 6, 10)
y_test_one_hot = y_test_one_hot.reshape(-1, 6, 10)
y_val_one_hot = y_val_one_hot.reshape(-1, 6, 10)

train_generator = datagen.flow(X_train, y_train_one_hot, batch_size=batch_size)
test_generator = datagen.flow(X_test, y_test_one_hot, batch_size=batch_size)
val_generator = datagen.flow(X_val, y_val_one_hot, batch_size=batch_size)

# Initializing the model, saving it's architecture to a json object, initializing optimizer, compiling model

In [None]:
model = ocr(X_train.shape[1], X_train.shape[2])

# Converting model to json object for saving model architecture.
model_json = model.to_json()

# Saving model architecture.
with open('ocr_model.json', 'w') as json_file:
    json_file.write(model_json)

# Initializing Optimizer.
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
# Initializing EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Compile the model
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 188, 78, 32)       320       
                                                                 
 max_pooling2d (MaxPooling2  (None, 94, 39, 32)        0         
 D)                                                              
                                                                 
 conv2d_1 (Conv2D)           (None, 92, 37, 64)        18496     
                                                                 
 max_pooling2d_1 (MaxPoolin  (None, 46, 18, 64)        0         
 g2D)                                                            
                                                                 
 conv2d_2 (Conv2D)           (None, 44, 16, 128)       73856     
                                                                 
 max_pooling2d_2 (MaxPoolin  (None, 22, 8, 128)        0

# Training the model

In [118]:
# Train the model
model.fit(train_generator, epochs=100, batch_size=batch_size, validation_data=val_generator, callbacks=[early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100


<keras.src.callbacks.History at 0x7f4428502d40>

# saving the model weights

In [136]:
model.save_weights('models/ocr_100_epoch_0.001_lr.h5')

# Evaluating the model

In [119]:
test_loss, test_acc = model.evaluate(test_generator)
print('test loss, test acc: ', np.round(test_loss, 3), np.round(test_acc, 3))

test loss, test acc:  0.154 0.961


# Making prediction on test data

In [120]:
prediction = model.predict(X_test)



# Extracting logits from the prediction probabilities

In [121]:
logits = np.argmax(prediction, axis=-1)

array([1, 0, 5, 0, 0, 5])

# Flattening the logits and y_test for generating metrices

In [122]:
flatten_logits = logits.flatten()
flatten_y_test = y_test.flatten()

In [23]:
flatten_logits = logits.flatten()
flatten_y_test = y_test.flatten()

# Generating evaluation metrices.

In [66]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

accuracy = accuracy_score(flatten_logits, flatten_y_test)
confusion_mat = confusion_matrix(flatten_logits, flatten_y_test)

# Calculate precision, recall, and F1 score for each class
precision = precision_score(flatten_y_test, flatten_logits, average=None)
recall = recall_score(flatten_y_test, flatten_logits, average=None)
f1 = f1_score(flatten_y_test, flatten_logits, average=None)

print('confusion_mat: ', confusion_mat)
print('accuracy: ', np.round(accuracy, 3))
print("precision: ", np.round(precision, 3))
print('recall: ', np.round(recall, 3))
print('f1: ', np.round(f1, 3))

confusion_mat:  [[168   0   1   0   0   0   0   0   0   0]
 [  0 328   1   0   1   0   0   0   0   0]
 [  0   0  64   5   0   0   0   1   0   0]
 [  0   1   0  62   0   0   0   2   0   0]
 [  0   0   0   0  67   0   0   0   0   0]
 [  0   0   0   0   0  74   0   0   0   0]
 [  0   0   0   0   0   0  77   0   0   0]
 [  1   0   0   0   1   0   0  67   0   0]
 [  0   1   0   0   0   0   0   0  70   0]
 [  0   0   0   0   1   0   0   0   0  69]]
accuracy:  0.985
precision:  [0.994 0.994 0.914 0.954 1.    1.    1.    0.971 0.986 0.986]
recall:  [0.994 0.994 0.97  0.925 0.957 1.    1.    0.957 1.    1.   ]
f1:  [0.994 0.994 0.941 0.939 0.978 1.    1.    0.964 0.993 0.993]
