In [29]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import numpy as np
import pandas as pd 
import tensorflow as tf
from keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, BatchNormalization, Input, Conv2D, MaxPool2D, Lambda, Bidirectional, Dropout, LSTM
from keras.models import Model
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.optimizers import Adam
from keras import backend as K
from itertools import groupby
import warnings
warnings.filterwarnings("ignore")

In [2]:
alphabets = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ "
max_str_len = 19  
num_of_characters = len(alphabets) + 1  
num_of_timestamps = 64  
default_path = "IAM/img/"
batch_size = 512

def label_to_num(txt):
    dig_lst = []
    for index, char in enumerate(txt):
        try:
            dig_lst.append(alphabets.index(char))
        except:
            pass
    return pad_sequences([dig_lst], maxlen=max_str_len, padding='post', value=len(alphabets))[0]

def ctc_decoder(predictions):
    text_list = []   
    pred_indcies = np.argmax(predictions, axis=2)
    
    for i in range(pred_indcies.shape[0]):
        ans = ""
        merged_list = [k for k,_ in groupby(pred_indcies[i])]
        for p in merged_list:
            if p != len(alphabets):
                ans += alphabets[int(p)]     
        text_list.append(ans)    
    return text_list

def num_to_label(num):
    ret = ""
    for ch in num:
        if ch == -1:  
            break
        else:
            ret += alphabets[ch]
    return ret

def process_single_sample(img_path, label):
    try:
        img = tf.io.read_file(img_path)
        img = tf.io.decode_png(img, channels=1)
        img = tf.image.convert_image_dtype(img, tf.float32)
        img = tf.image.resize(img, [32, 128])
        return {"image": img, "label": label}
    except:
        print("file not found")
    

In [3]:
data = pd.read_excel('data.xlsx')
data = pd.DataFrame(data, columns = ['Fpath','Identify']).astype(str)
data.dropna(axis=0, inplace=True)
 
train = data.sample(frac=0.9, random_state=42)
unique_train = train['Fpath'].unique()
valid = data.drop(train.index)

print(train.shape)
print(valid.shape)
print(data.shape)

(103785, 2)
(11532, 2)
(115317, 2)


In [4]:
train = train[0:80000]
valid = valid[0:8000]

train.reset_index(inplace=True, drop=True)
valid.reset_index(inplace=True, drop=True)

vocab = set("".join(map(str, valid['Identify'])))
print(sorted(vocab))
vocab = set("".join(map(str, train['Identify'])))
print(sorted(vocab))

[' ', '!', '#', '&', '(', ')', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
[' ', '!', '#', '&', '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [5]:
train_size = 80000
valid_size = 8000
print(valid_size)

8000


In [6]:
train_x = []
valid_x = []

for i in range(valid_size):
    path= valid.loc[i, 'Fpath']
    img_dir = default_path + path    
    valid_x.append(img_dir)

for i in range(train_size):
    path= train.loc[i, 'Fpath']
    img_dir = default_path + path
    train_x.append(img_dir)

valid_y = []
for i in range(valid_size):
    string = valid.loc[i, 'Identify']
    valid_y.append(label_to_num(string))

train_y = []
for i in range(train_size):
    string = train.loc[i, 'Identify']
    train_y.append(label_to_num(string))
    

In [7]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_x, train_y))

train_dataset = (
    train_dataset.map(
        process_single_sample, num_parallel_calls=tf.data.experimental.AUTOTUNE
    )
    .batch(batch_size)
    .prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
)

valid_dataset = tf.data.Dataset.from_tensor_slices((valid_x, valid_y))
valid_dataset = (
    valid_dataset.map(
        process_single_sample, num_parallel_calls=tf.data.experimental.AUTOTUNE
    )
    .batch(batch_size)
    .prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
)

In [8]:
valid_y[0]

array([17, 10, 28, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63,
       63, 63])

In [9]:
class CTCLayer(layers.Layer):

    def __init__(self, name=None):

        super().__init__(name=name)
        self.loss_fn = K.ctc_batch_cost

    def call(self, y_true, y_pred):

        batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
        input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
        label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

        input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
        label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")

        loss = self.loss_fn(y_true, y_pred, input_length, label_length)
        self.add_loss(loss)

        return y_pred

In [10]:
input_data = Input(shape=(32, 128, 1), name='image')
labels = layers.Input(name="label", shape=(None,), dtype="float32")

inner = Conv2D(32, (3, 3), padding='same', name='conv1', activation='selu')(input_data)
inner = MaxPool2D(pool_size=(2, 2), name='max1')(inner)

inner = Conv2D(64, (3, 3), padding='same', name='conv2', activation='selu')(inner)
inner = MaxPool2D(pool_size=(2, 2), name='max2')(inner)

inner = Conv2D(128, (3, 3), padding='same', name='conv3', activation='selu')(inner)
inner = Conv2D(128, (3, 3), padding='same', name='conv4', activation='selu')(inner)

inner = Conv2D(512, (3, 3), padding='same', name='conv5', activation='selu')(inner)
inner = Conv2D(512, (3, 3), padding='same', name='conv6', activation='selu')(inner)
inner = Dropout(0.2)(inner)

inner = Conv2D(512, (3, 3), padding='same', name='conv7', activation='selu')(inner)
inner = Conv2D(512, (3, 3), padding='same', name='conv8', activation='selu')(inner)
inner = MaxPool2D(pool_size=(2, 1), name='max8')(inner)

inner = Conv2D(256, (3, 3), padding='same', name='conv9',  activation='selu')(inner)
inner = BatchNormalization()(inner)
inner = Dropout(0.2)(inner)

inner = Conv2D(256, (3, 3), padding='same', name='conv10', activation='selu')(inner)
inner = BatchNormalization()(inner)
inner = MaxPool2D(pool_size=(2, 1), name='max10')(inner)
inner = Dropout(0.2)(inner)

inner = Conv2D(64, (2,2), name='conv11', activation='selu')(inner)
inner = Dropout(0.2)(inner)

squeezed = Lambda(lambda x: K.squeeze(x, 1))(inner)

inner = Bidirectional(LSTM(128, return_sequences=True), name='lstm1')(squeezed)
inner = Bidirectional(LSTM(512, return_sequences=True), name='lstm2')(inner)
inner = Bidirectional(LSTM(512, return_sequences=True), name='lstm3')(inner)
inner = Bidirectional(LSTM(512, return_sequences=True), name='lstm4')(inner)
inner = Bidirectional(LSTM(128, return_sequences=True), name='lstm5')(inner)

dense_= Dense(128,activation = 'relu')(inner)
y_pred = Dense(num_of_characters,activation = 'softmax', name='dense2')(dense_)
output = CTCLayer(name="ctc_loss",)(labels, y_pred)

In [28]:
model = Model(inputs=input_data, outputs=y_pred)
train_model = Model(inputs=[input_data, labels], outputs=output)
train_model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 image (InputLayer)             [(None, 32, 128, 1)  0           []                               
                                ]                                                                 
                                                                                                  
 conv1 (Conv2D)                 (None, 32, 128, 32)  320         ['image[0][0]']                  
                                                                                                  
 max1 (MaxPooling2D)            (None, 16, 64, 32)   0           ['conv1[0][0]']                  
                                                                                                  
 conv2 (Conv2D)                 (None, 16, 64, 64)   18496       ['max1[0][0]']             

In [None]:
train_model.compile(optimizer=Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, clipnorm=1.0),
                    metrics=[tf.keras.metrics.Accuracy()])

filepath = "best_model.h5"

checkpoint = ModelCheckpoint(filepath=filepath,
                             monitor= 'val_loss',
                             verbose=1, save_best_only=True, save_weights_only=True, mode='auto')

earlyStopping = EarlyStopping(monitor='val_loss', mode='auto', patience=15)

callbacks_list = [checkpoint, earlyStopping]

history = train_model.fit(train_dataset,
                          epochs=1,
                          validation_data=valid_dataset,
                          verbose = 1,
                          shuffle=True,   
                          callbacks=callbacks_list)

train_model.save('my_model.h5')

In [39]:
model.load_weights('my_model.h5')

In [46]:
prediction = []
for batch in valid_dataset.as_numpy_iterator():
    preds = model.predict(batch)
    prediction.extend(ctc_decoder(preds))

y_true = valid.loc[0:valid_size, 'Identify']
correct_char = 0
total_char = 0
correct = 0
for i in range(valid_size):
    pr = prediction[i]
    tr = y_true[i]
    total_char += len(tr)

    for j in range(min(len(tr), len(pr))):
        if tr[j] == pr[j]:
            correct_char += 1
    if pr == tr:
        correct += 1

print('Correct characters predicted : %.2f%%' % (correct_char * 100 / total_char))
print('Correct words predicted      : %.2f%%' % (correct * 100 / valid_size))

Correct characters predicted : 81.32%
Correct words predicted      : 68.17%


In [47]:
img = process_single_sample("would.jpg","would")
pred = model.predict(np.asarray([img["image"]]))



In [48]:
ctc_decoder(pred)

['would']