In [14]:
from tensorflow.keras.layers import Input, Dense, Conv2D, MaxPool2D, CuDNNLSTM, CuDNNGRU, Bidirectional, TimeDistributed, Reshape, Lambda
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard
from tensorflow.keras.backend import ctc_decode, get_session, ctc_batch_cost
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import Sequence, multi_gpu_model
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.models import Model
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
from itertools import groupby
from glob import iglob
import editdistance
import numpy as np
import sys

In [2]:
ngpus = 4

image_list = list(iglob('../dataset/*/*'))
image_list.sort()
print(f'=> Found {len(image_list)} images <=')

max_w = 1928
max_h = 64
print(f'=> Max width of images {max_w} <=')

labels = ' '.join([x.split('/')[-1].split('_')[0] for x in image_list])
vocab = sorted(list(set(labels)))
vocab_size = len(vocab)
print(f'=> Vocab size of dataset {vocab_size} <=')

letter_idx = {x: idx for idx, x in enumerate(vocab)}
idx_letter = {v: k for k, v in letter_idx.items()}
idx_letter[len(idx_letter)] = ''

string_lens = [len(x) for x in [x.split('/')[-1].split('_')[0] for x in image_list]]
max_string_len = max(string_lens)
print(f'=> Max string len {max_string_len} <=')

=> Found 102000 images <=
=> Max width of images 1928 <=
=> Vocab size of dataset 68 <=
=> Max string len 80 <=


In [3]:
h, w = max_w, max_h

def ctc_loss(tensor_list):
    y_pred, y_true, input_length, label_length = tensor_list
    y_pred = y_pred[:,2:,:]
    return ctc_batch_cost(y_true, y_pred,input_length,label_length)

def dummy_loss(y_true, y_pred):
    return y_pred

downscale_factor = 4
print('=> Building model <=')
base_model = ResNet50(weights=None, include_top=False, input_shape=(h,w,1))
conv_features = base_model.get_layer('activation_9').output
conv_features = Conv2D(filters=128, kernel_size=3, padding='same', kernel_initializer='he_normal')(conv_features)
y = Reshape(target_shape=(h // downscale_factor, w // downscale_factor * 128), name='reshape')(conv_features)
y = Bidirectional(CuDNNLSTM(units=512, return_sequences=True),
                  name='biLSTM_1')(y)
y = Bidirectional(CuDNNLSTM(units=512, return_sequences=True),
                  name='biLSTM_2')(y)
output_layer = TimeDistributed(Dense(
    units=vocab_size+1, kernel_initializer='he_normal', activation='softmax'), name='char_output')(y)

labels = Input(shape=(max_string_len, ))
label_length = Input(shape=(1,))
input_length = Input(shape=(1,))
loss_layer = Lambda(ctc_loss, output_shape=(1,), name='loss_layer')(
    [output_layer, labels, input_length, label_length])
input_tensors = [base_model.input, labels, label_length, input_length]
train_model = Model(inputs=input_tensors, outputs=loss_layer)
print('=> Build completed successfully <=')
print(f'=> Creating model replicas for distributed training across {ngpus} gpus <=')

pmodel = multi_gpu_model(train_model, ngpus)
pmodel.load_weights('weights_backup/top_weights.h5')

model = Model(inputs=base_model.input, outputs=output_layer)
sess = get_session()

=> Building model <=
Instructions for updating:
Colocations handled automatically by placer.




Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use tf.cast instead.
=> Build completed successfully <=
=> Creating model replicas for distributed training across 4 gpus <=


In [4]:
test_images = list(iglob('../dataset/val/*'))
test_images.sort()
print(f'Found {len(test_images)} test images')

Found 1000 test images


In [5]:
def prepare_input(x):
    h, w, _ = np.array(load_img(x)).shape
    new_h = 64
    new_w = int(np.round((64/h) * w, 0))
    img = img_to_array(load_img(x, color_mode='grayscale', target_size=(new_h, new_w)))[:,:,0]/255.
    blank_img = np.zeros((64, max_w))
    blank_img[:img.shape[0], :img.shape[1]] = img
    blank_img = blank_img.T
    blank_img = np.expand_dims(blank_img, axis=-1)
    blank_img = np.expand_dims(blank_img, axis=0)
    return blank_img, img
labels = [x.split('/')[-1].split('_')[0] for x in test_images]

In [11]:
p = []
for idx,i in tqdm_notebook(enumerate(test_images)):
    inp_img, img = prepare_input(i)
    ctc_matrix = model.predict(inp_img)
    input_len = np.expand_dims((img.shape[1]//4) - 2, 0)
    preds = model.predict(inp_img)
    out = np.argmax(preds[0][2:input_len[0]], axis=-1)
    decoded_labels = ''.join([idx_letter[i] for i,x in groupby(out)])
#     preds = sess.run(ctc_decode(ctc_matrix[:,2:,:], input_len, greedy=True, beam_width=100, top_paths=1))
#     decoded_labels = ''.join([idx_letter[i] for i,x in groupby(preds[0][0][0])])
    p.append(decoded_labels)
    plt.figure()   
    plt.axis('off')
    plt.title(f'Predicted: {decoded_labels}\nActual: {labels[idx]}', wrap=True, loc='left')
    plt.imshow(img, cmap='gray')
    plt.savefig(f'outputs/{idx}.png')
    plt.close()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [19]:
x = 0
distances = []
for i in range(len(p)):
    if p[i] == labels[i]:
        x+=1
    distances.append(editdistance.distance(p[i], labels[i]))
print(f'Accuracy (exact match ) : {x/len(p)}')
print(f'Average editdistance    : {sum(distances)/len(p)}')

Accuracy (exact match ) : 0.43
Average editdistance    : 3.815
