### Optical character recognition using RNNs

In [1]:
!pip install --upgrade numpy
!pip install --upgrade tensorflow

Requirement already up-to-date: numpy in /usr/local/lib/python2.7/site-packages
Requirement already up-to-date: tensorflow in /usr/local/lib/python2.7/site-packages
Requirement already up-to-date: six>=1.10.0 in /usr/local/lib/python2.7/site-packages (from tensorflow)
Requirement already up-to-date: numpy>=1.12.1 in /usr/local/lib/python2.7/site-packages (from tensorflow)
Requirement already up-to-date: tensorflow-tensorboard<0.5.0,>=0.4.0rc1 in /usr/local/lib/python2.7/site-packages (from tensorflow)
Requirement already up-to-date: mock>=2.0.0 in /usr/local/lib/python2.7/site-packages (from tensorflow)
Requirement already up-to-date: enum34>=1.1.6 in /usr/local/lib/python2.7/site-packages (from tensorflow)
Requirement already up-to-date: protobuf>=3.3.0 in /usr/local/lib/python2.7/site-packages (from tensorflow)
Requirement already up-to-date: wheel in /usr/local/lib/python2.7/site-packages (from tensorflow)
Requirement already up-to-date: backports.weakref>=1.0rc1 in /usr/local/lib/p

In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [3]:
import os
import gzip
import csv

In [4]:
import numpy as np
import tensorflow as tf

In [5]:
%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt

In [6]:
from six.moves import urllib

In [7]:
print(np.__version__)
print(tf.__version__)

1.13.3
1.4.1


In [64]:
URL_PATH = 'http://ai.stanford.edu/~btaskar/ocr/letter.data.gz'
DOWNLOADED_FILENAME = 'letter.data.gz'

def download_data():
    if not os.path.exists(DOWNLOADED_FILENAME):
        filename, _ = urllib.request.urlretrieve(URL_PATH, DOWNLOADED_FILENAME)
    
    print('Found and verified file from this path: ', URL_PATH)
    print('Downloaded file: ', DOWNLOADED_FILENAME)

In [65]:
download_data()

Found and verified file from this path:  http://ai.stanford.edu/~btaskar/ocr/letter.data.gz
Downloaded file:  letter.data.gz


In [66]:
def read_lines():
    with gzip.open(DOWNLOADED_FILENAME, 'rt') as f:
        reader = csv.reader(f, delimiter='\t')
        lines = list(reader)

        return lines

In [67]:
lines = read_lines()

### Format of every line

* id
* letter
* next_id
* word_id
* position
* fold
* 16x8 columns of pixel values

In [68]:
lines[0][:8]

['1', 'o', '2', '1', '1', '0', '0', '0']

In [69]:
len(lines)

52152

In [70]:
def get_features_labels(lines):
    lines = sorted(lines, key=lambda x: int(x[0]))
    data, target = [], []
    
    next_id = -1
    
    word = []
    word_pixels = []

    for line in lines:
        next_id = int(line[2]) # The index for the next_id column

        pixels = np.array([int(x) for x in line[6:134]])
        pixels = pixels.reshape((16, 8))
        
        word_pixels.append(pixels)
        word.append(line[1])
        
        if next_id == -1:
            data.append(word_pixels)
            target.append(word)

            word = []
            word_pixels = []


    return data, target

In [71]:
data, target = get_features_labels(lines)

In [72]:
def pad_features_labels(data, target):    
    max_length = max(len(x) for x in target)
    padding = np.zeros((16, 8))

    data = [x + ([padding] * (max_length - len(x))) for x in data]
    target = [x + ([''] * (max_length - len(x))) for x in target]
    
    return np.array(data), np.array(target)

In [73]:
padded_data, padded_target = pad_features_labels(data, target)

In [74]:
padded_target[:10]

array([['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', ''],
       ['o', 'm', 'm', 'a', 'n', 'd', 'i', 'n', 'g', '', '', '', '', '']],
      dtype='|S1')

#### The length of each sequence

We've padded all words so that their lengths are all equal to the length of the longest word

In [75]:
sequence_length = len(padded_target[0])

In [76]:
sequence_length

14

In [77]:
padded_data.shape

(6877, 14, 16, 8)

In [78]:
padded_data.shape[:2] + (-1,)

(6877, 14, -1)

In [79]:
reshaped_data = padded_data.reshape(padded_data.shape[:2] + (-1,))

In [80]:
reshaped_data.shape

(6877, 14, 128)

In [81]:
padded_target.shape

(6877, 14)

In [82]:
padded_target.shape + (26,)

(6877, 14, 26)

In [83]:
one_hot_target = np.zeros(padded_target.shape + (26,))

In [84]:
for index, letter in np.ndenumerate(padded_target):
    if letter:
        one_hot_target[index][ord(letter) - ord('a')] = 1

#### One-hot representation of the letter 'o'

* The letter 'o' represented by a 1 at the 14th index 
* Index positions start at 0

In [85]:
one_hot_target[0][0]

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [86]:
shuffled_indices = np.random.permutation(len(reshaped_data))

shuffled_data = reshaped_data[shuffled_indices]
shuffled_target = one_hot_target[shuffled_indices]

In [87]:
split = int(0.66 * len(shuffled_data))

train_data = shuffled_data[:split]
train_target = shuffled_target[:split]

test_data = shuffled_data[split:]
test_target = shuffled_target[split:]

In [88]:
train_data.shape

(4538, 14, 128)

In [89]:
_, num_steps, num_inputs = train_data.shape

In [90]:
train_target.shape

(4538, 14, 26)

In [91]:
num_classes = train_target.shape[2]

In [92]:
tf.reset_default_graph()

In [93]:
X = tf.placeholder(tf.float64, [None, num_steps, num_inputs])

y = tf.placeholder(tf.float64, [None, num_steps, num_classes])

#### Sequence length calculation

In [94]:
used = tf.sign(tf.reduce_max(tf.abs(X), reduction_indices=2))

length = tf.reduce_sum(used, reduction_indices=1)
sequence_length = tf.cast(length, tf.int64)

In [95]:
sequence_length

<tf.Tensor 'Cast:0' shape=(?,) dtype=int64>

#### RNN for training and prediction

In [96]:
num_neurons = 300

In [97]:
output, _ = tf.nn.bidirectional_dynamic_rnn(tf.nn.rnn_cell.GRUCell(num_neurons), 
                                            tf.nn.rnn_cell.GRUCell(num_neurons),
                                            X,
                                            dtype=tf.float64, sequence_length=sequence_length)

In [98]:
output

(<tf.Tensor 'bidirectional_rnn/fw/fw/transpose:0' shape=(?, 14, 300) dtype=float64>,
 <tf.Tensor 'ReverseSequence:0' shape=(?, 14, 300) dtype=float64>)

In [99]:
output = tf.concat([output[0], output[1]], axis=2)

In [100]:
output.shape

TensorShape([Dimension(None), Dimension(14), Dimension(600)])

#### Shared softmax layer

In [101]:
weight = tf.Variable(tf.truncated_normal([num_neurons * 2, num_classes], stddev=0.01, dtype=tf.float64))

In [102]:
bias = tf.Variable(tf.constant(0.1, shape=[num_classes], dtype=tf.float64))

In [103]:
flattened_output = tf.reshape(output, [-1, num_neurons * 2])

In [104]:
flattened_output

<tf.Tensor 'Reshape:0' shape=(?, 600) dtype=float64>

In [105]:
logits = tf.matmul(flattened_output, weight) + bias

In [106]:
logits_reshaped = tf.reshape(logits, [-1, num_steps, num_classes])

#### Cost calculation

In [107]:
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y)

In [108]:
loss = tf.reduce_mean(cross_entropy)

#### Error calculation

In [109]:
mistakes = tf.not_equal(
            tf.argmax(y, 2), tf.argmax(logits_reshaped, 2))
mistakes = tf.cast(mistakes, tf.float64)
mask = tf.sign(tf.reduce_max(tf.abs(y), reduction_indices=2))
mistakes *= mask

In [110]:
mistakes = tf.reduce_sum(mistakes, reduction_indices=1)
mistakes /= tf.cast(sequence_length, tf.float64)

In [111]:
error = tf.reduce_mean(mistakes)

#### Optimizer

In [112]:
optimizer = tf.train.RMSPropOptimizer(0.002)

In [113]:
gradient = optimizer.compute_gradients(loss)

In [114]:
optimize = optimizer.apply_gradients(gradient)

In [115]:
def batched(data, target, batch_size):
    epoch = 0
    offset = 0
    while True:
        old_offset = offset
        offset = (offset + batch_size) % (target.shape[0] - batch_size)

        # Offset wrapped around to the beginning so new epoch
        if offset < old_offset:
            # New epoch, need to shuffle data
            shuffled_indices = np.random.permutation(len(data))
            
            data = data[shuffled_indices]
            target = target[shuffled_indices]

            epoch += 1

        batch_data = data[offset:(offset + batch_size), :]
        
        batch_target = target[offset:(offset + batch_size), :]

        yield batch_data, batch_target, epoch

In [116]:
batch_size = 10
batches = batched(train_data, train_target, batch_size)

In [117]:
epochs = 5

In [118]:
def batched(data, target, batch_size):
    epoch = 0
    offset = 0
    while True:
        old_offset = offset
        offset = (offset + batch_size) % (target.shape[0] - batch_size)

        # Offset wrapped around to the beginning so new epoch
        if offset < old_offset:
            # New epoch, need to shuffle data
            shuffled_indices = np.random.permutation(len(data))
            
            data = data[shuffled_indices]
            target = target[shuffled_indices]

            epoch += 1

        batch_data = data[offset:(offset + batch_size), :]
        
        batch_target = target[offset:(offset + batch_size), :]

        yield batch_data, batch_target, epoch

In [119]:
batch_size = 20
batches = batched(train_data, train_target, batch_size)

In [120]:
epochs = 5

In [121]:
with tf.Session() as sess:
    
    sess.run(tf.global_variables_initializer())

    for index, batch in enumerate(batches):
        batch_data = batch[0]
        batch_target = batch[1]
    
        epoch = batch[2]

        if epoch >= epochs:
            break
        
        feed = {X: batch_data, y: batch_target}
        train_error, _ = sess.run([error, optimize], feed)
        
        print('{}: {:3.6f}%'.format(index + 1, 100 * train_error))

    test_feed = {X: test_data, y: test_target}
    test_error, _ = sess.run([error, optimize], test_feed)
    
    print('Test error: {:3.6f}%'.format(100 * test_error))

1: 96.591880%
2: 98.402778%
3: 95.646465%
4: 93.698718%
5: 94.077020%
6: 94.426587%
7: 94.125000%
8: 92.110570%
9: 91.864899%
10: 89.974206%
11: 93.627414%
12: 92.371767%
13: 91.695665%
14: 93.224206%
15: 90.912698%
16: 92.336053%
17: 93.003968%
18: 89.807540%
19: 94.540113%
20: 90.549423%
21: 86.911436%
22: 92.821068%
23: 90.055861%
24: 87.051587%
25: 79.185440%
26: 94.232143%
27: 86.541667%
28: 90.994048%
29: 87.370796%
30: 87.408675%
31: 82.702686%
32: 86.501623%
33: 90.232393%
34: 84.767857%
35: 90.095849%
36: 85.377595%
37: 88.787338%
38: 83.493506%
39: 83.766067%
40: 85.995310%
41: 88.962302%
42: 87.807789%
43: 84.727633%
44: 86.423701%
45: 85.247780%
46: 86.421190%
47: 92.932540%
48: 87.899365%
49: 90.291250%
50: 88.349817%
51: 85.021895%
52: 83.829185%
53: 88.438131%
54: 83.968504%
55: 87.948413%
56: 80.659812%
57: 85.378913%
58: 86.255411%
59: 86.369658%
60: 89.474817%
61: 79.494908%
62: 90.469877%
63: 84.325397%
64: 88.576659%
65: 83.682179%
66: 89.366703%
67: 90.588925%
68: 